diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Makefile | 20 | ||||
-rw-r--r-- | mm/dmapool.c | 10 | ||||
-rw-r--r-- | mm/huge_memory.c | 3 | ||||
-rw-r--r-- | mm/kasan/Makefile | 3 | ||||
-rw-r--r-- | mm/kasan/kasan.c | 356 | ||||
-rw-r--r-- | mm/kasan/kasan.h | 51 | ||||
-rw-r--r-- | mm/kasan/quarantine.c | 328 | ||||
-rw-r--r-- | mm/kasan/report.c | 262 | ||||
-rw-r--r-- | mm/kmemcheck.c | 3 | ||||
-rw-r--r-- | mm/kmemleak.c | 18 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memblock.c | 31 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 3 | ||||
-rw-r--r-- | mm/mempolicy.c | 7 | ||||
-rw-r--r-- | mm/mempool.c | 24 | ||||
-rw-r--r-- | mm/mlock.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 55 | ||||
-rw-r--r-- | mm/mprotect.c | 2 | ||||
-rw-r--r-- | mm/oom_kill.c | 3 | ||||
-rw-r--r-- | mm/page_alloc.c | 69 | ||||
-rw-r--r-- | mm/percpu.c | 4 | ||||
-rw-r--r-- | mm/shmem.c | 13 | ||||
-rw-r--r-- | mm/slab.c | 367 | ||||
-rw-r--r-- | mm/slab.h | 2 | ||||
-rw-r--r-- | mm/slab_common.c | 20 | ||||
-rw-r--r-- | mm/slub.c | 237 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 8 | ||||
-rw-r--r-- | mm/sparse.c | 8 | ||||
-rw-r--r-- | mm/swapfile.c | 3 | ||||
-rw-r--r-- | mm/usercopy.c | 278 | ||||
-rw-r--r-- | mm/util.c | 1 | ||||
-rw-r--r-- | mm/vmalloc.c | 4 | ||||
-rw-r--r-- | mm/vmstat.c | 69 |
33 files changed, 1819 insertions, 447 deletions
diff --git a/mm/Makefile b/mm/Makefile index 2ed43191fc3b..ec91e951da28 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -3,8 +3,27 @@ # KASAN_SANITIZE_slab_common.o := n +KASAN_SANITIZE_slab.o := n KASAN_SANITIZE_slub.o := n +# Since __builtin_frame_address does work as used, disable the warning. +CFLAGS_usercopy.o += $(call cc-disable-warning, frame-address) + +# These files are disabled because they produce non-interesting and/or +# flaky coverage that is not a function of syscall inputs. E.g. slab is out of +# free pages, or a task is migrated between nodes. +KCOV_INSTRUMENT_slab_common.o := n +KCOV_INSTRUMENT_slob.o := n +KCOV_INSTRUMENT_slab.o := n +KCOV_INSTRUMENT_slub.o := n +KCOV_INSTRUMENT_page_alloc.o := n +KCOV_INSTRUMENT_debug-pagealloc.o := n +KCOV_INSTRUMENT_kmemleak.o := n +KCOV_INSTRUMENT_kmemcheck.o := n +KCOV_INSTRUMENT_memcontrol.o := n +KCOV_INSTRUMENT_mmzone.o := n +KCOV_INSTRUMENT_vmstat.o := n + mmu-y := nommu.o mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ @@ -81,3 +100,4 @@ obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o +obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o diff --git a/mm/dmapool.c b/mm/dmapool.c index 57312b5d6e12..2821500e8123 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -452,13 +452,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) } spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) - dev_err(pool->dev, "dma_pool_free %s, dma %Lx " - "already free\n", pool->name, - (unsigned long long)dma); + dev_err(pool->dev, "dma_pool_free %s, dma %Lx already free\n", + pool->name, (unsigned long long)dma); else - printk(KERN_ERR "dma_pool_free %s, dma %Lx " - "already free\n", pool->name, - (unsigned long long)dma); + printk(KERN_ERR "dma_pool_free %s, dma %Lx already free\n", + pool->name, (unsigned long long)dma); return; } } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 465786cd6490..b2d56b1c3276 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -135,8 +135,7 @@ static void set_recommended_min_free_kbytes(void) if (recommended_min > min_free_kbytes) { if (user_min_free_kbytes >= 0) - pr_info("raising min_free_kbytes from %d to %lu " - "to help transparent hugepage allocations\n", + pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", min_free_kbytes, recommended_min); min_free_kbytes = recommended_min; diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 64710148941e..e1100433cefe 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -1,8 +1,9 @@ KASAN_SANITIZE := n +KCOV_INSTRUMENT := n CFLAGS_REMOVE_kasan.o = -pg # Function splitter causes unnecessary splits in __asan_load1/__asan_store1 # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -obj-y := kasan.o report.o kasan_init.o +obj-y := kasan.o report.o kasan_init.o quarantine.o diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index b7397b459960..2bfdb3c23e72 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -17,9 +17,12 @@ #define DISABLE_BRANCH_PROFILING #include <linux/export.h> +#include <linux/interrupt.h> #include <linux/init.h> +#include <linux/kasan.h> #include <linux/kernel.h> #include <linux/kmemleak.h> +#include <linux/linkage.h> #include <linux/memblock.h> #include <linux/memory.h> #include <linux/mm.h> @@ -31,11 +34,21 @@ #include <linux/string.h> #include <linux/types.h> #include <linux/vmalloc.h> -#include <linux/kasan.h> +#include <linux/bug.h> #include "kasan.h" #include "../slab.h" +void kasan_enable_current(void) +{ + current->kasan_depth++; +} + +void kasan_disable_current(void) +{ + current->kasan_depth--; +} + /* * Poisons the shadow memory for 'size' bytes starting from 'addr'. * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE. @@ -60,6 +73,47 @@ void kasan_unpoison_shadow(const void *address, size_t size) } } +static void __kasan_unpoison_stack(struct task_struct *task, const void *sp) +{ + void *base = task_stack_page(task); + size_t size = sp - base; + + kasan_unpoison_shadow(base, size); +} + +/* Unpoison the entire stack for a task. */ +void kasan_unpoison_task_stack(struct task_struct *task) +{ + __kasan_unpoison_stack(task, task_stack_page(task) + THREAD_SIZE); +} + +/* Unpoison the stack for the current task beyond a watermark sp value. */ +asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) +{ + /* + * Calculate the task stack base address. Avoid using 'current' + * because this function is called by early resume code which hasn't + * yet set up the percpu register (%gs). + */ + void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1)); + + kasan_unpoison_shadow(base, watermark - base); +} + +/* + * Clear all poison for the region between the current SP and a provided + * watermark value, as is sometimes required prior to hand-crafted asm function + * returns in the middle of functions. + */ +void kasan_unpoison_stack_above_sp_to(const void *watermark) +{ + const void *sp = __builtin_frame_address(0); + size_t size = watermark - sp; + + if (WARN_ON(sp > watermark)) + return; + kasan_unpoison_shadow(sp, size); +} /* * All functions below always inlined so compiler could @@ -252,32 +306,48 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) return memory_is_poisoned_n(addr, size); } - -static __always_inline void check_memory_region(unsigned long addr, - size_t size, bool write) +static __always_inline void check_memory_region_inline(unsigned long addr, + size_t size, bool write, + unsigned long ret_ip) { if (unlikely(size == 0)) return; if (unlikely((void *)addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { - kasan_report(addr, size, write, _RET_IP_); + kasan_report(addr, size, write, ret_ip); return; } if (likely(!memory_is_poisoned(addr, size))) return; - kasan_report(addr, size, write, _RET_IP_); + kasan_report(addr, size, write, ret_ip); } -void __asan_loadN(unsigned long addr, size_t size); -void __asan_storeN(unsigned long addr, size_t size); +static void check_memory_region(unsigned long addr, + size_t size, bool write, + unsigned long ret_ip) +{ + check_memory_region_inline(addr, size, write, ret_ip); +} + +void kasan_check_read(const void *p, unsigned int size) +{ + check_memory_region((unsigned long)p, size, false, _RET_IP_); +} +EXPORT_SYMBOL(kasan_check_read); + +void kasan_check_write(const void *p, unsigned int size) +{ + check_memory_region((unsigned long)p, size, true, _RET_IP_); +} +EXPORT_SYMBOL(kasan_check_write); #undef memset void *memset(void *addr, int c, size_t len) { - __asan_storeN((unsigned long)addr, len); + check_memory_region((unsigned long)addr, len, true, _RET_IP_); return __memset(addr, c, len); } @@ -285,8 +355,8 @@ void *memset(void *addr, int c, size_t len) #undef memmove void *memmove(void *dest, const void *src, size_t len) { - __asan_loadN((unsigned long)src, len); - __asan_storeN((unsigned long)dest, len); + check_memory_region((unsigned long)src, len, false, _RET_IP_); + check_memory_region((unsigned long)dest, len, true, _RET_IP_); return __memmove(dest, src, len); } @@ -294,8 +364,8 @@ void *memmove(void *dest, const void *src, size_t len) #undef memcpy void *memcpy(void *dest, const void *src, size_t len) { - __asan_loadN((unsigned long)src, len); - __asan_storeN((unsigned long)dest, len); + check_memory_region((unsigned long)src, len, false, _RET_IP_); + check_memory_region((unsigned long)dest, len, true, _RET_IP_); return __memcpy(dest, src, len); } @@ -314,6 +384,80 @@ void kasan_free_pages(struct page *page, unsigned int order) KASAN_FREE_PAGE); } +/* + * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. + * For larger allocations larger redzones are used. + */ +static size_t optimal_redzone(size_t object_size) +{ + int rz = + object_size <= 64 - 16 ? 16 : + object_size <= 128 - 32 ? 32 : + object_size <= 512 - 64 ? 64 : + object_size <= 4096 - 128 ? 128 : + object_size <= (1 << 14) - 256 ? 256 : + object_size <= (1 << 15) - 512 ? 512 : + object_size <= (1 << 16) - 1024 ? 1024 : 2048; + return rz; +} + +void kasan_cache_create(struct kmem_cache *cache, size_t *size, + unsigned long *flags) +{ + int redzone_adjust; + int orig_size = *size; + + /* Add alloc meta. */ + cache->kasan_info.alloc_meta_offset = *size; + *size += sizeof(struct kasan_alloc_meta); + + /* Add free meta. */ + if (cache->flags & SLAB_DESTROY_BY_RCU || cache->ctor || + cache->object_size < sizeof(struct kasan_free_meta)) { + cache->kasan_info.free_meta_offset = *size; + *size += sizeof(struct kasan_free_meta); + } + redzone_adjust = optimal_redzone(cache->object_size) - + (*size - cache->object_size); + + if (redzone_adjust > 0) + *size += redzone_adjust; + + *size = min(KMALLOC_MAX_SIZE, max(*size, cache->object_size + + optimal_redzone(cache->object_size))); + + /* + * If the metadata doesn't fit, don't enable KASAN at all. + */ + if (*size <= cache->kasan_info.alloc_meta_offset || + *size <= cache->kasan_info.free_meta_offset) { + cache->kasan_info.alloc_meta_offset = 0; + cache->kasan_info.free_meta_offset = 0; + *size = orig_size; + return; + } + + *flags |= SLAB_KASAN; +} + +void kasan_cache_shrink(struct kmem_cache *cache) +{ + quarantine_remove_cache(cache); +} + +void kasan_cache_shutdown(struct kmem_cache *cache) +{ + quarantine_remove_cache(cache); +} + +size_t kasan_metadata_size(struct kmem_cache *cache) +{ + return (cache->kasan_info.alloc_meta_offset ? + sizeof(struct kasan_alloc_meta) : 0) + + (cache->kasan_info.free_meta_offset ? + sizeof(struct kasan_free_meta) : 0); +} + void kasan_poison_slab(struct page *page) { kasan_poison_shadow(page_address(page), @@ -333,12 +477,84 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object) KASAN_KMALLOC_REDZONE); } -void kasan_slab_alloc(struct kmem_cache *cache, void *object) +static inline int in_irqentry_text(unsigned long ptr) { - kasan_kmalloc(cache, object, cache->object_size); + return (ptr >= (unsigned long)&__irqentry_text_start && + ptr < (unsigned long)&__irqentry_text_end) || + (ptr >= (unsigned long)&__softirqentry_text_start && + ptr < (unsigned long)&__softirqentry_text_end); } -void kasan_slab_free(struct kmem_cache *cache, void *object) +static inline void filter_irq_stacks(struct stack_trace *trace) +{ + int i; + + if (!trace->nr_entries) + return; + for (i = 0; i < trace->nr_entries; i++) + if (in_irqentry_text(trace->entries[i])) { + /* Include the irqentry function into the stack. */ + trace->nr_entries = i + 1; + break; + } +} + +static inline depot_stack_handle_t save_stack(gfp_t flags) +{ + unsigned long entries[KASAN_STACK_DEPTH]; + struct stack_trace trace = { + .nr_entries = 0, + .entries = entries, + .max_entries = KASAN_STACK_DEPTH, + .skip = 0 + }; + + save_stack_trace(&trace); + filter_irq_stacks(&trace); + if (trace.nr_entries != 0 && + trace.entries[trace.nr_entries-1] == ULONG_MAX) + trace.nr_entries--; + + return depot_save_stack(&trace, flags); +} + +static inline void set_track(struct kasan_track *track, gfp_t flags) +{ + track->pid = current->pid; + track->stack = save_stack(flags); +} + +struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, + const void *object) +{ + BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32); + return (void *)object + cache->kasan_info.alloc_meta_offset; +} + +struct kasan_free_meta *get_free_info(struct kmem_cache *cache, + const void *object) +{ + BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); + return (void *)object + cache->kasan_info.free_meta_offset; +} + +void kasan_init_slab_obj(struct kmem_cache *cache, const void *object) +{ + struct kasan_alloc_meta *alloc_info; + + if (!(cache->flags & SLAB_KASAN)) + return; + + alloc_info = get_alloc_info(cache, object); + __memset(alloc_info, 0, sizeof(*alloc_info)); +} + +void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags) +{ + kasan_kmalloc(cache, object, cache->object_size, flags); +} + +static void kasan_poison_slab_free(struct kmem_cache *cache, void *object) { unsigned long size = cache->object_size; unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE); @@ -350,11 +566,40 @@ void kasan_slab_free(struct kmem_cache *cache, void *object) kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); } -void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size) +bool kasan_slab_free(struct kmem_cache *cache, void *object) +{ + s8 shadow_byte; + + /* RCU slabs could be legally used after free within the RCU period */ + if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) + return false; + + shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object)); + if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) { + kasan_report_double_free(cache, object, + __builtin_return_address(1)); + return true; + } + + kasan_poison_slab_free(cache, object); + + if (unlikely(!(cache->flags & SLAB_KASAN))) + return false; + + set_track(&get_alloc_info(cache, object)->free_track, GFP_NOWAIT); + quarantine_put(get_free_info(cache, object), cache); + return true; +} + +void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, + gfp_t flags) { unsigned long redzone_start; unsigned long redzone_end; + if (gfpflags_allow_blocking(flags)) + quarantine_reduce(); + if (unlikely(object == NULL)) return; @@ -366,15 +611,21 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size) kasan_unpoison_shadow(object, size); kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, KASAN_KMALLOC_REDZONE); + + if (cache->flags & SLAB_KASAN) + set_track(&get_alloc_info(cache, object)->alloc_track, flags); } EXPORT_SYMBOL(kasan_kmalloc); -void kasan_kmalloc_large(const void *ptr, size_t size) +void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) { struct page *page; unsigned long redzone_start; unsigned long redzone_end; + if (gfpflags_allow_blocking(flags)) + quarantine_reduce(); + if (unlikely(ptr == NULL)) return; @@ -388,7 +639,7 @@ void kasan_kmalloc_large(const void *ptr, size_t size) KASAN_PAGE_REDZONE); } -void kasan_krealloc(const void *object, size_t size) +void kasan_krealloc(const void *object, size_t size, gfp_t flags) { struct page *page; @@ -398,12 +649,12 @@ void kasan_krealloc(const void *object, size_t size) page = virt_to_head_page(object); if (unlikely(!PageSlab(page))) - kasan_kmalloc_large(object, size); + kasan_kmalloc_large(object, size, flags); else - kasan_kmalloc(page->slab_cache, object, size); + kasan_kmalloc(page->slab_cache, object, size, flags); } -void kasan_kfree(void *ptr) +void kasan_poison_kfree(void *ptr) { struct page *page; @@ -413,7 +664,7 @@ void kasan_kfree(void *ptr) kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page), KASAN_FREE_PAGE); else - kasan_slab_free(page->slab_cache, ptr); + kasan_poison_slab_free(page->slab_cache, ptr); } void kasan_kfree_large(const void *ptr) @@ -484,22 +735,22 @@ void __asan_unregister_globals(struct kasan_global *globals, size_t size) } EXPORT_SYMBOL(__asan_unregister_globals); -#define DEFINE_ASAN_LOAD_STORE(size) \ - void __asan_load##size(unsigned long addr) \ - { \ - check_memory_region(addr, size, false); \ - } \ - EXPORT_SYMBOL(__asan_load##size); \ - __alias(__asan_load##size) \ - void __asan_load##size##_noabort(unsigned long); \ - EXPORT_SYMBOL(__asan_load##size##_noabort); \ - void __asan_store##size(unsigned long addr) \ - { \ - check_memory_region(addr, size, true); \ - } \ - EXPORT_SYMBOL(__asan_store##size); \ - __alias(__asan_store##size) \ - void __asan_store##size##_noabort(unsigned long); \ +#define DEFINE_ASAN_LOAD_STORE(size) \ + void __asan_load##size(unsigned long addr) \ + { \ + check_memory_region_inline(addr, size, false, _RET_IP_);\ + } \ + EXPORT_SYMBOL(__asan_load##size); \ + __alias(__asan_load##size) \ + void __asan_load##size##_noabort(unsigned long); \ + EXPORT_SYMBOL(__asan_load##size##_noabort); \ + void __asan_store##size(unsigned long addr) \ + { \ + check_memory_region_inline(addr, size, true, _RET_IP_); \ + } \ + EXPORT_SYMBOL(__asan_store##size); \ + __alias(__asan_store##size) \ + void __asan_store##size##_noabort(unsigned long); \ EXPORT_SYMBOL(__asan_store##size##_noabort) DEFINE_ASAN_LOAD_STORE(1); @@ -510,7 +761,7 @@ DEFINE_ASAN_LOAD_STORE(16); void __asan_loadN(unsigned long addr, size_t size) { - check_memory_region(addr, size, false); + check_memory_region(addr, size, false, _RET_IP_); } EXPORT_SYMBOL(__asan_loadN); @@ -520,7 +771,7 @@ EXPORT_SYMBOL(__asan_loadN_noabort); void __asan_storeN(unsigned long addr, size_t size) { - check_memory_region(addr, size, true); + check_memory_region(addr, size, true, _RET_IP_); } EXPORT_SYMBOL(__asan_storeN); @@ -532,6 +783,25 @@ EXPORT_SYMBOL(__asan_storeN_noabort); void __asan_handle_no_return(void) {} EXPORT_SYMBOL(__asan_handle_no_return); +/* Emitted by compiler to poison large objects when they go out of scope. */ +void __asan_poison_stack_memory(const void *addr, size_t size) +{ + /* + * Addr is KASAN_SHADOW_SCALE_SIZE-aligned and the object is surrounded + * by redzones, so we simply round up size to simplify logic. + */ + kasan_poison_shadow(addr, round_up(size, KASAN_SHADOW_SCALE_SIZE), + KASAN_USE_AFTER_SCOPE); +} +EXPORT_SYMBOL(__asan_poison_stack_memory); + +/* Emitted by compiler to unpoison large objects when they go into scope. */ +void __asan_unpoison_stack_memory(const void *addr, size_t size) +{ + kasan_unpoison_shadow(addr, size); +} +EXPORT_SYMBOL(__asan_unpoison_stack_memory); + #ifdef CONFIG_MEMORY_HOTPLUG static int kasan_mem_notifier(struct notifier_block *nb, unsigned long action, void *data) @@ -541,8 +811,8 @@ static int kasan_mem_notifier(struct notifier_block *nb, static int __init kasan_memhotplug_init(void) { - pr_err("WARNING: KASAN doesn't support memory hot-add\n"); - pr_err("Memory hot-add will be disabled\n"); + pr_info("WARNING: KASAN doesn't support memory hot-add\n"); + pr_info("Memory hot-add will be disabled\n"); hotplug_memory_notifier(kasan_mem_notifier, 0); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 37ff0ab6a8ff..1229298cce64 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -2,6 +2,7 @@ #define __MM_KASAN_KASAN_H #include <linux/kasan.h> +#include <linux/stackdepot.h> #define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) #define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1) @@ -20,6 +21,7 @@ #define KASAN_STACK_MID 0xF2 #define KASAN_STACK_RIGHT 0xF3 #define KASAN_STACK_PARTIAL 0xF4 +#define KASAN_USE_AFTER_SCOPE 0xF8 /* Don't break randconfig/all*config builds */ #ifndef KASAN_ABI_VERSION @@ -57,18 +59,57 @@ struct kasan_global { #endif }; +/** + * Structures to keep alloc and free tracks * + */ + +#define KASAN_STACK_DEPTH 64 + +struct kasan_track { + u32 pid; + depot_stack_handle_t stack; +}; + +struct kasan_alloc_meta { + struct kasan_track alloc_track; + struct kasan_track free_track; +}; + +struct qlist_node { + struct qlist_node *next; +}; +struct kasan_free_meta { + /* This field is used while the object is in the quarantine. + * Otherwise it might be used for the allocator freelist. + */ + struct qlist_node quarantine_link; +}; + +struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, + const void *object); +struct kasan_free_meta *get_free_info(struct kmem_cache *cache, + const void *object); + static inline const void *kasan_shadow_to_mem(const void *shadow_addr) { return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) << KASAN_SHADOW_SCALE_SHIFT); } -static inline bool kasan_report_enabled(void) -{ - return !current->kasan_depth; -} - void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); +void kasan_report_double_free(struct kmem_cache *cache, void *object, + void *ip); + +#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB) +void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); +void quarantine_reduce(void); +void quarantine_remove_cache(struct kmem_cache *cache); +#else +static inline void quarantine_put(struct kasan_free_meta *info, + struct kmem_cache *cache) { } +static inline void quarantine_reduce(void) { } +static inline void quarantine_remove_cache(struct kmem_cache *cache) { } +#endif #endif diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c new file mode 100644 index 000000000000..3a8ddf8baf7d --- /dev/null +++ b/mm/kasan/quarantine.c @@ -0,0 +1,328 @@ +/* + * KASAN quarantine. + * + * Author: Alexander Potapenko <glider@google.com> + * Copyright (C) 2016 Google, Inc. + * + * Based on code by Dmitry Chernenkov. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +#include <linux/gfp.h> +#include <linux/hash.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/percpu.h> +#include <linux/printk.h> +#include <linux/shrinker.h> +#include <linux/slab.h> +#include <linux/srcu.h> +#include <linux/string.h> +#include <linux/types.h> + +#include "../slab.h" +#include "kasan.h" + +/* Data structure and operations for quarantine queues. */ + +/* + * Each queue is a signle-linked list, which also stores the total size of + * objects inside of it. + */ +struct qlist_head { + struct qlist_node *head; + struct qlist_node *tail; + size_t bytes; +}; + +#define QLIST_INIT { NULL, NULL, 0 } + +static bool qlist_empty(struct qlist_head *q) +{ + return !q->head; +} + +static void qlist_init(struct qlist_head *q) +{ + q->head = q->tail = NULL; + q->bytes = 0; +} + +static void qlist_put(struct qlist_head *q, struct qlist_node *qlink, + size_t size) +{ + if (unlikely(qlist_empty(q))) + q->head = qlink; + else + q->tail->next = qlink; + q->tail = qlink; + qlink->next = NULL; + q->bytes += size; +} + +static void qlist_move_all(struct qlist_head *from, struct qlist_head *to) +{ + if (unlikely(qlist_empty(from))) + return; + + if (qlist_empty(to)) { + *to = *from; + qlist_init(from); + return; + } + + to->tail->next = from->head; + to->tail = from->tail; + to->bytes += from->bytes; + + qlist_init(from); +} + +#define QUARANTINE_PERCPU_SIZE (1 << 20) +#define QUARANTINE_BATCHES \ + (1024 > 4 * CONFIG_NR_CPUS ? 1024 : 4 * CONFIG_NR_CPUS) + +/* + * The object quarantine consists of per-cpu queues and a global queue, + * guarded by quarantine_lock. + */ +static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine); + +/* Round-robin FIFO array of batches. */ +static struct qlist_head global_quarantine[QUARANTINE_BATCHES]; +static int quarantine_head; +static int quarantine_tail; +/* Total size of all objects in global_quarantine across all batches. */ +static unsigned long quarantine_size; +static DEFINE_SPINLOCK(quarantine_lock); +DEFINE_STATIC_SRCU(remove_cache_srcu); + +/* Maximum size of the global queue. */ +static unsigned long quarantine_max_size; + +/* + * Target size of a batch in global_quarantine. + * Usually equal to QUARANTINE_PERCPU_SIZE unless we have too much RAM. + */ +static unsigned long quarantine_batch_size; + +/* + * The fraction of physical memory the quarantine is allowed to occupy. + * Quarantine doesn't support memory shrinker with SLAB allocator, so we keep + * the ratio low to avoid OOM. + */ +#define QUARANTINE_FRACTION 32 + +static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink) +{ + return virt_to_head_page(qlink)->slab_cache; +} + +static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache) +{ + struct kasan_free_meta *free_info = + container_of(qlink, struct kasan_free_meta, + quarantine_link); + + return ((void *)free_info) - cache->kasan_info.free_meta_offset; +} + +static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache) +{ + void *object = qlink_to_object(qlink, cache); + unsigned long flags; + + if (IS_ENABLED(CONFIG_SLAB)) + local_irq_save(flags); + + ___cache_free(cache, object, _THIS_IP_); + + if (IS_ENABLED(CONFIG_SLAB)) + local_irq_restore(flags); +} + +static void qlist_free_all(struct qlist_head *q, struct kmem_cache *cache) +{ + struct qlist_node *qlink; + + if (unlikely(qlist_empty(q))) + return; + + qlink = q->head; + while (qlink) { + struct kmem_cache *obj_cache = + cache ? cache : qlink_to_cache(qlink); + struct qlist_node *next = qlink->next; + + qlink_free(qlink, obj_cache); + qlink = next; + } + qlist_init(q); +} + +void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache) +{ + unsigned long flags; + struct qlist_head *q; + struct qlist_head temp = QLIST_INIT; + + /* + * Note: irq must be disabled until after we move the batch to the + * global quarantine. Otherwise quarantine_remove_cache() can miss + * some objects belonging to the cache if they are in our local temp + * list. quarantine_remove_cache() executes on_each_cpu() at the + * beginning which ensures that it either sees the objects in per-cpu + * lists or in the global quarantine. + */ + local_irq_save(flags); + + q = this_cpu_ptr(&cpu_quarantine); + qlist_put(q, &info->quarantine_link, cache->size); + if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) { + qlist_move_all(q, &temp); + + spin_lock(&quarantine_lock); + WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes); + qlist_move_all(&temp, &global_quarantine[quarantine_tail]); + if (global_quarantine[quarantine_tail].bytes >= + READ_ONCE(quarantine_batch_size)) { + int new_tail; + + new_tail = quarantine_tail + 1; + if (new_tail == QUARANTINE_BATCHES) + new_tail = 0; + if (new_tail != quarantine_head) + quarantine_tail = new_tail; + } + spin_unlock(&quarantine_lock); + } + + local_irq_restore(flags); +} + +void quarantine_reduce(void) +{ + size_t total_size, new_quarantine_size, percpu_quarantines; + unsigned long flags; + int srcu_idx; + struct qlist_head to_free = QLIST_INIT; + + if (likely(READ_ONCE(quarantine_size) <= + READ_ONCE(quarantine_max_size))) + return; + + /* + * srcu critical section ensures that quarantine_remove_cache() + * will not miss objects belonging to the cache while they are in our + * local to_free list. srcu is chosen because (1) it gives us private + * grace period domain that does not interfere with anything else, + * and (2) it allows synchronize_srcu() to return without waiting + * if there are no pending read critical sections (which is the + * expected case). + */ + srcu_idx = srcu_read_lock(&remove_cache_srcu); + spin_lock_irqsave(&quarantine_lock, flags); + + /* + * Update quarantine size in case of hotplug. Allocate a fraction of + * the installed memory to quarantine minus per-cpu queue limits. + */ + total_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) / + QUARANTINE_FRACTION; + percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus(); + new_quarantine_size = (total_size < percpu_quarantines) ? + 0 : total_size - percpu_quarantines; + WRITE_ONCE(quarantine_max_size, new_quarantine_size); + /* Aim at consuming at most 1/2 of slots in quarantine. */ + WRITE_ONCE(quarantine_batch_size, max((size_t)QUARANTINE_PERCPU_SIZE, + 2 * total_size / QUARANTINE_BATCHES)); + + if (likely(quarantine_size > quarantine_max_size)) { + qlist_move_all(&global_quarantine[quarantine_head], &to_free); + WRITE_ONCE(quarantine_size, quarantine_size - to_free.bytes); + quarantine_head++; + if (quarantine_head == QUARANTINE_BATCHES) + quarantine_head = 0; + } + + spin_unlock_irqrestore(&quarantine_lock, flags); + + qlist_free_all(&to_free, NULL); + srcu_read_unlock(&remove_cache_srcu, srcu_idx); +} + +static void qlist_move_cache(struct qlist_head *from, + struct qlist_head *to, + struct kmem_cache *cache) +{ + struct qlist_node *curr; + + if (unlikely(qlist_empty(from))) + return; + + curr = from->head; + qlist_init(from); + while (curr) { + struct qlist_node *next = curr->next; + struct kmem_cache *obj_cache = qlink_to_cache(curr); + + if (obj_cache == cache) + qlist_put(to, curr, obj_cache->size); + else + qlist_put(from, curr, obj_cache->size); + + curr = next; + } +} + +static void per_cpu_remove_cache(void *arg) +{ + struct kmem_cache *cache = arg; + struct qlist_head to_free = QLIST_INIT; + struct qlist_head *q; + + q = this_cpu_ptr(&cpu_quarantine); + qlist_move_cache(q, &to_free, cache); + qlist_free_all(&to_free, cache); +} + +/* Free all quarantined objects belonging to cache. */ +void quarantine_remove_cache(struct kmem_cache *cache) +{ + unsigned long flags, i; + struct qlist_head to_free = QLIST_INIT; + + /* + * Must be careful to not miss any objects that are being moved from + * per-cpu list to the global quarantine in quarantine_put(), + * nor objects being freed in quarantine_reduce(). on_each_cpu() + * achieves the first goal, while synchronize_srcu() achieves the + * second. + */ + on_each_cpu(per_cpu_remove_cache, cache, 1); + + spin_lock_irqsave(&quarantine_lock, flags); + for (i = 0; i < QUARANTINE_BATCHES; i++) { + if (qlist_empty(&global_quarantine[i])) + continue; + qlist_move_cache(&global_quarantine[i], &to_free, cache); + /* Scanning whole quarantine can take a while. */ + spin_unlock_irqrestore(&quarantine_lock, flags); + cond_resched(); + spin_lock_irqsave(&quarantine_lock, flags); + } + spin_unlock_irqrestore(&quarantine_lock, flags); + + qlist_free_all(&to_free, cache); + + synchronize_srcu(&remove_cache_srcu); +} diff --git a/mm/kasan/report.c b/mm/kasan/report.c index b4e31f78ae69..04bb1d3eb9ec 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -13,12 +13,15 @@ * */ +#include <linux/bitops.h> #include <linux/ftrace.h> +#include <linux/init.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/printk.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/stackdepot.h> #include <linux/stacktrace.h> #include <linux/string.h> #include <linux/types.h> @@ -48,7 +51,13 @@ static const void *find_first_bad_addr(const void *addr, size_t size) return first_bad_addr; } -static void print_error_description(struct kasan_access_info *info) +static bool addr_has_shadow(struct kasan_access_info *info) +{ + return (info->access_addr >= + kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); +} + +static const char *get_shadow_bug_type(struct kasan_access_info *info) { const char *bug_type = "unknown-crash"; u8 *shadow_addr; @@ -90,14 +99,44 @@ static void print_error_description(struct kasan_access_info *info) case KASAN_KMALLOC_FREE: bug_type = "use-after-free"; break; + case KASAN_USE_AFTER_SCOPE: + bug_type = "use-after-scope"; + break; } - pr_err("BUG: KASAN: %s in %pS at addr %p\n", - bug_type, (void *)info->ip, - info->access_addr); - pr_err("%s of size %zu by task %s/%d\n", - info->is_write ? "Write" : "Read", - info->access_size, current->comm, task_pid_nr(current)); + return bug_type; +} + +static const char *get_wild_bug_type(struct kasan_access_info *info) +{ + const char *bug_type = "unknown-crash"; + + if ((unsigned long)info->access_addr < PAGE_SIZE) + bug_type = "null-ptr-deref"; + else if ((unsigned long)info->access_addr < TASK_SIZE) + bug_type = "user-memory-access"; + else + bug_type = "wild-memory-access"; + + return bug_type; +} + +static const char *get_bug_type(struct kasan_access_info *info) +{ + if (addr_has_shadow(info)) + return get_shadow_bug_type(info); + return get_wild_bug_type(info); +} + +static void print_error_description(struct kasan_access_info *info) +{ + const char *bug_type = get_bug_type(info); + + pr_err("BUG: KASAN: %s in %pS\n", + bug_type, (void *)info->ip); + pr_err("%s of size %zu at addr %p by task %s/%d\n", + info->is_write ? "Write" : "Read", info->access_size, + info->access_addr, current->comm, task_pid_nr(current)); } static inline bool kernel_or_module_addr(const void *addr) @@ -116,39 +155,119 @@ static inline bool init_task_stack_addr(const void *addr) sizeof(init_thread_union.stack)); } -static void print_address_description(struct kasan_access_info *info) +static DEFINE_SPINLOCK(report_lock); + +static void kasan_start_report(unsigned long *flags) +{ + /* + * Make sure we don't end up in loop. + */ + kasan_disable_current(); + spin_lock_irqsave(&report_lock, *flags); + pr_err("==================================================================\n"); +} + +static void kasan_end_report(unsigned long *flags) +{ + pr_err("==================================================================\n"); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + spin_unlock_irqrestore(&report_lock, *flags); + if (panic_on_warn) + panic("panic_on_warn set ...\n"); + kasan_enable_current(); +} + +static void print_track(struct kasan_track *track, const char *prefix) { - const void *addr = info->access_addr; + pr_err("%s by task %u:\n", prefix, track->pid); + if (track->stack) { + struct stack_trace trace; + depot_fetch_stack(track->stack, &trace); + print_stack_trace(&trace, 0); + } else { + pr_err("(stack is not available)\n"); + } +} + +static struct page *addr_to_page(const void *addr) +{ if ((addr >= (void *)PAGE_OFFSET) && - (addr < high_memory)) { - struct page *page = virt_to_head_page(addr); + (addr < high_memory)) + return virt_to_head_page(addr); + return NULL; +} - if (PageSlab(page)) { - void *object; - struct kmem_cache *cache = page->slab_cache; - void *last_object; +static void describe_object_addr(struct kmem_cache *cache, void *object, + const void *addr) +{ + unsigned long access_addr = (unsigned long)addr; + unsigned long object_addr = (unsigned long)object; + const char *rel_type; + int rel_bytes; - object = virt_to_obj(cache, page_address(page), addr); - last_object = page_address(page) + - page->objects * cache->size; + pr_err("The buggy address belongs to the object at %p\n" + " which belongs to the cache %s of size %d\n", + object, cache->name, cache->object_size); - if (unlikely(object > last_object)) - object = last_object; /* we hit into padding */ + if (!addr) + return; - object_err(cache, page, object, - "kasan: bad access detected"); - return; - } - dump_page(page, "kasan: bad access detected"); + if (access_addr < object_addr) { + rel_type = "to the left"; + rel_bytes = object_addr - access_addr; + } else if (access_addr >= object_addr + cache->object_size) { + rel_type = "to the right"; + rel_bytes = access_addr - (object_addr + cache->object_size); + } else { + rel_type = "inside"; + rel_bytes = access_addr - object_addr; } - if (kernel_or_module_addr(addr)) { - if (!init_task_stack_addr(addr)) - pr_err("Address belongs to variable %pS\n", addr); + pr_err("The buggy address is located %d bytes %s of\n" + " %d-byte region [%p, %p)\n", + rel_bytes, rel_type, cache->object_size, (void *)object_addr, + (void *)(object_addr + cache->object_size)); +} + +static void describe_object(struct kmem_cache *cache, void *object, + const void *addr) +{ + struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); + + if (cache->flags & SLAB_KASAN) { + print_track(&alloc_info->alloc_track, "Allocated"); + pr_err("\n"); + print_track(&alloc_info->free_track, "Freed"); + pr_err("\n"); } + describe_object_addr(cache, object, addr); +} + +static void print_address_description(void *addr) +{ + struct page *page = addr_to_page(addr); + dump_stack(); + pr_err("\n"); + + if (page && PageSlab(page)) { + struct kmem_cache *cache = page->slab_cache; + void *object = nearest_obj(cache, page, addr); + + describe_object(cache, object, addr); + } + + if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) { + pr_err("The buggy address belongs to the variable:\n"); + pr_err(" %pS\n", addr); + } + + if (page) { + pr_err("The buggy address belongs to the page:\n"); + dump_page(page, "kasan: bad access detected"); + } } static bool row_is_guilty(const void *row, const void *guilty) @@ -203,45 +322,72 @@ static void print_shadow_for_address(const void *addr) } } -static DEFINE_SPINLOCK(report_lock); +void kasan_report_double_free(struct kmem_cache *cache, void *object, + void *ip) +{ + unsigned long flags; + + kasan_start_report(&flags); + pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", ip); + pr_err("\n"); + print_address_description(object); + pr_err("\n"); + print_shadow_for_address(object); + kasan_end_report(&flags); +} static void kasan_report_error(struct kasan_access_info *info) { unsigned long flags; - const char *bug_type; - /* - * Make sure we don't end up in loop. - */ - kasan_disable_current(); - spin_lock_irqsave(&report_lock, flags); - pr_err("=================================" - "=================================\n"); - if (info->access_addr < - kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) { - if ((unsigned long)info->access_addr < PAGE_SIZE) - bug_type = "null-ptr-deref"; - else if ((unsigned long)info->access_addr < TASK_SIZE) - bug_type = "user-memory-access"; - else - bug_type = "wild-memory-access"; - pr_err("BUG: KASAN: %s on address %p\n", - bug_type, info->access_addr); - pr_err("%s of size %zu by task %s/%d\n", - info->is_write ? "Write" : "Read", - info->access_size, current->comm, - task_pid_nr(current)); + kasan_start_report(&flags); + + print_error_description(info); + pr_err("\n"); + + if (!addr_has_shadow(info)) { dump_stack(); } else { - print_error_description(info); - print_address_description(info); + print_address_description((void *)info->access_addr); + pr_err("\n"); print_shadow_for_address(info->first_bad_addr); } - pr_err("=================================" - "=================================\n"); - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); - spin_unlock_irqrestore(&report_lock, flags); - kasan_enable_current(); + + kasan_end_report(&flags); +} + +static unsigned long kasan_flags; + +#define KASAN_BIT_REPORTED 0 +#define KASAN_BIT_MULTI_SHOT 1 + +bool kasan_save_enable_multi_shot(void) +{ + return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); +} +EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot); + +void kasan_restore_multi_shot(bool enabled) +{ + if (!enabled) + clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); +} +EXPORT_SYMBOL_GPL(kasan_restore_multi_shot); + +static int __init kasan_set_multi_shot(char *str) +{ + set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); + return 1; +} +__setup("kasan_multi_shot", kasan_set_multi_shot); + +static inline bool kasan_report_enabled(void) +{ + if (current->kasan_depth) + return false; + if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) + return true; + return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); } void kasan_report(unsigned long addr, size_t size, diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c index cab58bb592d8..e6347772bbda 100644 --- a/mm/kmemcheck.c +++ b/mm/kmemcheck.c @@ -20,8 +20,7 @@ void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order); if (!shadow) { if (printk_ratelimit()) - printk(KERN_ERR "kmemcheck: failed to allocate " - "shadow bitmap\n"); + printk(KERN_ERR "kmemcheck: failed to allocate shadow bitmap\n"); return; } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 84c93879aa5d..6e5996937712 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -597,8 +597,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, else if (parent->pointer + parent->size <= ptr) link = &parent->rb_node.rb_right; else { - kmemleak_stop("Cannot insert 0x%lx into the object " - "search tree (overlaps existing)\n", + kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n", ptr); /* * No need for parent->lock here since "parent" cannot @@ -671,8 +670,8 @@ static void delete_object_part(unsigned long ptr, size_t size) object = find_and_remove_object(ptr, 1); if (!object) { #ifdef DEBUG - kmemleak_warn("Partially freeing unknown object at 0x%08lx " - "(size %zu)\n", ptr, size); + kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n", + ptr, size); #endif return; } @@ -718,8 +717,8 @@ static void paint_ptr(unsigned long ptr, int color) object = find_and_get_object(ptr, 0); if (!object) { - kmemleak_warn("Trying to color unknown object " - "at 0x%08lx as %s\n", ptr, + kmemleak_warn("Trying to color unknown object at 0x%08lx as %s\n", + ptr, (color == KMEMLEAK_GREY) ? "Grey" : (color == KMEMLEAK_BLACK) ? "Black" : "Unknown"); return; @@ -1466,8 +1465,8 @@ static void kmemleak_scan(void) if (new_leaks) { kmemleak_found_leaks = true; - pr_info("%d new suspected memory leaks (see " - "/sys/kernel/debug/kmemleak)\n", new_leaks); + pr_info("%d new suspected memory leaks (see /sys/kernel/debug/kmemleak)\n", + new_leaks); } } @@ -1800,8 +1799,7 @@ static void kmemleak_do_cleanup(struct work_struct *work) if (!kmemleak_found_leaks) __kmemleak_do_cleanup(); else - pr_info("Kmemleak disabled without freeing internal data. " - "Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n"); + pr_info("Kmemleak disabled without freeing internal data. Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\".\n"); } static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup); diff --git a/mm/madvise.c b/mm/madvise.c index f548c66154ee..d1d09bdf9a72 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -104,7 +104,7 @@ static long madvise_behavior(struct vm_area_struct *vma, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, vma_get_anon_name(vma)); if (*prev) { vma = *prev; goto success; diff --git a/mm/memblock.c b/mm/memblock.c index f8fab45bfdb7..99c7f493d45f 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -241,8 +241,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, * so we use WARN_ONCE() here to see the stack trace if * fail happens. */ - WARN_ONCE(1, "memblock: bottom-up allocation failed, " - "memory hotunplug may be affected\n"); + WARN_ONCE(1, "memblock: bottom-up allocation failed, memory hotunplug may be affected\n"); } return __memblock_find_range_top_down(start, end, size, align, nid, @@ -822,6 +821,17 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR); } +/** + * memblock_mark_nomap - Mark a memory region with flag MEMBLOCK_NOMAP. + * @base: the base phys addr of the region + * @size: the size of the region + * + * Return 0 on success, -errno on failure. + */ +int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(base, size, 1, MEMBLOCK_NOMAP); +} /** * __next_reserved_mem_region - next function for for_each_reserved_region() @@ -913,6 +923,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) continue; + /* skip nomap memory unless we were asked for it explicitly */ + if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m)) + continue; + if (!type_b) { if (out_start) *out_start = m_start; @@ -1022,6 +1036,10 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) continue; + /* skip nomap memory unless we were asked for it explicitly */ + if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m)) + continue; + if (!type_b) { if (out_start) *out_start = m_start; @@ -1519,6 +1537,15 @@ int __init_memblock memblock_is_memory(phys_addr_t addr) return memblock_search(&memblock.memory, addr) != -1; } +int __init_memblock memblock_is_map_memory(phys_addr_t addr) +{ + int i = memblock_search(&memblock.memory, addr); + + if (i == -1) + return false; + return !memblock_is_nomap(&memblock.memory.regions[i]); +} + #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int __init_memblock memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0addef5f8aa3..38c81212d9c3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1943,8 +1943,7 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; - pr_warn("removing memory fails, because memory " - "[%pa-%pa] is onlined\n", + pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", &beginpa, &endpa); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 44134ba6fb53..63c19c24dfe1 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -720,7 +720,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, ((vmstart - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, - new_pol, vma->vm_userfaultfd_ctx); + new_pol, vma->vm_userfaultfd_ctx, + vma_get_anon_name(vma)); if (prev) { vma = prev; next = vma->vm_next; @@ -2565,9 +2566,7 @@ static void __init check_numabalancing_enable(void) set_numabalancing_state(numabalancing_override == 1); if (num_online_nodes() > 1 && !numabalancing_override) { - pr_info("%s automatic NUMA balancing. " - "Configure with numa_balancing= or the " - "kernel.numa_balancing sysctl", + pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n", numabalancing_default ? "Enabling" : "Disabling"); set_numabalancing_state(numabalancing_default); } diff --git a/mm/mempool.c b/mm/mempool.c index 7924f4f58a6d..5ba6c8b3b814 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -104,20 +104,16 @@ static inline void poison_element(mempool_t *pool, void *element) static void kasan_poison_element(mempool_t *pool, void *element) { - if (pool->alloc == mempool_alloc_slab) - kasan_slab_free(pool->pool_data, element); - if (pool->alloc == mempool_kmalloc) - kasan_kfree(element); + if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) + kasan_poison_kfree(element); if (pool->alloc == mempool_alloc_pages) kasan_free_pages(element, (unsigned long)pool->pool_data); } -static void kasan_unpoison_element(mempool_t *pool, void *element) +static void kasan_unpoison_element(mempool_t *pool, void *element, gfp_t flags) { - if (pool->alloc == mempool_alloc_slab) - kasan_slab_alloc(pool->pool_data, element); - if (pool->alloc == mempool_kmalloc) - kasan_krealloc(element, (size_t)pool->pool_data); + if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) + kasan_unpoison_slab(element); if (pool->alloc == mempool_alloc_pages) kasan_alloc_pages(element, (unsigned long)pool->pool_data); } @@ -130,12 +126,12 @@ static void add_element(mempool_t *pool, void *element) pool->elements[pool->curr_nr++] = element; } -static void *remove_element(mempool_t *pool) +static void *remove_element(mempool_t *pool, gfp_t flags) { void *element = pool->elements[--pool->curr_nr]; BUG_ON(pool->curr_nr < 0); - kasan_unpoison_element(pool, element); + kasan_unpoison_element(pool, element, flags); check_element(pool, element); return element; } @@ -154,7 +150,7 @@ void mempool_destroy(mempool_t *pool) return; while (pool->curr_nr) { - void *element = remove_element(pool); + void *element = remove_element(pool, GFP_KERNEL); pool->free(element, pool->pool_data); } kfree(pool->elements); @@ -250,7 +246,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr) spin_lock_irqsave(&pool->lock, flags); if (new_min_nr <= pool->min_nr) { while (new_min_nr < pool->curr_nr) { - element = remove_element(pool); + element = remove_element(pool, GFP_KERNEL); spin_unlock_irqrestore(&pool->lock, flags); pool->free(element, pool->pool_data); spin_lock_irqsave(&pool->lock, flags); @@ -336,7 +332,7 @@ repeat_alloc: spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr)) { - element = remove_element(pool); + element = remove_element(pool, gfp_temp); spin_unlock_irqrestore(&pool->lock, flags); /* paired with rmb in mempool_free(), read comment there */ smp_wmb(); diff --git a/mm/mlock.c b/mm/mlock.c index b03f160c1bdd..966dbdcfc322 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -514,7 +514,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, vma_get_anon_name(vma)); if (*prev) { vma = *prev; goto success; diff --git a/mm/mmap.c b/mm/mmap.c index 3074dbcd9621..f325aa33f327 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -58,6 +58,18 @@ #define arch_rebalance_pgtables(addr, len) (addr) #endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS +const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN; +const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX; +int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS; +#endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS +const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN; +const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX; +int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; +#endif + + static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); @@ -939,7 +951,8 @@ again: remove_next = 1 + (end > next->vm_end); */ static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char __user *anon_name) { /* * VM_SOFTDIRTY should not prevent from VMA merging, if we @@ -957,6 +970,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, return 0; if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) return 0; + if (vma_get_anon_name(vma) != anon_name) + return 0; return 1; } @@ -989,9 +1004,10 @@ static int can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char __user *anon_name) { - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { if (vma->vm_pgoff == vm_pgoff) return 1; @@ -1010,9 +1026,10 @@ static int can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char __user *anon_name) { - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { pgoff_t vm_pglen; vm_pglen = vma_pages(vma); @@ -1023,9 +1040,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, } /* - * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out - * whether that can be merged with its predecessor or its successor. - * Or both (it neatly fills a hole). + * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name), + * figure out whether that can be merged with its predecessor or its + * successor. Or both (it neatly fills a hole). * * In most cases - when called for mmap, brk or mremap - [addr,end) is * certain not to be mapped by the time vma_merge is called; but when @@ -1056,7 +1073,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, unsigned long end, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t pgoff, struct mempolicy *policy, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char __user *anon_name) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; struct vm_area_struct *area, *next; @@ -1084,7 +1102,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, mpol_equal(vma_policy(prev), policy) && can_vma_merge_after(prev, vm_flags, anon_vma, file, pgoff, - vm_userfaultfd_ctx)) { + vm_userfaultfd_ctx, + anon_name)) { /* * OK, it can. Can we now merge in the successor as well? */ @@ -1093,7 +1112,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, - vm_userfaultfd_ctx) && + vm_userfaultfd_ctx, + anon_name) && is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { /* cases 1, 6 */ @@ -1115,7 +1135,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, - vm_userfaultfd_ctx)) { + vm_userfaultfd_ctx, + anon_name)) { if (prev && addr < prev->vm_end) /* case 4 */ err = vma_adjust(prev, prev->vm_start, addr, prev->vm_pgoff, NULL); @@ -1631,7 +1652,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * Can we just expand an old mapping? */ vma = vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (vma) goto out; @@ -2681,6 +2702,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) return 0; } +EXPORT_SYMBOL(do_munmap); int vm_munmap(unsigned long start, size_t len) { @@ -2714,8 +2736,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, unsigned long ret = -EINVAL; struct file *file; - pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. " - "See Documentation/vm/remap_file_pages.txt.\n", + pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.txt.\n", current->comm, current->pid); if (prot) @@ -2854,7 +2875,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) /* Can we just expand an old private anonymous mapping? */ vma = vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (vma) goto out; @@ -3019,7 +3040,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, return NULL; /* should never get here */ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, vma_get_anon_name(vma)); if (new_vma) { /* * Source vma may have been merged into new_vma diff --git a/mm/mprotect.c b/mm/mprotect.c index a277f3412a5d..fcd678c3bd24 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -343,7 +343,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *pprev = vma_merge(mm, *pprev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, vma_get_anon_name(vma)); if (*pprev) { vma = *pprev; goto success; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index bc781cdc0d04..7e5d3467d6d2 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -386,8 +386,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) static void dump_header(struct oom_control *oc, struct task_struct *p, struct mem_cgroup *memcg) { - pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " - "oom_score_adj=%hd\n", + pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, oom_score_adj=%hd\n", current->comm, oc->gfp_mask, oc->order, current->signal->oom_score_adj); cpuset_print_current_mems_allowed(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fd75e27c9b40..5b7402abc1d6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -238,9 +238,21 @@ compound_page_dtor * const compound_page_dtors[] = { #endif }; +/* + * Try to keep at least this much lowmem free. Do not allow normal + * allocations below this point, only high priority ones. Automatically + * tuned according to the amount of memory in the system. + */ int min_free_kbytes = 1024; int user_min_free_kbytes = -1; +/* + * Extra memory for the system to try freeing. Used to temporarily + * free memory, to make space for new workloads. Anyone can allocate + * down to the min watermarks controlled by min_free_kbytes above. + */ +int extra_free_kbytes = 0; + static unsigned long __meminitdata nr_kernel_pages; static unsigned long __meminitdata nr_all_pages; static unsigned long __meminitdata dma_reserve; @@ -3983,8 +3995,7 @@ static int __parse_numa_zonelist_order(char *s) user_zonelist_order = ZONELIST_ORDER_ZONE; } else { printk(KERN_WARNING - "Ignoring invalid numa_zonelist_order value: " - "%s\n", s); + "Ignoring invalid numa_zonelist_order value: %s\n", s); return -EINVAL; } return 0; @@ -4449,12 +4460,11 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) else page_group_by_mobility_disabled = 0; - pr_info("Built %i zonelists in %s order, mobility grouping %s. " - "Total pages: %ld\n", - nr_online_nodes, - zonelist_order_name[current_zonelist_order], - page_group_by_mobility_disabled ? "off" : "on", - vm_total_pages); + pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n", + nr_online_nodes, + zonelist_order_name[current_zonelist_order], + page_group_by_mobility_disabled ? "off" : "on", + vm_total_pages); #ifdef CONFIG_NUMA pr_info("Policy zone: %s\n", zone_names[policy_zone]); #endif @@ -5925,22 +5935,21 @@ void __init mem_init_print_info(const char *str) #undef adj_init_size - pr_info("Memory: %luK/%luK available " - "(%luK kernel code, %luK rwdata, %luK rodata, " - "%luK init, %luK bss, %luK reserved, %luK cma-reserved" + pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved" #ifdef CONFIG_HIGHMEM - ", %luK highmem" + ", %luK highmem" #endif - "%s%s)\n", - nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), - codesize >> 10, datasize >> 10, rosize >> 10, - (init_data_size + init_code_size) >> 10, bss_size >> 10, - (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT-10), - totalcma_pages << (PAGE_SHIFT-10), + "%s%s)\n", + nr_free_pages() << (PAGE_SHIFT - 10), + physpages << (PAGE_SHIFT - 10), + codesize >> 10, datasize >> 10, rosize >> 10, + (init_data_size + init_code_size) >> 10, bss_size >> 10, + (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10), + totalcma_pages << (PAGE_SHIFT - 10), #ifdef CONFIG_HIGHMEM - totalhigh_pages << (PAGE_SHIFT-10), + totalhigh_pages << (PAGE_SHIFT - 10), #endif - str ? ", " : "", str ? str : ""); + str ? ", " : "", str ? str : ""); } /** @@ -6084,6 +6093,7 @@ static void setup_per_zone_lowmem_reserve(void) static void __setup_per_zone_wmarks(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); + unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; struct zone *zone; unsigned long flags; @@ -6095,11 +6105,14 @@ static void __setup_per_zone_wmarks(void) } for_each_zone(zone) { - u64 tmp; + u64 min, low; spin_lock_irqsave(&zone->lock, flags); - tmp = (u64)pages_min * zone->managed_pages; - do_div(tmp, lowmem_pages); + min = (u64)pages_min * zone->managed_pages; + do_div(min, lowmem_pages); + low = (u64)pages_low * zone->managed_pages; + do_div(low, vm_total_pages); + if (is_highmem(zone)) { /* * __GFP_HIGH and PF_MEMALLOC allocations usually don't @@ -6120,11 +6133,13 @@ static void __setup_per_zone_wmarks(void) * If it's a lowmem zone, reserve a number of pages * proportionate to the zone's size. */ - zone->watermark[WMARK_MIN] = tmp; + zone->watermark[WMARK_MIN] = min; } - zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); - zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); + zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + + low + (min >> 2); + zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + + low + (min >> 1); __mod_zone_page_state(zone, NR_ALLOC_BATCH, high_wmark_pages(zone) - low_wmark_pages(zone) - @@ -6247,7 +6262,7 @@ core_initcall(init_per_zone_wmark_min) /* * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so * that we can call two helper functions whenever min_free_kbytes - * changes. + * or extra_free_kbytes changes. */ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) diff --git a/mm/percpu.c b/mm/percpu.c index 1c784df3bdfe..c5f2a724101a 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -890,8 +890,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, size = ALIGN(size, 2); if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { - WARN(true, "illegal size (%zu) or align (%zu) for " - "percpu allocation\n", size, align); + WARN(true, "illegal size (%zu) or align (%zu) for percpu allocation\n", + size, align); return NULL; } diff --git a/mm/shmem.c b/mm/shmem.c index d902b413941a..a06b31b8ff8d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3405,6 +3405,14 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags } EXPORT_SYMBOL_GPL(shmem_file_setup); +void shmem_set_file(struct vm_area_struct *vma, struct file *file) +{ + if (vma->vm_file) + fput(vma->vm_file); + vma->vm_file = file; + vma->vm_ops = &shmem_vm_ops; +} + /** * shmem_zero_setup - setup a shared anonymous mapping * @vma: the vma to be mmapped is prepared by do_mmap_pgoff @@ -3424,10 +3432,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) if (IS_ERR(file)) return PTR_ERR(file); - if (vma->vm_file) - fput(vma->vm_file); - vma->vm_file = file; - vma->vm_ops = &shmem_vm_ops; + shmem_set_file(vma, file); return 0; } diff --git a/mm/slab.c b/mm/slab.c index 92df044f5e00..6180cba0df78 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -390,36 +390,26 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #endif -#define OBJECT_FREE (0) -#define OBJECT_ACTIVE (1) - #ifdef CONFIG_DEBUG_SLAB_LEAK -static void set_obj_status(struct page *page, int idx, int val) +static inline bool is_store_user_clean(struct kmem_cache *cachep) { - int freelist_size; - char *status; - struct kmem_cache *cachep = page->slab_cache; - - freelist_size = cachep->num * sizeof(freelist_idx_t); - status = (char *)page->freelist + freelist_size; - status[idx] = val; + return atomic_read(&cachep->store_user_clean) == 1; } -static inline unsigned int get_obj_status(struct page *page, int idx) +static inline void set_store_user_clean(struct kmem_cache *cachep) { - int freelist_size; - char *status; - struct kmem_cache *cachep = page->slab_cache; - - freelist_size = cachep->num * sizeof(freelist_idx_t); - status = (char *)page->freelist + freelist_size; + atomic_set(&cachep->store_user_clean, 1); +} - return status[idx]; +static inline void set_store_user_dirty(struct kmem_cache *cachep) +{ + if (is_store_user_clean(cachep)) + atomic_set(&cachep->store_user_clean, 0); } #else -static inline void set_obj_status(struct page *page, int idx, int val) {} +static inline void set_store_user_dirty(struct kmem_cache *cachep) {} #endif @@ -480,9 +470,6 @@ static size_t calculate_freelist_size(int nr_objs, size_t align) size_t freelist_size; freelist_size = nr_objs * sizeof(freelist_idx_t); - if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) - freelist_size += nr_objs * sizeof(char); - if (align) freelist_size = ALIGN(freelist_size, align); @@ -495,10 +482,7 @@ static int calculate_nr_objs(size_t slab_size, size_t buffer_size, int nr_objs; size_t remained_size; size_t freelist_size; - int extra_space = 0; - if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) - extra_space = sizeof(char); /* * Ignore padding for the initial guess. The padding * is at most @align-1 bytes, and @buffer_size is at @@ -507,7 +491,7 @@ static int calculate_nr_objs(size_t slab_size, size_t buffer_size, * into the memory allocation when taking the padding * into account. */ - nr_objs = slab_size / (buffer_size + idx_size + extra_space); + nr_objs = slab_size / (buffer_size + idx_size); /* * This calculated number will be either the right @@ -1672,6 +1656,14 @@ static void kmem_rcu_free(struct rcu_head *head) } #if DEBUG +static bool is_debug_pagealloc_cache(struct kmem_cache *cachep) +{ + if (debug_pagealloc_enabled() && OFF_SLAB(cachep) && + (cachep->size % PAGE_SIZE) == 0) + return true; + + return false; +} #ifdef CONFIG_DEBUG_PAGEALLOC static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, @@ -1705,6 +1697,23 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, } *addr++ = 0x87654321; } + +static void slab_kernel_map(struct kmem_cache *cachep, void *objp, + int map, unsigned long caller) +{ + if (!is_debug_pagealloc_cache(cachep)) + return; + + if (caller) + store_stackinfo(cachep, objp, caller); + + kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map); +} + +#else +static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp, + int map, unsigned long caller) {} + #endif static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) @@ -1735,11 +1744,9 @@ static void dump_line(char *data, int offset, int limit) if (bad_count == 1) { error ^= POISON_FREE; if (!(error & (error - 1))) { - printk(KERN_ERR "Single bit error detected. Probably " - "bad RAM.\n"); + printk(KERN_ERR "Single bit error detected. Probably bad RAM.\n"); #ifdef CONFIG_X86 - printk(KERN_ERR "Run memtest86+ or a similar memory " - "test tool.\n"); + printk(KERN_ERR "Run memtest86+ or a similar memory test tool.\n"); #else printk(KERN_ERR "Run a memory test tool.\n"); #endif @@ -1783,6 +1790,9 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) int size, i; int lines = 0; + if (is_debug_pagealloc_cache(cachep)) + return; + realobj = (char *)objp + obj_offset(cachep); size = cachep->object_size; @@ -1848,24 +1858,14 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, void *objp = index_to_obj(cachep, page, i); if (cachep->flags & SLAB_POISON) { -#ifdef CONFIG_DEBUG_PAGEALLOC - if (cachep->size % PAGE_SIZE == 0 && - OFF_SLAB(cachep)) - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 1); - else - check_poison_obj(cachep, objp); -#else check_poison_obj(cachep, objp); -#endif + slab_kernel_map(cachep, objp, 1, 0); } if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "start of a freed object " - "was overwritten"); + slab_error(cachep, "start of a freed object was overwritten"); if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "end of a freed object " - "was overwritten"); + slab_error(cachep, "end of a freed object was overwritten"); } } } @@ -1947,16 +1947,13 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, break; if (flags & CFLGS_OFF_SLAB) { - size_t freelist_size_per_obj = sizeof(freelist_idx_t); /* * Max number of objs-per-slab for caches which * use off-slab slabs. Needed to avoid a possible * looping condition in cache_grow(). */ - if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) - freelist_size_per_obj += sizeof(char); offslab_limit = size; - offslab_limit /= freelist_size_per_obj; + offslab_limit /= sizeof(freelist_idx_t); if (num > offslab_limit) break; @@ -2181,7 +2178,19 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) else size += BYTES_PER_WORD; } -#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) +#endif + + kasan_cache_create(cachep, &size, &flags); + + size = ALIGN(size, cachep->align); + /* + * We should restrict the number of objects in a slab to implement + * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition. + */ + if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) + size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); + +#if DEBUG /* * To activate debug pagealloc, off-slab management is necessary * requirement. In early phase of initialization, small sized slab @@ -2189,14 +2198,14 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * to check size >= 256. It guarantees that all necessary small * sized slab is initialized in current slab initialization sequence. */ - if (!slab_early_init && size >= kmalloc_size(INDEX_NODE) && + if (debug_pagealloc_enabled() && (flags & SLAB_POISON) && + !slab_early_init && size >= kmalloc_size(INDEX_NODE) && size >= 256 && cachep->object_size > cache_line_size() && - ALIGN(size, cachep->align) < PAGE_SIZE) { - cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align); + size < PAGE_SIZE) { + cachep->obj_offset += PAGE_SIZE - size; size = PAGE_SIZE; } #endif -#endif /* * Determine if the slab management is 'on' or 'off' slab. @@ -2205,20 +2214,13 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) */ if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init && - !(flags & SLAB_NOLEAKTRACE)) + !(flags & SLAB_NOLEAKTRACE)) { /* * Size is large, assume best to place the slab management obj * off-slab (should allow better packing of objs). */ flags |= CFLGS_OFF_SLAB; - - size = ALIGN(size, cachep->align); - /* - * We should restrict the number of objects in a slab to implement - * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition. - */ - if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) - size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); + } left_over = calculate_slab_order(cachep, size, cachep->align, flags); @@ -2239,15 +2241,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (flags & CFLGS_OFF_SLAB) { /* really off slab. No need for manual alignment */ freelist_size = calculate_freelist_size(cachep->num, 0); - -#ifdef CONFIG_PAGE_POISONING - /* If we're going to use the generic kernel_map_pages() - * poisoning, then it's going to smash the contents of - * the redzone and userword anyhow, so switch them off. - */ - if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) - flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); -#endif } cachep->colour_off = cache_line_size(); @@ -2263,7 +2256,19 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) cachep->size = size; cachep->reciprocal_buffer_size = reciprocal_value(size); - if (flags & CFLGS_OFF_SLAB) { +#if DEBUG + /* + * If we're going to use the generic kernel_map_pages() + * poisoning, then it's going to smash the contents of + * the redzone and userword anyhow, so switch them off. + */ + if (IS_ENABLED(CONFIG_PAGE_POISONING) && + (cachep->flags & SLAB_POISON) && + is_debug_pagealloc_cache(cachep)) + cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); +#endif + + if (OFF_SLAB(cachep)) { cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); /* * This is a possibility for one of the kmalloc_{dma,}_caches. @@ -2482,17 +2487,14 @@ static inline void set_free_obj(struct page *page, ((freelist_idx_t *)(page->freelist))[idx] = val; } -static void cache_init_objs(struct kmem_cache *cachep, - struct page *page) +static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) { +#if DEBUG int i; for (i = 0; i < cachep->num; i++) { void *objp = index_to_obj(cachep, page, i); -#if DEBUG - /* need to poison the objs? */ - if (cachep->flags & SLAB_POISON) - poison_obj(cachep, objp, POISON_FREE); + kasan_init_slab_obj(cachep, objp); if (cachep->flags & SLAB_STORE_USER) *dbg_userword(cachep, objp) = NULL; @@ -2505,26 +2507,46 @@ static void cache_init_objs(struct kmem_cache *cachep, * cache which they are a constructor for. Otherwise, deadlock. * They must also be threaded. */ - if (cachep->ctor && !(cachep->flags & SLAB_POISON)) + if (cachep->ctor && !(cachep->flags & SLAB_POISON)) { + kasan_unpoison_object_data(cachep, + objp + obj_offset(cachep)); cachep->ctor(objp + obj_offset(cachep)); + kasan_poison_object_data( + cachep, objp + obj_offset(cachep)); + } if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "constructor overwrote the" - " end of an object"); + slab_error(cachep, "constructor overwrote the end of an object"); if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "constructor overwrote the" - " start of an object"); + slab_error(cachep, "constructor overwrote the start of an object"); } - if ((cachep->size % PAGE_SIZE) == 0 && - OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 0); -#else - if (cachep->ctor) - cachep->ctor(objp); + /* need to poison the objs? */ + if (cachep->flags & SLAB_POISON) { + poison_obj(cachep, objp, POISON_FREE); + slab_kernel_map(cachep, objp, 0, 0); + } + } #endif - set_obj_status(page, i, OBJECT_FREE); +} + +static void cache_init_objs(struct kmem_cache *cachep, + struct page *page) +{ + int i; + void *objp; + + cache_init_objs_debug(cachep, page); + + for (i = 0; i < cachep->num; i++) { + /* constructor could break poison info */ + if (DEBUG == 0 && cachep->ctor) { + objp = index_to_obj(cachep, page, i); + kasan_unpoison_object_data(cachep, objp); + cachep->ctor(objp); + kasan_poison_object_data(cachep, objp); + } + set_free_obj(page, i, i); } } @@ -2550,6 +2572,11 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct page *page, WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); #endif +#if DEBUG + if (cachep->flags & SLAB_STORE_USER) + set_store_user_dirty(cachep); +#endif + return objp; } @@ -2566,8 +2593,8 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page, /* Verify double free bug */ for (i = page->active; i < cachep->num; i++) { if (get_free_obj(page, i) == objnr) { - printk(KERN_ERR "slab: double free detected in cache " - "'%s', objp %p\n", cachep->name, objp); + printk(KERN_ERR "slab: double free detected in cache '%s', objp %p\n", + cachep->name, objp); BUG(); } } @@ -2652,6 +2679,7 @@ static int cache_grow(struct kmem_cache *cachep, slab_map_pages(cachep, page, freelist); + kasan_poison_slab(page); cache_init_objs(cachep, page); if (gfpflags_allow_blocking(local_flags)) @@ -2728,27 +2756,19 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, *dbg_redzone1(cachep, objp) = RED_INACTIVE; *dbg_redzone2(cachep, objp) = RED_INACTIVE; } - if (cachep->flags & SLAB_STORE_USER) + if (cachep->flags & SLAB_STORE_USER) { + set_store_user_dirty(cachep); *dbg_userword(cachep, objp) = (void *)caller; + } objnr = obj_to_index(cachep, page, objp); BUG_ON(objnr >= cachep->num); BUG_ON(objp != index_to_obj(cachep, page, objnr)); - set_obj_status(page, objnr, OBJECT_FREE); if (cachep->flags & SLAB_POISON) { -#ifdef CONFIG_DEBUG_PAGEALLOC - if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { - store_stackinfo(cachep, objp, caller); - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 0); - } else { - poison_obj(cachep, objp, POISON_FREE); - } -#else poison_obj(cachep, objp, POISON_FREE); -#endif + slab_kernel_map(cachep, objp, 0, caller); } return objp; } @@ -2870,20 +2890,11 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, void *objp, unsigned long caller) { - struct page *page; - if (!objp) return objp; if (cachep->flags & SLAB_POISON) { -#ifdef CONFIG_DEBUG_PAGEALLOC - if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 1); - else - check_poison_obj(cachep, objp); -#else check_poison_obj(cachep, objp); -#endif + slab_kernel_map(cachep, objp, 1, 0); poison_obj(cachep, objp, POISON_INUSE); } if (cachep->flags & SLAB_STORE_USER) @@ -2892,8 +2903,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { - slab_error(cachep, "double free, or memory outside" - " object was overwritten"); + slab_error(cachep, "double free, or memory outside object was overwritten"); printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx\n", objp, *dbg_redzone1(cachep, objp), @@ -2903,8 +2913,6 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, *dbg_redzone2(cachep, objp) = RED_ACTIVE; } - page = virt_to_head_page(objp); - set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE); objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) cachep->ctor(objp); @@ -3368,6 +3376,16 @@ free_done: static inline void __cache_free(struct kmem_cache *cachep, void *objp, unsigned long caller) { + /* Put the object into the quarantine, don't touch it for now. */ + if (kasan_slab_free(cachep, objp)) + return; + + ___cache_free(cachep, objp, caller); +} + +void ___cache_free(struct kmem_cache *cachep, void *objp, + unsigned long caller) +{ struct array_cache *ac = cpu_cache_get(cachep); check_irq_off(); @@ -3408,6 +3426,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *ret = slab_alloc(cachep, flags, _RET_IP_); + kasan_slab_alloc(cachep, ret, flags); trace_kmem_cache_alloc(_RET_IP_, ret, cachep->object_size, cachep->size, flags); @@ -3436,6 +3455,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) ret = slab_alloc(cachep, flags, _RET_IP_); + kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc(_RET_IP_, ret, size, cachep->size, flags); return ret; @@ -3459,6 +3479,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); + kasan_slab_alloc(cachep, ret, flags); trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep->object_size, cachep->size, flags, nodeid); @@ -3477,6 +3498,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); + kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc_node(_RET_IP_, ret, size, cachep->size, flags, nodeid); @@ -3489,11 +3511,15 @@ static __always_inline void * __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) { struct kmem_cache *cachep; + void *ret; cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - return kmem_cache_alloc_node_trace(cachep, flags, node, size); + ret = kmem_cache_alloc_node_trace(cachep, flags, node, size); + kasan_kmalloc(cachep, ret, size, flags); + + return ret; } void *__kmalloc_node(size_t size, gfp_t flags, int node) @@ -3527,6 +3553,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, return cachep; ret = slab_alloc(cachep, flags, caller); + kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc(caller, ret, size, cachep->size, flags); @@ -4001,8 +4028,7 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) unsigned long node_frees = cachep->node_frees; unsigned long overflows = cachep->node_overflow; - seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu " - "%4lu %4lu %4lu %4lu %4lu", + seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees, overflows); @@ -4107,15 +4133,34 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct page *page) { void *p; - int i; + int i, j; + unsigned long v; if (n[0] == n[1]) return; for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { - if (get_obj_status(page, i) != OBJECT_ACTIVE) + bool active = true; + + for (j = page->active; j < c->num; j++) { + if (get_free_obj(page, j) == i) { + active = false; + break; + } + } + + if (!active) + continue; + + /* + * probe_kernel_read() is used for DEBUG_PAGEALLOC. page table + * mapping is established when actual object allocation and + * we could mistakenly access the unmapped object in the cpu + * cache. + */ + if (probe_kernel_read(&v, dbg_userword(c, p), sizeof(v))) continue; - if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) + if (!add_caller(n, v)) return; } } @@ -4151,21 +4196,31 @@ static int leaks_show(struct seq_file *m, void *p) if (!(cachep->flags & SLAB_RED_ZONE)) return 0; - /* OK, we can do it */ + /* + * Set store_user_clean and start to grab stored user information + * for all objects on this cache. If some alloc/free requests comes + * during the processing, information would be wrong so restart + * whole processing. + */ + do { + set_store_user_clean(cachep); + drain_cpu_caches(cachep); - x[1] = 0; + x[1] = 0; - for_each_kmem_cache_node(cachep, node, n) { + for_each_kmem_cache_node(cachep, node, n) { - check_irq_on(); - spin_lock_irq(&n->list_lock); + check_irq_on(); + spin_lock_irq(&n->list_lock); + + list_for_each_entry(page, &n->slabs_full, lru) + handle_slab(x, cachep, page); + list_for_each_entry(page, &n->slabs_partial, lru) + handle_slab(x, cachep, page); + spin_unlock_irq(&n->list_lock); + } + } while (!is_store_user_clean(cachep)); - list_for_each_entry(page, &n->slabs_full, lru) - handle_slab(x, cachep, page); - list_for_each_entry(page, &n->slabs_partial, lru) - handle_slab(x, cachep, page); - spin_unlock_irq(&n->list_lock); - } name = cachep->name; if (x[0] == x[1]) { /* Increase the buffer size */ @@ -4231,6 +4286,36 @@ static int __init slab_proc_init(void) module_init(slab_proc_init); #endif +#ifdef CONFIG_HARDENED_USERCOPY +/* + * Rejects objects that are incorrectly sized. + * + * Returns NULL if check passes, otherwise const char * to name of cache + * to indicate an error. + */ +const char *__check_heap_object(const void *ptr, unsigned long n, + struct page *page) +{ + struct kmem_cache *cachep; + unsigned int objnr; + unsigned long offset; + + /* Find and validate object. */ + cachep = page->slab_cache; + objnr = obj_to_index(cachep, page, (void *)ptr); + BUG_ON(objnr >= cachep->num); + + /* Find offset within object. */ + offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep); + + /* Allow address range falling entirely within object size. */ + if (offset <= cachep->object_size && n <= cachep->object_size - offset) + return NULL; + + return cachep->name; +} +#endif /* CONFIG_HARDENED_USERCOPY */ + /** * ksize - get the actual amount of memory allocated for a given object * @objp: Pointer to the object @@ -4245,10 +4330,18 @@ module_init(slab_proc_init); */ size_t ksize(const void *objp) { + size_t size; + BUG_ON(!objp); if (unlikely(objp == ZERO_SIZE_PTR)) return 0; - return virt_to_cache(objp)->object_size; + size = virt_to_cache(objp)->object_size; + /* We assume that ksize callers could use the whole allocated area, + * so we need to unpoison this area. + */ + kasan_krealloc(objp, size, GFP_NOWAIT); + + return size; } EXPORT_SYMBOL(ksize); diff --git a/mm/slab.h b/mm/slab.h index 7b6087197997..66118e967e04 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -371,4 +371,6 @@ void *slab_next(struct seq_file *m, void *p, loff_t *pos); void slab_stop(struct seq_file *m, void *p); int memcg_slab_show(struct seq_file *m, void *p); +void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); + #endif /* MM_SLAB_H */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 01e7246de8df..1577d113fac5 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -35,7 +35,7 @@ struct kmem_cache *kmem_cache; */ #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ - SLAB_FAILSLAB) + SLAB_FAILSLAB | SLAB_KASAN) #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | SLAB_NOTRACK) @@ -453,6 +453,9 @@ EXPORT_SYMBOL(kmem_cache_create); static int shutdown_cache(struct kmem_cache *s, struct list_head *release, bool *need_rcu_barrier) { + /* free asan quarantined objects */ + kasan_cache_shutdown(s); + if (__kmem_cache_shutdown(s) != 0) return -EBUSY; @@ -723,8 +726,8 @@ void kmem_cache_destroy(struct kmem_cache *s) err = shutdown_cache(s, &release, &need_rcu_barrier); if (err) { - pr_err("kmem_cache_destroy %s: " - "Slab cache still has objects\n", s->name); + pr_err("kmem_cache_destroy %s: Slab cache still has objects\n", + s->name); dump_stack(); } out_unlock: @@ -750,6 +753,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep) get_online_cpus(); get_online_mems(); + kasan_cache_shrink(cachep); ret = __kmem_cache_shrink(cachep, false); put_online_mems(); put_online_cpus(); @@ -1010,7 +1014,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) page = alloc_kmem_pages(flags, order); ret = page ? page_address(page) : NULL; kmemleak_alloc(ret, size, 1, flags); - kasan_kmalloc_large(ret, size); + kasan_kmalloc_large(ret, size, flags); return ret; } EXPORT_SYMBOL(kmalloc_order); @@ -1044,13 +1048,11 @@ static void print_slabinfo_header(struct seq_file *m) #else seq_puts(m, "slabinfo - version: 2.1\n"); #endif - seq_puts(m, "# name <active_objs> <num_objs> <objsize> " - "<objperslab> <pagesperslab>"); + seq_puts(m, "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>"); seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); #ifdef CONFIG_DEBUG_SLAB - seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> " - "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>"); + seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> <error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>"); seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); #endif seq_putc(m, '\n'); @@ -1191,7 +1193,7 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size, ks = ksize(p); if (ks >= new_size) { - kasan_krealloc((void *)p, new_size); + kasan_krealloc((void *)p, new_size, flags); return (void *)p; } diff --git a/mm/slub.c b/mm/slub.c index c33b0e13cca7..191bbc3378d3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -124,6 +124,14 @@ static inline int kmem_cache_debug(struct kmem_cache *s) #endif } +static inline void *fixup_red_left(struct kmem_cache *s, void *p) +{ + if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) + p += s->red_left_pad; + + return p; +} + static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) { #ifdef CONFIG_SLUB_CPU_PARTIAL @@ -224,24 +232,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) * Core slab cache functions *******************************************************************/ -/* Verify that a pointer has an address that is valid within a slab page */ -static inline int check_valid_pointer(struct kmem_cache *s, - struct page *page, const void *object) -{ - void *base; - - if (!object) - return 1; - - base = page_address(page); - if (object < base || object >= base + page->objects * s->size || - (object - base) % s->size) { - return 0; - } - - return 1; -} - static inline void *get_freepointer(struct kmem_cache *s, void *object) { return *(void **)(object + s->offset); @@ -271,12 +261,14 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) /* Loop over all objects in a slab */ #define for_each_object(__p, __s, __addr, __objects) \ - for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ - __p += (__s)->size) + for (__p = fixup_red_left(__s, __addr); \ + __p < (__addr) + (__objects) * (__s)->size; \ + __p += (__s)->size) #define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ - for (__p = (__addr), __idx = 1; __idx <= __objects;\ - __p += (__s)->size, __idx++) + for (__p = fixup_red_left(__s, __addr), __idx = 1; \ + __idx <= __objects; \ + __p += (__s)->size, __idx++) /* Determine object index from a given position */ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) @@ -295,6 +287,9 @@ static inline size_t slab_ksize(const struct kmem_cache *s) return s->object_size; #endif + if (s->flags & SLAB_KASAN) + return s->object_size; + /* * If we have the need to store the freelist pointer * back there or track user information then we can @@ -456,13 +451,27 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) set_bit(slab_index(p, s, addr), map); } +static inline int size_from_object(struct kmem_cache *s) +{ + if (s->flags & SLAB_RED_ZONE) + return s->size - s->red_left_pad; + + return s->size; +} + +static inline void *restore_red_left(struct kmem_cache *s, void *p) +{ + if (s->flags & SLAB_RED_ZONE) + p -= s->red_left_pad; + + return p; +} + /* * Debug settings: */ #if defined(CONFIG_SLUB_DEBUG_ON) static int slub_debug = DEBUG_DEFAULT_FLAGS; -#elif defined(CONFIG_KASAN) -static int slub_debug = SLAB_STORE_USER; #else static int slub_debug; #endif @@ -489,6 +498,26 @@ static inline void metadata_access_disable(void) /* * Object debugging */ + +/* Verify that a pointer has an address that is valid within a slab page */ +static inline int check_valid_pointer(struct kmem_cache *s, + struct page *page, void *object) +{ + void *base; + + if (!object) + return 1; + + base = page_address(page); + object = restore_red_left(s, object); + if (object < base || object >= base + page->objects * s->size || + (object - base) % s->size) { + return 0; + } + + return 1; +} + static void print_section(char *text, u8 *addr, unsigned int length) { metadata_access_enable(); @@ -628,7 +657,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", p, p - addr, get_freepointer(s, p)); - if (p > addr + 16) + if (s->flags & SLAB_RED_ZONE) + print_section("Redzone ", p - s->red_left_pad, s->red_left_pad); + else if (p > addr + 16) print_section("Bytes b4 ", p - 16, 16); print_section("Object ", p, min_t(unsigned long, s->object_size, @@ -645,9 +676,11 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); - if (off != s->size) + off += kasan_metadata_size(s); + + if (off != size_from_object(s)) /* Beginning of the filler is the free pointer */ - print_section("Padding ", p + off, s->size - off); + print_section("Padding ", p + off, size_from_object(s) - off); dump_stack(); } @@ -677,6 +710,9 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) { u8 *p = object; + if (s->flags & SLAB_RED_ZONE) + memset(p - s->red_left_pad, val, s->red_left_pad); + if (s->flags & __OBJECT_POISON) { memset(p, POISON_FREE, s->object_size - 1); p[s->object_size - 1] = POISON_END; @@ -769,11 +805,13 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) /* We also have user information there */ off += 2 * sizeof(struct track); - if (s->size == off) + off += kasan_metadata_size(s); + + if (size_from_object(s) == off) return 1; return check_bytes_and_report(s, page, p, "Object padding", - p + off, POISON_INUSE, s->size - off); + p + off, POISON_INUSE, size_from_object(s) - off); } /* Check the pad bytes at the end of a slab page */ @@ -818,6 +856,10 @@ static int check_object(struct kmem_cache *s, struct page *page, if (s->flags & SLAB_RED_ZONE) { if (!check_bytes_and_report(s, page, object, "Redzone", + object - s->red_left_pad, val, s->red_left_pad)) + return 0; + + if (!check_bytes_and_report(s, page, object, "Redzone", endobject, val, s->inuse - s->object_size)) return 0; } else { @@ -928,14 +970,14 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) max_objects = MAX_OBJS_PER_PAGE; if (page->objects != max_objects) { - slab_err(s, page, "Wrong number of objects. Found %d but " - "should be %d", page->objects, max_objects); + slab_err(s, page, "Wrong number of objects. Found %d but should be %d", + page->objects, max_objects); page->objects = max_objects; slab_fix(s, "Number of objects adjusted."); } if (page->inuse != page->objects - nr) { - slab_err(s, page, "Wrong object count. Counter is %d but " - "counted were %d", page->inuse, page->objects - nr); + slab_err(s, page, "Wrong object count. Counter is %d but counted were %d", + page->inuse, page->objects - nr); page->inuse = page->objects - nr; slab_fix(s, "Object count adjusted."); } @@ -1099,8 +1141,8 @@ next_object: if (unlikely(s != page->slab_cache)) { if (!PageSlab(page)) { - slab_err(s, page, "Attempt to free object(0x%p) " - "outside of slab", object); + slab_err(s, page, "Attempt to free object(0x%p) outside of slab", + object); } else if (!page->slab_cache) { pr_err("SLUB <none>: no slab for object 0x%p.\n", object); @@ -1270,7 +1312,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) { kmemleak_alloc(ptr, size, 1, flags); - kasan_kmalloc_large(ptr, size); + kasan_kmalloc_large(ptr, size, flags); } static inline void kfree_hook(const void *x) @@ -1304,13 +1346,15 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); - kasan_slab_alloc(s, object); + kasan_slab_alloc(s, object, flags); } memcg_kmem_put_cache(s); } -static inline void slab_free_hook(struct kmem_cache *s, void *x) +static inline void *slab_free_hook(struct kmem_cache *s, void *x) { + void *freeptr; + kmemleak_free_recursive(x, s->flags); /* @@ -1331,7 +1375,13 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) if (!(s->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(x, s->object_size); + freeptr = get_freepointer(s, x); + /* + * kasan_slab_free() may put x into memory quarantine, delaying its + * reuse. In this case the object's freelist pointer is changed. + */ kasan_slab_free(s, x); + return freeptr; } static inline void slab_free_freelist_hook(struct kmem_cache *s, @@ -1349,11 +1399,11 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s, void *object = head; void *tail_obj = tail ? : head; + void *freeptr; do { - slab_free_hook(s, object); - } while ((object != tail_obj) && - (object = get_freepointer(s, object))); + freeptr = slab_free_hook(s, object); + } while ((object != tail_obj) && (object = freeptr)); #endif } @@ -1361,6 +1411,7 @@ static void setup_object(struct kmem_cache *s, struct page *page, void *object) { setup_object_debug(s, page, object); + kasan_init_slab_obj(s, object); if (unlikely(s->ctor)) { kasan_unpoison_object_data(s, object); s->ctor(object); @@ -1468,7 +1519,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) set_freepointer(s, p, NULL); } - page->freelist = start; + page->freelist = fixup_red_left(s, start); page->inuse = page->objects; page->frozen = 1; @@ -2588,7 +2639,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) { void *ret = slab_alloc(s, gfpflags, _RET_IP_); trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); - kasan_kmalloc(s, ret, size); + kasan_kmalloc(s, ret, size, gfpflags); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_trace); @@ -2616,7 +2667,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s, trace_kmalloc_node(_RET_IP_, ret, size, s->size, gfpflags, node); - kasan_kmalloc(s, ret, size); + kasan_kmalloc(s, ret, size, gfpflags); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node_trace); @@ -2761,16 +2812,13 @@ slab_empty: * same page) possible by specifying head and tail ptr, plus objects * count (cnt). Bulk free indicated by tail pointer being set. */ -static __always_inline void slab_free(struct kmem_cache *s, struct page *page, - void *head, void *tail, int cnt, - unsigned long addr) +static __always_inline void do_slab_free(struct kmem_cache *s, + struct page *page, void *head, void *tail, + int cnt, unsigned long addr) { void *tail_obj = tail ? : head; struct kmem_cache_cpu *c; unsigned long tid; - - slab_free_freelist_hook(s, head, tail); - redo: /* * Determine the currently cpus per cpu slab. @@ -2804,6 +2852,27 @@ redo: } +static __always_inline void slab_free(struct kmem_cache *s, struct page *page, + void *head, void *tail, int cnt, + unsigned long addr) +{ + slab_free_freelist_hook(s, head, tail); + /* + * slab_free_freelist_hook() could have put the items into quarantine. + * If so, no need to free them. + */ + if (s->flags & SLAB_KASAN && !(s->flags & SLAB_DESTROY_BY_RCU)) + return; + do_slab_free(s, page, head, tail, cnt, addr); +} + +#ifdef CONFIG_KASAN +void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) +{ + do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr); +} +#endif + void kmem_cache_free(struct kmem_cache *s, void *x) { s = cache_from_obj(s, x); @@ -3160,7 +3229,8 @@ static void early_kmem_cache_node_alloc(int node) init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); init_tracking(kmem_cache_node, n); #endif - kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node)); + kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), + GFP_KERNEL); init_kmem_cache_node(n); inc_slabs_node(kmem_cache_node, node, page->objects); @@ -3223,7 +3293,7 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min) static int calculate_sizes(struct kmem_cache *s, int forced_order) { unsigned long flags = s->flags; - unsigned long size = s->object_size; + size_t size = s->object_size; int order; /* @@ -3282,8 +3352,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * the object. */ size += 2 * sizeof(struct track); +#endif - if (flags & SLAB_RED_ZONE) + kasan_cache_create(s, &size, &s->flags); +#ifdef CONFIG_SLUB_DEBUG + if (flags & SLAB_RED_ZONE) { /* * Add some empty padding so that we can catch * overwrites from earlier objects rather than let @@ -3292,6 +3365,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * of the object. */ size += sizeof(void *); + + s->red_left_pad = sizeof(void *); + s->red_left_pad = ALIGN(s->red_left_pad, s->align); + size += s->red_left_pad; + } #endif /* @@ -3406,10 +3484,9 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) free_kmem_cache_nodes(s); error: if (flags & SLAB_PANIC) - panic("Cannot create slab %s size=%lu realsize=%u " - "order=%u offset=%u flags=%lx\n", - s->name, (unsigned long)s->size, s->size, - oo_order(s->oo), s->offset, flags); + panic("Cannot create slab %s size=%lu realsize=%u order=%u offset=%u flags=%lx\n", + s->name, (unsigned long)s->size, s->size, + oo_order(s->oo), s->offset, flags); return -EINVAL; } @@ -3533,7 +3610,7 @@ void *__kmalloc(size_t size, gfp_t flags) trace_kmalloc(_RET_IP_, ret, size, s->size, flags); - kasan_kmalloc(s, ret, size); + kasan_kmalloc(s, ret, size, flags); return ret; } @@ -3578,13 +3655,53 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); - kasan_kmalloc(s, ret, size); + kasan_kmalloc(s, ret, size, flags); return ret; } EXPORT_SYMBOL(__kmalloc_node); #endif +#ifdef CONFIG_HARDENED_USERCOPY +/* + * Rejects objects that are incorrectly sized. + * + * Returns NULL if check passes, otherwise const char * to name of cache + * to indicate an error. + */ +const char *__check_heap_object(const void *ptr, unsigned long n, + struct page *page) +{ + struct kmem_cache *s; + unsigned long offset; + size_t object_size; + + /* Find object and usable object size. */ + s = page->slab_cache; + object_size = slab_ksize(s); + + /* Reject impossible pointers. */ + if (ptr < page_address(page)) + return s->name; + + /* Find offset within object. */ + offset = (ptr - page_address(page)) % s->size; + + /* Adjust for redzone and reject if within the redzone. */ + if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) { + if (offset < s->red_left_pad) + return s->name; + offset -= s->red_left_pad; + } + + /* Allow address range falling entirely within object size. */ + if (offset <= object_size && n <= object_size - offset) + return NULL; + + return s->name; +} +#endif /* CONFIG_HARDENED_USERCOPY */ + static size_t __ksize(const void *object) { struct page *page; @@ -3607,7 +3724,7 @@ size_t ksize(const void *object) size_t size = __ksize(object); /* We assume that ksize callers could use whole allocated area, so we need unpoison this area. */ - kasan_krealloc(object, size); + kasan_krealloc(object, size, GFP_NOWAIT); return size; } EXPORT_SYMBOL(ksize); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 4cba9c2783a1..c22065e02084 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -94,8 +94,8 @@ void __meminit vmemmap_verify(pte_t *pte, int node, int actual_node = early_pfn_to_nid(pfn); if (node_distance(actual_node, node) > LOCAL_DISTANCE) - printk(KERN_WARNING "[%lx-%lx] potential offnode " - "page_structs\n", start, end - 1); + printk(KERN_WARNING "[%lx-%lx] potential offnode page_structs\n", + start, end - 1); } pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) @@ -220,8 +220,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, if (map_map[pnum]) continue; ms = __nr_to_section(pnum); - printk(KERN_ERR "%s: sparsemem memory map backing failed " - "some memory will not be available.\n", __func__); + printk(KERN_ERR "%s: sparsemem memory map backing failed some memory will not be available.\n", + __func__); ms->section_mem_map = 0; } diff --git a/mm/sparse.c b/mm/sparse.c index d1b48b691ac8..1b7543a775a4 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -428,8 +428,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, if (map_map[pnum]) continue; ms = __nr_to_section(pnum); - printk(KERN_ERR "%s: sparsemem memory map backing failed " - "some memory will not be available.\n", __func__); + printk(KERN_ERR "%s: sparsemem memory map backing failed some memory will not be available.\n", + __func__); ms->section_mem_map = 0; } } @@ -456,8 +456,8 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) if (map) return map; - printk(KERN_ERR "%s: sparsemem memory map backing failed " - "some memory will not be available.\n", __func__); + printk(KERN_ERR "%s: sparsemem memory map backing failed some memory will not be available.\n", + __func__); ms->section_mem_map = 0; return NULL; } diff --git a/mm/swapfile.c b/mm/swapfile.c index 8e25ff2b693a..623c77c1327b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2560,8 +2560,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); - pr_info("Adding %uk swap on %s. " - "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", + pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", p->pages<<(PAGE_SHIFT-10), name->name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", diff --git a/mm/usercopy.c b/mm/usercopy.c new file mode 100644 index 000000000000..b34996a3860b --- /dev/null +++ b/mm/usercopy.c @@ -0,0 +1,278 @@ +/* + * This implements the various checks for CONFIG_HARDENED_USERCOPY*, + * which are designed to protect kernel memory from needless exposure + * and overwrite under many unintended conditions. This code is based + * on PAX_USERCOPY, which is: + * + * Copyright (C) 2001-2016 PaX Team, Bradley Spengler, Open Source + * Security Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <asm/sections.h> + +enum { + BAD_STACK = -1, + NOT_STACK = 0, + GOOD_FRAME, + GOOD_STACK, +}; + +/* + * Checks if a given pointer and length is contained by the current + * stack frame (if possible). + * + * Returns: + * NOT_STACK: not at all on the stack + * GOOD_FRAME: fully within a valid stack frame + * GOOD_STACK: fully on the stack (when can't do frame-checking) + * BAD_STACK: error condition (invalid stack position or bad stack frame) + */ +static noinline int check_stack_object(const void *obj, unsigned long len) +{ + const void * const stack = task_stack_page(current); + const void * const stackend = stack + THREAD_SIZE; + int ret; + + /* Object is not on the stack at all. */ + if (obj + len <= stack || stackend <= obj) + return NOT_STACK; + + /* + * Reject: object partially overlaps the stack (passing the + * the check above means at least one end is within the stack, + * so if this check fails, the other end is outside the stack). + */ + if (obj < stack || stackend < obj + len) + return BAD_STACK; + + /* Check if object is safely within a valid frame. */ + ret = arch_within_stack_frames(stack, stackend, obj, len); + if (ret) + return ret; + + return GOOD_STACK; +} + +static void report_usercopy(const void *ptr, unsigned long len, + bool to_user, const char *type) +{ + pr_emerg("kernel memory %s attempt detected %s %p (%s) (%lu bytes)\n", + to_user ? "exposure" : "overwrite", + to_user ? "from" : "to", ptr, type ? : "unknown", len); + /* + * For greater effect, it would be nice to do do_group_exit(), + * but BUG() actually hooks all the lock-breaking and per-arch + * Oops code, so that is used here instead. + */ + BUG(); +} + +/* Returns true if any portion of [ptr,ptr+n) over laps with [low,high). */ +static bool overlaps(const void *ptr, unsigned long n, unsigned long low, + unsigned long high) +{ + unsigned long check_low = (uintptr_t)ptr; + unsigned long check_high = check_low + n; + + /* Does not overlap if entirely above or entirely below. */ + if (check_low >= high || check_high <= low) + return false; + + return true; +} + +/* Is this address range in the kernel text area? */ +static inline const char *check_kernel_text_object(const void *ptr, + unsigned long n) +{ + unsigned long textlow = (unsigned long)_stext; + unsigned long texthigh = (unsigned long)_etext; + unsigned long textlow_linear, texthigh_linear; + + if (overlaps(ptr, n, textlow, texthigh)) + return "<kernel text>"; + + /* + * Some architectures have virtual memory mappings with a secondary + * mapping of the kernel text, i.e. there is more than one virtual + * kernel address that points to the kernel image. It is usually + * when there is a separate linear physical memory mapping, in that + * __pa() is not just the reverse of __va(). This can be detected + * and checked: + */ + textlow_linear = (unsigned long)__va(__pa(textlow)); + /* No different mapping: we're done. */ + if (textlow_linear == textlow) + return NULL; + + /* Check the secondary mapping... */ + texthigh_linear = (unsigned long)__va(__pa(texthigh)); + if (overlaps(ptr, n, textlow_linear, texthigh_linear)) + return "<linear kernel text>"; + + return NULL; +} + +static inline const char *check_bogus_address(const void *ptr, unsigned long n) +{ + /* Reject if object wraps past end of memory. */ + if ((unsigned long)ptr + n < (unsigned long)ptr) + return "<wrapped address>"; + + /* Reject if NULL or ZERO-allocation. */ + if (ZERO_OR_NULL_PTR(ptr)) + return "<null>"; + + return NULL; +} + +/* Checks for allocs that are marked in some way as spanning multiple pages. */ +static inline const char *check_page_span(const void *ptr, unsigned long n, + struct page *page, bool to_user) +{ +#ifdef CONFIG_HARDENED_USERCOPY_PAGESPAN + const void *end = ptr + n - 1; + struct page *endpage; + bool is_reserved, is_cma; + + /* + * Sometimes the kernel data regions are not marked Reserved (see + * check below). And sometimes [_sdata,_edata) does not cover + * rodata and/or bss, so check each range explicitly. + */ + + /* Allow reads of kernel rodata region (if not marked as Reserved). */ + if (ptr >= (const void *)__start_rodata && + end <= (const void *)__end_rodata) { + if (!to_user) + return "<rodata>"; + return NULL; + } + + /* Allow kernel data region (if not marked as Reserved). */ + if (ptr >= (const void *)_sdata && end <= (const void *)_edata) + return NULL; + + /* Allow kernel bss region (if not marked as Reserved). */ + if (ptr >= (const void *)__bss_start && + end <= (const void *)__bss_stop) + return NULL; + + /* Is the object wholly within one base page? */ + if (likely(((unsigned long)ptr & (unsigned long)PAGE_MASK) == + ((unsigned long)end & (unsigned long)PAGE_MASK))) + return NULL; + + /* Allow if fully inside the same compound (__GFP_COMP) page. */ + endpage = virt_to_head_page(end); + if (likely(endpage == page)) + return NULL; + + /* + * Reject if range is entirely either Reserved (i.e. special or + * device memory), or CMA. Otherwise, reject since the object spans + * several independently allocated pages. + */ + is_reserved = PageReserved(page); + is_cma = is_migrate_cma_page(page); + if (!is_reserved && !is_cma) + return "<spans multiple pages>"; + + for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) { + page = virt_to_head_page(ptr); + if (is_reserved && !PageReserved(page)) + return "<spans Reserved and non-Reserved pages>"; + if (is_cma && !is_migrate_cma_page(page)) + return "<spans CMA and non-CMA pages>"; + } +#endif + + return NULL; +} + +static inline const char *check_heap_object(const void *ptr, unsigned long n, + bool to_user) +{ + struct page *page; + + /* + * Some architectures (arm64) return true for virt_addr_valid() on + * vmalloced addresses. Work around this by checking for vmalloc + * first. + */ + if (is_vmalloc_addr(ptr)) + return NULL; + + if (!virt_addr_valid(ptr)) + return NULL; + + page = virt_to_head_page(ptr); + + /* Check slab allocator for flags and size. */ + if (PageSlab(page)) + return __check_heap_object(ptr, n, page); + + /* Verify object does not incorrectly span multiple pages. */ + return check_page_span(ptr, n, page, to_user); +} + +/* + * Validates that the given object is: + * - not bogus address + * - known-safe heap or stack object + * - not in kernel text + */ +void __check_object_size(const void *ptr, unsigned long n, bool to_user) +{ + const char *err; + + /* Skip all tests if size is zero. */ + if (!n) + return; + + /* Check for invalid addresses. */ + err = check_bogus_address(ptr, n); + if (err) + goto report; + + /* Check for bad heap object. */ + err = check_heap_object(ptr, n, to_user); + if (err) + goto report; + + /* Check for bad stack object. */ + switch (check_stack_object(ptr, n)) { + case NOT_STACK: + /* Object is not touching the current process stack. */ + break; + case GOOD_FRAME: + case GOOD_STACK: + /* + * Object is either in the correct frame (when it + * is possible to check) or just generally on the + * process stack (when frame checking not available). + */ + return; + default: + err = "<process stack>"; + goto report; + } + + /* Check for object in kernel to avoid text exposure. */ + err = check_kernel_text_object(ptr, n); + if (!err) + return; + +report: + report_usercopy(ptr, n, to_user, err); +} +EXPORT_SYMBOL(__check_object_size); diff --git a/mm/util.c b/mm/util.c index db39235970c6..4d59b4ffe66b 100644 --- a/mm/util.c +++ b/mm/util.c @@ -372,6 +372,7 @@ struct address_space *page_mapping(struct page *page) return NULL; return page->mapping; } +EXPORT_SYMBOL(page_mapping); int overcommit_ratio_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, diff --git a/mm/vmalloc.c b/mm/vmalloc.c index de8e372ece04..6e7d513467c6 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -470,8 +470,8 @@ overflow: goto retry; } if (printk_ratelimit()) - pr_warn("vmap allocation for size %lu failed: " - "use vmalloc=<size> to increase size.\n", size); + pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n", + size); kfree(va); return ERR_PTR(-EBUSY); } diff --git a/mm/vmstat.c b/mm/vmstat.c index a2d70ef74db7..6af9bbad94c7 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -460,7 +460,7 @@ static int fold_diff(int *diff) * * The function returns the number of global counters updated. */ -static int refresh_cpu_vm_stats(void) +static int refresh_cpu_vm_stats(bool do_pagesets) { struct zone *zone; int i; @@ -484,33 +484,35 @@ static int refresh_cpu_vm_stats(void) #endif } } - cond_resched(); #ifdef CONFIG_NUMA - /* - * Deal with draining the remote pageset of this - * processor - * - * Check if there are pages remaining in this pageset - * if not then there is nothing to expire. - */ - if (!__this_cpu_read(p->expire) || + if (do_pagesets) { + cond_resched(); + /* + * Deal with draining the remote pageset of this + * processor + * + * Check if there are pages remaining in this pageset + * if not then there is nothing to expire. + */ + if (!__this_cpu_read(p->expire) || !__this_cpu_read(p->pcp.count)) - continue; + continue; - /* - * We never drain zones local to this processor. - */ - if (zone_to_nid(zone) == numa_node_id()) { - __this_cpu_write(p->expire, 0); - continue; - } + /* + * We never drain zones local to this processor. + */ + if (zone_to_nid(zone) == numa_node_id()) { + __this_cpu_write(p->expire, 0); + continue; + } - if (__this_cpu_dec_return(p->expire)) - continue; + if (__this_cpu_dec_return(p->expire)) + continue; - if (__this_cpu_read(p->pcp.count)) { - drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); - changes++; + if (__this_cpu_read(p->pcp.count)) { + drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); + changes++; + } } #endif } @@ -1393,7 +1395,7 @@ static cpumask_var_t cpu_stat_off; static void vmstat_update(struct work_struct *w) { - if (refresh_cpu_vm_stats()) { + if (refresh_cpu_vm_stats(true)) { /* * Counters were updated so we expect more updates * to occur in the future. Keep on running the @@ -1425,6 +1427,23 @@ static void vmstat_update(struct work_struct *w) } /* + * Switch off vmstat processing and then fold all the remaining differentials + * until the diffs stay at zero. The function is used by NOHZ and can only be + * invoked when tick processing is not active. + */ +void quiet_vmstat(void) +{ + if (system_state != SYSTEM_RUNNING) + return; + + do { + if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off)) + cancel_delayed_work(this_cpu_ptr(&vmstat_work)); + + } while (refresh_cpu_vm_stats(false)); +} + +/* * Check if the diffs for a certain cpu indicate that * an update is needed. */ @@ -1456,7 +1475,7 @@ static bool need_update(int cpu) */ static void vmstat_shepherd(struct work_struct *w); -static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd); +static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd); static void vmstat_shepherd(struct work_struct *w) { |