diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/gup.c | 5 | ||||
| -rw-r--r-- | mm/kasan/report.c | 3 | ||||
| -rw-r--r-- | mm/memblock.c | 24 | ||||
| -rw-r--r-- | mm/memory-failure.c | 13 | ||||
| -rw-r--r-- | mm/memory.c | 38 | ||||
| -rw-r--r-- | mm/mlock.c | 5 | ||||
| -rw-r--r-- | mm/mmap.c | 160 | ||||
| -rw-r--r-- | mm/page_alloc.c | 25 | ||||
| -rw-r--r-- | mm/slub.c | 6 | ||||
| -rw-r--r-- | mm/swap_cgroup.c | 3 | ||||
| -rw-r--r-- | mm/truncate.c | 2 | ||||
| -rw-r--r-- | mm/vmalloc.c | 196 | ||||
| -rw-r--r-- | mm/vmstat.c | 27 |
13 files changed, 286 insertions, 221 deletions
@@ -312,11 +312,6 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, /* mlock all present pages, but do not fault in new pages */ if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) return -ENOENT; - /* For mm_populate(), just skip the stack guard page. */ - if ((*flags & FOLL_POPULATE) && - (stack_guard_page_start(vma, address) || - stack_guard_page_end(vma, address + PAGE_SIZE))) - return -ENOENT; if (*flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; if (nonblocking) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 12f222d0224b..b4e31f78ae69 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -13,6 +13,7 @@ * */ +#include <linux/ftrace.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/printk.h> @@ -251,6 +252,8 @@ void kasan_report(unsigned long addr, size_t size, if (likely(!kasan_report_enabled())) return; + disable_trace_on_warning(); + info.access_addr = (void *)addr; info.access_size = size; info.is_write = is_write; diff --git a/mm/memblock.c b/mm/memblock.c index 89e74fa82290..351a4840a407 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1688,6 +1688,30 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name } } +extern unsigned long __init_memblock +memblock_reserved_memory_within(phys_addr_t start_addr, phys_addr_t end_addr) +{ + struct memblock_type *type = &memblock.reserved; + unsigned long size = 0; + int idx; + + for (idx = 0; idx < type->cnt; idx++) { + struct memblock_region *rgn = &type->regions[idx]; + phys_addr_t start, end; + + if (rgn->base + rgn->size < start_addr) + continue; + if (rgn->base > end_addr) + continue; + + start = rgn->base; + end = start + rgn->size; + size += end - start; + } + + return size; +} + void __init_memblock __memblock_dump_all(void) { pr_info("MEMBLOCK configuration:\n"); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b3becd9941e9..14c1c5cb3ccf 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1208,7 +1208,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * page_remove_rmap() in try_to_unmap_one(). So to determine page status * correctly, we save a copy of the page flags at this time. */ - page_flags = p->flags; + if (PageHuge(p)) + page_flags = hpage->flags; + else + page_flags = p->flags; /* * unpoison always clear PG_hwpoison inside page lock @@ -1619,12 +1622,8 @@ static int soft_offline_huge_page(struct page *page, int flags) if (ret) { pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); - /* - * We know that soft_offline_huge_page() tries to migrate - * only one hugepage pointed to by hpage, so we need not - * run through the pagelist here. - */ - putback_active_hugepage(hpage); + if (!list_empty(&pagelist)) + putback_movable_pages(&pagelist); if (ret > 0) ret = -EIO; } else { diff --git a/mm/memory.c b/mm/memory.c index b536e3d60fc7..8689df9b09d5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2663,40 +2663,6 @@ out_release: } /* - * This is like a special single-page "expand_{down|up}wards()", - * except we must first make sure that 'address{-|+}PAGE_SIZE' - * doesn't hit another vma. - */ -static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address) -{ - address &= PAGE_MASK; - if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) { - struct vm_area_struct *prev = vma->vm_prev; - - /* - * Is there a mapping abutting this one below? - * - * That's only ok if it's the same stack mapping - * that has gotten split.. - */ - if (prev && prev->vm_end == address) - return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; - - return expand_downwards(vma, address - PAGE_SIZE); - } - if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { - struct vm_area_struct *next = vma->vm_next; - - /* As VM_GROWSDOWN but s/below/above/ */ - if (next && next->vm_start == address + PAGE_SIZE) - return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM; - - return expand_upwards(vma, address + PAGE_SIZE); - } - return 0; -} - -/* * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. @@ -2716,10 +2682,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (vma->vm_flags & VM_SHARED) return VM_FAULT_SIGBUS; - /* Check if we need to add a guard page to the stack */ - if (check_stack_guard_page(vma, address) < 0) - return VM_FAULT_SIGSEGV; - /* Use the zero-page for reads */ if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) { entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), diff --git a/mm/mlock.c b/mm/mlock.c index d843bc9d32dd..206e86b98a03 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -277,7 +277,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) { int i; int nr = pagevec_count(pvec); - int delta_munlocked; + int delta_munlocked = -nr; struct pagevec pvec_putback; int pgrescued = 0; @@ -297,6 +297,8 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) continue; else __munlock_isolation_failed(page); + } else { + delta_munlocked++; } /* @@ -308,7 +310,6 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) pagevec_add(&pvec_putback, pvec->pages[i]); pvec->pages[i] = NULL; } - delta_munlocked = -nr + pagevec_count(&pvec_putback); __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); spin_unlock_irq(&zone->lru_lock); diff --git a/mm/mmap.c b/mm/mmap.c index ebc9d1923c3d..092729c2cb72 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -304,6 +304,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) unsigned long retval; unsigned long newbrk, oldbrk; struct mm_struct *mm = current->mm; + struct vm_area_struct *next; unsigned long min_brk; bool populate; @@ -348,7 +349,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) } /* Check against existing mmap mappings. */ - if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) + next = find_vma(mm, oldbrk); + if (next && newbrk + PAGE_SIZE > vm_start_gap(next)) goto out; /* Ok, looks good - let it rip. */ @@ -371,10 +373,22 @@ out: static long vma_compute_subtree_gap(struct vm_area_struct *vma) { - unsigned long max, subtree_gap; - max = vma->vm_start; - if (vma->vm_prev) - max -= vma->vm_prev->vm_end; + unsigned long max, prev_end, subtree_gap; + + /* + * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we + * allow two stack_guard_gaps between them here, and when choosing + * an unmapped area; whereas when expanding we only require one. + * That's a little inconsistent, but keeps the code here simpler. + */ + max = vm_start_gap(vma); + if (vma->vm_prev) { + prev_end = vm_end_gap(vma->vm_prev); + if (max > prev_end) + max -= prev_end; + else + max = 0; + } if (vma->vm_rb.rb_left) { subtree_gap = rb_entry(vma->vm_rb.rb_left, struct vm_area_struct, vm_rb)->rb_subtree_gap; @@ -467,7 +481,7 @@ static void validate_mm(struct mm_struct *mm) anon_vma_unlock_read(anon_vma); } - highest_address = vma->vm_end; + highest_address = vm_end_gap(vma); vma = vma->vm_next; i++; } @@ -636,7 +650,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, if (vma->vm_next) vma_gap_update(vma->vm_next); else - mm->highest_vm_end = vma->vm_end; + mm->highest_vm_end = vm_end_gap(vma); /* * vma->vm_prev wasn't known when we followed the rbtree to find the @@ -882,7 +896,7 @@ again: remove_next = 1 + (end > next->vm_end); vma_gap_update(vma); if (end_changed) { if (!next) - mm->highest_vm_end = end; + mm->highest_vm_end = vm_end_gap(vma); else if (!adjust_next) vma_gap_update(next); } @@ -925,7 +939,7 @@ again: remove_next = 1 + (end > next->vm_end); else if (next) vma_gap_update(next); else - mm->highest_vm_end = end; + VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma)); } if (insert && file) uprobe_mmap(insert); @@ -1771,7 +1785,7 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info) while (true) { /* Visit left subtree if it looks promising */ - gap_end = vma->vm_start; + gap_end = vm_start_gap(vma); if (gap_end >= low_limit && vma->vm_rb.rb_left) { struct vm_area_struct *left = rb_entry(vma->vm_rb.rb_left, @@ -1782,12 +1796,13 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info) } } - gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; + gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0; check_current: /* Check if current node has a suitable gap */ if (gap_start > high_limit) return -ENOMEM; - if (gap_end >= low_limit && gap_end - gap_start >= length) + if (gap_end >= low_limit && + gap_end > gap_start && gap_end - gap_start >= length) goto found; /* Visit right subtree if it looks promising */ @@ -1809,8 +1824,8 @@ check_current: vma = rb_entry(rb_parent(prev), struct vm_area_struct, vm_rb); if (prev == vma->vm_rb.rb_left) { - gap_start = vma->vm_prev->vm_end; - gap_end = vma->vm_start; + gap_start = vm_end_gap(vma->vm_prev); + gap_end = vm_start_gap(vma); goto check_current; } } @@ -1874,7 +1889,7 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) while (true) { /* Visit right subtree if it looks promising */ - gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; + gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0; if (gap_start <= high_limit && vma->vm_rb.rb_right) { struct vm_area_struct *right = rb_entry(vma->vm_rb.rb_right, @@ -1887,10 +1902,11 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) check_current: /* Check if current node has a suitable gap */ - gap_end = vma->vm_start; + gap_end = vm_start_gap(vma); if (gap_end < low_limit) return -ENOMEM; - if (gap_start <= high_limit && gap_end - gap_start >= length) + if (gap_start <= high_limit && + gap_end > gap_start && gap_end - gap_start >= length) goto found; /* Visit left subtree if it looks promising */ @@ -1913,7 +1929,7 @@ check_current: struct vm_area_struct, vm_rb); if (prev == vma->vm_rb.rb_right) { gap_start = vma->vm_prev ? - vma->vm_prev->vm_end : 0; + vm_end_gap(vma->vm_prev) : 0; goto check_current; } } @@ -1951,7 +1967,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; + struct vm_area_struct *vma, *prev; struct vm_unmapped_area_info info; if (len > TASK_SIZE - mmap_min_addr) @@ -1962,9 +1978,10 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, if (addr) { addr = PAGE_ALIGN(addr); - vma = find_vma(mm, addr); + vma = find_vma_prev(mm, addr, &prev); if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma)) && + (!prev || addr >= vm_end_gap(prev))) return addr; } @@ -1987,7 +2004,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, const unsigned long len, const unsigned long pgoff, const unsigned long flags) { - struct vm_area_struct *vma; + struct vm_area_struct *vma, *prev; struct mm_struct *mm = current->mm; unsigned long addr = addr0; struct vm_unmapped_area_info info; @@ -2002,9 +2019,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, /* requesting a specific address */ if (addr) { addr = PAGE_ALIGN(addr); - vma = find_vma(mm, addr); + vma = find_vma_prev(mm, addr, &prev); if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma)) && + (!prev || addr >= vm_end_gap(prev))) return addr; } @@ -2129,21 +2147,19 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr, * update accounting. This is shared with both the * grow-up and grow-down cases. */ -static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) +static int acct_stack_growth(struct vm_area_struct *vma, + unsigned long size, unsigned long grow) { struct mm_struct *mm = vma->vm_mm; struct rlimit *rlim = current->signal->rlim; - unsigned long new_start, actual_size; + unsigned long new_start; /* address space limit tests */ if (!may_expand_vm(mm, grow)) return -ENOMEM; /* Stack limit test */ - actual_size = size; - if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN))) - actual_size -= PAGE_SIZE; - if (actual_size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur)) + if (size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur)) return -ENOMEM; /* mlock limit tests */ @@ -2181,16 +2197,32 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns int expand_upwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *next; + unsigned long gap_addr; int error = 0; if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; - /* Guard against wrapping around to address 0. */ - if (address < PAGE_ALIGN(address+4)) - address = PAGE_ALIGN(address+4); - else + /* Guard against exceeding limits of the address space. */ + address &= PAGE_MASK; + if (address >= TASK_SIZE) return -ENOMEM; + address += PAGE_SIZE; + + /* Enforce stack_guard_gap */ + gap_addr = address + stack_guard_gap; + + /* Guard against overflow */ + if (gap_addr < address || gap_addr > TASK_SIZE) + gap_addr = TASK_SIZE; + + next = vma->vm_next; + if (next && next->vm_start < gap_addr) { + if (!(next->vm_flags & VM_GROWSUP)) + return -ENOMEM; + /* Check that both stack segments have the same anon_vma? */ + } /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) @@ -2236,7 +2268,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (vma->vm_next) vma_gap_update(vma->vm_next); else - mm->highest_vm_end = address; + mm->highest_vm_end = vm_end_gap(vma); spin_unlock(&mm->page_table_lock); perf_event_mmap(vma); @@ -2257,6 +2289,8 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *prev; + unsigned long gap_addr; int error; address &= PAGE_MASK; @@ -2264,6 +2298,17 @@ int expand_downwards(struct vm_area_struct *vma, if (error) return error; + /* Enforce stack_guard_gap */ + gap_addr = address - stack_guard_gap; + if (gap_addr > address) + return -ENOMEM; + prev = vma->vm_prev; + if (prev && prev->vm_end > gap_addr) { + if (!(prev->vm_flags & VM_GROWSDOWN)) + return -ENOMEM; + /* Check that both stack segments have the same anon_vma? */ + } + /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) return -ENOMEM; @@ -2319,28 +2364,25 @@ int expand_downwards(struct vm_area_struct *vma, return error; } -/* - * Note how expand_stack() refuses to expand the stack all the way to - * abut the next virtual mapping, *unless* that mapping itself is also - * a stack mapping. We want to leave room for a guard page, after all - * (the guard page itself is not added here, that is done by the - * actual page faulting logic) - * - * This matches the behavior of the guard page logic (see mm/memory.c: - * check_stack_guard_page()), which only allows the guard page to be - * removed under these circumstances. - */ +/* enforced gap between the expanding stack and other mappings. */ +unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT; + +static int __init cmdline_parse_stack_guard_gap(char *p) +{ + unsigned long val; + char *endptr; + + val = simple_strtoul(p, &endptr, 10); + if (!*endptr) + stack_guard_gap = val << PAGE_SHIFT; + + return 0; +} +__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap); + #ifdef CONFIG_STACK_GROWSUP int expand_stack(struct vm_area_struct *vma, unsigned long address) { - struct vm_area_struct *next; - - address &= PAGE_MASK; - next = vma->vm_next; - if (next && next->vm_start == address + PAGE_SIZE) { - if (!(next->vm_flags & VM_GROWSUP)) - return -ENOMEM; - } return expand_upwards(vma, address); } @@ -2362,14 +2404,6 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) #else int expand_stack(struct vm_area_struct *vma, unsigned long address) { - struct vm_area_struct *prev; - - address &= PAGE_MASK; - prev = vma->vm_prev; - if (prev && prev->vm_end == address) { - if (!(prev->vm_flags & VM_GROWSDOWN)) - return -ENOMEM; - } return expand_downwards(vma, address); } @@ -2467,7 +2501,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, vma->vm_prev = prev; vma_gap_update(vma); } else - mm->highest_vm_end = prev ? prev->vm_end : 0; + mm->highest_vm_end = prev ? vm_end_gap(prev) : 0; tail_vma->vm_next = NULL; /* Kill the cache */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 170c1486e5c9..4fbb23c1cba7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -274,6 +274,26 @@ int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT static inline void reset_deferred_meminit(pg_data_t *pgdat) { + unsigned long max_initialise; + unsigned long reserved_lowmem; + + /* + * Initialise at least 2G of a node but also take into account that + * two large system hashes that can take up 1GB for 0.25TB/node. + */ + max_initialise = max(2UL << (30 - PAGE_SHIFT), + (pgdat->node_spanned_pages >> 8)); + + /* + * Compensate the all the memblock reservations (e.g. crash kernel) + * from the initial estimation to make sure we will initialize enough + * memory to boot. + */ + reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn, + pgdat->node_start_pfn + max_initialise); + max_initialise += reserved_lowmem; + + pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages); pgdat->first_deferred_pfn = ULONG_MAX; } @@ -307,10 +327,9 @@ static inline bool update_defer_init(pg_data_t *pgdat, /* Always populate low zones for address-contrained allocations */ if (zone_end < pgdat_end_pfn(pgdat)) return true; - /* Initialise at least 2G of the highest zone */ (*nr_initialised)++; - if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) && + if ((*nr_initialised > pgdat->static_init_size) && (pfn & (PAGES_PER_SECTION - 1)) == 0) { pgdat->first_deferred_pfn = pfn; return false; @@ -5456,7 +5475,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, /* pg_data_t should be reset to zero when it's allocated */ WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); - reset_deferred_meminit(pgdat); pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP @@ -5475,6 +5493,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, (unsigned long)pgdat->node_mem_map); #endif + reset_deferred_meminit(pgdat); free_area_init_core(pgdat); } diff --git a/mm/slub.c b/mm/slub.c index 64bc4e973789..a5f6c6d107e9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5357,6 +5357,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) char mbuf[64]; char *buf; struct slab_attribute *attr = to_slab_attr(slab_attrs[i]); + ssize_t len; if (!attr || !attr->store || !attr->show) continue; @@ -5381,8 +5382,9 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) buf = buffer; } - attr->show(root_cache, buf); - attr->store(s, buf, strlen(buf)); + len = attr->show(root_cache, buf); + if (len > 0) + attr->store(s, buf, len); } if (buffer) diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index b5f7f24b8dd1..40dd0f9b00d6 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -48,6 +48,9 @@ static int swap_cgroup_prepare(int type) if (!page) goto not_enough_page; ctrl->map[idx] = page; + + if (!(idx % SWAP_CLUSTER_MAX)) + cond_resched(); } return 0; not_enough_page: diff --git a/mm/truncate.c b/mm/truncate.c index 30c2f059a488..8ca20a98c327 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -799,7 +799,7 @@ EXPORT_SYMBOL(truncate_setsize); */ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) { - int bsize = 1 << inode->i_blkbits; + int bsize = i_blocksize(inode); loff_t rounded_from; struct page *page; pgoff_t index; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b219ccc015d6..b24be2a7f456 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -274,13 +274,12 @@ EXPORT_SYMBOL(vmalloc_to_pfn); /*** Global kva allocator ***/ -#define VM_LAZY_FREE 0x01 -#define VM_LAZY_FREEING 0x02 #define VM_VM_AREA 0x04 static DEFINE_SPINLOCK(vmap_area_lock); /* Export for kexec only */ LIST_HEAD(vmap_area_list); +static LLIST_HEAD(vmap_purge_list); static struct rb_root vmap_area_root = RB_ROOT; /* The vmap cache globals are protected by vmap_area_lock */ @@ -414,6 +413,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, BUG_ON(offset_in_page(size)); BUG_ON(!is_power_of_2(align)); + might_sleep(); + va = kmalloc_node(sizeof(struct vmap_area), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!va)) @@ -628,6 +629,13 @@ static unsigned long lazy_max_pages(void) static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); +/* + * Serialize vmap purging. There is no actual criticial section protected + * by this look, but we want to avoid concurrent calls for performance + * reasons and to make the pcpu_get_vm_areas more deterministic. + */ +static DEFINE_MUTEX(vmap_purge_lock); + /* for per-CPU blocks */ static void purge_fragmented_blocks_allcpus(void); @@ -642,65 +650,40 @@ void set_iounmap_nonlazy(void) /* * Purges all lazily-freed vmap areas. - * - * If sync is 0 then don't purge if there is already a purge in progress. - * If force_flush is 1, then flush kernel TLBs between *start and *end even - * if we found no lazy vmap areas to unmap (callers can use this to optimise - * their own TLB flushing). - * Returns with *start = min(*start, lowest purged address) - * *end = max(*end, highest purged address) */ -static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, - int sync, int force_flush) +static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) { - static DEFINE_SPINLOCK(purge_lock); - LIST_HEAD(valist); + struct llist_node *valist; struct vmap_area *va; struct vmap_area *n_va; - int nr = 0; - - /* - * If sync is 0 but force_flush is 1, we'll go sync anyway but callers - * should not expect such behaviour. This just simplifies locking for - * the case that isn't actually used at the moment anyway. - */ - if (!sync && !force_flush) { - if (!spin_trylock(&purge_lock)) - return; - } else - spin_lock(&purge_lock); + bool do_free = false; - if (sync) - purge_fragmented_blocks_allcpus(); + lockdep_assert_held(&vmap_purge_lock); - rcu_read_lock(); - list_for_each_entry_rcu(va, &vmap_area_list, list) { - if (va->flags & VM_LAZY_FREE) { - if (va->va_start < *start) - *start = va->va_start; - if (va->va_end > *end) - *end = va->va_end; - nr += (va->va_end - va->va_start) >> PAGE_SHIFT; - list_add_tail(&va->purge_list, &valist); - va->flags |= VM_LAZY_FREEING; - va->flags &= ~VM_LAZY_FREE; - } + valist = llist_del_all(&vmap_purge_list); + llist_for_each_entry(va, valist, purge_list) { + if (va->va_start < start) + start = va->va_start; + if (va->va_end > end) + end = va->va_end; + do_free = true; } - rcu_read_unlock(); - if (nr) - atomic_sub(nr, &vmap_lazy_nr); + if (!do_free) + return false; - if (nr || force_flush) - flush_tlb_kernel_range(*start, *end); + flush_tlb_kernel_range(start, end); - if (nr) { - spin_lock(&vmap_area_lock); - list_for_each_entry_safe(va, n_va, &valist, purge_list) - __free_vmap_area(va); - spin_unlock(&vmap_area_lock); + spin_lock(&vmap_area_lock); + llist_for_each_entry_safe(va, n_va, valist, purge_list) { + int nr = (va->va_end - va->va_start) >> PAGE_SHIFT; + + __free_vmap_area(va); + atomic_sub(nr, &vmap_lazy_nr); + cond_resched_lock(&vmap_area_lock); } - spin_unlock(&purge_lock); + spin_unlock(&vmap_area_lock); + return true; } /* @@ -709,9 +692,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, */ static void try_purge_vmap_area_lazy(void) { - unsigned long start = ULONG_MAX, end = 0; - - __purge_vmap_area_lazy(&start, &end, 0, 0); + if (mutex_trylock(&vmap_purge_lock)) { + __purge_vmap_area_lazy(ULONG_MAX, 0); + mutex_unlock(&vmap_purge_lock); + } } /* @@ -719,9 +703,10 @@ static void try_purge_vmap_area_lazy(void) */ static void purge_vmap_area_lazy(void) { - unsigned long start = ULONG_MAX, end = 0; - - __purge_vmap_area_lazy(&start, &end, 1, 0); + mutex_lock(&vmap_purge_lock); + purge_fragmented_blocks_allcpus(); + __purge_vmap_area_lazy(ULONG_MAX, 0); + mutex_unlock(&vmap_purge_lock); } /* @@ -731,20 +716,16 @@ static void purge_vmap_area_lazy(void) */ static void free_vmap_area_noflush(struct vmap_area *va) { - va->flags |= VM_LAZY_FREE; - atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); - if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages())) - try_purge_vmap_area_lazy(); -} + int nr_lazy; -/* - * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been - * called for the correct range previously. - */ -static void free_unmap_vmap_area_noflush(struct vmap_area *va) -{ - unmap_vmap_area(va); - free_vmap_area_noflush(va); + nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT, + &vmap_lazy_nr); + + /* After this point, we may free va at any time */ + llist_add(&va->purge_list, &vmap_purge_list); + + if (unlikely(nr_lazy > lazy_max_pages())) + try_purge_vmap_area_lazy(); } /* @@ -753,7 +734,8 @@ static void free_unmap_vmap_area_noflush(struct vmap_area *va) static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); - free_unmap_vmap_area_noflush(va); + unmap_vmap_area(va); + free_vmap_area_noflush(va); } static struct vmap_area *find_vmap_area(unsigned long addr) @@ -767,16 +749,6 @@ static struct vmap_area *find_vmap_area(unsigned long addr) return va; } -static void free_unmap_vmap_area_addr(unsigned long addr) -{ - struct vmap_area *va; - - va = find_vmap_area(addr); - BUG_ON(!va); - free_unmap_vmap_area(va); -} - - /*** Per cpu kva allocator ***/ /* @@ -1097,6 +1069,8 @@ void vm_unmap_aliases(void) if (unlikely(!vmap_initialized)) return; + might_sleep(); + for_each_possible_cpu(cpu) { struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); struct vmap_block *vb; @@ -1121,7 +1095,11 @@ void vm_unmap_aliases(void) rcu_read_unlock(); } - __purge_vmap_area_lazy(&start, &end, 1, flush); + mutex_lock(&vmap_purge_lock); + purge_fragmented_blocks_allcpus(); + if (!__purge_vmap_area_lazy(start, end) && flush) + flush_tlb_kernel_range(start, end); + mutex_unlock(&vmap_purge_lock); } EXPORT_SYMBOL_GPL(vm_unmap_aliases); @@ -1134,7 +1112,9 @@ void vm_unmap_ram(const void *mem, unsigned int count) { unsigned long size = count << PAGE_SHIFT; unsigned long addr = (unsigned long)mem; + struct vmap_area *va; + might_sleep(); BUG_ON(!addr); BUG_ON(addr < VMALLOC_START); BUG_ON(addr > VMALLOC_END); @@ -1143,10 +1123,14 @@ void vm_unmap_ram(const void *mem, unsigned int count) debug_check_no_locks_freed(mem, size); vmap_debug_free_range(addr, addr+size); - if (likely(count <= VMAP_MAX_ALLOC)) + if (likely(count <= VMAP_MAX_ALLOC)) { vb_free(mem, size); - else - free_unmap_vmap_area_addr(addr); + return; + } + + va = find_vmap_area(addr); + BUG_ON(!va); + free_unmap_vmap_area(va); } EXPORT_SYMBOL(vm_unmap_ram); @@ -1520,6 +1504,8 @@ struct vm_struct *remove_vm_area(const void *addr) { struct vmap_area *va; + might_sleep(); + va = find_vmap_area((unsigned long)addr); if (va && va->flags & VM_VM_AREA) { struct vm_struct *vm = va->vm; @@ -1578,7 +1564,39 @@ static void __vunmap(const void *addr, int deallocate_pages) kfree(area); return; } - + +static inline void __vfree_deferred(const void *addr) +{ + /* + * Use raw_cpu_ptr() because this can be called from preemptible + * context. Preemption is absolutely fine here, because the llist_add() + * implementation is lockless, so it works even if we are adding to + * nother cpu's list. schedule_work() should be fine with this too. + */ + struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); + + if (llist_add((struct llist_node *)addr, &p->list)) + schedule_work(&p->wq); +} + +/** + * vfree_atomic - release memory allocated by vmalloc() + * @addr: memory base address + * + * This one is just like vfree() but can be called in any atomic context + * except NMIs. + */ +void vfree_atomic(const void *addr) +{ + BUG_ON(in_nmi()); + + kmemleak_free(addr); + + if (!addr) + return; + __vfree_deferred(addr); +} + /** * vfree - release memory allocated by vmalloc() * @addr: memory base address @@ -1601,11 +1619,9 @@ void vfree(const void *addr) if (!addr) return; - if (unlikely(in_interrupt())) { - struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred); - if (llist_add((struct llist_node *)addr, &p->list)) - schedule_work(&p->wq); - } else + if (unlikely(in_interrupt())) + __vfree_deferred(addr); + else __vunmap(addr, 1); } EXPORT_SYMBOL(vfree); diff --git a/mm/vmstat.c b/mm/vmstat.c index b8f2eda3381d..d6b4817c4416 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -906,6 +906,7 @@ static void frag_stop(struct seq_file *m, void *arg) /* Walk all the zones in a node and print using a callback */ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, + bool nolock, void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) { struct zone *zone; @@ -916,9 +917,11 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, if (!populated_zone(zone)) continue; - spin_lock_irqsave(&zone->lock, flags); + if (!nolock) + spin_lock_irqsave(&zone->lock, flags); print(m, pgdat, zone); - spin_unlock_irqrestore(&zone->lock, flags); + if (!nolock) + spin_unlock_irqrestore(&zone->lock, flags); } } #endif @@ -954,7 +957,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, static int frag_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; - walk_zones_in_node(m, pgdat, frag_show_print); + walk_zones_in_node(m, pgdat, false, frag_show_print); return 0; } @@ -995,7 +998,7 @@ static int pagetypeinfo_showfree(struct seq_file *m, void *arg) seq_printf(m, "%6d ", order); seq_putc(m, '\n'); - walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print); + walk_zones_in_node(m, pgdat, false, pagetypeinfo_showfree_print); return 0; } @@ -1044,7 +1047,7 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) seq_printf(m, "%12s ", migratetype_names[mtype]); seq_putc(m, '\n'); - walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print); + walk_zones_in_node(m, pgdat, false, pagetypeinfo_showblockcount_print); return 0; } @@ -1088,7 +1091,11 @@ static void pagetypeinfo_showmixedcount_print(struct seq_file *m, page = pfn_to_page(pfn); if (PageBuddy(page)) { - pfn += (1UL << page_order(page)) - 1; + unsigned long freepage_order; + + freepage_order = page_order_unsafe(page); + if (freepage_order < MAX_ORDER) + pfn += (1UL << freepage_order) - 1; continue; } @@ -1143,7 +1150,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat) seq_printf(m, "%12s ", migratetype_names[mtype]); seq_putc(m, '\n'); - walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print); + walk_zones_in_node(m, pgdat, true, pagetypeinfo_showmixedcount_print); #endif /* CONFIG_PAGE_OWNER */ } @@ -1276,7 +1283,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, static int zoneinfo_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; - walk_zones_in_node(m, pgdat, zoneinfo_show_print); + walk_zones_in_node(m, pgdat, false, zoneinfo_show_print); return 0; } @@ -1635,7 +1642,7 @@ static int unusable_show(struct seq_file *m, void *arg) if (!node_state(pgdat->node_id, N_MEMORY)) return 0; - walk_zones_in_node(m, pgdat, unusable_show_print); + walk_zones_in_node(m, pgdat, false, unusable_show_print); return 0; } @@ -1687,7 +1694,7 @@ static int extfrag_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; - walk_zones_in_node(m, pgdat, extfrag_show_print); + walk_zones_in_node(m, pgdat, false, extfrag_show_print); return 0; } |
