summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/gup.c5
-rw-r--r--mm/kasan/report.c3
-rw-r--r--mm/memblock.c24
-rw-r--r--mm/memory-failure.c13
-rw-r--r--mm/memory.c38
-rw-r--r--mm/mlock.c5
-rw-r--r--mm/mmap.c160
-rw-r--r--mm/page_alloc.c25
-rw-r--r--mm/slub.c6
-rw-r--r--mm/swap_cgroup.c3
-rw-r--r--mm/truncate.c2
-rw-r--r--mm/vmalloc.c196
-rw-r--r--mm/vmstat.c27
13 files changed, 286 insertions, 221 deletions
diff --git a/mm/gup.c b/mm/gup.c
index 4b0b7e7d1136..b599526db9f7 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -312,11 +312,6 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
/* mlock all present pages, but do not fault in new pages */
if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
return -ENOENT;
- /* For mm_populate(), just skip the stack guard page. */
- if ((*flags & FOLL_POPULATE) &&
- (stack_guard_page_start(vma, address) ||
- stack_guard_page_end(vma, address + PAGE_SIZE)))
- return -ENOENT;
if (*flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
if (nonblocking)
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 12f222d0224b..b4e31f78ae69 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -13,6 +13,7 @@
*
*/
+#include <linux/ftrace.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/printk.h>
@@ -251,6 +252,8 @@ void kasan_report(unsigned long addr, size_t size,
if (likely(!kasan_report_enabled()))
return;
+ disable_trace_on_warning();
+
info.access_addr = (void *)addr;
info.access_size = size;
info.is_write = is_write;
diff --git a/mm/memblock.c b/mm/memblock.c
index 89e74fa82290..351a4840a407 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1688,6 +1688,30 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name
}
}
+extern unsigned long __init_memblock
+memblock_reserved_memory_within(phys_addr_t start_addr, phys_addr_t end_addr)
+{
+ struct memblock_type *type = &memblock.reserved;
+ unsigned long size = 0;
+ int idx;
+
+ for (idx = 0; idx < type->cnt; idx++) {
+ struct memblock_region *rgn = &type->regions[idx];
+ phys_addr_t start, end;
+
+ if (rgn->base + rgn->size < start_addr)
+ continue;
+ if (rgn->base > end_addr)
+ continue;
+
+ start = rgn->base;
+ end = start + rgn->size;
+ size += end - start;
+ }
+
+ return size;
+}
+
void __init_memblock __memblock_dump_all(void)
{
pr_info("MEMBLOCK configuration:\n");
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index b3becd9941e9..14c1c5cb3ccf 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1208,7 +1208,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* page_remove_rmap() in try_to_unmap_one(). So to determine page status
* correctly, we save a copy of the page flags at this time.
*/
- page_flags = p->flags;
+ if (PageHuge(p))
+ page_flags = hpage->flags;
+ else
+ page_flags = p->flags;
/*
* unpoison always clear PG_hwpoison inside page lock
@@ -1619,12 +1622,8 @@ static int soft_offline_huge_page(struct page *page, int flags)
if (ret) {
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
pfn, ret, page->flags);
- /*
- * We know that soft_offline_huge_page() tries to migrate
- * only one hugepage pointed to by hpage, so we need not
- * run through the pagelist here.
- */
- putback_active_hugepage(hpage);
+ if (!list_empty(&pagelist))
+ putback_movable_pages(&pagelist);
if (ret > 0)
ret = -EIO;
} else {
diff --git a/mm/memory.c b/mm/memory.c
index b536e3d60fc7..8689df9b09d5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2663,40 +2663,6 @@ out_release:
}
/*
- * This is like a special single-page "expand_{down|up}wards()",
- * except we must first make sure that 'address{-|+}PAGE_SIZE'
- * doesn't hit another vma.
- */
-static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
-{
- address &= PAGE_MASK;
- if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
- struct vm_area_struct *prev = vma->vm_prev;
-
- /*
- * Is there a mapping abutting this one below?
- *
- * That's only ok if it's the same stack mapping
- * that has gotten split..
- */
- if (prev && prev->vm_end == address)
- return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
-
- return expand_downwards(vma, address - PAGE_SIZE);
- }
- if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
- struct vm_area_struct *next = vma->vm_next;
-
- /* As VM_GROWSDOWN but s/below/above/ */
- if (next && next->vm_start == address + PAGE_SIZE)
- return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
-
- return expand_upwards(vma, address + PAGE_SIZE);
- }
- return 0;
-}
-
-/*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
@@ -2716,10 +2682,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (vma->vm_flags & VM_SHARED)
return VM_FAULT_SIGBUS;
- /* Check if we need to add a guard page to the stack */
- if (check_stack_guard_page(vma, address) < 0)
- return VM_FAULT_SIGSEGV;
-
/* Use the zero-page for reads */
if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) {
entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
diff --git a/mm/mlock.c b/mm/mlock.c
index d843bc9d32dd..206e86b98a03 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -277,7 +277,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
{
int i;
int nr = pagevec_count(pvec);
- int delta_munlocked;
+ int delta_munlocked = -nr;
struct pagevec pvec_putback;
int pgrescued = 0;
@@ -297,6 +297,8 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
continue;
else
__munlock_isolation_failed(page);
+ } else {
+ delta_munlocked++;
}
/*
@@ -308,7 +310,6 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
pagevec_add(&pvec_putback, pvec->pages[i]);
pvec->pages[i] = NULL;
}
- delta_munlocked = -nr + pagevec_count(&pvec_putback);
__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
spin_unlock_irq(&zone->lru_lock);
diff --git a/mm/mmap.c b/mm/mmap.c
index ebc9d1923c3d..092729c2cb72 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -304,6 +304,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
unsigned long retval;
unsigned long newbrk, oldbrk;
struct mm_struct *mm = current->mm;
+ struct vm_area_struct *next;
unsigned long min_brk;
bool populate;
@@ -348,7 +349,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
}
/* Check against existing mmap mappings. */
- if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
+ next = find_vma(mm, oldbrk);
+ if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
goto out;
/* Ok, looks good - let it rip. */
@@ -371,10 +373,22 @@ out:
static long vma_compute_subtree_gap(struct vm_area_struct *vma)
{
- unsigned long max, subtree_gap;
- max = vma->vm_start;
- if (vma->vm_prev)
- max -= vma->vm_prev->vm_end;
+ unsigned long max, prev_end, subtree_gap;
+
+ /*
+ * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we
+ * allow two stack_guard_gaps between them here, and when choosing
+ * an unmapped area; whereas when expanding we only require one.
+ * That's a little inconsistent, but keeps the code here simpler.
+ */
+ max = vm_start_gap(vma);
+ if (vma->vm_prev) {
+ prev_end = vm_end_gap(vma->vm_prev);
+ if (max > prev_end)
+ max -= prev_end;
+ else
+ max = 0;
+ }
if (vma->vm_rb.rb_left) {
subtree_gap = rb_entry(vma->vm_rb.rb_left,
struct vm_area_struct, vm_rb)->rb_subtree_gap;
@@ -467,7 +481,7 @@ static void validate_mm(struct mm_struct *mm)
anon_vma_unlock_read(anon_vma);
}
- highest_address = vma->vm_end;
+ highest_address = vm_end_gap(vma);
vma = vma->vm_next;
i++;
}
@@ -636,7 +650,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
if (vma->vm_next)
vma_gap_update(vma->vm_next);
else
- mm->highest_vm_end = vma->vm_end;
+ mm->highest_vm_end = vm_end_gap(vma);
/*
* vma->vm_prev wasn't known when we followed the rbtree to find the
@@ -882,7 +896,7 @@ again: remove_next = 1 + (end > next->vm_end);
vma_gap_update(vma);
if (end_changed) {
if (!next)
- mm->highest_vm_end = end;
+ mm->highest_vm_end = vm_end_gap(vma);
else if (!adjust_next)
vma_gap_update(next);
}
@@ -925,7 +939,7 @@ again: remove_next = 1 + (end > next->vm_end);
else if (next)
vma_gap_update(next);
else
- mm->highest_vm_end = end;
+ VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
}
if (insert && file)
uprobe_mmap(insert);
@@ -1771,7 +1785,7 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info)
while (true) {
/* Visit left subtree if it looks promising */
- gap_end = vma->vm_start;
+ gap_end = vm_start_gap(vma);
if (gap_end >= low_limit && vma->vm_rb.rb_left) {
struct vm_area_struct *left =
rb_entry(vma->vm_rb.rb_left,
@@ -1782,12 +1796,13 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info)
}
}
- gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
+ gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
check_current:
/* Check if current node has a suitable gap */
if (gap_start > high_limit)
return -ENOMEM;
- if (gap_end >= low_limit && gap_end - gap_start >= length)
+ if (gap_end >= low_limit &&
+ gap_end > gap_start && gap_end - gap_start >= length)
goto found;
/* Visit right subtree if it looks promising */
@@ -1809,8 +1824,8 @@ check_current:
vma = rb_entry(rb_parent(prev),
struct vm_area_struct, vm_rb);
if (prev == vma->vm_rb.rb_left) {
- gap_start = vma->vm_prev->vm_end;
- gap_end = vma->vm_start;
+ gap_start = vm_end_gap(vma->vm_prev);
+ gap_end = vm_start_gap(vma);
goto check_current;
}
}
@@ -1874,7 +1889,7 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
while (true) {
/* Visit right subtree if it looks promising */
- gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
+ gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
if (gap_start <= high_limit && vma->vm_rb.rb_right) {
struct vm_area_struct *right =
rb_entry(vma->vm_rb.rb_right,
@@ -1887,10 +1902,11 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
check_current:
/* Check if current node has a suitable gap */
- gap_end = vma->vm_start;
+ gap_end = vm_start_gap(vma);
if (gap_end < low_limit)
return -ENOMEM;
- if (gap_start <= high_limit && gap_end - gap_start >= length)
+ if (gap_start <= high_limit &&
+ gap_end > gap_start && gap_end - gap_start >= length)
goto found;
/* Visit left subtree if it looks promising */
@@ -1913,7 +1929,7 @@ check_current:
struct vm_area_struct, vm_rb);
if (prev == vma->vm_rb.rb_right) {
gap_start = vma->vm_prev ?
- vma->vm_prev->vm_end : 0;
+ vm_end_gap(vma->vm_prev) : 0;
goto check_current;
}
}
@@ -1951,7 +1967,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma, *prev;
struct vm_unmapped_area_info info;
if (len > TASK_SIZE - mmap_min_addr)
@@ -1962,9 +1978,10 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
if (addr) {
addr = PAGE_ALIGN(addr);
- vma = find_vma(mm, addr);
+ vma = find_vma_prev(mm, addr, &prev);
if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)) &&
+ (!prev || addr >= vm_end_gap(prev)))
return addr;
}
@@ -1987,7 +2004,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
const unsigned long len, const unsigned long pgoff,
const unsigned long flags)
{
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma, *prev;
struct mm_struct *mm = current->mm;
unsigned long addr = addr0;
struct vm_unmapped_area_info info;
@@ -2002,9 +2019,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
/* requesting a specific address */
if (addr) {
addr = PAGE_ALIGN(addr);
- vma = find_vma(mm, addr);
+ vma = find_vma_prev(mm, addr, &prev);
if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)) &&
+ (!prev || addr >= vm_end_gap(prev)))
return addr;
}
@@ -2129,21 +2147,19 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,
* update accounting. This is shared with both the
* grow-up and grow-down cases.
*/
-static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow)
+static int acct_stack_growth(struct vm_area_struct *vma,
+ unsigned long size, unsigned long grow)
{
struct mm_struct *mm = vma->vm_mm;
struct rlimit *rlim = current->signal->rlim;
- unsigned long new_start, actual_size;
+ unsigned long new_start;
/* address space limit tests */
if (!may_expand_vm(mm, grow))
return -ENOMEM;
/* Stack limit test */
- actual_size = size;
- if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN)))
- actual_size -= PAGE_SIZE;
- if (actual_size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur))
+ if (size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur))
return -ENOMEM;
/* mlock limit tests */
@@ -2181,16 +2197,32 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *next;
+ unsigned long gap_addr;
int error = 0;
if (!(vma->vm_flags & VM_GROWSUP))
return -EFAULT;
- /* Guard against wrapping around to address 0. */
- if (address < PAGE_ALIGN(address+4))
- address = PAGE_ALIGN(address+4);
- else
+ /* Guard against exceeding limits of the address space. */
+ address &= PAGE_MASK;
+ if (address >= TASK_SIZE)
return -ENOMEM;
+ address += PAGE_SIZE;
+
+ /* Enforce stack_guard_gap */
+ gap_addr = address + stack_guard_gap;
+
+ /* Guard against overflow */
+ if (gap_addr < address || gap_addr > TASK_SIZE)
+ gap_addr = TASK_SIZE;
+
+ next = vma->vm_next;
+ if (next && next->vm_start < gap_addr) {
+ if (!(next->vm_flags & VM_GROWSUP))
+ return -ENOMEM;
+ /* Check that both stack segments have the same anon_vma? */
+ }
/* We must make sure the anon_vma is allocated. */
if (unlikely(anon_vma_prepare(vma)))
@@ -2236,7 +2268,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
if (vma->vm_next)
vma_gap_update(vma->vm_next);
else
- mm->highest_vm_end = address;
+ mm->highest_vm_end = vm_end_gap(vma);
spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
@@ -2257,6 +2289,8 @@ int expand_downwards(struct vm_area_struct *vma,
unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *prev;
+ unsigned long gap_addr;
int error;
address &= PAGE_MASK;
@@ -2264,6 +2298,17 @@ int expand_downwards(struct vm_area_struct *vma,
if (error)
return error;
+ /* Enforce stack_guard_gap */
+ gap_addr = address - stack_guard_gap;
+ if (gap_addr > address)
+ return -ENOMEM;
+ prev = vma->vm_prev;
+ if (prev && prev->vm_end > gap_addr) {
+ if (!(prev->vm_flags & VM_GROWSDOWN))
+ return -ENOMEM;
+ /* Check that both stack segments have the same anon_vma? */
+ }
+
/* We must make sure the anon_vma is allocated. */
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
@@ -2319,28 +2364,25 @@ int expand_downwards(struct vm_area_struct *vma,
return error;
}
-/*
- * Note how expand_stack() refuses to expand the stack all the way to
- * abut the next virtual mapping, *unless* that mapping itself is also
- * a stack mapping. We want to leave room for a guard page, after all
- * (the guard page itself is not added here, that is done by the
- * actual page faulting logic)
- *
- * This matches the behavior of the guard page logic (see mm/memory.c:
- * check_stack_guard_page()), which only allows the guard page to be
- * removed under these circumstances.
- */
+/* enforced gap between the expanding stack and other mappings. */
+unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
+
+static int __init cmdline_parse_stack_guard_gap(char *p)
+{
+ unsigned long val;
+ char *endptr;
+
+ val = simple_strtoul(p, &endptr, 10);
+ if (!*endptr)
+ stack_guard_gap = val << PAGE_SHIFT;
+
+ return 0;
+}
+__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
+
#ifdef CONFIG_STACK_GROWSUP
int expand_stack(struct vm_area_struct *vma, unsigned long address)
{
- struct vm_area_struct *next;
-
- address &= PAGE_MASK;
- next = vma->vm_next;
- if (next && next->vm_start == address + PAGE_SIZE) {
- if (!(next->vm_flags & VM_GROWSUP))
- return -ENOMEM;
- }
return expand_upwards(vma, address);
}
@@ -2362,14 +2404,6 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
#else
int expand_stack(struct vm_area_struct *vma, unsigned long address)
{
- struct vm_area_struct *prev;
-
- address &= PAGE_MASK;
- prev = vma->vm_prev;
- if (prev && prev->vm_end == address) {
- if (!(prev->vm_flags & VM_GROWSDOWN))
- return -ENOMEM;
- }
return expand_downwards(vma, address);
}
@@ -2467,7 +2501,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
vma->vm_prev = prev;
vma_gap_update(vma);
} else
- mm->highest_vm_end = prev ? prev->vm_end : 0;
+ mm->highest_vm_end = prev ? vm_end_gap(prev) : 0;
tail_vma->vm_next = NULL;
/* Kill the cache */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 170c1486e5c9..4fbb23c1cba7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -274,6 +274,26 @@ int page_group_by_mobility_disabled __read_mostly;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
static inline void reset_deferred_meminit(pg_data_t *pgdat)
{
+ unsigned long max_initialise;
+ unsigned long reserved_lowmem;
+
+ /*
+ * Initialise at least 2G of a node but also take into account that
+ * two large system hashes that can take up 1GB for 0.25TB/node.
+ */
+ max_initialise = max(2UL << (30 - PAGE_SHIFT),
+ (pgdat->node_spanned_pages >> 8));
+
+ /*
+ * Compensate the all the memblock reservations (e.g. crash kernel)
+ * from the initial estimation to make sure we will initialize enough
+ * memory to boot.
+ */
+ reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn,
+ pgdat->node_start_pfn + max_initialise);
+ max_initialise += reserved_lowmem;
+
+ pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages);
pgdat->first_deferred_pfn = ULONG_MAX;
}
@@ -307,10 +327,9 @@ static inline bool update_defer_init(pg_data_t *pgdat,
/* Always populate low zones for address-contrained allocations */
if (zone_end < pgdat_end_pfn(pgdat))
return true;
-
/* Initialise at least 2G of the highest zone */
(*nr_initialised)++;
- if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) &&
+ if ((*nr_initialised > pgdat->static_init_size) &&
(pfn & (PAGES_PER_SECTION - 1)) == 0) {
pgdat->first_deferred_pfn = pfn;
return false;
@@ -5456,7 +5475,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
/* pg_data_t should be reset to zero when it's allocated */
WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
- reset_deferred_meminit(pgdat);
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -5475,6 +5493,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
(unsigned long)pgdat->node_mem_map);
#endif
+ reset_deferred_meminit(pgdat);
free_area_init_core(pgdat);
}
diff --git a/mm/slub.c b/mm/slub.c
index 64bc4e973789..a5f6c6d107e9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5357,6 +5357,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
char mbuf[64];
char *buf;
struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
+ ssize_t len;
if (!attr || !attr->store || !attr->show)
continue;
@@ -5381,8 +5382,9 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
buf = buffer;
}
- attr->show(root_cache, buf);
- attr->store(s, buf, strlen(buf));
+ len = attr->show(root_cache, buf);
+ if (len > 0)
+ attr->store(s, buf, len);
}
if (buffer)
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index b5f7f24b8dd1..40dd0f9b00d6 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -48,6 +48,9 @@ static int swap_cgroup_prepare(int type)
if (!page)
goto not_enough_page;
ctrl->map[idx] = page;
+
+ if (!(idx % SWAP_CLUSTER_MAX))
+ cond_resched();
}
return 0;
not_enough_page:
diff --git a/mm/truncate.c b/mm/truncate.c
index 30c2f059a488..8ca20a98c327 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -799,7 +799,7 @@ EXPORT_SYMBOL(truncate_setsize);
*/
void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
{
- int bsize = 1 << inode->i_blkbits;
+ int bsize = i_blocksize(inode);
loff_t rounded_from;
struct page *page;
pgoff_t index;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b219ccc015d6..b24be2a7f456 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -274,13 +274,12 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
/*** Global kva allocator ***/
-#define VM_LAZY_FREE 0x01
-#define VM_LAZY_FREEING 0x02
#define VM_VM_AREA 0x04
static DEFINE_SPINLOCK(vmap_area_lock);
/* Export for kexec only */
LIST_HEAD(vmap_area_list);
+static LLIST_HEAD(vmap_purge_list);
static struct rb_root vmap_area_root = RB_ROOT;
/* The vmap cache globals are protected by vmap_area_lock */
@@ -414,6 +413,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
BUG_ON(offset_in_page(size));
BUG_ON(!is_power_of_2(align));
+ might_sleep();
+
va = kmalloc_node(sizeof(struct vmap_area),
gfp_mask & GFP_RECLAIM_MASK, node);
if (unlikely(!va))
@@ -628,6 +629,13 @@ static unsigned long lazy_max_pages(void)
static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
+/*
+ * Serialize vmap purging. There is no actual criticial section protected
+ * by this look, but we want to avoid concurrent calls for performance
+ * reasons and to make the pcpu_get_vm_areas more deterministic.
+ */
+static DEFINE_MUTEX(vmap_purge_lock);
+
/* for per-CPU blocks */
static void purge_fragmented_blocks_allcpus(void);
@@ -642,65 +650,40 @@ void set_iounmap_nonlazy(void)
/*
* Purges all lazily-freed vmap areas.
- *
- * If sync is 0 then don't purge if there is already a purge in progress.
- * If force_flush is 1, then flush kernel TLBs between *start and *end even
- * if we found no lazy vmap areas to unmap (callers can use this to optimise
- * their own TLB flushing).
- * Returns with *start = min(*start, lowest purged address)
- * *end = max(*end, highest purged address)
*/
-static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
- int sync, int force_flush)
+static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
{
- static DEFINE_SPINLOCK(purge_lock);
- LIST_HEAD(valist);
+ struct llist_node *valist;
struct vmap_area *va;
struct vmap_area *n_va;
- int nr = 0;
-
- /*
- * If sync is 0 but force_flush is 1, we'll go sync anyway but callers
- * should not expect such behaviour. This just simplifies locking for
- * the case that isn't actually used at the moment anyway.
- */
- if (!sync && !force_flush) {
- if (!spin_trylock(&purge_lock))
- return;
- } else
- spin_lock(&purge_lock);
+ bool do_free = false;
- if (sync)
- purge_fragmented_blocks_allcpus();
+ lockdep_assert_held(&vmap_purge_lock);
- rcu_read_lock();
- list_for_each_entry_rcu(va, &vmap_area_list, list) {
- if (va->flags & VM_LAZY_FREE) {
- if (va->va_start < *start)
- *start = va->va_start;
- if (va->va_end > *end)
- *end = va->va_end;
- nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
- list_add_tail(&va->purge_list, &valist);
- va->flags |= VM_LAZY_FREEING;
- va->flags &= ~VM_LAZY_FREE;
- }
+ valist = llist_del_all(&vmap_purge_list);
+ llist_for_each_entry(va, valist, purge_list) {
+ if (va->va_start < start)
+ start = va->va_start;
+ if (va->va_end > end)
+ end = va->va_end;
+ do_free = true;
}
- rcu_read_unlock();
- if (nr)
- atomic_sub(nr, &vmap_lazy_nr);
+ if (!do_free)
+ return false;
- if (nr || force_flush)
- flush_tlb_kernel_range(*start, *end);
+ flush_tlb_kernel_range(start, end);
- if (nr) {
- spin_lock(&vmap_area_lock);
- list_for_each_entry_safe(va, n_va, &valist, purge_list)
- __free_vmap_area(va);
- spin_unlock(&vmap_area_lock);
+ spin_lock(&vmap_area_lock);
+ llist_for_each_entry_safe(va, n_va, valist, purge_list) {
+ int nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
+
+ __free_vmap_area(va);
+ atomic_sub(nr, &vmap_lazy_nr);
+ cond_resched_lock(&vmap_area_lock);
}
- spin_unlock(&purge_lock);
+ spin_unlock(&vmap_area_lock);
+ return true;
}
/*
@@ -709,9 +692,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
*/
static void try_purge_vmap_area_lazy(void)
{
- unsigned long start = ULONG_MAX, end = 0;
-
- __purge_vmap_area_lazy(&start, &end, 0, 0);
+ if (mutex_trylock(&vmap_purge_lock)) {
+ __purge_vmap_area_lazy(ULONG_MAX, 0);
+ mutex_unlock(&vmap_purge_lock);
+ }
}
/*
@@ -719,9 +703,10 @@ static void try_purge_vmap_area_lazy(void)
*/
static void purge_vmap_area_lazy(void)
{
- unsigned long start = ULONG_MAX, end = 0;
-
- __purge_vmap_area_lazy(&start, &end, 1, 0);
+ mutex_lock(&vmap_purge_lock);
+ purge_fragmented_blocks_allcpus();
+ __purge_vmap_area_lazy(ULONG_MAX, 0);
+ mutex_unlock(&vmap_purge_lock);
}
/*
@@ -731,20 +716,16 @@ static void purge_vmap_area_lazy(void)
*/
static void free_vmap_area_noflush(struct vmap_area *va)
{
- va->flags |= VM_LAZY_FREE;
- atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
- if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
- try_purge_vmap_area_lazy();
-}
+ int nr_lazy;
-/*
- * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
- * called for the correct range previously.
- */
-static void free_unmap_vmap_area_noflush(struct vmap_area *va)
-{
- unmap_vmap_area(va);
- free_vmap_area_noflush(va);
+ nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT,
+ &vmap_lazy_nr);
+
+ /* After this point, we may free va at any time */
+ llist_add(&va->purge_list, &vmap_purge_list);
+
+ if (unlikely(nr_lazy > lazy_max_pages()))
+ try_purge_vmap_area_lazy();
}
/*
@@ -753,7 +734,8 @@ static void free_unmap_vmap_area_noflush(struct vmap_area *va)
static void free_unmap_vmap_area(struct vmap_area *va)
{
flush_cache_vunmap(va->va_start, va->va_end);
- free_unmap_vmap_area_noflush(va);
+ unmap_vmap_area(va);
+ free_vmap_area_noflush(va);
}
static struct vmap_area *find_vmap_area(unsigned long addr)
@@ -767,16 +749,6 @@ static struct vmap_area *find_vmap_area(unsigned long addr)
return va;
}
-static void free_unmap_vmap_area_addr(unsigned long addr)
-{
- struct vmap_area *va;
-
- va = find_vmap_area(addr);
- BUG_ON(!va);
- free_unmap_vmap_area(va);
-}
-
-
/*** Per cpu kva allocator ***/
/*
@@ -1097,6 +1069,8 @@ void vm_unmap_aliases(void)
if (unlikely(!vmap_initialized))
return;
+ might_sleep();
+
for_each_possible_cpu(cpu) {
struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
struct vmap_block *vb;
@@ -1121,7 +1095,11 @@ void vm_unmap_aliases(void)
rcu_read_unlock();
}
- __purge_vmap_area_lazy(&start, &end, 1, flush);
+ mutex_lock(&vmap_purge_lock);
+ purge_fragmented_blocks_allcpus();
+ if (!__purge_vmap_area_lazy(start, end) && flush)
+ flush_tlb_kernel_range(start, end);
+ mutex_unlock(&vmap_purge_lock);
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
@@ -1134,7 +1112,9 @@ void vm_unmap_ram(const void *mem, unsigned int count)
{
unsigned long size = count << PAGE_SHIFT;
unsigned long addr = (unsigned long)mem;
+ struct vmap_area *va;
+ might_sleep();
BUG_ON(!addr);
BUG_ON(addr < VMALLOC_START);
BUG_ON(addr > VMALLOC_END);
@@ -1143,10 +1123,14 @@ void vm_unmap_ram(const void *mem, unsigned int count)
debug_check_no_locks_freed(mem, size);
vmap_debug_free_range(addr, addr+size);
- if (likely(count <= VMAP_MAX_ALLOC))
+ if (likely(count <= VMAP_MAX_ALLOC)) {
vb_free(mem, size);
- else
- free_unmap_vmap_area_addr(addr);
+ return;
+ }
+
+ va = find_vmap_area(addr);
+ BUG_ON(!va);
+ free_unmap_vmap_area(va);
}
EXPORT_SYMBOL(vm_unmap_ram);
@@ -1520,6 +1504,8 @@ struct vm_struct *remove_vm_area(const void *addr)
{
struct vmap_area *va;
+ might_sleep();
+
va = find_vmap_area((unsigned long)addr);
if (va && va->flags & VM_VM_AREA) {
struct vm_struct *vm = va->vm;
@@ -1578,7 +1564,39 @@ static void __vunmap(const void *addr, int deallocate_pages)
kfree(area);
return;
}
-
+
+static inline void __vfree_deferred(const void *addr)
+{
+ /*
+ * Use raw_cpu_ptr() because this can be called from preemptible
+ * context. Preemption is absolutely fine here, because the llist_add()
+ * implementation is lockless, so it works even if we are adding to
+ * nother cpu's list. schedule_work() should be fine with this too.
+ */
+ struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
+
+ if (llist_add((struct llist_node *)addr, &p->list))
+ schedule_work(&p->wq);
+}
+
+/**
+ * vfree_atomic - release memory allocated by vmalloc()
+ * @addr: memory base address
+ *
+ * This one is just like vfree() but can be called in any atomic context
+ * except NMIs.
+ */
+void vfree_atomic(const void *addr)
+{
+ BUG_ON(in_nmi());
+
+ kmemleak_free(addr);
+
+ if (!addr)
+ return;
+ __vfree_deferred(addr);
+}
+
/**
* vfree - release memory allocated by vmalloc()
* @addr: memory base address
@@ -1601,11 +1619,9 @@ void vfree(const void *addr)
if (!addr)
return;
- if (unlikely(in_interrupt())) {
- struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred);
- if (llist_add((struct llist_node *)addr, &p->list))
- schedule_work(&p->wq);
- } else
+ if (unlikely(in_interrupt()))
+ __vfree_deferred(addr);
+ else
__vunmap(addr, 1);
}
EXPORT_SYMBOL(vfree);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b8f2eda3381d..d6b4817c4416 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -906,6 +906,7 @@ static void frag_stop(struct seq_file *m, void *arg)
/* Walk all the zones in a node and print using a callback */
static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
+ bool nolock,
void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
{
struct zone *zone;
@@ -916,9 +917,11 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
if (!populated_zone(zone))
continue;
- spin_lock_irqsave(&zone->lock, flags);
+ if (!nolock)
+ spin_lock_irqsave(&zone->lock, flags);
print(m, pgdat, zone);
- spin_unlock_irqrestore(&zone->lock, flags);
+ if (!nolock)
+ spin_unlock_irqrestore(&zone->lock, flags);
}
}
#endif
@@ -954,7 +957,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
static int frag_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = (pg_data_t *)arg;
- walk_zones_in_node(m, pgdat, frag_show_print);
+ walk_zones_in_node(m, pgdat, false, frag_show_print);
return 0;
}
@@ -995,7 +998,7 @@ static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
seq_printf(m, "%6d ", order);
seq_putc(m, '\n');
- walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print);
+ walk_zones_in_node(m, pgdat, false, pagetypeinfo_showfree_print);
return 0;
}
@@ -1044,7 +1047,7 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
seq_printf(m, "%12s ", migratetype_names[mtype]);
seq_putc(m, '\n');
- walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print);
+ walk_zones_in_node(m, pgdat, false, pagetypeinfo_showblockcount_print);
return 0;
}
@@ -1088,7 +1091,11 @@ static void pagetypeinfo_showmixedcount_print(struct seq_file *m,
page = pfn_to_page(pfn);
if (PageBuddy(page)) {
- pfn += (1UL << page_order(page)) - 1;
+ unsigned long freepage_order;
+
+ freepage_order = page_order_unsafe(page);
+ if (freepage_order < MAX_ORDER)
+ pfn += (1UL << freepage_order) - 1;
continue;
}
@@ -1143,7 +1150,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
seq_printf(m, "%12s ", migratetype_names[mtype]);
seq_putc(m, '\n');
- walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print);
+ walk_zones_in_node(m, pgdat, true, pagetypeinfo_showmixedcount_print);
#endif /* CONFIG_PAGE_OWNER */
}
@@ -1276,7 +1283,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
static int zoneinfo_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = (pg_data_t *)arg;
- walk_zones_in_node(m, pgdat, zoneinfo_show_print);
+ walk_zones_in_node(m, pgdat, false, zoneinfo_show_print);
return 0;
}
@@ -1635,7 +1642,7 @@ static int unusable_show(struct seq_file *m, void *arg)
if (!node_state(pgdat->node_id, N_MEMORY))
return 0;
- walk_zones_in_node(m, pgdat, unusable_show_print);
+ walk_zones_in_node(m, pgdat, false, unusable_show_print);
return 0;
}
@@ -1687,7 +1694,7 @@ static int extfrag_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = (pg_data_t *)arg;
- walk_zones_in_node(m, pgdat, extfrag_show_print);
+ walk_zones_in_node(m, pgdat, false, extfrag_show_print);
return 0;
}