From a4e25ff31103e7c9084904418cb95596e3e9d9cf Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 15 Mar 2016 14:53:32 -0700 Subject: slub: clean up code for kmem cgroup support to kmem_cache_free_bulk commit 376bf125ac781d32e202760ed7deb1ae4ed35d31 upstream. This change is primarily an attempt to make it easier to realize the optimizations the compiler performs in-case CONFIG_MEMCG_KMEM is not enabled. Performance wise, even when CONFIG_MEMCG_KMEM is compiled in, the overhead is zero. This is because, as long as no process have enabled kmem cgroups accounting, the assignment is replaced by asm-NOP operations. This is possible because memcg_kmem_enabled() uses a static_key_false() construct. It also helps readability as it avoid accessing the p[] array like: p[size - 1] which "expose" that the array is processed backwards inside helper function build_detached_freelist(). Lastly this also makes the code more robust, in error case like passing NULL pointers in the array. Which were previously handled before commit 033745189b1b ("slub: add missing kmem cgroup support to kmem_cache_free_bulk"). Fixes: 033745189b1b ("slub: add missing kmem cgroup support to kmem_cache_free_bulk") Signed-off-by: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/slub.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 46997517406e..65d5f92d51d2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2819,6 +2819,7 @@ struct detached_freelist { void *tail; void *freelist; int cnt; + struct kmem_cache *s; }; /* @@ -2833,8 +2834,9 @@ struct detached_freelist { * synchronization primitive. Look ahead in the array is limited due * to performance reasons. */ -static int build_detached_freelist(struct kmem_cache *s, size_t size, - void **p, struct detached_freelist *df) +static inline +int build_detached_freelist(struct kmem_cache *s, size_t size, + void **p, struct detached_freelist *df) { size_t first_skipped_index = 0; int lookahead = 3; @@ -2850,8 +2852,11 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size, if (!object) return 0; + /* Support for memcg, compiler can optimize this out */ + df->s = cache_from_obj(s, object); + /* Start new detached freelist */ - set_freepointer(s, object, NULL); + set_freepointer(df->s, object, NULL); df->page = virt_to_head_page(object); df->tail = object; df->freelist = object; @@ -2866,7 +2871,7 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size, /* df->page is always set at this point */ if (df->page == virt_to_head_page(object)) { /* Opportunity build freelist */ - set_freepointer(s, object, df->freelist); + set_freepointer(df->s, object, df->freelist); df->freelist = object; df->cnt++; p[size] = NULL; /* mark object processed */ @@ -2885,25 +2890,20 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size, return first_skipped_index; } - /* Note that interrupts must be enabled when calling this function. */ -void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) { if (WARN_ON(!size)) return; do { struct detached_freelist df; - struct kmem_cache *s; - - /* Support for memcg */ - s = cache_from_obj(orig_s, p[size - 1]); size = build_detached_freelist(s, size, p, &df); if (unlikely(!df.page)) continue; - slab_free(s, df.page, df.freelist, df.tail, df.cnt, _RET_IP_); + slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_); } while (likely(size)); } EXPORT_SYMBOL(kmem_cache_free_bulk); -- cgit v1.2.3 From 52526076a5a686906a0acc22d27530ecb9364d84 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 21 Apr 2016 19:09:02 -0400 Subject: memcg: relocate charge moving from ->attach to ->post_attach commit 264a0ae164bc0e9144bebcd25ff030d067b1a878 upstream. Hello, So, this ended up a lot simpler than I originally expected. I tested it lightly and it seems to work fine. Petr, can you please test these two patches w/o the lru drain drop patch and see whether the problem is gone? Thanks. ------ 8< ------ If charge moving is used, memcg performs relabeling of the affected pages from its ->attach callback which is called under both cgroup_threadgroup_rwsem and thus can't create new kthreads. This is fragile as various operations may depend on workqueues making forward progress which relies on the ability to create new kthreads. There's no reason to perform charge moving from ->attach which is deep in the task migration path. Move it to ->post_attach which is called after the actual migration is finished and cgroup_threadgroup_rwsem is dropped. * move_charge_struct->mm is added and ->can_attach is now responsible for pinning and recording the target mm. mem_cgroup_clear_mc() is updated accordingly. This also simplifies mem_cgroup_move_task(). * mem_cgroup_move_task() is now called from ->post_attach instead of ->attach. Signed-off-by: Tejun Heo Cc: Johannes Weiner Acked-by: Michal Hocko Debugged-and-tested-by: Petr Mladek Reported-by: Cyril Hrubis Reported-by: Johannes Weiner Fixes: 1ed1328792ff ("sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem") Signed-off-by: Greg Kroah-Hartman --- mm/memcontrol.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fc0bcc41d57f..6ba4dd988e2e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -196,6 +196,7 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); /* "mc" and its members are protected by cgroup_mutex */ static struct move_charge_struct { spinlock_t lock; /* for from, to */ + struct mm_struct *mm; struct mem_cgroup *from; struct mem_cgroup *to; unsigned long flags; @@ -4800,6 +4801,8 @@ static void __mem_cgroup_clear_mc(void) static void mem_cgroup_clear_mc(void) { + struct mm_struct *mm = mc.mm; + /* * we must clear moving_task before waking up waiters at the end of * task migration. @@ -4809,7 +4812,10 @@ static void mem_cgroup_clear_mc(void) spin_lock(&mc.lock); mc.from = NULL; mc.to = NULL; + mc.mm = NULL; spin_unlock(&mc.lock); + + mmput(mm); } static int mem_cgroup_can_attach(struct cgroup_taskset *tset) @@ -4866,6 +4872,7 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) VM_BUG_ON(mc.moved_swap); spin_lock(&mc.lock); + mc.mm = mm; mc.from = from; mc.to = memcg; mc.flags = move_flags; @@ -4875,8 +4882,9 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) ret = mem_cgroup_precharge_mc(mm); if (ret) mem_cgroup_clear_mc(); + } else { + mmput(mm); } - mmput(mm); return ret; } @@ -4985,11 +4993,11 @@ put: /* get_mctgt_type() gets the page */ return ret; } -static void mem_cgroup_move_charge(struct mm_struct *mm) +static void mem_cgroup_move_charge(void) { struct mm_walk mem_cgroup_move_charge_walk = { .pmd_entry = mem_cgroup_move_charge_pte_range, - .mm = mm, + .mm = mc.mm, }; lru_add_drain_all(); @@ -5001,7 +5009,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) atomic_inc(&mc.from->moving_account); synchronize_rcu(); retry: - if (unlikely(!down_read_trylock(&mm->mmap_sem))) { + if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) { /* * Someone who are holding the mmap_sem might be waiting in * waitq. So we cancel all extra charges, wake up all waiters, @@ -5018,23 +5026,16 @@ retry: * additional charge, the page walk just aborts. */ walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk); - up_read(&mm->mmap_sem); + up_read(&mc.mm->mmap_sem); atomic_dec(&mc.from->moving_account); } -static void mem_cgroup_move_task(struct cgroup_taskset *tset) +static void mem_cgroup_move_task(void) { - struct cgroup_subsys_state *css; - struct task_struct *p = cgroup_taskset_first(tset, &css); - struct mm_struct *mm = get_task_mm(p); - - if (mm) { - if (mc.to) - mem_cgroup_move_charge(mm); - mmput(mm); - } - if (mc.to) + if (mc.to) { + mem_cgroup_move_charge(); mem_cgroup_clear_mc(); + } } #else /* !CONFIG_MMU */ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) @@ -5044,7 +5045,7 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) { } -static void mem_cgroup_move_task(struct cgroup_taskset *tset) +static void mem_cgroup_move_task(void) { } #endif @@ -5258,7 +5259,7 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_reset = mem_cgroup_css_reset, .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, - .attach = mem_cgroup_move_task, + .post_attach = mem_cgroup_move_task, .bind = mem_cgroup_bind, .dfl_cftypes = memory_files, .legacy_cftypes = mem_cgroup_legacy_files, -- cgit v1.2.3 From be591a683e3b4cc58466e08cd6b5e4a71c02b19a Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 28 Apr 2016 16:18:32 -0700 Subject: mm/huge_memory: replace VM_NO_THP VM_BUG_ON with actual VMA check commit 3486b85a29c1741db99d0c522211c82d2b7a56d0 upstream. Khugepaged detects own VMAs by checking vm_file and vm_ops but this way it cannot distinguish private /dev/zero mappings from other special mappings like /dev/hpet which has no vm_ops and popultes PTEs in mmap. This fixes false-positive VM_BUG_ON and prevents installing THP where they are not expected. Link: http://lkml.kernel.org/r/CACT4Y+ZmuZMV5CjSFOeXviwQdABAgT7T+StKfTqan9YDtgEi5g@mail.gmail.com Fixes: 78f11a255749 ("mm: thp: fix /dev/zero MAP_PRIVATE and vm_flags cleanups") Signed-off-by: Konstantin Khlebnikov Reported-by: Dmitry Vyukov Acked-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Cc: Dmitry Vyukov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/huge_memory.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 62fe06bb7d04..530e6427f823 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2134,10 +2134,9 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma, * page fault if needed. */ return 0; - if (vma->vm_ops) + if (vma->vm_ops || (vm_flags & VM_NO_THP)) /* khugepaged not yet working on file or special mappings */ return 0; - VM_BUG_ON_VMA(vm_flags & VM_NO_THP, vma); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart < hend) @@ -2498,8 +2497,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma) return false; if (is_vma_temporary_stack(vma)) return false; - VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma); - return true; + return !(vma->vm_flags & VM_NO_THP); } static void collapse_huge_page(struct mm_struct *mm, -- cgit v1.2.3 From e513b90a9aef91e6399decb8e9592f2d75f7ebad Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Thu, 28 Apr 2016 16:18:35 -0700 Subject: numa: fix /proc//numa_maps for THP commit 28093f9f34cedeaea0f481c58446d9dac6dd620f upstream. In gather_pte_stats() a THP pmd is cast into a pte, which is wrong because the layouts may differ depending on the architecture. On s390 this will lead to inaccurate numa_maps accounting in /proc because of misguided pte_present() and pte_dirty() checks on the fake pte. On other architectures pte_present() and pte_dirty() may work by chance, but there may be an issue with direct-access (dax) mappings w/o underlying struct pages when HAVE_PTE_SPECIAL is set and THP is available. In vm_normal_page() the fake pte will be checked with pte_special() and because there is no "special" bit in a pmd, this will always return false and the VM_PFNMAP | VM_MIXEDMAP checking will be skipped. On dax mappings w/o struct pages, an invalid struct page pointer would then be returned that can crash the kernel. This patch fixes the numa_maps THP handling by introducing new "_pmd" variants of the can_gather_numa_stats() and vm_normal_page() functions. Signed-off-by: Gerald Schaefer Cc: Naoya Horiguchi Cc: "Kirill A . Shutemov" Cc: Konstantin Khlebnikov Cc: Michal Hocko Cc: Vlastimil Babka Cc: Jerome Marchand Cc: Johannes Weiner Cc: Dave Hansen Cc: Mel Gorman Cc: Dan Williams Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Michael Holzheu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/memory.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index b80bf4746b67..76dcee317714 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -797,6 +797,46 @@ out: return pfn_to_page(pfn); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t pmd) +{ + unsigned long pfn = pmd_pfn(pmd); + + /* + * There is no pmd_special() but there may be special pmds, e.g. + * in a direct-access (dax) mapping, so let's just replicate the + * !HAVE_PTE_SPECIAL case from vm_normal_page() here. + */ + if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { + if (vma->vm_flags & VM_MIXEDMAP) { + if (!pfn_valid(pfn)) + return NULL; + goto out; + } else { + unsigned long off; + off = (addr - vma->vm_start) >> PAGE_SHIFT; + if (pfn == vma->vm_pgoff + off) + return NULL; + if (!is_cow_mapping(vma->vm_flags)) + return NULL; + } + } + + if (is_zero_pfn(pfn)) + return NULL; + if (unlikely(pfn > highest_memmap_pfn)) + return NULL; + + /* + * NOTE! We still have PageReserved() pages in the page tables. + * eg. VDSO mappings can cause them to exist. + */ +out: + return pfn_to_page(pfn); +} +#endif + /* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range -- cgit v1.2.3 From 87c855f150be9317b9b6ad82c1611ed8d577d986 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 28 Apr 2016 16:18:38 -0700 Subject: mm: vmscan: reclaim highmem zone if buffer_heads is over limit commit 7bf52fb891b64b8d61caf0b82060adb9db761aec upstream. We have been reclaimed highmem zone if buffer_heads is over limit but commit 6b4f7799c6a5 ("mm: vmscan: invoke slab shrinkers from shrink_zone()") changed the behavior so it doesn't reclaim highmem zone although buffer_heads is over the limit. This patch restores the logic. Fixes: 6b4f7799c6a5 ("mm: vmscan: invoke slab shrinkers from shrink_zone()") Signed-off-by: Minchan Kim Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 2aec4241b42a..0c114e2b01d3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2534,7 +2534,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) sc->gfp_mask |= __GFP_HIGHMEM; for_each_zone_zonelist_nodemask(zone, z, zonelist, - requested_highidx, sc->nodemask) { + gfp_zone(sc->gfp_mask), sc->nodemask) { enum zone_type classzone_idx; if (!populated_zone(zone)) -- cgit v1.2.3 From 36abe7272a248a7e47a4cec8d8ec9c76ef387bac Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 28 Apr 2016 16:18:44 -0700 Subject: mm/hwpoison: fix wrong num_poisoned_pages accounting commit d7e69488bd04de165667f6bc741c1c0ec6042ab9 upstream. Currently, migration code increses num_poisoned_pages on *failed* migration page as well as successfully migrated one at the trial of memory-failure. It will make the stat wrong. As well, it marks the page as PG_HWPoison even if the migration trial failed. It would mean we cannot recover the corrupted page using memory-failure facility. This patches fixes it. Signed-off-by: Minchan Kim Reported-by: Vlastimil Babka Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/migrate.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 6d17e0ab42d4..bbeb0b71fcf4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -963,7 +963,13 @@ out: dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); /* Soft-offlined page shouldn't go through lru cache list */ - if (reason == MR_MEMORY_FAILURE) { + if (reason == MR_MEMORY_FAILURE && rc == MIGRATEPAGE_SUCCESS) { + /* + * With this release, we free successfully migrated + * page and set PG_HWPoison on just freed page + * intentionally. Although it's rather weird, it's how + * HWPoison flag works at the moment. + */ put_page(page); if (!test_set_page_hwpoison(page)) num_poisoned_pages_inc(); -- cgit v1.2.3