From d26914d11751b23ca2e8747725f2cae10c2f2c1b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 3 Apr 2014 14:47:24 -0700 Subject: mm: optimize put_mems_allowed() usage Since put_mems_allowed() is strictly optional, its a seqcount retry, we don't need to evaluate the function if the allocation was in fact successful, saving a smp_rmb some loads and comparisons on some relative fast-paths. Since the naming, get/put_mems_allowed() does suggest a mandatory pairing, rename the interface, as suggested by Mel, to resemble the seqcount interface. This gives us: read_mems_allowed_begin() and read_mems_allowed_retry(), where it is important to note that the return value of the latter call is inverted from its previous incarnation. Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3bac76ae4b30..979378deccbf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2739,7 +2739,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, return NULL; retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); /* The preferred zone is used for statistics later */ first_zones_zonelist(zonelist, high_zoneidx, @@ -2777,7 +2777,7 @@ out: * the mask is being updated. If a page allocation is about to fail, * check if the cpuset changed during allocation and if so, retry. */ - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; memcg_kmem_commit_charge(page, memcg, order); @@ -3045,9 +3045,9 @@ bool skip_free_areas_node(unsigned int flags, int nid) goto out; do { - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); ret = !node_isset(nid, cpuset_current_mems_allowed); - } while (!put_mems_allowed(cpuset_mems_cookie)); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); out: return ret; } -- cgit v1.2.3 From 70ef57e6c22c3323dce179b7d0d433c479266612 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 7 Apr 2014 15:37:01 -0700 Subject: mm: exclude memoryless nodes from zone_reclaim We had a report about strange OOM killer strikes on a PPC machine although there was a lot of swap free and a tons of anonymous memory which could be swapped out. In the end it turned out that the OOM was a side effect of zone reclaim which wasn't unmapping and swapping out and so the system was pushed to the OOM. Although this sounds like a bug somewhere in the kswapd vs. zone reclaim vs. direct reclaim interaction numactl on the said hardware suggests that the zone reclaim should not have been set in the first place: node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 node 0 size: 0 MB node 0 free: 0 MB node 2 cpus: node 2 size: 7168 MB node 2 free: 6019 MB node distances: node 0 2 0: 10 40 2: 40 10 So all the CPUs are associated with Node0 which doesn't have any memory while Node2 contains all the available memory. Node distances cause an automatic zone_reclaim_mode enabling. Zone reclaim is intended to keep the allocations local but this doesn't make any sense on the memoryless nodes. So let's exclude such nodes for init_zone_allows_reclaim which evaluates zone reclaim behavior and suitable reclaim_nodes. Signed-off-by: Michal Hocko Acked-by: David Rientjes Acked-by: Nishanth Aravamudan Tested-by: Nishanth Aravamudan Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 979378deccbf..336ee925f756 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1870,7 +1870,7 @@ static void __paginginit init_zone_allows_reclaim(int nid) { int i; - for_each_online_node(i) + for_each_node_state(i, N_MEMORY) if (node_distance(nid, i) <= RECLAIM_DISTANCE) node_set(i, NODE_DATA(nid)->reclaim_nodes); else @@ -4919,7 +4919,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; - init_zone_allows_reclaim(nid); + if (node_state(nid, N_MEMORY)) + init_zone_allows_reclaim(nid); #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); #endif -- cgit v1.2.3 From d230dec18dc9a581f153c14cb5371640cd62543b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 7 Apr 2014 15:37:38 -0700 Subject: mm: use 'const char *' insted of 'char *' for reason in dump_page() I tried to use 'dump_page(page, __func__)' for debugging, but it triggers warning: warning: passing argument 2 of `dump_page' discards `const' qualifier from pointer target type [enabled by default] Let's convert 'reason' to 'const char *' in dump_page() and friends: we shouldn't modify it anyway. Signed-off-by: Kirill A. Shutemov Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 336ee925f756..73c25912c7c4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -295,7 +295,8 @@ static inline int bad_range(struct zone *zone, struct page *page) } #endif -static void bad_page(struct page *page, char *reason, unsigned long bad_flags) +static void bad_page(struct page *page, const char *reason, + unsigned long bad_flags) { static unsigned long resume; static unsigned long nr_shown; @@ -623,7 +624,7 @@ out: static inline int free_pages_check(struct page *page) { - char *bad_reason = NULL; + const char *bad_reason = NULL; unsigned long bad_flags = 0; if (unlikely(page_mapcount(page))) @@ -859,7 +860,7 @@ static inline void expand(struct zone *zone, struct page *page, */ static inline int check_new_page(struct page *page) { - char *bad_reason = NULL; + const char *bad_reason = NULL; unsigned long bad_flags = 0; if (unlikely(page_mapcount(page))) @@ -6545,7 +6546,8 @@ static void dump_page_flags(unsigned long flags) printk(")\n"); } -void dump_page_badflags(struct page *page, char *reason, unsigned long badflags) +void dump_page_badflags(struct page *page, const char *reason, + unsigned long badflags) { printk(KERN_ALERT "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", @@ -6561,7 +6563,7 @@ void dump_page_badflags(struct page *page, char *reason, unsigned long badflags) mem_cgroup_print_bad_page(page); } -void dump_page(struct page *page, char *reason) +void dump_page(struct page *page, const char *reason) { dump_page_badflags(page, reason, 0); } -- cgit v1.2.3 From 3a025760fc158b3726eac89ee95d7f29599e9dfa Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 7 Apr 2014 15:37:48 -0700 Subject: mm: page_alloc: spill to remote nodes before waking kswapd On NUMA systems, a node may start thrashing cache or even swap anonymous pages while there are still free pages on remote nodes. This is a result of commits 81c0a2bb515f ("mm: page_alloc: fair zone allocator policy") and fff4068cba48 ("mm: page_alloc: revert NUMA aspect of fair allocation policy"). Before those changes, the allocator would first try all allowed zones, including those on remote nodes, before waking any kswapds. But now, the allocator fastpath doubles as the fairness pass, which in turn can only consider the local node to prevent remote spilling based on exhausted fairness batches alone. Remote nodes are only considered in the slowpath, after the kswapds are woken up. But if remote nodes still have free memory, kswapd should not be woken to rebalance the local node or it may thrash cash or swap prematurely. Fix this by adding one more unfair pass over the zonelist that is allowed to spill to remote nodes after the local fairness pass fails but before entering the slowpath and waking the kswapds. This also gets rid of the GFP_THISNODE exemption from the fairness protocol because the unfair pass is no longer tied to kswapd, which GFP_THISNODE is not allowed to wake up. However, because remote spills can be more frequent now - we prefer them over local kswapd reclaim - the allocation batches on remote nodes could underflow more heavily. When resetting the batches, use atomic_long_read() directly instead of zone_page_state() to calculate the delta as the latter filters negative counter values. Signed-off-by: Johannes Weiner Acked-by: Rik van Riel Acked-by: Mel Gorman Cc: [3.12+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 89 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 45 insertions(+), 44 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 73c25912c7c4..15d140755e71 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1239,15 +1239,6 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) } local_irq_restore(flags); } -static bool gfp_thisnode_allocation(gfp_t gfp_mask) -{ - return (gfp_mask & GFP_THISNODE) == GFP_THISNODE; -} -#else -static bool gfp_thisnode_allocation(gfp_t gfp_mask) -{ - return false; -} #endif /* @@ -1584,12 +1575,7 @@ again: get_pageblock_migratetype(page)); } - /* - * NOTE: GFP_THISNODE allocations do not partake in the kswapd - * aging protocol, so they can't be fair. - */ - if (!gfp_thisnode_allocation(gfp_flags)) - __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); @@ -1955,23 +1941,12 @@ zonelist_scan: * zone size to ensure fair page aging. The zone a * page was allocated in should have no effect on the * time the page has in memory before being reclaimed. - * - * Try to stay in local zones in the fastpath. If - * that fails, the slowpath is entered, which will do - * another pass starting with the local zones, but - * ultimately fall back to remote zones that do not - * partake in the fairness round-robin cycle of this - * zonelist. - * - * NOTE: GFP_THISNODE allocations do not partake in - * the kswapd aging protocol, so they can't be fair. */ - if ((alloc_flags & ALLOC_WMARK_LOW) && - !gfp_thisnode_allocation(gfp_mask)) { - if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) - continue; + if (alloc_flags & ALLOC_FAIR) { if (!zone_local(preferred_zone, zone)) continue; + if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) + continue; } /* * When allocating a page cache page for writing, we @@ -2409,32 +2384,40 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, return page; } -static void prepare_slowpath(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, - enum zone_type high_zoneidx, - struct zone *preferred_zone) +static void reset_alloc_batches(struct zonelist *zonelist, + enum zone_type high_zoneidx, + struct zone *preferred_zone) { struct zoneref *z; struct zone *zone; for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - if (!(gfp_mask & __GFP_NO_KSWAPD)) - wakeup_kswapd(zone, order, zone_idx(preferred_zone)); /* * Only reset the batches of zones that were actually - * considered in the fast path, we don't want to - * thrash fairness information for zones that are not + * considered in the fairness pass, we don't want to + * trash fairness information for zones that are not * actually part of this zonelist's round-robin cycle. */ if (!zone_local(preferred_zone, zone)) continue; mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - - low_wmark_pages(zone) - - zone_page_state(zone, NR_ALLOC_BATCH)); + high_wmark_pages(zone) - low_wmark_pages(zone) - + atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); } } +static void wake_all_kswapds(unsigned int order, + struct zonelist *zonelist, + enum zone_type high_zoneidx, + struct zone *preferred_zone) +{ + struct zoneref *z; + struct zone *zone; + + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) + wakeup_kswapd(zone, order, zone_idx(preferred_zone)); +} + static inline int gfp_to_alloc_flags(gfp_t gfp_mask) { @@ -2523,12 +2506,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * allowed per node queues are empty and that nodes are * over allocated. */ - if (gfp_thisnode_allocation(gfp_mask)) + if (IS_ENABLED(CONFIG_NUMA) && + (gfp_mask & GFP_THISNODE) == GFP_THISNODE) goto nopage; restart: - prepare_slowpath(gfp_mask, order, zonelist, - high_zoneidx, preferred_zone); + if (!(gfp_mask & __GFP_NO_KSWAPD)) + wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone); /* * OK, we're below the kswapd watermark and have kicked background @@ -2712,7 +2696,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct page *page = NULL; int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; - int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; + int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; struct mem_cgroup *memcg = NULL; gfp_mask &= gfp_allowed_mask; @@ -2753,11 +2737,28 @@ retry_cpuset: if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif +retry: /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, alloc_flags, preferred_zone, migratetype); if (unlikely(!page)) { + /* + * The first pass makes sure allocations are spread + * fairly within the local node. However, the local + * node might have free pages left after the fairness + * batches are exhausted, and remote zones haven't + * even been considered yet. Try once more without + * fairness, and include remote zones now, before + * entering the slowpath and waking kswapd: prefer + * spilling to a remote zone over swapping locally. + */ + if (alloc_flags & ALLOC_FAIR) { + reset_alloc_batches(zonelist, high_zoneidx, + preferred_zone); + alloc_flags &= ~ALLOC_FAIR; + goto retry; + } /* * Runtime PM, block IO and its error handling path * can deadlock because I/O on the device might not -- cgit v1.2.3 From 136199f0a67cd6bb3f0e8de0ad50f52879f82077 Mon Sep 17 00:00:00 2001 From: Emil Medve Date: Mon, 7 Apr 2014 15:37:52 -0700 Subject: memblock: use for_each_memblock() This is a small cleanup. Signed-off-by: Emil Medve Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 15d140755e71..48427a7cfb45 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5073,7 +5073,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) nodemask_t saved_node_state = node_states[N_MEMORY]; unsigned long totalpages = early_calculate_totalpages(); int usable_nodes = nodes_weight(node_states[N_MEMORY]); - struct memblock_type *type = &memblock.memory; + struct memblock_region *r; /* Need to find movable_zone earlier when movable_node is specified. */ find_usable_zone_for_movable(); @@ -5083,13 +5083,13 @@ static void __init find_zone_movable_pfns_for_nodes(void) * options. */ if (movable_node_is_enabled()) { - for (i = 0; i < type->cnt; i++) { - if (!memblock_is_hotpluggable(&type->regions[i])) + for_each_memblock(memory, r) { + if (!memblock_is_hotpluggable(r)) continue; - nid = type->regions[i].nid; + nid = r->nid; - usable_startpfn = PFN_DOWN(type->regions[i].base); + usable_startpfn = PFN_DOWN(r->base); zone_movable_pfn[nid] = zone_movable_pfn[nid] ? min(usable_startpfn, zone_movable_pfn[nid]) : usable_startpfn; -- cgit v1.2.3 From ed12d845b5f528cc0846023862b9c448a36122ec Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Mon, 7 Apr 2014 15:37:59 -0700 Subject: mm/page_alloc.c: change mm debug routines back to EXPORT_SYMBOL A new dump_page() routine was recently added, and marked EXPORT_SYMBOL_GPL. dump_page() was also added to the VM_BUG_ON_PAGE() macro, and so the end result is that non-GPL code can no longer call get_page() and a few other routines. This only happens if the kernel was compiled with CONFIG_DEBUG_VM. Change dump_page() to be EXPORT_SYMBOL. Longer explanation: Prior to commit 309381feaee5 ("mm: dump page when hitting a VM_BUG_ON using VM_BUG_ON_PAGE") , it was possible to build MIT-licensed (non-GPL) drivers on Fedora. Fedora is semi-unique, in that it sets CONFIG_VM_DEBUG. Because Fedora sets CONFIG_VM_DEBUG, they end up pulling in dump_page(), via VM_BUG_ON_PAGE, via get_page(). As one of the authors of NVIDIA's new, open source, "UVM-Lite" kernel module, I originally choose to use the kernel's get_page() routine from within nvidia_uvm_page_cache.c, because get_page() has always seemed to be very clearly intended for use by non-GPL, driver code. So I'm hoping that making get_page() widely accessible again will not be too controversial. We did check with Fedora first, and they responded (https://bugzilla.redhat.com/show_bug.cgi?id=1074710#c3) that we should try to get upstream changed, before asking Fedora to change. Their reasoning seems beneficial to Linux: leaving CONFIG_DEBUG_VM set allows Fedora to help catch mm bugs. Signed-off-by: John Hubbard Cc: Sasha Levin Cc: Josh Boyer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 48427a7cfb45..5dba2933c9c0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6568,4 +6568,4 @@ void dump_page(struct page *page, const char *reason) { dump_page_badflags(page, reason, 0); } -EXPORT_SYMBOL_GPL(dump_page); +EXPORT_SYMBOL(dump_page); -- cgit v1.2.3