From 6131728914810a6c02e08750e13e45870101e862 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Wed, 21 Mar 2012 16:33:48 -0700 Subject: mm/vmscan.c: cleanup with s/reclaim_mode/isolate_mode/ With tons of reclaim_mode (defined as one field of struct scan_control) already in the file, it is clearer to rename the local reclaim_mode when setting up the isolation mode. Signed-off-by: Hillf Danton Acked-by: KAMEZAWA Hiroyuki Acked-by: KOSAKI Motohiro Reviewed-by: Rik van Riel Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index c52b23552659..61a66881235d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1509,7 +1509,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, unsigned long nr_file; unsigned long nr_dirty = 0; unsigned long nr_writeback = 0; - isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; + isolate_mode_t isolate_mode = ISOLATE_INACTIVE; struct zone *zone = mz->zone; while (unlikely(too_many_isolated(zone, file, sc))) { @@ -1522,20 +1522,20 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, set_reclaim_mode(priority, sc, false); if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) - reclaim_mode |= ISOLATE_ACTIVE; + isolate_mode |= ISOLATE_ACTIVE; lru_add_drain(); if (!sc->may_unmap) - reclaim_mode |= ISOLATE_UNMAPPED; + isolate_mode |= ISOLATE_UNMAPPED; if (!sc->may_writepage) - reclaim_mode |= ISOLATE_CLEAN; + isolate_mode |= ISOLATE_CLEAN; spin_lock_irq(&zone->lru_lock); nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned, sc->order, - reclaim_mode, 0, file); + isolate_mode, 0, file); if (global_reclaim(sc)) { zone->pages_scanned += nr_scanned; if (current_is_kswapd()) @@ -1699,21 +1699,21 @@ static void shrink_active_list(unsigned long nr_to_scan, struct page *page; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); unsigned long nr_rotated = 0; - isolate_mode_t reclaim_mode = ISOLATE_ACTIVE; + isolate_mode_t isolate_mode = ISOLATE_ACTIVE; struct zone *zone = mz->zone; lru_add_drain(); if (!sc->may_unmap) - reclaim_mode |= ISOLATE_UNMAPPED; + isolate_mode |= ISOLATE_UNMAPPED; if (!sc->may_writepage) - reclaim_mode |= ISOLATE_CLEAN; + isolate_mode |= ISOLATE_CLEAN; spin_lock_irq(&zone->lru_lock); nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc->order, - reclaim_mode, 1, file); + isolate_mode, 1, file); if (global_reclaim(sc)) zone->pages_scanned += nr_scanned; -- cgit v1.2.3 From c38446cc65e1f2b3eb8630c53943b94c4f65f670 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Wed, 21 Mar 2012 16:33:50 -0700 Subject: mm: vmscan: fix misused nr_reclaimed in shrink_mem_cgroup_zone() The value of nr_reclaimed is the number of pages reclaimed in the current round of the loop, whereas nr_to_reclaim should be compared with the number of pages reclaimed in all rounds. In each round of the loop, reclaimed pages are cut off from the reclaim goal, and the loop stops once the goal achieved. Signed-off-by: Hillf Danton Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Rik van Riel Cc: Hugh Dickins Cc: Michal Hocko Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 61a66881235d..8dfa59866af2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2112,7 +2112,12 @@ restart: * with multiple processes reclaiming pages, the total * freeing target can get unreasonably large. */ - if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) + if (nr_reclaimed >= nr_to_reclaim) + nr_to_reclaim = 0; + else + nr_to_reclaim -= nr_reclaimed; + + if (!nr_to_reclaim && priority < DEF_PRIORITY) break; } blk_finish_plug(&plug); -- cgit v1.2.3 From fe2c2a106663130a5ab45cb0e3414b52df2fff0c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 21 Mar 2012 16:33:51 -0700 Subject: vmscan: reclaim at order 0 when compaction is enabled When built with CONFIG_COMPACTION, kswapd should not try to free contiguous pages, because it is not trying hard enough to have a real chance at being successful, but still disrupts the LRU enough to break other things. Do not do higher order page isolation unless we really are in lumpy reclaim mode. Stop reclaiming pages once we have enough free pages that compaction can deal with things, and we hit the normal order 0 watermarks used by kswapd. Also remove a line of code that increments balanced right before exiting the function. Signed-off-by: Rik van Riel Cc: Andrea Arcangeli Acked-by: Mel Gorman Cc: Johannes Weiner Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 47 ++++++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 17 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 8dfa59866af2..d7dad2a4e69c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1138,7 +1138,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) * @mz: The mem_cgroup_zone to pull pages from. * @dst: The temp list to put pages on to. * @nr_scanned: The number of pages that were scanned. - * @order: The caller's attempted allocation order + * @sc: The scan_control struct for this reclaim session * @mode: One of the LRU isolation modes * @active: True [1] if isolating active pages * @file: True [1] if isolating file [!anon] pages @@ -1147,8 +1147,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) */ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, struct list_head *dst, - unsigned long *nr_scanned, int order, isolate_mode_t mode, - int active, int file) + unsigned long *nr_scanned, struct scan_control *sc, + isolate_mode_t mode, int active, int file) { struct lruvec *lruvec; struct list_head *src; @@ -1194,7 +1194,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, BUG(); } - if (!order) + if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)) continue; /* @@ -1208,8 +1208,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, */ zone_id = page_zone_id(page); page_pfn = page_to_pfn(page); - pfn = page_pfn & ~((1 << order) - 1); - end_pfn = pfn + (1 << order); + pfn = page_pfn & ~((1 << sc->order) - 1); + end_pfn = pfn + (1 << sc->order); for (; pfn < end_pfn; pfn++) { struct page *cursor_page; @@ -1275,7 +1275,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, *nr_scanned = scan; - trace_mm_vmscan_lru_isolate(order, + trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, @@ -1533,9 +1533,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, spin_lock_irq(&zone->lru_lock); - nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, - &nr_scanned, sc->order, - isolate_mode, 0, file); + nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned, + sc, isolate_mode, 0, file); if (global_reclaim(sc)) { zone->pages_scanned += nr_scanned; if (current_is_kswapd()) @@ -1711,8 +1710,7 @@ static void shrink_active_list(unsigned long nr_to_scan, spin_lock_irq(&zone->lru_lock); - nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, - &nr_scanned, sc->order, + nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc, isolate_mode, 1, file); if (global_reclaim(sc)) zone->pages_scanned += nr_scanned; @@ -2758,7 +2756,7 @@ loop_again: */ for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; - int nr_slab; + int nr_slab, testorder; unsigned long balance_gap; if (!populated_zone(zone)) @@ -2791,7 +2789,20 @@ loop_again: (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / KSWAPD_ZONE_BALANCE_GAP_RATIO); - if (!zone_watermark_ok_safe(zone, order, + /* + * Kswapd reclaims only single pages with compaction + * enabled. Trying too hard to reclaim until contiguous + * free pages have become available can hurt performance + * by evicting too much useful data from memory. + * Do not reclaim more than needed for compaction. + */ + testorder = order; + if (COMPACTION_BUILD && order && + compaction_suitable(zone, order) != + COMPACT_SKIPPED) + testorder = 0; + + if (!zone_watermark_ok_safe(zone, testorder, high_wmark_pages(zone) + balance_gap, end_zone, 0)) { shrink_zone(priority, zone, &sc); @@ -2820,7 +2831,7 @@ loop_again: continue; } - if (!zone_watermark_ok_safe(zone, order, + if (!zone_watermark_ok_safe(zone, testorder, high_wmark_pages(zone), end_zone, 0)) { all_zones_ok = 0; /* @@ -2917,6 +2928,10 @@ out: if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; + /* Would compaction fail due to lack of free memory? */ + if (compaction_suitable(zone, order) == COMPACT_SKIPPED) + goto loop_again; + /* Confirm the zone is balanced for order-0 */ if (!zone_watermark_ok(zone, 0, high_wmark_pages(zone), 0, 0)) { @@ -2926,8 +2941,6 @@ out: /* If balanced, clear the congested flag */ zone_clear_flag(zone, ZONE_CONGESTED); - if (i <= *classzone_idx) - balanced += zone->present_pages; } } -- cgit v1.2.3 From 7be62de99adcab4449d416977b4274985c5fe023 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 21 Mar 2012 16:33:52 -0700 Subject: vmscan: kswapd carefully call compaction With CONFIG_COMPACTION enabled, kswapd does not try to free contiguous free pages, even when it is woken for a higher order request. This could be bad for eg. jumbo frame network allocations, which are done from interrupt context and cannot compact memory themselves. Higher than before allocation failure rates in the network receive path have been observed in kernels with compaction enabled. Teach kswapd to defragment the memory zones in a node, but only if required and compaction is not deferred in a zone. [akpm@linux-foundation.org: reduce scope of zones_need_compaction] Signed-off-by: Rik van Riel Acked-by: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index d7dad2a4e69c..b2b4c4a0ada2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2919,6 +2919,8 @@ out: * and it is potentially going to sleep here. */ if (order) { + int zones_need_compaction = 1; + for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; @@ -2939,9 +2941,17 @@ out: goto loop_again; } + /* Check if the memory needs to be defragmented. */ + if (zone_watermark_ok(zone, order, + low_wmark_pages(zone), *classzone_idx, 0)) + zones_need_compaction = 0; + /* If balanced, clear the congested flag */ zone_clear_flag(zone, ZONE_CONGESTED); } + + if (zones_need_compaction) + compact_pgdat(pgdat, order); } /* -- cgit v1.2.3 From aff622495c9a0b56148192e53bdec539f5e147f2 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 21 Mar 2012 16:33:52 -0700 Subject: vmscan: only defer compaction for failed order and higher Currently a failed order-9 (transparent hugepage) compaction can lead to memory compaction being temporarily disabled for a memory zone. Even if we only need compaction for an order 2 allocation, eg. for jumbo frames networking. The fix is relatively straightforward: keep track of the highest order at which compaction is succeeding, and only defer compaction for orders at which compaction is failing. Signed-off-by: Rik van Riel Cc: Andrea Arcangeli Acked-by: Mel Gorman Cc: Johannes Weiner Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index b2b4c4a0ada2..87e4d6a6dc11 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2198,7 +2198,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) * If compaction is deferred, reclaim up to a point where * compaction will have a chance of success when re-enabled */ - if (compaction_deferred(zone)) + if (compaction_deferred(zone, sc->order)) return watermark_ok; /* If compaction is not ready to start, keep reclaiming */ -- cgit v1.2.3 From cc715d99e529d470dde2f33a6614f255adea71f3 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 21 Mar 2012 16:34:00 -0700 Subject: mm: vmscan: forcibly scan highmem if there are too many buffer_heads pinning highmem Stuart Foster reported on bugzilla that copying large amounts of data from NTFS caused an OOM kill on 32-bit X86 with 16G of memory. Andrew Morton correctly identified that the problem was NTFS was using 512 blocks meaning each page had 8 buffer_heads in low memory pinning it. In the past, direct reclaim used to scan highmem even if the allocating process did not specify __GFP_HIGHMEM but not any more. kswapd no longer will reclaim from zones that are above the high watermark. The intention in both cases was to minimise unnecessary reclaim. The downside is on machines with large amounts of highmem that lowmem can be fully consumed by buffer_heads with nothing trying to free them. The following patch is based on a suggestion by Andrew Morton to extend the buffer_heads_over_limit case to force kswapd and direct reclaim to scan the highmem zone regardless of the allocation request or watermarks. Addresses https://bugzilla.kernel.org/show_bug.cgi?id=42578 [hughd@google.com: move buffer_heads_over_limit check up] [akpm@linux-foundation.org: buffer_heads_over_limit is unlikely] Reported-by: Stuart Foster Tested-by: Stuart Foster Signed-off-by: Mel Gorman Signed-off-by: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Christoph Lameter Cc: stable Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 87e4d6a6dc11..ae3bf0a09cdd 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1642,18 +1642,6 @@ static void move_active_pages_to_lru(struct zone *zone, unsigned long pgmoved = 0; struct page *page; - if (buffer_heads_over_limit) { - spin_unlock_irq(&zone->lru_lock); - list_for_each_entry(page, list, lru) { - if (page_has_private(page) && trylock_page(page)) { - if (page_has_private(page)) - try_to_release_page(page, 0); - unlock_page(page); - } - } - spin_lock_irq(&zone->lru_lock); - } - while (!list_empty(list)) { struct lruvec *lruvec; @@ -1735,6 +1723,14 @@ static void shrink_active_list(unsigned long nr_to_scan, continue; } + if (unlikely(buffer_heads_over_limit)) { + if (page_has_private(page) && trylock_page(page)) { + if (page_has_private(page)) + try_to_release_page(page, 0); + unlock_page(page); + } + } + if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) { nr_rotated += hpage_nr_pages(page); /* @@ -2238,6 +2234,14 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, unsigned long nr_soft_scanned; bool aborted_reclaim = false; + /* + * If the number of buffer_heads in the machine exceeds the maximum + * allowed level, force direct reclaim to scan the highmem zone as + * highmem pages could be pinning lowmem pages storing buffer_heads + */ + if (buffer_heads_over_limit) + sc->gfp_mask |= __GFP_HIGHMEM; + for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { if (!populated_zone(zone)) @@ -2727,6 +2731,17 @@ loop_again: */ age_active_anon(zone, &sc, priority); + /* + * If the number of buffer_heads in the machine + * exceeds the maximum allowed level and this node + * has a highmem zone, force kswapd to reclaim from + * it to relieve lowmem pressure. + */ + if (buffer_heads_over_limit && is_highmem_idx(i)) { + end_zone = i; + break; + } + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 0, 0)) { end_zone = i; @@ -2802,7 +2817,8 @@ loop_again: COMPACT_SKIPPED) testorder = 0; - if (!zone_watermark_ok_safe(zone, testorder, + if ((buffer_heads_over_limit && is_highmem_idx(i)) || + !zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + balance_gap, end_zone, 0)) { shrink_zone(priority, zone, &sc); -- cgit v1.2.3 From d563c0501bf8702b9b683206c09b9defb37d8a8a Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Wed, 21 Mar 2012 16:34:02 -0700 Subject: vmscan: handle isolated pages with lru lock released When shrinking inactive lru list, isolated pages are queued on locally private list, so the lock-hold time could be reduced if pages are counted without lock protection. To achieve that, firstly updating reclaim stat is delayed until the putback stage, after reacquiring the lru lock. Secondly, operations related to vm and zone stats are now proteced with preemption disabled as they are per-cpu operations. Signed-off-by: Hillf Danton Acked-by: Hugh Dickins Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index ae3bf0a09cdd..57d8ef6ee4dd 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1413,7 +1413,6 @@ update_isolated_counts(struct mem_cgroup_zone *mz, unsigned long *nr_anon, unsigned long *nr_file) { - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); struct zone *zone = mz->zone; unsigned int count[NR_LRU_LISTS] = { 0, }; unsigned long nr_active = 0; @@ -1434,6 +1433,7 @@ update_isolated_counts(struct mem_cgroup_zone *mz, count[lru] += numpages; } + preempt_disable(); __count_vm_events(PGDEACTIVATE, nr_active); __mod_zone_page_state(zone, NR_ACTIVE_FILE, @@ -1448,8 +1448,9 @@ update_isolated_counts(struct mem_cgroup_zone *mz, *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; - reclaim_stat->recent_scanned[0] += *nr_anon; - reclaim_stat->recent_scanned[1] += *nr_file; + __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file); + preempt_enable(); } /* @@ -1511,6 +1512,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, unsigned long nr_writeback = 0; isolate_mode_t isolate_mode = ISOLATE_INACTIVE; struct zone *zone = mz->zone; + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); while (unlikely(too_many_isolated(zone, file, sc))) { congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -1544,19 +1546,13 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned); } + spin_unlock_irq(&zone->lru_lock); - if (nr_taken == 0) { - spin_unlock_irq(&zone->lru_lock); + if (nr_taken == 0) return 0; - } update_isolated_counts(mz, &page_list, &nr_anon, &nr_file); - __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); - - spin_unlock_irq(&zone->lru_lock); - nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority, &nr_dirty, &nr_writeback); @@ -1569,6 +1565,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, spin_lock_irq(&zone->lru_lock); + reclaim_stat->recent_scanned[0] += nr_anon; + reclaim_stat->recent_scanned[1] += nr_file; + if (current_is_kswapd()) __count_vm_events(KSWAPD_STEAL, nr_reclaimed); __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); -- cgit v1.2.3 From c7cfa37b7324a190fc36ff116d79d0f899e8d273 Mon Sep 17 00:00:00 2001 From: Copot Alexandru Date: Wed, 21 Mar 2012 16:34:10 -0700 Subject: mm/vmscan.c: fix spelling error s/noticable/noticeable/ Signed-off-by: Copot Alexandru Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 57d8ef6ee4dd..440af1d899b9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2261,8 +2261,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, * Even though compaction is invoked for any * non-zero order, only frequent costly order * reclamation is disruptive enough to become a - * noticable problem, like transparent huge page - * allocations. + * noticeable problem, like transparent huge + * page allocations. */ if (compaction_ready(zone, sc)) { aborted_reclaim = true; -- cgit v1.2.3 From cc9a6c8776615f9c194ccf0b63a0aa5628235545 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 21 Mar 2012 16:34:11 -0700 Subject: cpuset: mm: reduce large amounts of memory barrier related damage v3 Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") wins a super prize for the largest number of memory barriers entered into fast paths for one commit. [get|put]_mems_allowed is incredibly heavy with pairs of full memory barriers inserted into a number of hot paths. This was detected while investigating at large page allocator slowdown introduced some time after 2.6.32. The largest portion of this overhead was shown by oprofile to be at an mfence introduced by this commit into the page allocator hot path. For extra style points, the commit introduced the use of yield() in an implementation of what looks like a spinning mutex. This patch replaces the full memory barriers on both read and write sides with a sequence counter with just read barriers on the fast path side. This is much cheaper on some architectures, including x86. The main bulk of the patch is the retry logic if the nodemask changes in a manner that can cause a false failure. While updating the nodemask, a check is made to see if a false failure is a risk. If it is, the sequence number gets bumped and parallel allocators will briefly stall while the nodemask update takes place. In a page fault test microbenchmark, oprofile samples from __alloc_pages_nodemask went from 4.53% of all samples to 1.15%. The actual results were 3.3.0-rc3 3.3.0-rc3 rc3-vanilla nobarrier-v2r1 Clients 1 UserTime 0.07 ( 0.00%) 0.08 (-14.19%) Clients 2 UserTime 0.07 ( 0.00%) 0.07 ( 2.72%) Clients 4 UserTime 0.08 ( 0.00%) 0.07 ( 3.29%) Clients 1 SysTime 0.70 ( 0.00%) 0.65 ( 6.65%) Clients 2 SysTime 0.85 ( 0.00%) 0.82 ( 3.65%) Clients 4 SysTime 1.41 ( 0.00%) 1.41 ( 0.32%) Clients 1 WallTime 0.77 ( 0.00%) 0.74 ( 4.19%) Clients 2 WallTime 0.47 ( 0.00%) 0.45 ( 3.73%) Clients 4 WallTime 0.38 ( 0.00%) 0.37 ( 1.58%) Clients 1 Flt/sec/cpu 497620.28 ( 0.00%) 520294.53 ( 4.56%) Clients 2 Flt/sec/cpu 414639.05 ( 0.00%) 429882.01 ( 3.68%) Clients 4 Flt/sec/cpu 257959.16 ( 0.00%) 258761.48 ( 0.31%) Clients 1 Flt/sec 495161.39 ( 0.00%) 517292.87 ( 4.47%) Clients 2 Flt/sec 820325.95 ( 0.00%) 850289.77 ( 3.65%) Clients 4 Flt/sec 1020068.93 ( 0.00%) 1022674.06 ( 0.26%) MMTests Statistics: duration Sys Time Running Test (seconds) 135.68 132.17 User+Sys Time Running Test (seconds) 164.2 160.13 Total Elapsed Time (seconds) 123.46 120.87 The overall improvement is small but the System CPU time is much improved and roughly in correlation to what oprofile reported (these performance figures are without profiling so skew is expected). The actual number of page faults is noticeably improved. For benchmarks like kernel builds, the overall benefit is marginal but the system CPU time is slightly reduced. To test the actual bug the commit fixed I opened two terminals. The first ran within a cpuset and continually ran a small program that faulted 100M of anonymous data. In a second window, the nodemask of the cpuset was continually randomised in a loop. Without the commit, the program would fail every so often (usually within 10 seconds) and obviously with the commit everything worked fine. With this patch applied, it also worked fine so the fix should be functionally equivalent. Signed-off-by: Mel Gorman Cc: Miao Xie Cc: David Rientjes Cc: Peter Zijlstra Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 440af1d899b9..55d86c9506f3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2343,7 +2343,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, unsigned long writeback_threshold; bool aborted_reclaim; - get_mems_allowed(); delayacct_freepages_start(); if (global_reclaim(sc)) @@ -2407,7 +2406,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, out: delayacct_freepages_end(); - put_mems_allowed(); if (sc->nr_reclaimed) return sc->nr_reclaimed; -- cgit v1.2.3 From 1480de0340a8d5f094b74d7c4b902456c9a06903 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 21 Mar 2012 16:34:17 -0700 Subject: mm: forbid lumpy-reclaim in shrink_active_list() Reset the reclaim mode in shrink_active_list() to RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC. (sync/async sign is used only in shrink_page_list and does not affect shrink_active_list) Currenly shrink_active_list() sometimes works in lumpy-reclaim mode, if RECLAIM_MODE_LUMPYRECLAIM is left over from an earlier shrink_inactive_list(). Meanwhile, in age_active_anon() sc->reclaim_mode is totally zero. So the current behavior is too complex and confusing, and this looks like bug. In general, shrink_active_list() populates the inactive list for the next shrink_inactive_list(). Lumpy shring_inactive_list() isolates pages around the chosen one from both the active and inactive lists. So, there is no reason for lumpy isolation in shrink_active_list(). See also: https://lkml.org/lkml/2012/3/15/583 Signed-off-by: Konstantin Khlebnikov Proposed-by: Hugh Dickins Acked-by: Johannes Weiner Cc: Rik van Riel Cc: Minchan Kim Cc: Mel Gorman Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 55d86c9506f3..49f15ef0a99a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1690,6 +1690,8 @@ static void shrink_active_list(unsigned long nr_to_scan, lru_add_drain(); + reset_reclaim_mode(sc); + if (!sc->may_unmap) isolate_mode |= ISOLATE_UNMAPPED; if (!sc->may_writepage) -- cgit v1.2.3 From 643ac9fc5429e85b8b7f534544b80bcc4f34c367 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 23 Mar 2012 02:57:31 -0700 Subject: mm: fix testorder interaction between two kswapd patches Adjusting cc715d99e529 "mm: vmscan: forcibly scan highmem if there are too many buffer_heads pinning highmem" for -stable reveals that it was slightly wrong once on top of fe2c2a106663 "vmscan: reclaim at order 0 when compaction is enabled", which specifically adds testorder for the zone_watermark_ok_safe() test. Signed-off-by: Hugh Dickins Acked-by: Mel Gorman Acked-by: Rik van Riel Cc: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 49f15ef0a99a..7658fd6536dd 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2817,7 +2817,7 @@ loop_again: testorder = 0; if ((buffer_heads_over_limit && is_highmem_idx(i)) || - !zone_watermark_ok_safe(zone, order, + !zone_watermark_ok_safe(zone, testorder, high_wmark_pages(zone) + balance_gap, end_zone, 0)) { shrink_zone(priority, zone, &sc); -- cgit v1.2.3 From 496b919b3bdd957d4b1727df79bfa3751bced1c1 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Sat, 24 Mar 2012 10:26:21 -0400 Subject: Fix potential endless loop in kswapd when compaction is not enabled We should only test compaction_suitable if the kernel is built with CONFIG_COMPACTION, otherwise the stub compaction_suitable function will always return COMPACT_SKIPPED and send kswapd into an infinite loop. Reported-by: Anton Blanchard Signed-off-by: Rik van Riel Signed-off-by: Linus Torvalds --- mm/vmscan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 7658fd6536dd..33c332bbab73 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2946,7 +2946,8 @@ out: continue; /* Would compaction fail due to lack of free memory? */ - if (compaction_suitable(zone, order) == COMPACT_SKIPPED) + if (COMPACTION_BUILD && + compaction_suitable(zone, order) == COMPACT_SKIPPED) goto loop_again; /* Confirm the zone is balanced for order-0 */ -- cgit v1.2.3