From 6131728914810a6c02e08750e13e45870101e862 Mon Sep 17 00:00:00 2001
From: Hillf Danton <dhillf@gmail.com>
Date: Wed, 21 Mar 2012 16:33:48 -0700
Subject: mm/vmscan.c: cleanup with s/reclaim_mode/isolate_mode/

With tons of reclaim_mode (defined as one field of struct scan_control)
already in the file, it is clearer to rename the local reclaim_mode when
setting up the isolation mode.

Signed-off-by: Hillf Danton <dhillf@gmail.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index c52b23552659..61a66881235d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1509,7 +1509,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
 	unsigned long nr_file;
 	unsigned long nr_dirty = 0;
 	unsigned long nr_writeback = 0;
-	isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
+	isolate_mode_t isolate_mode = ISOLATE_INACTIVE;
 	struct zone *zone = mz->zone;
 
 	while (unlikely(too_many_isolated(zone, file, sc))) {
@@ -1522,20 +1522,20 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
 
 	set_reclaim_mode(priority, sc, false);
 	if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
-		reclaim_mode |= ISOLATE_ACTIVE;
+		isolate_mode |= ISOLATE_ACTIVE;
 
 	lru_add_drain();
 
 	if (!sc->may_unmap)
-		reclaim_mode |= ISOLATE_UNMAPPED;
+		isolate_mode |= ISOLATE_UNMAPPED;
 	if (!sc->may_writepage)
-		reclaim_mode |= ISOLATE_CLEAN;
+		isolate_mode |= ISOLATE_CLEAN;
 
 	spin_lock_irq(&zone->lru_lock);
 
 	nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list,
 				     &nr_scanned, sc->order,
-				     reclaim_mode, 0, file);
+				     isolate_mode, 0, file);
 	if (global_reclaim(sc)) {
 		zone->pages_scanned += nr_scanned;
 		if (current_is_kswapd())
@@ -1699,21 +1699,21 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	struct page *page;
 	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
 	unsigned long nr_rotated = 0;
-	isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
+	isolate_mode_t isolate_mode = ISOLATE_ACTIVE;
 	struct zone *zone = mz->zone;
 
 	lru_add_drain();
 
 	if (!sc->may_unmap)
-		reclaim_mode |= ISOLATE_UNMAPPED;
+		isolate_mode |= ISOLATE_UNMAPPED;
 	if (!sc->may_writepage)
-		reclaim_mode |= ISOLATE_CLEAN;
+		isolate_mode |= ISOLATE_CLEAN;
 
 	spin_lock_irq(&zone->lru_lock);
 
 	nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold,
 				     &nr_scanned, sc->order,
-				     reclaim_mode, 1, file);
+				     isolate_mode, 1, file);
 	if (global_reclaim(sc))
 		zone->pages_scanned += nr_scanned;
 
-- 
cgit v1.2.3


From c38446cc65e1f2b3eb8630c53943b94c4f65f670 Mon Sep 17 00:00:00 2001
From: Hillf Danton <dhillf@gmail.com>
Date: Wed, 21 Mar 2012 16:33:50 -0700
Subject: mm: vmscan: fix misused nr_reclaimed in shrink_mem_cgroup_zone()

The value of nr_reclaimed is the number of pages reclaimed in the current
round of the loop, whereas nr_to_reclaim should be compared with the
number of pages reclaimed in all rounds.

In each round of the loop, reclaimed pages are cut off from the reclaim
goal, and the loop stops once the goal achieved.

Signed-off-by: Hillf Danton <dhillf@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 61a66881235d..8dfa59866af2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2112,7 +2112,12 @@ restart:
 		 * with multiple processes reclaiming pages, the total
 		 * freeing target can get unreasonably large.
 		 */
-		if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
+		if (nr_reclaimed >= nr_to_reclaim)
+			nr_to_reclaim = 0;
+		else
+			nr_to_reclaim -= nr_reclaimed;
+
+		if (!nr_to_reclaim && priority < DEF_PRIORITY)
 			break;
 	}
 	blk_finish_plug(&plug);
-- 
cgit v1.2.3


From fe2c2a106663130a5ab45cb0e3414b52df2fff0c Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Wed, 21 Mar 2012 16:33:51 -0700
Subject: vmscan: reclaim at order 0 when compaction is enabled

When built with CONFIG_COMPACTION, kswapd should not try to free
contiguous pages, because it is not trying hard enough to have a real
chance at being successful, but still disrupts the LRU enough to break
other things.

Do not do higher order page isolation unless we really are in lumpy
reclaim mode.

Stop reclaiming pages once we have enough free pages that compaction can
deal with things, and we hit the normal order 0 watermarks used by kswapd.

Also remove a line of code that increments balanced right before exiting
the function.

Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Hillf Danton <dhillf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 47 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 17 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8dfa59866af2..d7dad2a4e69c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1138,7 +1138,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
  * @mz:		The mem_cgroup_zone to pull pages from.
  * @dst:	The temp list to put pages on to.
  * @nr_scanned:	The number of pages that were scanned.
- * @order:	The caller's attempted allocation order
+ * @sc:		The scan_control struct for this reclaim session
  * @mode:	One of the LRU isolation modes
  * @active:	True [1] if isolating active pages
  * @file:	True [1] if isolating file [!anon] pages
@@ -1147,8 +1147,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
  */
 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 		struct mem_cgroup_zone *mz, struct list_head *dst,
-		unsigned long *nr_scanned, int order, isolate_mode_t mode,
-		int active, int file)
+		unsigned long *nr_scanned, struct scan_control *sc,
+		isolate_mode_t mode, int active, int file)
 {
 	struct lruvec *lruvec;
 	struct list_head *src;
@@ -1194,7 +1194,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 			BUG();
 		}
 
-		if (!order)
+		if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM))
 			continue;
 
 		/*
@@ -1208,8 +1208,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 		 */
 		zone_id = page_zone_id(page);
 		page_pfn = page_to_pfn(page);
-		pfn = page_pfn & ~((1 << order) - 1);
-		end_pfn = pfn + (1 << order);
+		pfn = page_pfn & ~((1 << sc->order) - 1);
+		end_pfn = pfn + (1 << sc->order);
 		for (; pfn < end_pfn; pfn++) {
 			struct page *cursor_page;
 
@@ -1275,7 +1275,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 
 	*nr_scanned = scan;
 
-	trace_mm_vmscan_lru_isolate(order,
+	trace_mm_vmscan_lru_isolate(sc->order,
 			nr_to_scan, scan,
 			nr_taken,
 			nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
@@ -1533,9 +1533,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
 
 	spin_lock_irq(&zone->lru_lock);
 
-	nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list,
-				     &nr_scanned, sc->order,
-				     isolate_mode, 0, file);
+	nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned,
+				     sc, isolate_mode, 0, file);
 	if (global_reclaim(sc)) {
 		zone->pages_scanned += nr_scanned;
 		if (current_is_kswapd())
@@ -1711,8 +1710,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
 
 	spin_lock_irq(&zone->lru_lock);
 
-	nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold,
-				     &nr_scanned, sc->order,
+	nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc,
 				     isolate_mode, 1, file);
 	if (global_reclaim(sc))
 		zone->pages_scanned += nr_scanned;
@@ -2758,7 +2756,7 @@ loop_again:
 		 */
 		for (i = 0; i <= end_zone; i++) {
 			struct zone *zone = pgdat->node_zones + i;
-			int nr_slab;
+			int nr_slab, testorder;
 			unsigned long balance_gap;
 
 			if (!populated_zone(zone))
@@ -2791,7 +2789,20 @@ loop_again:
 				(zone->present_pages +
 					KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
 				KSWAPD_ZONE_BALANCE_GAP_RATIO);
-			if (!zone_watermark_ok_safe(zone, order,
+			/*
+			 * Kswapd reclaims only single pages with compaction
+			 * enabled. Trying too hard to reclaim until contiguous
+			 * free pages have become available can hurt performance
+			 * by evicting too much useful data from memory.
+			 * Do not reclaim more than needed for compaction.
+			 */
+			testorder = order;
+			if (COMPACTION_BUILD && order &&
+					compaction_suitable(zone, order) !=
+						COMPACT_SKIPPED)
+				testorder = 0;
+
+			if (!zone_watermark_ok_safe(zone, testorder,
 					high_wmark_pages(zone) + balance_gap,
 					end_zone, 0)) {
 				shrink_zone(priority, zone, &sc);
@@ -2820,7 +2831,7 @@ loop_again:
 				continue;
 			}
 
-			if (!zone_watermark_ok_safe(zone, order,
+			if (!zone_watermark_ok_safe(zone, testorder,
 					high_wmark_pages(zone), end_zone, 0)) {
 				all_zones_ok = 0;
 				/*
@@ -2917,6 +2928,10 @@ out:
 			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
 				continue;
 
+			/* Would compaction fail due to lack of free memory? */
+			if (compaction_suitable(zone, order) == COMPACT_SKIPPED)
+				goto loop_again;
+
 			/* Confirm the zone is balanced for order-0 */
 			if (!zone_watermark_ok(zone, 0,
 					high_wmark_pages(zone), 0, 0)) {
@@ -2926,8 +2941,6 @@ out:
 
 			/* If balanced, clear the congested flag */
 			zone_clear_flag(zone, ZONE_CONGESTED);
-			if (i <= *classzone_idx)
-				balanced += zone->present_pages;
 		}
 	}
 
-- 
cgit v1.2.3


From 7be62de99adcab4449d416977b4274985c5fe023 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Wed, 21 Mar 2012 16:33:52 -0700
Subject: vmscan: kswapd carefully call compaction

With CONFIG_COMPACTION enabled, kswapd does not try to free contiguous
free pages, even when it is woken for a higher order request.

This could be bad for eg.  jumbo frame network allocations, which are done
from interrupt context and cannot compact memory themselves.  Higher than
before allocation failure rates in the network receive path have been
observed in kernels with compaction enabled.

Teach kswapd to defragment the memory zones in a node, but only if
required and compaction is not deferred in a zone.

[akpm@linux-foundation.org: reduce scope of zones_need_compaction]
Signed-off-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Hillf Danton <dhillf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index d7dad2a4e69c..b2b4c4a0ada2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2919,6 +2919,8 @@ out:
 	 * and it is potentially going to sleep here.
 	 */
 	if (order) {
+		int zones_need_compaction = 1;
+
 		for (i = 0; i <= end_zone; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 
@@ -2939,9 +2941,17 @@ out:
 				goto loop_again;
 			}
 
+			/* Check if the memory needs to be defragmented. */
+			if (zone_watermark_ok(zone, order,
+				    low_wmark_pages(zone), *classzone_idx, 0))
+				zones_need_compaction = 0;
+
 			/* If balanced, clear the congested flag */
 			zone_clear_flag(zone, ZONE_CONGESTED);
 		}
+
+		if (zones_need_compaction)
+			compact_pgdat(pgdat, order);
 	}
 
 	/*
-- 
cgit v1.2.3


From aff622495c9a0b56148192e53bdec539f5e147f2 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Wed, 21 Mar 2012 16:33:52 -0700
Subject: vmscan: only defer compaction for failed order and higher

Currently a failed order-9 (transparent hugepage) compaction can lead to
memory compaction being temporarily disabled for a memory zone.  Even if
we only need compaction for an order 2 allocation, eg.  for jumbo frames
networking.

The fix is relatively straightforward: keep track of the highest order at
which compaction is succeeding, and only defer compaction for orders at
which compaction is failing.

Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Hillf Danton <dhillf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index b2b4c4a0ada2..87e4d6a6dc11 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2198,7 +2198,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
 	 * If compaction is deferred, reclaim up to a point where
 	 * compaction will have a chance of success when re-enabled
 	 */
-	if (compaction_deferred(zone))
+	if (compaction_deferred(zone, sc->order))
 		return watermark_ok;
 
 	/* If compaction is not ready to start, keep reclaiming */
-- 
cgit v1.2.3


From cc715d99e529d470dde2f33a6614f255adea71f3 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Wed, 21 Mar 2012 16:34:00 -0700
Subject: mm: vmscan: forcibly scan highmem if there are too many buffer_heads
 pinning highmem

Stuart Foster reported on bugzilla that copying large amounts of data
from NTFS caused an OOM kill on 32-bit X86 with 16G of memory.  Andrew
Morton correctly identified that the problem was NTFS was using 512
blocks meaning each page had 8 buffer_heads in low memory pinning it.

In the past, direct reclaim used to scan highmem even if the allocating
process did not specify __GFP_HIGHMEM but not any more.  kswapd no longer
will reclaim from zones that are above the high watermark.  The intention
in both cases was to minimise unnecessary reclaim.  The downside is on
machines with large amounts of highmem that lowmem can be fully consumed
by buffer_heads with nothing trying to free them.

The following patch is based on a suggestion by Andrew Morton to extend
the buffer_heads_over_limit case to force kswapd and direct reclaim to
scan the highmem zone regardless of the allocation request or watermarks.

Addresses https://bugzilla.kernel.org/show_bug.cgi?id=42578

[hughd@google.com: move buffer_heads_over_limit check up]
[akpm@linux-foundation.org: buffer_heads_over_limit is unlikely]
Reported-by: Stuart Foster <smf.linux@ntlworld.com>
Tested-by: Stuart Foster <smf.linux@ntlworld.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 42 +++++++++++++++++++++++++++++-------------
 1 file changed, 29 insertions(+), 13 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 87e4d6a6dc11..ae3bf0a09cdd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1642,18 +1642,6 @@ static void move_active_pages_to_lru(struct zone *zone,
 	unsigned long pgmoved = 0;
 	struct page *page;
 
-	if (buffer_heads_over_limit) {
-		spin_unlock_irq(&zone->lru_lock);
-		list_for_each_entry(page, list, lru) {
-			if (page_has_private(page) && trylock_page(page)) {
-				if (page_has_private(page))
-					try_to_release_page(page, 0);
-				unlock_page(page);
-			}
-		}
-		spin_lock_irq(&zone->lru_lock);
-	}
-
 	while (!list_empty(list)) {
 		struct lruvec *lruvec;
 
@@ -1735,6 +1723,14 @@ static void shrink_active_list(unsigned long nr_to_scan,
 			continue;
 		}
 
+		if (unlikely(buffer_heads_over_limit)) {
+			if (page_has_private(page) && trylock_page(page)) {
+				if (page_has_private(page))
+					try_to_release_page(page, 0);
+				unlock_page(page);
+			}
+		}
+
 		if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) {
 			nr_rotated += hpage_nr_pages(page);
 			/*
@@ -2238,6 +2234,14 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
 	unsigned long nr_soft_scanned;
 	bool aborted_reclaim = false;
 
+	/*
+	 * If the number of buffer_heads in the machine exceeds the maximum
+	 * allowed level, force direct reclaim to scan the highmem zone as
+	 * highmem pages could be pinning lowmem pages storing buffer_heads
+	 */
+	if (buffer_heads_over_limit)
+		sc->gfp_mask |= __GFP_HIGHMEM;
+
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 					gfp_zone(sc->gfp_mask), sc->nodemask) {
 		if (!populated_zone(zone))
@@ -2727,6 +2731,17 @@ loop_again:
 			 */
 			age_active_anon(zone, &sc, priority);
 
+			/*
+			 * If the number of buffer_heads in the machine
+			 * exceeds the maximum allowed level and this node
+			 * has a highmem zone, force kswapd to reclaim from
+			 * it to relieve lowmem pressure.
+			 */
+			if (buffer_heads_over_limit && is_highmem_idx(i)) {
+				end_zone = i;
+				break;
+			}
+
 			if (!zone_watermark_ok_safe(zone, order,
 					high_wmark_pages(zone), 0, 0)) {
 				end_zone = i;
@@ -2802,7 +2817,8 @@ loop_again:
 						COMPACT_SKIPPED)
 				testorder = 0;
 
-			if (!zone_watermark_ok_safe(zone, testorder,
+			if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
+				    !zone_watermark_ok_safe(zone, order,
 					high_wmark_pages(zone) + balance_gap,
 					end_zone, 0)) {
 				shrink_zone(priority, zone, &sc);
-- 
cgit v1.2.3


From d563c0501bf8702b9b683206c09b9defb37d8a8a Mon Sep 17 00:00:00 2001
From: Hillf Danton <dhillf@gmail.com>
Date: Wed, 21 Mar 2012 16:34:02 -0700
Subject: vmscan: handle isolated pages with lru lock released

When shrinking inactive lru list, isolated pages are queued on locally
private list, so the lock-hold time could be reduced if pages are counted
without lock protection.

To achieve that, firstly updating reclaim stat is delayed until the
putback stage, after reacquiring the lru lock.

Secondly, operations related to vm and zone stats are now proteced with
preemption disabled as they are per-cpu operations.

Signed-off-by: Hillf Danton <dhillf@gmail.com>
Acked-by: Hugh Dickins <hughd@google.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index ae3bf0a09cdd..57d8ef6ee4dd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1413,7 +1413,6 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
 		       unsigned long *nr_anon,
 		       unsigned long *nr_file)
 {
-	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
 	struct zone *zone = mz->zone;
 	unsigned int count[NR_LRU_LISTS] = { 0, };
 	unsigned long nr_active = 0;
@@ -1434,6 +1433,7 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
 		count[lru] += numpages;
 	}
 
+	preempt_disable();
 	__count_vm_events(PGDEACTIVATE, nr_active);
 
 	__mod_zone_page_state(zone, NR_ACTIVE_FILE,
@@ -1448,8 +1448,9 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
 	*nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
 	*nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
 
-	reclaim_stat->recent_scanned[0] += *nr_anon;
-	reclaim_stat->recent_scanned[1] += *nr_file;
+	__mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
+	__mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
+	preempt_enable();
 }
 
 /*
@@ -1511,6 +1512,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
 	unsigned long nr_writeback = 0;
 	isolate_mode_t isolate_mode = ISOLATE_INACTIVE;
 	struct zone *zone = mz->zone;
+	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
 
 	while (unlikely(too_many_isolated(zone, file, sc))) {
 		congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1544,19 +1546,13 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
 			__count_zone_vm_events(PGSCAN_DIRECT, zone,
 					       nr_scanned);
 	}
+	spin_unlock_irq(&zone->lru_lock);
 
-	if (nr_taken == 0) {
-		spin_unlock_irq(&zone->lru_lock);
+	if (nr_taken == 0)
 		return 0;
-	}
 
 	update_isolated_counts(mz, &page_list, &nr_anon, &nr_file);
 
-	__mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
-	__mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
-
-	spin_unlock_irq(&zone->lru_lock);
-
 	nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
 						&nr_dirty, &nr_writeback);
 
@@ -1569,6 +1565,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
 
 	spin_lock_irq(&zone->lru_lock);
 
+	reclaim_stat->recent_scanned[0] += nr_anon;
+	reclaim_stat->recent_scanned[1] += nr_file;
+
 	if (current_is_kswapd())
 		__count_vm_events(KSWAPD_STEAL, nr_reclaimed);
 	__count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
-- 
cgit v1.2.3


From c7cfa37b7324a190fc36ff116d79d0f899e8d273 Mon Sep 17 00:00:00 2001
From: Copot Alexandru <alex.mihai.c@gmail.com>
Date: Wed, 21 Mar 2012 16:34:10 -0700
Subject: mm/vmscan.c: fix spelling error

s/noticable/noticeable/

Signed-off-by: Copot Alexandru <alex.mihai.c@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 57d8ef6ee4dd..440af1d899b9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2261,8 +2261,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
 				 * Even though compaction is invoked for any
 				 * non-zero order, only frequent costly order
 				 * reclamation is disruptive enough to become a
-				 * noticable problem, like transparent huge page
-				 * allocations.
+				 * noticeable problem, like transparent huge
+				 * page allocations.
 				 */
 				if (compaction_ready(zone, sc)) {
 					aborted_reclaim = true;
-- 
cgit v1.2.3


From cc9a6c8776615f9c194ccf0b63a0aa5628235545 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 21 Mar 2012 16:34:11 -0700
Subject: cpuset: mm: reduce large amounts of memory barrier related damage v3

Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when
changing cpuset's mems") wins a super prize for the largest number of
memory barriers entered into fast paths for one commit.

[get|put]_mems_allowed is incredibly heavy with pairs of full memory
barriers inserted into a number of hot paths.  This was detected while
investigating at large page allocator slowdown introduced some time
after 2.6.32.  The largest portion of this overhead was shown by
oprofile to be at an mfence introduced by this commit into the page
allocator hot path.

For extra style points, the commit introduced the use of yield() in an
implementation of what looks like a spinning mutex.

This patch replaces the full memory barriers on both read and write
sides with a sequence counter with just read barriers on the fast path
side.  This is much cheaper on some architectures, including x86.  The
main bulk of the patch is the retry logic if the nodemask changes in a
manner that can cause a false failure.

While updating the nodemask, a check is made to see if a false failure
is a risk.  If it is, the sequence number gets bumped and parallel
allocators will briefly stall while the nodemask update takes place.

In a page fault test microbenchmark, oprofile samples from
__alloc_pages_nodemask went from 4.53% of all samples to 1.15%.  The
actual results were

                             3.3.0-rc3          3.3.0-rc3
                             rc3-vanilla        nobarrier-v2r1
    Clients   1 UserTime       0.07 (  0.00%)   0.08 (-14.19%)
    Clients   2 UserTime       0.07 (  0.00%)   0.07 (  2.72%)
    Clients   4 UserTime       0.08 (  0.00%)   0.07 (  3.29%)
    Clients   1 SysTime        0.70 (  0.00%)   0.65 (  6.65%)
    Clients   2 SysTime        0.85 (  0.00%)   0.82 (  3.65%)
    Clients   4 SysTime        1.41 (  0.00%)   1.41 (  0.32%)
    Clients   1 WallTime       0.77 (  0.00%)   0.74 (  4.19%)
    Clients   2 WallTime       0.47 (  0.00%)   0.45 (  3.73%)
    Clients   4 WallTime       0.38 (  0.00%)   0.37 (  1.58%)
    Clients   1 Flt/sec/cpu  497620.28 (  0.00%) 520294.53 (  4.56%)
    Clients   2 Flt/sec/cpu  414639.05 (  0.00%) 429882.01 (  3.68%)
    Clients   4 Flt/sec/cpu  257959.16 (  0.00%) 258761.48 (  0.31%)
    Clients   1 Flt/sec      495161.39 (  0.00%) 517292.87 (  4.47%)
    Clients   2 Flt/sec      820325.95 (  0.00%) 850289.77 (  3.65%)
    Clients   4 Flt/sec      1020068.93 (  0.00%) 1022674.06 (  0.26%)
    MMTests Statistics: duration
    Sys Time Running Test (seconds)             135.68    132.17
    User+Sys Time Running Test (seconds)         164.2    160.13
    Total Elapsed Time (seconds)                123.46    120.87

The overall improvement is small but the System CPU time is much
improved and roughly in correlation to what oprofile reported (these
performance figures are without profiling so skew is expected).  The
actual number of page faults is noticeably improved.

For benchmarks like kernel builds, the overall benefit is marginal but
the system CPU time is slightly reduced.

To test the actual bug the commit fixed I opened two terminals.  The
first ran within a cpuset and continually ran a small program that
faulted 100M of anonymous data.  In a second window, the nodemask of the
cpuset was continually randomised in a loop.

Without the commit, the program would fail every so often (usually
within 10 seconds) and obviously with the commit everything worked fine.
With this patch applied, it also worked fine so the fix should be
functionally equivalent.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 440af1d899b9..55d86c9506f3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2343,7 +2343,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 	unsigned long writeback_threshold;
 	bool aborted_reclaim;
 
-	get_mems_allowed();
 	delayacct_freepages_start();
 
 	if (global_reclaim(sc))
@@ -2407,7 +2406,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 
 out:
 	delayacct_freepages_end();
-	put_mems_allowed();
 
 	if (sc->nr_reclaimed)
 		return sc->nr_reclaimed;
-- 
cgit v1.2.3


From 1480de0340a8d5f094b74d7c4b902456c9a06903 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Wed, 21 Mar 2012 16:34:17 -0700
Subject: mm: forbid lumpy-reclaim in shrink_active_list()

Reset the reclaim mode in shrink_active_list() to RECLAIM_MODE_SINGLE |
RECLAIM_MODE_ASYNC.  (sync/async sign is used only in shrink_page_list
and does not affect shrink_active_list)

Currenly shrink_active_list() sometimes works in lumpy-reclaim mode, if
RECLAIM_MODE_LUMPYRECLAIM is left over from an earlier
shrink_inactive_list().  Meanwhile, in age_active_anon()
sc->reclaim_mode is totally zero.  So the current behavior is too
complex and confusing, and this looks like bug.

In general, shrink_active_list() populates the inactive list for the
next shrink_inactive_list().  Lumpy shring_inactive_list() isolates
pages around the chosen one from both the active and inactive lists.
So, there is no reason for lumpy isolation in shrink_active_list().

See also: https://lkml.org/lkml/2012/3/15/583

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Proposed-by: Hugh Dickins <hughd@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 55d86c9506f3..49f15ef0a99a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1690,6 +1690,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
 
 	lru_add_drain();
 
+	reset_reclaim_mode(sc);
+
 	if (!sc->may_unmap)
 		isolate_mode |= ISOLATE_UNMAPPED;
 	if (!sc->may_writepage)
-- 
cgit v1.2.3


From 643ac9fc5429e85b8b7f534544b80bcc4f34c367 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Fri, 23 Mar 2012 02:57:31 -0700
Subject: mm: fix testorder interaction between two kswapd patches

Adjusting cc715d99e529 "mm: vmscan: forcibly scan highmem if there are
too many buffer_heads pinning highmem" for -stable reveals that it was
slightly wrong once on top of fe2c2a106663 "vmscan: reclaim at order 0
when compaction is enabled", which specifically adds testorder for the
zone_watermark_ok_safe() test.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 49f15ef0a99a..7658fd6536dd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2817,7 +2817,7 @@ loop_again:
 				testorder = 0;
 
 			if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
-				    !zone_watermark_ok_safe(zone, order,
+				    !zone_watermark_ok_safe(zone, testorder,
 					high_wmark_pages(zone) + balance_gap,
 					end_zone, 0)) {
 				shrink_zone(priority, zone, &sc);
-- 
cgit v1.2.3


From 496b919b3bdd957d4b1727df79bfa3751bced1c1 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Sat, 24 Mar 2012 10:26:21 -0400
Subject: Fix potential endless loop in kswapd when compaction is not enabled

We should only test compaction_suitable if the kernel is built with
CONFIG_COMPACTION, otherwise the stub compaction_suitable function will
always return COMPACT_SKIPPED and send kswapd into an infinite loop.

Reported-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7658fd6536dd..33c332bbab73 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2946,7 +2946,8 @@ out:
 				continue;
 
 			/* Would compaction fail due to lack of free memory? */
-			if (compaction_suitable(zone, order) == COMPACT_SKIPPED)
+			if (COMPACTION_BUILD &&
+			    compaction_suitable(zone, order) == COMPACT_SKIPPED)
 				goto loop_again;
 
 			/* Confirm the zone is balanced for order-0 */
-- 
cgit v1.2.3