mm: vmscan: fix the page state calculation in too_many_isolated

It is observed that sometimes multiple tasks get blocked in the congestion_wait loop below, in shrink_inactive_list. (__schedule) from [<c0a03328>] (schedule_timeout) from [<c0a04940>] (io_schedule_timeout) from [<c01d585c>] (congestion_wait) from [<c01cc9d8>] (shrink_inactive_list) from [<c01cd034>] (shrink_zone) from [<c01cdd08>] (try_to_free_pages) from [<c01c442c>] (__alloc_pages_nodemask) from [<c01f1884>] (new_slab) from [<c09fcf60>] (__slab_alloc) from [<c01f1a6c>] In one such instance, zone_page_state(zone, NR_ISOLATED_FILE) had returned 14, zone_page_state(zone, NR_INACTIVE_FILE) returned 92, and the gfp_flag was GFP_KERNEL which resulted in too_many_isolated to return true. But one of the CPU pageset vmstat diff had NR_ISOLATED_FILE as -14. As there weren't any more update to per cpu pageset, the threshold wasn't met, and the tasks were blocked in the congestion wait. This patch uses zone_page_state_snapshot instead, but restricts its usage to avoid performance penalty. Change-Id: Iec767a548e524729c7ed79a92fe4718cdd08ce69 Signed-off-by: Vinayak Menon <vinmenon@codeaurora.org>
author: Vinayak Menon <vinmenon@codeaurora.org> 2014-12-26 19:29:41 +0530
committer: Kyle Yan <kyan@codeaurora.org> 2016-05-31 15:23:28 -0700
commit: 44bd107fc9cdb2d3b195e01611b074f609b97f1a (patch)
tree: dee71c4f53a91e03a5cc8e847f36899ecab1d0e6 /mm/vmscan.c
parent: eaee620aa28e718a9eabe642b2249764f2d86614 (diff)
1 files changed, 49 insertions, 19 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c2bd5b3293bf..3f702c2f9d58 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1498,6 +1498,44 @@ int isolate_lru_page(struct page *page)
 	return ret;
 }
 
+static int __too_many_isolated(struct zone *zone, int file,
+	struct scan_control *sc, int safe)
+{
+	unsigned long inactive, isolated;
+
+	if (file) {
+		if (safe) {
+			inactive = zone_page_state_snapshot(zone,
+					NR_INACTIVE_FILE);
+			isolated = zone_page_state_snapshot(zone,
+					NR_ISOLATED_FILE);
+		} else {
+			inactive = zone_page_state(zone, NR_INACTIVE_FILE);
+			isolated = zone_page_state(zone, NR_ISOLATED_FILE);
+		}
+	} else {
+		if (safe) {
+			inactive = zone_page_state_snapshot(zone,
+					NR_INACTIVE_ANON);
+			isolated = zone_page_state_snapshot(zone,
+					NR_ISOLATED_ANON);
+		} else {
+			inactive = zone_page_state(zone, NR_INACTIVE_ANON);
+			isolated = zone_page_state(zone, NR_ISOLATED_ANON);
+		}
+	}
+
+	/*
+	 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
+	 * won't get blocked by normal direct-reclaimers, forming a circular
+	 * deadlock.
+	 */
+	if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
+		inactive >>= 3;
+
+	return isolated > inactive;
+}
+
 /*
  * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
  * then get resheduled. When there are massive number of tasks doing page
@@ -1506,33 +1544,22 @@ int isolate_lru_page(struct page *page)
  * unnecessary swapping, thrashing and OOM.
  */
 static int too_many_isolated(struct zone *zone, int file,
-		struct scan_control *sc)
+		struct scan_control *sc, int safe)
 {
-	unsigned long inactive, isolated;
-
 	if (current_is_kswapd())
 		return 0;
 
 	if (!sane_reclaim(sc))
 		return 0;
 
-	if (file) {
-		inactive = zone_page_state(zone, NR_INACTIVE_FILE);
-		isolated = zone_page_state(zone, NR_ISOLATED_FILE);
-	} else {
-		inactive = zone_page_state(zone, NR_INACTIVE_ANON);
-		isolated = zone_page_state(zone, NR_ISOLATED_ANON);
+	if (unlikely(__too_many_isolated(zone, file, sc, 0))) {
+		if (safe)
+			return __too_many_isolated(zone, file, sc, safe);
+		else
+			return 1;
 	}
 
-	/*
-	 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
-	 * won't get blocked by normal direct-reclaimers, forming a circular
-	 * deadlock.
-	 */
-	if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
-		inactive >>= 3;
-
-	return isolated > inactive;
+	return 0;
 }
 
 static noinline_for_stack void
@@ -1622,15 +1649,18 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	unsigned long nr_immediate = 0;
 	isolate_mode_t isolate_mode = 0;
 	int file = is_file_lru(lru);
+	int safe = 0;
 	struct zone *zone = lruvec_zone(lruvec);
 	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 
-	while (unlikely(too_many_isolated(zone, file, sc))) {
+	while (unlikely(too_many_isolated(zone, file, sc, safe))) {
 		congestion_wait(BLK_RW_ASYNC, HZ/10);
 
 		/* We are about to die and free our memory. Return now. */
 		if (fatal_signal_pending(current))
 			return SWAP_CLUSTER_MAX;
+
+		safe = 1;
 	}
 
 	lru_add_drain();
author	Vinayak Menon <vinmenon@codeaurora.org>	2014-12-26 19:29:41 +0530
committer	Kyle Yan <kyan@codeaurora.org>	2016-05-31 15:23:28 -0700
commit	44bd107fc9cdb2d3b195e01611b074f609b97f1a (patch)
tree	dee71c4f53a91e03a5cc8e847f36899ecab1d0e6 /mm/vmscan.c
parent	eaee620aa28e718a9eabe642b2249764f2d86614 (diff)