From a2c8990aed5ab000491732b07c8c4465d1b389b8 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 24 May 2011 17:12:50 -0700 Subject: memsw: remove noswapaccount kernel parameter The noswapaccount parameter has been deprecated since 2.6.38 without any complaints from users so we can remove it. swapaccount=0|1 can be used instead. As we are removing the parameter we can also clean up swapaccount because it doesn't have to accept an empty string anymore (to match noswapaccount) and so we can push = into __setup macro rather than checking "=1" resp. "=0" strings Signed-off-by: Michal Hocko Cc: Hiroyuki Kamezawa Cc: Daisuke Nishimura Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 010f9166fa6e..d5fd3dcd3f2e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5169,19 +5169,12 @@ struct cgroup_subsys mem_cgroup_subsys = { static int __init enable_swap_account(char *s) { /* consider enabled if no parameter or 1 is given */ - if (!(*s) || !strcmp(s, "=1")) + if (!strcmp(s, "1")) really_do_swap_account = 1; - else if (!strcmp(s, "=0")) + else if (!strcmp(s, "0")) really_do_swap_account = 0; return 1; } -__setup("swapaccount", enable_swap_account); +__setup("swapaccount=", enable_swap_account); -static int __init disable_swap_account(char *s) -{ - printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n"); - enable_swap_account("=0"); - return 1; -} -__setup("noswapaccount", disable_swap_account); #endif -- cgit v1.2.3 From f780bdb7c1c73009cb57adcf99ef50027d80bf3c Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Thu, 26 May 2011 16:25:19 -0700 Subject: cgroups: add per-thread subsystem callbacks Add cgroup subsystem callbacks for per-thread attachment in atomic contexts Add can_attach_task(), pre_attach(), and attach_task() as new callbacks for cgroups's subsystem interface. Unlike can_attach and attach, these are for per-thread operations, to be called potentially many times when attaching an entire threadgroup. Also, the old "bool threadgroup" interface is removed, as replaced by this. All subsystems are modified for the new interface - of note is cpuset, which requires from/to nodemasks for attach to be globally scoped (though per-cpuset would work too) to persist from its pre_attach to attach_task and attach. This is a pre-patch for cgroup-procs-writable.patch. Signed-off-by: Ben Blum Cc: "Eric W. Biederman" Cc: Li Zefan Cc: Matt Helsley Reviewed-by: Paul Menage Cc: Oleg Nesterov Cc: David Rientjes Cc: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d5fd3dcd3f2e..fc259926c170 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4953,8 +4953,7 @@ static void mem_cgroup_clear_mc(void) static int mem_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { int ret = 0; struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); @@ -4993,8 +4992,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { mem_cgroup_clear_mc(); } @@ -5112,8 +5110,7 @@ retry: static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *old_cont, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { struct mm_struct *mm; @@ -5131,22 +5128,19 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, #else /* !CONFIG_MMU */ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { return 0; } static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { } static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *old_cont, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { } #endif -- cgit v1.2.3 From 0ae5e89c60c9eb87da36a2614836bc434b0ec2ad Mon Sep 17 00:00:00 2001 From: Ying Han Date: Thu, 26 May 2011 16:25:25 -0700 Subject: memcg: count the soft_limit reclaim in global background reclaim The global kswapd scans per-zone LRU and reclaims pages regardless of the cgroup. It breaks memory isolation since one cgroup can end up reclaiming pages from another cgroup. Instead we should rely on memcg-aware target reclaim including per-memcg kswapd and soft_limit hierarchical reclaim under memory pressure. In the global background reclaim, we do soft reclaim before scanning the per-zone LRU. However, the return value is ignored. This patch is the first step to skip shrink_zone() if soft_limit reclaim does enough work. This is part of the effort which tries to reduce reclaiming pages in global LRU in memcg. The per-memcg background reclaim patchset further enhances the per-cgroup targetting reclaim, which I should have V4 posted shortly. Try running multiple memory intensive workloads within seperate memcgs. Watch the counters of soft_steal in memory.stat. $ cat /dev/cgroup/A/memory.stat | grep 'soft' soft_steal 240000 soft_scan 240000 total_soft_steal 240000 total_soft_scan 240000 This patch: In the global background reclaim, we do soft reclaim before scanning the per-zone LRU. However, the return value is ignored. We would like to skip shrink_zone() if soft_limit reclaim does enough work. Also, we need to make the memory pressure balanced across per-memcg zones, like the logic vm-core. This patch is the first step where we start with counting the nr_scanned and nr_reclaimed from soft_limit reclaim into the global scan_control. Signed-off-by: Ying Han Cc: KOSAKI Motohiro Cc: Minchan Kim Cc: Rik van Riel Cc: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: Balbir Singh Acked-by: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fc259926c170..e41a6c26f1e7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1433,7 +1433,8 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, struct zone *zone, gfp_t gfp_mask, - unsigned long reclaim_options) + unsigned long reclaim_options, + unsigned long *total_scanned) { struct mem_cgroup *victim; int ret, total = 0; @@ -1442,6 +1443,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; unsigned long excess; + unsigned long nr_scanned; excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; @@ -1484,10 +1486,12 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, continue; } /* we use swappiness of local cgroup */ - if (check_soft) + if (check_soft) { ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, - noswap, get_swappiness(victim), zone); - else + noswap, get_swappiness(victim), zone, + &nr_scanned); + *total_scanned += nr_scanned; + } else ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, get_swappiness(victim)); css_put(&victim->css); @@ -1928,7 +1932,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, return CHARGE_WOULDBLOCK; ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, - gfp_mask, flags); + gfp_mask, flags, NULL); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) return CHARGE_RETRY; /* @@ -3211,7 +3215,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, break; mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, - MEM_CGROUP_RECLAIM_SHRINK); + MEM_CGROUP_RECLAIM_SHRINK, + NULL); curusage = res_counter_read_u64(&memcg->res, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -3271,7 +3276,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, MEM_CGROUP_RECLAIM_NOSWAP | - MEM_CGROUP_RECLAIM_SHRINK); + MEM_CGROUP_RECLAIM_SHRINK, + NULL); curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -3285,7 +3291,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, } unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, - gfp_t gfp_mask) + gfp_t gfp_mask, + unsigned long *total_scanned) { unsigned long nr_reclaimed = 0; struct mem_cgroup_per_zone *mz, *next_mz = NULL; @@ -3293,6 +3300,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, int loop = 0; struct mem_cgroup_tree_per_zone *mctz; unsigned long long excess; + unsigned long nr_scanned; if (order > 0) return 0; @@ -3311,10 +3319,13 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, if (!mz) break; + nr_scanned = 0; reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, gfp_mask, - MEM_CGROUP_RECLAIM_SOFT); + MEM_CGROUP_RECLAIM_SOFT, + &nr_scanned); nr_reclaimed += reclaimed; + *total_scanned += nr_scanned; spin_lock(&mctz->lock); /* -- cgit v1.2.3 From 39cc98f1f8aa949afeea89f424c7494b0785d7da Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 26 May 2011 16:25:28 -0700 Subject: memcg: remove pointless next_mz nullification in mem_cgroup_soft_limit_reclaim() next_mz is assigned to NULL if __mem_cgroup_largest_soft_limit_node selects the same mz. This doesn't make much sense as we assign to the variable right in the next loop. Compiler will probably optimize this out but it is little bit confusing for the code reading. Signed-off-by: Michal Hocko Acked-by: Daisuke Nishimura Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e41a6c26f1e7..fc62c714f3b6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3348,10 +3348,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, */ next_mz = __mem_cgroup_largest_soft_limit_node(mctz); - if (next_mz == mz) { + if (next_mz == mz) css_put(&next_mz->mem->css); - next_mz = NULL; - } else /* next_mz == NULL or other memcg */ + else /* next_mz == NULL or other memcg */ break; } while (1); } -- cgit v1.2.3 From 889976dbcb1218119fdd950fb7819084e37d7d37 Mon Sep 17 00:00:00 2001 From: Ying Han Date: Thu, 26 May 2011 16:25:33 -0700 Subject: memcg: reclaim memory from nodes in round-robin order Presently, memory cgroup's direct reclaim frees memory from the current node. But this has some troubles. Usually when a set of threads works in a cooperative way, they tend to operate on the same node. So if they hit limits under memcg they will reclaim memory from themselves, damaging the active working set. For example, assume 2 node system which has Node 0 and Node 1 and a memcg which has 1G limit. After some work, file cache remains and the usages are Node 0: 1M Node 1: 998M. and run an application on Node 0, it will eat its foot before freeing unnecessary file caches. This patch adds round-robin for NUMA and adds equal pressure to each node. When using cpuset's spread memory feature, this will work very well. But yes, a better algorithm is needed. [akpm@linux-foundation.org: comment editing] [kamezawa.hiroyu@jp.fujitsu.com: fix time comparisons] Signed-off-by: Ying Han Signed-off-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: KOSAKI Motohiro Cc: Daisuke Nishimura Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 96 insertions(+), 6 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fc62c714f3b6..1520efd1c7c4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -231,6 +231,11 @@ struct mem_cgroup { * reclaimed from. */ int last_scanned_child; + int last_scanned_node; +#if MAX_NUMNODES > 1 + nodemask_t scan_nodes; + unsigned long next_scan_node_update; +#endif /* * Should the accounting and control be hierarchical, per subtree? */ @@ -624,18 +629,27 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, preempt_enable(); } +static unsigned long +mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx) +{ + struct mem_cgroup_per_zone *mz; + u64 total = 0; + int zid; + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + mz = mem_cgroup_zoneinfo(mem, nid, zid); + total += MEM_CGROUP_ZSTAT(mz, idx); + } + return total; +} static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, enum lru_list idx) { - int nid, zid; - struct mem_cgroup_per_zone *mz; + int nid; u64 total = 0; for_each_online_node(nid) - for (zid = 0; zid < MAX_NR_ZONES; zid++) { - mz = mem_cgroup_zoneinfo(mem, nid, zid); - total += MEM_CGROUP_ZSTAT(mz, idx); - } + total += mem_cgroup_get_zonestat_node(mem, nid, idx); return total; } @@ -1418,6 +1432,81 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) return ret; } +#if MAX_NUMNODES > 1 + +/* + * Always updating the nodemask is not very good - even if we have an empty + * list or the wrong list here, we can start from some node and traverse all + * nodes based on the zonelist. So update the list loosely once per 10 secs. + * + */ +static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) +{ + int nid; + + if (time_after(mem->next_scan_node_update, jiffies)) + return; + + mem->next_scan_node_update = jiffies + 10*HZ; + /* make a nodemask where this memcg uses memory from */ + mem->scan_nodes = node_states[N_HIGH_MEMORY]; + + for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { + + if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) || + mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE)) + continue; + + if (total_swap_pages && + (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) || + mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON))) + continue; + node_clear(nid, mem->scan_nodes); + } +} + +/* + * Selecting a node where we start reclaim from. Because what we need is just + * reducing usage counter, start from anywhere is O,K. Considering + * memory reclaim from current node, there are pros. and cons. + * + * Freeing memory from current node means freeing memory from a node which + * we'll use or we've used. So, it may make LRU bad. And if several threads + * hit limits, it will see a contention on a node. But freeing from remote + * node means more costs for memory reclaim because of memory latency. + * + * Now, we use round-robin. Better algorithm is welcomed. + */ +int mem_cgroup_select_victim_node(struct mem_cgroup *mem) +{ + int node; + + mem_cgroup_may_update_nodemask(mem); + node = mem->last_scanned_node; + + node = next_node(node, mem->scan_nodes); + if (node == MAX_NUMNODES) + node = first_node(mem->scan_nodes); + /* + * We call this when we hit limit, not when pages are added to LRU. + * No LRU may hold pages because all pages are UNEVICTABLE or + * memcg is too small and all pages are not on LRU. In that case, + * we use curret node. + */ + if (unlikely(node == MAX_NUMNODES)) + node = numa_node_id(); + + mem->last_scanned_node = node; + return node; +} + +#else +int mem_cgroup_select_victim_node(struct mem_cgroup *mem) +{ + return 0; +} +#endif + /* * Scan the hierarchy if needed to reclaim memory. We remember the last child * we reclaimed from, so that we don't end up penalizing one child extensively @@ -4606,6 +4695,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) res_counter_init(&mem->memsw, NULL); } mem->last_scanned_child = 0; + mem->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&mem->oom_notify); if (parent) -- cgit v1.2.3 From 4fd14ebf6e3b66423dfac2bc9defda7b83ee07b3 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 26 May 2011 16:25:35 -0700 Subject: memcg: remove unused retry signal from reclaim If the memcg reclaim code detects the target memcg below its limit it exits and returns a guaranteed non-zero value so that the charge is retried. Nowadays, the charge side checks the memcg limit itself and does not rely on this non-zero return value trick. This patch removes it. The reclaim code will now always return the true number of pages it reclaimed on its own. Signed-off-by: Johannes Weiner Acked-by: Rik van Riel Acked-by: Ying Han Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko Cc: Balbir Singh Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1520efd1c7c4..bcb0a0bee1fc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1596,7 +1596,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, if (!res_counter_soft_limit_excess(&root_mem->res)) return total; } else if (mem_cgroup_margin(root_mem)) - return 1 + total; + return total; } return total; } -- cgit v1.2.3 From 1bac180bd29e03989f50054af97b53b8d37a364a Mon Sep 17 00:00:00 2001 From: Ying Han Date: Thu, 26 May 2011 16:25:36 -0700 Subject: memcg: rename mem_cgroup_zone_nr_pages() to mem_cgroup_zone_nr_lru_pages() The caller of the function has been renamed to zone_nr_lru_pages(), and this is just fixing up in the memcg code. The current name is easily to be mis-read as zone's total number of pages. Signed-off-by: Ying Han Acked-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Cc: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bcb0a0bee1fc..cc48a6854f7e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1078,9 +1078,9 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) return (active > inactive); } -unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, - struct zone *zone, - enum lru_list lru) +unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, + struct zone *zone, + enum lru_list lru) { int nid = zone_to_nid(zone); int zid = zone_idx(zone); -- cgit v1.2.3 From 406eb0c9ba765eb066406fd5ce9d5e2b169a4d5a Mon Sep 17 00:00:00 2001 From: Ying Han Date: Thu, 26 May 2011 16:25:37 -0700 Subject: memcg: add memory.numastat api for numa statistics The new API exports numa_maps per-memcg basis. This is a piece of useful information where it exports per-memcg page distribution across real numa nodes. One of the usecases is evaluating application performance by combining this information w/ the cpu allocation to the application. The output of the memory.numastat tries to follow w/ simiar format of numa_maps like: total= N0= N1= ... file= N0= N1= ... anon= N0= N1= ... unevictable= N0= N1= ... And we have per-node: total = file + anon + unevictable $ cat /dev/cgroup/memory/memory.numa_stat total=250020 N0=87620 N1=52367 N2=45298 N3=64735 file=225232 N0=83402 N1=46160 N2=40522 N3=55148 anon=21053 N0=3424 N1=6207 N2=4776 N3=6646 unevictable=3735 N0=794 N1=0 N2=0 N3=2941 Signed-off-by: Ying Han Cc: Balbir Singh Cc: Daisuke Nishimura Acked-by: KAMEZAWA Hiroyuki Acked-by: Daisuke Nishimura Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 155 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index cc48a6854f7e..4021fcd71b60 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1089,6 +1089,93 @@ unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, return MEM_CGROUP_ZSTAT(mz, lru); } +#ifdef CONFIG_NUMA +static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg, + int nid) +{ + unsigned long ret; + + ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_FILE) + + mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_FILE); + + return ret; +} + +static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg) +{ + u64 total = 0; + int nid; + + for_each_node_state(nid, N_HIGH_MEMORY) + total += mem_cgroup_node_nr_file_lru_pages(memcg, nid); + + return total; +} + +static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg, + int nid) +{ + unsigned long ret; + + ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) + + mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON); + + return ret; +} + +static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg) +{ + u64 total = 0; + int nid; + + for_each_node_state(nid, N_HIGH_MEMORY) + total += mem_cgroup_node_nr_anon_lru_pages(memcg, nid); + + return total; +} + +static unsigned long +mem_cgroup_node_nr_unevictable_lru_pages(struct mem_cgroup *memcg, int nid) +{ + return mem_cgroup_get_zonestat_node(memcg, nid, LRU_UNEVICTABLE); +} + +static unsigned long +mem_cgroup_nr_unevictable_lru_pages(struct mem_cgroup *memcg) +{ + u64 total = 0; + int nid; + + for_each_node_state(nid, N_HIGH_MEMORY) + total += mem_cgroup_node_nr_unevictable_lru_pages(memcg, nid); + + return total; +} + +static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, + int nid) +{ + enum lru_list l; + u64 total = 0; + + for_each_lru(l) + total += mem_cgroup_get_zonestat_node(memcg, nid, l); + + return total; +} + +static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg) +{ + u64 total = 0; + int nid; + + for_each_node_state(nid, N_HIGH_MEMORY) + total += mem_cgroup_node_nr_lru_pages(memcg, nid); + + return total; +} +#endif /* CONFIG_NUMA */ + struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, struct zone *zone) { @@ -3944,6 +4031,51 @@ mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) mem_cgroup_get_local_stat(iter, s); } +#ifdef CONFIG_NUMA +static int mem_control_numa_stat_show(struct seq_file *m, void *arg) +{ + int nid; + unsigned long total_nr, file_nr, anon_nr, unevictable_nr; + unsigned long node_nr; + struct cgroup *cont = m->private; + struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); + + total_nr = mem_cgroup_nr_lru_pages(mem_cont); + seq_printf(m, "total=%lu", total_nr); + for_each_node_state(nid, N_HIGH_MEMORY) { + node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid); + seq_printf(m, " N%d=%lu", nid, node_nr); + } + seq_putc(m, '\n'); + + file_nr = mem_cgroup_nr_file_lru_pages(mem_cont); + seq_printf(m, "file=%lu", file_nr); + for_each_node_state(nid, N_HIGH_MEMORY) { + node_nr = mem_cgroup_node_nr_file_lru_pages(mem_cont, nid); + seq_printf(m, " N%d=%lu", nid, node_nr); + } + seq_putc(m, '\n'); + + anon_nr = mem_cgroup_nr_anon_lru_pages(mem_cont); + seq_printf(m, "anon=%lu", anon_nr); + for_each_node_state(nid, N_HIGH_MEMORY) { + node_nr = mem_cgroup_node_nr_anon_lru_pages(mem_cont, nid); + seq_printf(m, " N%d=%lu", nid, node_nr); + } + seq_putc(m, '\n'); + + unevictable_nr = mem_cgroup_nr_unevictable_lru_pages(mem_cont); + seq_printf(m, "unevictable=%lu", unevictable_nr); + for_each_node_state(nid, N_HIGH_MEMORY) { + node_nr = mem_cgroup_node_nr_unevictable_lru_pages(mem_cont, + nid); + seq_printf(m, " N%d=%lu", nid, node_nr); + } + seq_putc(m, '\n'); + return 0; +} +#endif /* CONFIG_NUMA */ + static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, struct cgroup_map_cb *cb) { @@ -3954,6 +4086,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, memset(&mystat, 0, sizeof(mystat)); mem_cgroup_get_local_stat(mem_cont, &mystat); + for (i = 0; i < NR_MCS_STAT; i++) { if (i == MCS_SWAP && !do_swap_account) continue; @@ -4377,6 +4510,22 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, return 0; } +#ifdef CONFIG_NUMA +static const struct file_operations mem_control_numa_stat_file_operations = { + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int mem_control_numa_stat_open(struct inode *unused, struct file *file) +{ + struct cgroup *cont = file->f_dentry->d_parent->d_fsdata; + + file->f_op = &mem_control_numa_stat_file_operations; + return single_open(file, mem_control_numa_stat_show, cont); +} +#endif /* CONFIG_NUMA */ + static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", @@ -4440,6 +4589,12 @@ static struct cftype mem_cgroup_files[] = { .unregister_event = mem_cgroup_oom_unregister_event, .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), }, +#ifdef CONFIG_NUMA + { + .name = "numa_stat", + .open = mem_control_numa_stat_open, + }, +#endif }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP -- cgit v1.2.3 From 456f998ec817ebfa254464be4f089542fa390645 Mon Sep 17 00:00:00 2001 From: Ying Han Date: Thu, 26 May 2011 16:25:38 -0700 Subject: memcg: add the pagefault count into memcg stats Two new stats in per-memcg memory.stat which tracks the number of page faults and number of major page faults. "pgfault" "pgmajfault" They are different from "pgpgin"/"pgpgout" stat which count number of pages charged/discharged to the cgroup and have no meaning of reading/ writing page to disk. It is valuable to track the two stats for both measuring application's performance as well as the efficiency of the kernel page reclaim path. Counting pagefaults per process is useful, but we also need the aggregated value since processes are monitored and controlled in cgroup basis in memcg. Functional test: check the total number of pgfault/pgmajfault of all memcgs and compare with global vmstat value: $ cat /proc/vmstat | grep fault pgfault 1070751 pgmajfault 553 $ cat /dev/cgroup/memory.stat | grep fault pgfault 1071138 pgmajfault 553 total_pgfault 1071142 total_pgmajfault 553 $ cat /dev/cgroup/A/memory.stat | grep fault pgfault 199 pgmajfault 0 total_pgfault 199 total_pgmajfault 0 Performance test: run page fault test(pft) wit 16 thread on faulting in 15G anon pages in 16G container. There is no regression noticed on the "flt/cpu/s" Sample output from pft: TAG pft:anon-sys-default: Gb Thr CLine User System Wall flt/cpu/s fault/wsec 15 16 1 0.67s 233.41s 14.76s 16798.546 266356.260 +-------------------------------------------------------------------------+ N Min Max Median Avg Stddev x 10 16682.962 17344.027 16913.524 16928.812 166.5362 + 10 16695.568 16923.896 16820.604 16824.652 84.816568 No difference proven at 95.0% confidence [akpm@linux-foundation.org: fix build] [hughd@google.com: shmem fix] Signed-off-by: Ying Han Acked-by: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Reviewed-by: Minchan Kim Cc: Daisuke Nishimura Acked-by: Balbir Singh Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4021fcd71b60..bd9052a5d3ad 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -94,6 +94,8 @@ enum mem_cgroup_events_index { MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ + MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ + MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ MEM_CGROUP_EVENTS_NSTATS, }; /* @@ -590,6 +592,16 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); } +void mem_cgroup_pgfault(struct mem_cgroup *mem, int val) +{ + this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); +} + +void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val) +{ + this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); +} + static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, enum mem_cgroup_events_index idx) { @@ -827,6 +839,33 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) return (mem == root_mem_cgroup); } +void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) +{ + struct mem_cgroup *mem; + + if (!mm) + return; + + rcu_read_lock(); + mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!mem)) + goto out; + + switch (idx) { + case PGMAJFAULT: + mem_cgroup_pgmajfault(mem, 1); + break; + case PGFAULT: + mem_cgroup_pgfault(mem, 1); + break; + default: + BUG(); + } +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL(mem_cgroup_count_vm_event); + /* * Following LRU functions are allowed to be used without PCG_LOCK. * Operations are called by routine of global LRU independently from memcg. @@ -3958,6 +3997,8 @@ enum { MCS_PGPGIN, MCS_PGPGOUT, MCS_SWAP, + MCS_PGFAULT, + MCS_PGMAJFAULT, MCS_INACTIVE_ANON, MCS_ACTIVE_ANON, MCS_INACTIVE_FILE, @@ -3980,6 +4021,8 @@ struct { {"pgpgin", "total_pgpgin"}, {"pgpgout", "total_pgpgout"}, {"swap", "total_swap"}, + {"pgfault", "total_pgfault"}, + {"pgmajfault", "total_pgmajfault"}, {"inactive_anon", "total_inactive_anon"}, {"active_anon", "total_active_anon"}, {"inactive_file", "total_inactive_file"}, @@ -4008,6 +4051,10 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); s->stat[MCS_SWAP] += val * PAGE_SIZE; } + val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT); + s->stat[MCS_PGFAULT] += val; + val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT); + s->stat[MCS_PGMAJFAULT] += val; /* per zone stat */ val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); -- cgit v1.2.3 From a433658c30974fc87ba3ff52d7e4e6299762aa3d Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 15 Jun 2011 15:08:13 -0700 Subject: vmscan,memcg: memcg aware swap token Currently, memcg reclaim can disable swap token even if the swap token mm doesn't belong in its memory cgroup. It's slightly risky. If an admin creates very small mem-cgroup and silly guy runs contentious heavy memory pressure workload, every tasks are going to lose swap token and then system may become unresponsive. That's bad. This patch adds 'memcg' parameter into disable_swap_token(). and if the parameter doesn't match swap token, VM doesn't disable it. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: KOSAKI Motohiro Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bd9052a5d3ad..e37c44d88d46 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -735,7 +735,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) struct mem_cgroup, css); } -static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) +struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) { struct mem_cgroup *mem = NULL; @@ -5414,18 +5414,16 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct cgroup *old_cont, struct task_struct *p) { - struct mm_struct *mm; + struct mm_struct *mm = get_task_mm(p); - if (!mc.to) - /* no need to move charge */ - return; - - mm = get_task_mm(p); if (mm) { - mem_cgroup_move_charge(mm); + if (mc.to) + mem_cgroup_move_charge(mm); + put_swap_token(mm); mmput(mm); } - mem_cgroup_clear_mc(); + if (mc.to) + mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, -- cgit v1.2.3 From 8957712710e045044e3c44375c6a87d7ffa17d51 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 15 Jun 2011 15:08:41 -0700 Subject: mm: memory.numa_stat: fix file permission Commit 406eb0c9ba76 ("memcg: add memory.numastat api for numa statistics") adds memory.numa_stat file for memory cgroup. But the file permissions are wrong. [kamezawa@bluextal linux-2.6]$ ls -l /cgroup/memory/A/memory.numa_stat ---------- 1 root root 0 Jun 9 18:36 /cgroup/memory/A/memory.numa_stat This patch fixes the permission as [root@bluextal kamezawa]# ls -l /cgroup/memory/A/memory.numa_stat -r--r--r-- 1 root root 0 Jun 10 16:49 /cgroup/memory/A/memory.numa_stat Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e37c44d88d46..02a7947608ad 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4640,6 +4640,7 @@ static struct cftype mem_cgroup_files[] = { { .name = "numa_stat", .open = mem_control_numa_stat_open, + .mode = S_IRUGO, }, #endif }; -- cgit v1.2.3 From 7ae534d074e01e54d5cfbc9734b73fdfc855501f Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 15 Jun 2011 15:08:44 -0700 Subject: memcg: fix wrong check of noswap with softlimit Hierarchical reclaim doesn't swap out if memsw and resource limits are thye same (memsw_is_minimum == true) because we would hit mem+swap limit anyway (during hard limit reclaim). If it comes to the soft limit we shouldn't consider memsw_is_minimum at all because it doesn't make much sense. Either the soft limit is bellow the hard limit and then we cannot hit mem+swap limit or the direct reclaim takes a precedence. Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko Acked-by: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 02a7947608ad..0b1a32cbd74d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1663,7 +1663,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; /* If memsw_is_minimum==1, swap-out is of-no-use. */ - if (root_mem->memsw_is_minimum) + if (!check_soft && root_mem->memsw_is_minimum) noswap = true; while (1) { -- cgit v1.2.3 From 26fe616844491a41a1abc02e29f7a9d1ec2f8ddb Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 15 Jun 2011 15:08:45 -0700 Subject: memcg: fix percpu cached charge draining frequency For performance, memory cgroup caches some "charge" from res_counter into per cpu cache. This works well but because it's cache, it needs to be flushed in some cases. Typical cases are 1. when someone hit limit. 2. when rmdir() is called and need to charges to be 0. But "1" has problem. Recently, with large SMP machines, we see many kworker runs because of flushing memcg's cache. Bad things in implementation are that even if a cpu contains a cache for memcg not related to a memcg which hits limit, drain code is called. This patch does A) check percpu cache contains a useful data or not. B) check other asynchronous percpu draining doesn't run. C) don't call local cpu callback. (*)This patch avoid changing the calling condition with hard-limit. When I run "cat 1Gfile > /dev/null" under 300M limit memcg, [Before] 13767 kamezawa 20 0 98.6m 424 416 D 10.0 0.0 0:00.61 cat 58 root 20 0 0 0 0 S 0.6 0.0 0:00.09 kworker/2:1 60 root 20 0 0 0 0 S 0.6 0.0 0:00.08 kworker/4:1 4 root 20 0 0 0 0 S 0.3 0.0 0:00.02 kworker/0:0 57 root 20 0 0 0 0 S 0.3 0.0 0:00.05 kworker/1:1 61 root 20 0 0 0 0 S 0.3 0.0 0:00.05 kworker/5:1 62 root 20 0 0 0 0 S 0.3 0.0 0:00.05 kworker/6:1 63 root 20 0 0 0 0 S 0.3 0.0 0:00.05 kworker/7:1 [After] 2676 root 20 0 98.6m 416 416 D 9.3 0.0 0:00.87 cat 2626 kamezawa 20 0 15192 1312 920 R 0.3 0.0 0:00.28 top 1 root 20 0 19384 1496 1204 S 0.0 0.0 0:00.66 init 2 root 20 0 0 0 0 S 0.0 0.0 0:00.00 kthreadd 3 root 20 0 0 0 0 S 0.0 0.0 0:00.00 ksoftirqd/0 4 root 20 0 0 0 0 S 0.0 0.0 0:00.00 kworker/0:0 [akpm@linux-foundation.org: make percpu_charge_mutex static, tweak comments] Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Daisuke Nishimura Reviewed-by: Michal Hocko Tested-by: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 54 ++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 38 insertions(+), 16 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0b1a32cbd74d..c39a177bb641 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -359,7 +359,7 @@ enum charge_type { static void mem_cgroup_get(struct mem_cgroup *mem); static void mem_cgroup_put(struct mem_cgroup *mem); static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); -static void drain_all_stock_async(void); +static void drain_all_stock_async(struct mem_cgroup *mem); static struct mem_cgroup_per_zone * mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) @@ -1671,7 +1671,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, if (victim == root_mem) { loop++; if (loop >= 1) - drain_all_stock_async(); + drain_all_stock_async(root_mem); if (loop >= 2) { /* * If we have not been able to reclaim @@ -1934,9 +1934,11 @@ struct memcg_stock_pcp { struct mem_cgroup *cached; /* this never be root cgroup */ unsigned int nr_pages; struct work_struct work; + unsigned long flags; +#define FLUSHING_CACHED_CHARGE (0) }; static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); -static atomic_t memcg_drain_count; +static DEFINE_MUTEX(percpu_charge_mutex); /* * Try to consume stocked charge on this cpu. If success, one page is consumed @@ -1984,6 +1986,7 @@ static void drain_local_stock(struct work_struct *dummy) { struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); drain_stock(stock); + clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); } /* @@ -2008,26 +2011,45 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) * expects some charges will be back to res_counter later but cannot wait for * it. */ -static void drain_all_stock_async(void) +static void drain_all_stock_async(struct mem_cgroup *root_mem) { - int cpu; - /* This function is for scheduling "drain" in asynchronous way. - * The result of "drain" is not directly handled by callers. Then, - * if someone is calling drain, we don't have to call drain more. - * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if - * there is a race. We just do loose check here. + int cpu, curcpu; + /* + * If someone calls draining, avoid adding more kworker runs. */ - if (atomic_read(&memcg_drain_count)) + if (!mutex_trylock(&percpu_charge_mutex)) return; /* Notify other cpus that system-wide "drain" is running */ - atomic_inc(&memcg_drain_count); get_online_cpus(); + /* + * Get a hint for avoiding draining charges on the current cpu, + * which must be exhausted by our charging. It is not required that + * this be a precise check, so we use raw_smp_processor_id() instead of + * getcpu()/putcpu(). + */ + curcpu = raw_smp_processor_id(); for_each_online_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); - schedule_work_on(cpu, &stock->work); + struct mem_cgroup *mem; + + if (cpu == curcpu) + continue; + + mem = stock->cached; + if (!mem) + continue; + if (mem != root_mem) { + if (!root_mem->use_hierarchy) + continue; + /* check whether "mem" is under tree of "root_mem" */ + if (!css_is_ancestor(&mem->css, &root_mem->css)) + continue; + } + if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) + schedule_work_on(cpu, &stock->work); } put_online_cpus(); - atomic_dec(&memcg_drain_count); + mutex_unlock(&percpu_charge_mutex); /* We don't wait for flush_work */ } @@ -2035,9 +2057,9 @@ static void drain_all_stock_async(void) static void drain_all_stock_sync(void) { /* called when force_empty is called */ - atomic_inc(&memcg_drain_count); + mutex_lock(&percpu_charge_mutex); schedule_on_each_cpu(drain_local_stock); - atomic_dec(&memcg_drain_count); + mutex_unlock(&percpu_charge_mutex); } /* -- cgit v1.2.3 From fbc29a25e484be073e7d762c9f7f1d4bf8aecc48 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 15 Jun 2011 15:08:46 -0700 Subject: memcg: avoid percpu cached charge draining at softlimit Based on Michal Hocko's comment. We are not draining per cpu cached charges during soft limit reclaim because background reclaim doesn't care about charges. It tries to free some memory and charges will not give any. Cached charges might influence only selection of the biggest soft limit offender but as the call is done only after the selection has been already done it makes no change. Signed-off-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c39a177bb641..cf7d027a8844 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1670,7 +1670,13 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, victim = mem_cgroup_select_victim(root_mem); if (victim == root_mem) { loop++; - if (loop >= 1) + /* + * We are not draining per cpu cached charges during + * soft limit reclaim because global reclaim doesn't + * care about charges. It tries to free some memory and + * charges will not give any. + */ + if (!check_soft && loop >= 1) drain_all_stock_async(root_mem); if (loop >= 2) { /* -- cgit v1.2.3