From 33ef6b6984403a688189317ef46bb3caab3b70e0 Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Wed, 2 Nov 2011 13:38:05 -0700 Subject: cgroups: more safe tasklist locking in cgroup_attach_proc Fix unstable tasklist locking in cgroup_attach_proc. According to this thread - https://lkml.org/lkml/2011/7/27/243 - RCU is not sufficient to guarantee the tasklist is stable w.r.t. de_thread and exit. Taking tasklist_lock for reading, instead of rcu_read_lock, ensures proper exclusion. Signed-off-by: Ben Blum Acked-by: Paul Menage Cc: Oleg Nesterov Cc: Frederic Weisbecker Cc: "Paul E. McKenney" Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 453100a4159d..64b0e73402df 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2027,7 +2027,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) goto out_free_group_list; /* prevent changes to the threadgroup list while we take a snapshot. */ - rcu_read_lock(); + read_lock(&tasklist_lock); if (!thread_group_leader(leader)) { /* * a race with de_thread from another thread's exec() may strip @@ -2036,7 +2036,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * throw this task away and try again (from cgroup_procs_write); * this is "double-double-toil-and-trouble-check locking". */ - rcu_read_unlock(); + read_unlock(&tasklist_lock); retval = -EAGAIN; goto out_free_group_list; } @@ -2057,7 +2057,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) } while_each_thread(leader, tsk); /* remember the number of threads in the array for later. */ group_size = i; - rcu_read_unlock(); + read_unlock(&tasklist_lock); /* * step 1: check that we can legitimately attach to the cgroup. -- cgit v1.2.3 From 77ceab8ea590d7dc6c8f055ce43dfebd74428107 Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Wed, 2 Nov 2011 13:38:07 -0700 Subject: cgroups: don't attach task to subsystem if migration failed If a task has exited to the point it has called cgroup_exit() already, then we can't migrate it to another cgroup anymore. This can happen when we are attaching a task to a new cgroup between the call to ->can_attach_task() on subsystems and the migration that is eventually tried in cgroup_task_migrate(). In this case cgroup_task_migrate() returns -ESRCH and we don't want to attach the task to the subsystems because the attachment to the new cgroup itself failed. Fix this by only calling ->attach_task() on the subsystems if the cgroup migration succeeded. Reported-by: Oleg Nesterov Signed-off-by: Ben Blum Acked-by: Paul Menage Cc: Li Zefan Cc: Tejun Heo Signed-off-by: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 64b0e73402df..8386b21224ef 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2135,14 +2135,17 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) oldcgrp = task_cgroup_from_root(tsk, root); if (cgrp == oldcgrp) continue; - /* attach each task to each subsystem */ - for_each_subsys(root, ss) { - if (ss->attach_task) - ss->attach_task(cgrp, tsk); - } /* if the thread is PF_EXITING, it can just get skipped. */ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); - BUG_ON(retval != 0 && retval != -ESRCH); + if (retval == 0) { + /* attach each task to each subsystem */ + for_each_subsys(root, ss) { + if (ss->attach_task) + ss->attach_task(cgrp, tsk); + } + } else { + BUG_ON(retval != -ESRCH); + } } /* nothing is sensitive to fork() after this point. */ -- cgit v1.2.3 From 89e8a244b97e48f1f30e898b6f32acca477f2a13 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 2 Nov 2011 13:38:39 -0700 Subject: cpusets: avoid looping when storing to mems_allowed if one node remains set {get,put}_mems_allowed() exist so that general kernel code may locklessly access a task's set of allowable nodes without having the chance that a concurrent write will cause the nodemask to be empty on configurations where MAX_NUMNODES > BITS_PER_LONG. This could incur a significant delay, however, especially in low memory conditions because the page allocator is blocking and reclaim requires get_mems_allowed() itself. It is not atypical to see writes to cpuset.mems take over 2 seconds to complete, for example. In low memory conditions, this is problematic because it's one of the most imporant times to change cpuset.mems in the first place! The only way a task's set of allowable nodes may change is through cpusets by writing to cpuset.mems and when attaching a task to a generic code is not reading the nodemask with get_mems_allowed() at the same time, and then clearing all the old nodes. This prevents the possibility that a reader will see an empty nodemask at the same time the writer is storing a new nodemask. If at least one node remains unchanged, though, it's possible to simply set all new nodes and then clear all the old nodes. Changing a task's nodemask is protected by cgroup_mutex so it's guaranteed that two threads are not changing the same task's nodemask at the same time, so the nodemask is guaranteed to be stored before another thread changes it and determines whether a node remains set or not. Signed-off-by: David Rientjes Cc: Miao Xie Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 10131fdaff70..ed0ff443f036 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -949,6 +949,8 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, static void cpuset_change_task_nodemask(struct task_struct *tsk, nodemask_t *newmems) { + bool masks_disjoint = !nodes_intersects(*newmems, tsk->mems_allowed); + repeat: /* * Allow tasks that have access to memory reserves because they have @@ -963,7 +965,6 @@ repeat: nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); - /* * ensure checking ->mems_allowed_change_disable after setting all new * allowed nodes. @@ -980,9 +981,11 @@ repeat: /* * Allocation of memory is very fast, we needn't sleep when waiting - * for the read-side. + * for the read-side. No wait is necessary, however, if at least one + * node remains unchanged. */ - while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) { + while (masks_disjoint && + ACCESS_ONCE(tsk->mems_allowed_change_disable)) { task_unlock(tsk); if (!task_curr(tsk)) yield(); -- cgit v1.2.3 From f1ecf06854a66ee663f4d4cf029c78cd62a15e04 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Wed, 2 Nov 2011 13:39:22 -0700 Subject: sysctl: add support for poll() Adding support for poll() in sysctl fs allows userspace to receive notifications of changes in sysctl entries. This adds a infrastructure to allow files in sysctl fs to be pollable and implements it for hostname and domainname. [akpm@linux-foundation.org: s/declare/define/ for definitions] Signed-off-by: Lucas De Marchi Cc: Greg KH Cc: Kay Sievers Cc: Al Viro Cc: "Eric W. Biederman" Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 ++ kernel/utsname_sysctl.c | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 58459509b14c..d06c091e0345 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1286,6 +1286,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) memset(u->nodename + len, 0, sizeof(u->nodename) - len); errno = 0; } + uts_proc_notify(UTS_PROC_HOSTNAME); up_write(&uts_sem); return errno; } @@ -1336,6 +1337,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) memset(u->domainname + len, 0, sizeof(u->domainname) - len); errno = 0; } + uts_proc_notify(UTS_PROC_DOMAINNAME); up_write(&uts_sem); return errno; } diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index a2cd77e70d4d..3b0d48ebf81d 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -13,6 +13,7 @@ #include #include #include +#include static void *get_uts(ctl_table *table, int write) { @@ -51,12 +52,19 @@ static int proc_do_uts_string(ctl_table *table, int write, uts_table.data = get_uts(table, write); r = proc_dostring(&uts_table,write,buffer,lenp, ppos); put_uts(table, write, uts_table.data); + + if (write) + proc_sys_poll_notify(table->poll); + return r; } #else #define proc_do_uts_string NULL #endif +static DEFINE_CTL_TABLE_POLL(hostname_poll); +static DEFINE_CTL_TABLE_POLL(domainname_poll); + static struct ctl_table uts_kern_table[] = { { .procname = "ostype", @@ -85,6 +93,7 @@ static struct ctl_table uts_kern_table[] = { .maxlen = sizeof(init_uts_ns.name.nodename), .mode = 0644, .proc_handler = proc_do_uts_string, + .poll = &hostname_poll, }, { .procname = "domainname", @@ -92,6 +101,7 @@ static struct ctl_table uts_kern_table[] = { .maxlen = sizeof(init_uts_ns.name.domainname), .mode = 0644, .proc_handler = proc_do_uts_string, + .poll = &domainname_poll, }, {} }; @@ -105,6 +115,19 @@ static struct ctl_table uts_root_table[] = { {} }; +#ifdef CONFIG_PROC_SYSCTL +/* + * Notify userspace about a change in a certain entry of uts_kern_table, + * identified by the parameter proc. + */ +void uts_proc_notify(enum uts_proc proc) +{ + struct ctl_table *table = &uts_kern_table[proc]; + + proc_sys_poll_notify(table->poll); +} +#endif + static int __init utsname_sysctl_init(void) { register_sysctl_table(uts_root_table); -- cgit v1.2.3 From c1e2ee2dc436574880758b3836fc96935b774c32 Mon Sep 17 00:00:00 2001 From: Andrew Bresticker Date: Wed, 2 Nov 2011 13:40:29 -0700 Subject: memcg: replace ss->id_lock with a rwlock While back-porting Johannes Weiner's patch "mm: memcg-aware global reclaim" for an internal effort, we noticed a significant performance regression during page-reclaim heavy workloads due to high contention of the ss->id_lock. This lock protects idr map, and serializes calls to idr_get_next() in css_get_next() (which is used during the memcg hierarchy walk). Since idr_get_next() is just doing a look up, we need only serialize it with respect to idr_remove()/idr_get_new(). By making the ss->id_lock a rwlock, contention is greatly reduced and performance improves. Tested: cat a 256m file from a ramdisk in a 128m container 50 times on each core (one file + container per core) in parallel on a NUMA machine. Result is the time for the test to complete in 1 of the containers. Both kernels included Johannes' memcg-aware global reclaim patches. Before rwlock patch: 1710.778s After rwlock patch: 152.227s Signed-off-by: Andrew Bresticker Cc: Paul Menage Cc: Li Zefan Acked-by: KAMEZAWA Hiroyuki Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8386b21224ef..d9d5648f3cdc 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4883,9 +4883,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) rcu_assign_pointer(id->css, NULL); rcu_assign_pointer(css->id, NULL); - spin_lock(&ss->id_lock); + write_lock(&ss->id_lock); idr_remove(&ss->idr, id->id); - spin_unlock(&ss->id_lock); + write_unlock(&ss->id_lock); kfree_rcu(id, rcu_head); } EXPORT_SYMBOL_GPL(free_css_id); @@ -4911,10 +4911,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) error = -ENOMEM; goto err_out; } - spin_lock(&ss->id_lock); + write_lock(&ss->id_lock); /* Don't use 0. allocates an ID of 1-65535 */ error = idr_get_new_above(&ss->idr, newid, 1, &myid); - spin_unlock(&ss->id_lock); + write_unlock(&ss->id_lock); /* Returns error when there are no free spaces for new ID.*/ if (error) { @@ -4929,9 +4929,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) return newid; remove_idr: error = -ENOSPC; - spin_lock(&ss->id_lock); + write_lock(&ss->id_lock); idr_remove(&ss->idr, myid); - spin_unlock(&ss->id_lock); + write_unlock(&ss->id_lock); err_out: kfree(newid); return ERR_PTR(error); @@ -4943,7 +4943,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, { struct css_id *newid; - spin_lock_init(&ss->id_lock); + rwlock_init(&ss->id_lock); idr_init(&ss->idr); newid = get_new_cssid(ss, 0); @@ -5038,9 +5038,9 @@ css_get_next(struct cgroup_subsys *ss, int id, * scan next entry from bitmap(tree), tmpid is updated after * idr_get_next(). */ - spin_lock(&ss->id_lock); + read_lock(&ss->id_lock); tmp = idr_get_next(&ss->idr, &tmpid); - spin_unlock(&ss->id_lock); + read_unlock(&ss->id_lock); if (!tmp) break; -- cgit v1.2.3