From 2f7ee5691eecb67c8108b92001a85563ea336ac5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 12 Dec 2011 18:12:21 -0800 Subject: cgroup: introduce cgroup_taskset and use it in subsys->can_attach(), cancel_attach() and attach() Currently, there's no way to pass multiple tasks to cgroup_subsys methods necessitating the need for separate per-process and per-task methods. This patch introduces cgroup_taskset which can be used to pass multiple tasks and their associated cgroups to cgroup_subsys methods. Three methods - can_attach(), cancel_attach() and attach() - are converted to use cgroup_taskset. This unifies passed parameters so that all methods have access to all information. Conversions in this patchset are identical and don't introduce any behavior change. -v2: documentation updated as per Paul Menage's suggestion. Signed-off-by: Tejun Heo Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Frederic Weisbecker Acked-by: Paul Menage Acked-by: Li Zefan Cc: Balbir Singh Cc: Daisuke Nishimura Cc: KAMEZAWA Hiroyuki Cc: James Morris --- kernel/cpuset.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel/cpuset.c') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 9fe58c46a426..512bd59e8627 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1371,10 +1371,10 @@ static int fmeter_getrate(struct fmeter *fmp) } /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, - struct task_struct *tsk) +static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup_taskset *tset) { - struct cpuset *cs = cgroup_cs(cont); + struct cpuset *cs = cgroup_cs(cgrp); if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) return -ENOSPC; @@ -1387,7 +1387,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may * be changed. */ - if (tsk->flags & PF_THREAD_BOUND) + if (cgroup_taskset_first(tset)->flags & PF_THREAD_BOUND) return -EINVAL; return 0; @@ -1437,12 +1437,14 @@ static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk) cpuset_update_task_spread_flag(cs, tsk); } -static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, - struct cgroup *oldcont, struct task_struct *tsk) +static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup_taskset *tset) { struct mm_struct *mm; - struct cpuset *cs = cgroup_cs(cont); - struct cpuset *oldcs = cgroup_cs(oldcont); + struct task_struct *tsk = cgroup_taskset_first(tset); + struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); + struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *oldcs = cgroup_cs(oldcgrp); /* * Change mm, possibly for multiple threads in a threadgroup. This is -- cgit v1.2.3 From bb9d97b6dffa10cec5e1ce9adbce60f3c2b5eabc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 12 Dec 2011 18:12:21 -0800 Subject: cgroup: don't use subsys->can_attach_task() or ->attach_task() Now that subsys->can_attach() and attach() take @tset instead of @task, they can handle per-task operations. Convert ->can_attach_task() and ->attach_task() users to use ->can_attach() and attach() instead. Most converions are straight-forward. Noteworthy changes are, * In cgroup_freezer, remove unnecessary NULL assignments to unused methods. It's useless and very prone to get out of sync, which already happened. * In cpuset, PF_THREAD_BOUND test is checked for each task. This doesn't make any practical difference but is conceptually cleaner. Signed-off-by: Tejun Heo Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Frederic Weisbecker Acked-by: Li Zefan Cc: Paul Menage Cc: Balbir Singh Cc: Daisuke Nishimura Cc: James Morris Cc: Ingo Molnar Cc: Peter Zijlstra --- kernel/cpuset.c | 70 ++++++++++++++++++++++++++------------------------------- 1 file changed, 32 insertions(+), 38 deletions(-) (limited to 'kernel/cpuset.c') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 512bd59e8627..9a8a61301524 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1375,33 +1375,34 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, struct cgroup_taskset *tset) { struct cpuset *cs = cgroup_cs(cgrp); + struct task_struct *task; + int ret; if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) return -ENOSPC; - /* - * Kthreads bound to specific cpus cannot be moved to a new cpuset; we - * cannot change their cpu affinity and isolating such threads by their - * set of allowed nodes is unnecessary. Thus, cpusets are not - * applicable for such threads. This prevents checking for success of - * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may - * be changed. - */ - if (cgroup_taskset_first(tset)->flags & PF_THREAD_BOUND) - return -EINVAL; - + cgroup_taskset_for_each(task, cgrp, tset) { + /* + * Kthreads bound to specific cpus cannot be moved to a new + * cpuset; we cannot change their cpu affinity and + * isolating such threads by their set of allowed nodes is + * unnecessary. Thus, cpusets are not applicable for such + * threads. This prevents checking for success of + * set_cpus_allowed_ptr() on all attached tasks before + * cpus_allowed may be changed. + */ + if (task->flags & PF_THREAD_BOUND) + return -EINVAL; + if ((ret = security_task_setscheduler(task))) + return ret; + } return 0; } -static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task) -{ - return security_task_setscheduler(task); -} - /* * Protected by cgroup_lock. The nodemasks must be stored globally because * dynamically allocating them is not allowed in pre_attach, and they must - * persist among pre_attach, attach_task, and attach. + * persist among pre_attach, and attach. */ static cpumask_var_t cpus_attach; static nodemask_t cpuset_attach_nodemask_from; @@ -1420,39 +1421,34 @@ static void cpuset_pre_attach(struct cgroup *cont) guarantee_online_mems(cs, &cpuset_attach_nodemask_to); } -/* Per-thread attachment work. */ -static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk) -{ - int err; - struct cpuset *cs = cgroup_cs(cont); - - /* - * can_attach beforehand should guarantee that this doesn't fail. - * TODO: have a better way to handle failure here - */ - err = set_cpus_allowed_ptr(tsk, cpus_attach); - WARN_ON_ONCE(err); - - cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to); - cpuset_update_task_spread_flag(cs, tsk); -} - static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, struct cgroup_taskset *tset) { struct mm_struct *mm; - struct task_struct *tsk = cgroup_taskset_first(tset); + struct task_struct *task; + struct task_struct *leader = cgroup_taskset_first(tset); struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *oldcs = cgroup_cs(oldcgrp); + cgroup_taskset_for_each(task, cgrp, tset) { + /* + * can_attach beforehand should guarantee that this doesn't + * fail. TODO: have a better way to handle failure here + */ + WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); + + cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); + cpuset_update_task_spread_flag(cs, task); + } + /* * Change mm, possibly for multiple threads in a threadgroup. This is * expensive and may sleep. */ cpuset_attach_nodemask_from = oldcs->mems_allowed; cpuset_attach_nodemask_to = cs->mems_allowed; - mm = get_task_mm(tsk); + mm = get_task_mm(leader); if (mm) { mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); if (is_memory_migrate(cs)) @@ -1908,9 +1904,7 @@ struct cgroup_subsys cpuset_subsys = { .create = cpuset_create, .destroy = cpuset_destroy, .can_attach = cpuset_can_attach, - .can_attach_task = cpuset_can_attach_task, .pre_attach = cpuset_pre_attach, - .attach_task = cpuset_attach_task, .attach = cpuset_attach, .populate = cpuset_populate, .post_clone = cpuset_post_clone, -- cgit v1.2.3 From 94196f51c1ee5bbad674de28c682b17d78adb8e6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 12 Dec 2011 18:12:22 -0800 Subject: cgroup, cpuset: don't use ss->pre_attach() ->pre_attach() is supposed to be called before migration, which is observed during process migration but task migration does it the other way around. The only ->pre_attach() user is cpuset which can do the same operaitons in ->can_attach(). Collapse cpuset_pre_attach() into cpuset_can_attach(). -v2: Patch contamination from later patch removed. Spotted by Paul Menage. Signed-off-by: Tejun Heo Reviewed-by: Frederic Weisbecker Acked-by: Paul Menage Cc: Li Zefan --- kernel/cpuset.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) (limited to 'kernel/cpuset.c') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 9a8a61301524..42e568306382 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1370,6 +1370,15 @@ static int fmeter_getrate(struct fmeter *fmp) return val; } +/* + * Protected by cgroup_lock. The nodemasks must be stored globally because + * dynamically allocating them is not allowed in can_attach, and they must + * persist until attach. + */ +static cpumask_var_t cpus_attach; +static nodemask_t cpuset_attach_nodemask_from; +static nodemask_t cpuset_attach_nodemask_to; + /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, struct cgroup_taskset *tset) @@ -1396,29 +1405,16 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, if ((ret = security_task_setscheduler(task))) return ret; } - return 0; -} - -/* - * Protected by cgroup_lock. The nodemasks must be stored globally because - * dynamically allocating them is not allowed in pre_attach, and they must - * persist among pre_attach, and attach. - */ -static cpumask_var_t cpus_attach; -static nodemask_t cpuset_attach_nodemask_from; -static nodemask_t cpuset_attach_nodemask_to; - -/* Set-up work for before attaching each task. */ -static void cpuset_pre_attach(struct cgroup *cont) -{ - struct cpuset *cs = cgroup_cs(cont); + /* prepare for attach */ if (cs == &top_cpuset) cpumask_copy(cpus_attach, cpu_possible_mask); else guarantee_online_cpus(cs, cpus_attach); guarantee_online_mems(cs, &cpuset_attach_nodemask_to); + + return 0; } static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, @@ -1904,7 +1900,6 @@ struct cgroup_subsys cpuset_subsys = { .create = cpuset_create, .destroy = cpuset_destroy, .can_attach = cpuset_can_attach, - .pre_attach = cpuset_pre_attach, .attach = cpuset_attach, .populate = cpuset_populate, .post_clone = cpuset_post_clone, -- cgit v1.2.3 From b246272ecc5ac68c743b15c9e41a2275f7ce70e2 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 19 Dec 2011 17:11:52 -0800 Subject: cpusets: stall when updating mems_allowed for mempolicy or disjoint nodemask Kernels where MAX_NUMNODES > BITS_PER_LONG may temporarily see an empty nodemask in a tsk's mempolicy if its previous nodemask is remapped onto a new set of allowed cpuset nodes where the two nodemasks, as a result of the remap, are now disjoint. c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") adds get_mems_allowed() to prevent the set of allowed nodes from changing for a thread. This causes any update to a set of allowed nodes to stall until put_mems_allowed() is called. This stall is unncessary, however, if at least one node remains unchanged in the update to the set of allowed nodes. This was addressed by 89e8a244b97e ("cpusets: avoid looping when storing to mems_allowed if one node remains set"), but it's still possible that an empty nodemask may be read from a mempolicy because the old nodemask may be remapped to the new nodemask during rebind. To prevent this, only avoid the stall if there is no mempolicy for the thread being changed. This is a temporary solution until all reads from mempolicy nodemasks can be guaranteed to not be empty without the get_mems_allowed() synchronization. Also moves the check for nodemask intersection inside task_lock() so that tsk->mems_allowed cannot change. This ensures that nothing can set this tsk's mems_allowed out from under us and also protects tsk->mempolicy. Reported-by: Miao Xie Signed-off-by: David Rientjes Cc: KOSAKI Motohiro Cc: Paul Menage Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'kernel/cpuset.c') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 9fe58c46a426..0b1712dba587 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -123,6 +123,19 @@ static inline struct cpuset *task_cs(struct task_struct *task) struct cpuset, css); } +#ifdef CONFIG_NUMA +static inline bool task_has_mempolicy(struct task_struct *task) +{ + return task->mempolicy; +} +#else +static inline bool task_has_mempolicy(struct task_struct *task) +{ + return false; +} +#endif + + /* bits in struct cpuset flags field */ typedef enum { CS_CPU_EXCLUSIVE, @@ -949,7 +962,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, static void cpuset_change_task_nodemask(struct task_struct *tsk, nodemask_t *newmems) { - bool masks_disjoint = !nodes_intersects(*newmems, tsk->mems_allowed); + bool need_loop; repeat: /* @@ -962,6 +975,14 @@ repeat: return; task_lock(tsk); + /* + * Determine if a loop is necessary if another thread is doing + * get_mems_allowed(). If at least one node remains unchanged and + * tsk does not have a mempolicy, then an empty nodemask will not be + * possible when mems_allowed is larger than a word. + */ + need_loop = task_has_mempolicy(tsk) || + !nodes_intersects(*newmems, tsk->mems_allowed); nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); @@ -981,11 +1002,9 @@ repeat: /* * Allocation of memory is very fast, we needn't sleep when waiting - * for the read-side. No wait is necessary, however, if at least one - * node remains unchanged. + * for the read-side. */ - while (masks_disjoint && - ACCESS_ONCE(tsk->mems_allowed_change_disable)) { + while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) { task_unlock(tsk); if (!task_curr(tsk)) yield(); -- cgit v1.2.3