From 50a323b73069b169385a8ac65633dee837a7d13f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 8 Jun 2010 21:40:36 +0200 Subject: sched: define and use CPU_PRI_* enums for cpu notifier priorities Instead of hardcoding priority 10 and 20 in sched and perf, collect them into CPU_PRI_* enums. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Rusty Russell Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- include/linux/cpu.h | 9 +++++++++ include/linux/perf_event.h | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index e287863ac053..2d9073883ea9 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -48,6 +48,15 @@ extern ssize_t arch_cpu_release(const char *, size_t); #endif struct notifier_block; +/* + * CPU notifier priorities. + */ +enum { + /* migration should happen before other stuff but after perf */ + CPU_PRI_PERF = 20, + CPU_PRI_MIGRATION = 10, +}; + #ifdef CONFIG_SMP /* Need to know about CPUs going up/down? */ #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5d0266d94985..469e03e96fe7 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1068,7 +1068,7 @@ static inline void perf_event_disable(struct perf_event *event) { } #define perf_cpu_notifier(fn) \ do { \ static struct notifier_block fn##_nb __cpuinitdata = \ - { .notifier_call = fn, .priority = 20 }; \ + { .notifier_call = fn, .priority = CPU_PRI_PERF }; \ fn(&fn##_nb, (unsigned long)CPU_UP_PREPARE, \ (void *)(unsigned long)smp_processor_id()); \ fn(&fn##_nb, (unsigned long)CPU_STARTING, \ -- cgit v1.2.3 From 3a101d0548e925ab16ca6aaa8cf4f767d322ddb0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 8 Jun 2010 21:40:36 +0200 Subject: sched: adjust when cpu_active and cpuset configurations are updated during cpu on/offlining Currently, when a cpu goes down, cpu_active is cleared before CPU_DOWN_PREPARE starts and cpuset configuration is updated from a default priority cpu notifier. When a cpu is coming up, it's set before CPU_ONLINE but cpuset configuration again is updated from the same cpu notifier. For cpu notifiers, this presents an inconsistent state. Threads which a CPU_DOWN_PREPARE notifier expects to be bound to the CPU can be migrated to other cpus because the cpu is no more inactive. Fix it by updating cpu_active in the highest priority cpu notifier and cpuset configuration in the second highest when a cpu is coming up. Down path is updated similarly. This guarantees that all other cpu notifiers see consistent cpu_active and cpuset configuration. cpuset_track_online_cpus() notifier is converted to cpuset_update_active_cpus() which just updates the configuration and now called from cpuset_cpu_[in]active() notifiers registered from sched_init_smp(). If cpuset is disabled, cpuset_update_active_cpus() degenerates into partition_sched_domains() making separate notifier for !CONFIG_CPUSETS unnecessary. This problem is triggered by cmwq. During CPU_DOWN_PREPARE, hotplug callback creates a kthread and kthread_bind()s it to the target cpu, and the thread is expected to run on that cpu. * Ingo's test discovered __cpuinit/exit markups were incorrect. Fixed. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Rusty Russell Cc: Ingo Molnar Cc: Paul Menage --- include/linux/cpu.h | 16 ++++++++++++++++ include/linux/cpuset.h | 6 ++++++ 2 files changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 2d9073883ea9..de6b1722cdca 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -52,6 +52,22 @@ struct notifier_block; * CPU notifier priorities. */ enum { + /* + * SCHED_ACTIVE marks a cpu which is coming up active during + * CPU_ONLINE and CPU_DOWN_FAILED and must be the first + * notifier. CPUSET_ACTIVE adjusts cpuset according to + * cpu_active mask right after SCHED_ACTIVE. During + * CPU_DOWN_PREPARE, SCHED_INACTIVE and CPUSET_INACTIVE are + * ordered in the similar way. + * + * This ordering guarantees consistent cpu_active mask and + * migration behavior to all cpu notifiers. + */ + CPU_PRI_SCHED_ACTIVE = INT_MAX, + CPU_PRI_CPUSET_ACTIVE = INT_MAX - 1, + CPU_PRI_SCHED_INACTIVE = INT_MIN + 1, + CPU_PRI_CPUSET_INACTIVE = INT_MIN, + /* migration should happen before other stuff but after perf */ CPU_PRI_PERF = 20, CPU_PRI_MIGRATION = 10, diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 457ed765a116..f20eb8f16025 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -20,6 +20,7 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */ extern int cpuset_init(void); extern void cpuset_init_smp(void); +extern void cpuset_update_active_cpus(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern int cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); @@ -132,6 +133,11 @@ static inline void set_mems_allowed(nodemask_t nodemask) static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} +static inline void cpuset_update_active_cpus(void) +{ + partition_sched_domains(1, NULL, NULL); +} + static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { -- cgit v1.2.3 From 21aa9af03d06cb1d19a3738e5cf12acff984e69b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 8 Jun 2010 21:40:37 +0200 Subject: sched: add hooks for workqueue Concurrency managed workqueue needs to know when workers are going to sleep and waking up. Using these two hooks, cmwq keeps track of the current concurrency level and throttles execution of new works if it's too high and wakes up another worker from the sleep hook if it becomes too low. This patch introduces PF_WQ_WORKER to identify workqueue workers and adds the following two hooks. * wq_worker_waking_up(): called when a worker is woken up. * wq_worker_sleeping(): called when a worker is going to sleep and may return a pointer to a local task which should be woken up. The returned task is woken up using try_to_wake_up_local() which is simplified ttwu which is called under rq lock and can only wake up local tasks. Both hooks are currently defined as noop in kernel/workqueue_sched.h. Later cmwq implementation will replace them with proper implementation. These hooks are hard coded as they'll always be enabled. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Mike Galbraith Cc: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index f118809c953f..edc3dd168d87 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1696,6 +1696,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ +#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ #define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ -- cgit v1.2.3