From 44ffc75ba9a63f972dbebd4fab6888db5fcd3b0e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Dec 2013 12:28:01 -0500 Subject: cgroup, sched: convert away from cftype->read_map() In preparation of conversion to kernfs, cgroup file handling is being consolidated so that it can be easily mapped to the seq_file based interface of kernfs. cftype->read_map() doesn't add any value and being replaced with ->read_seq_string(). Update cpu_stats_show() and cpuacct_stats_show() accordingly. This patch doesn't make any visible behavior changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Ingo Molnar Cc: Peter Zijlstra --- kernel/sched/core.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c1808606ee5f..f28ec6722f0b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7257,14 +7257,14 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) } static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, - struct cgroup_map_cb *cb) + struct seq_file *sf) { struct task_group *tg = css_tg(css); struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; - cb->fill(cb, "nr_periods", cfs_b->nr_periods); - cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); - cb->fill(cb, "throttled_time", cfs_b->throttled_time); + seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods); + seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); + seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); return 0; } @@ -7318,7 +7318,7 @@ static struct cftype cpu_files[] = { }, { .name = "stat", - .read_map = cpu_stats_show, + .read_seq_string = cpu_stats_show, }, #endif #ifdef CONFIG_RT_GROUP_SCHED -- cgit v1.2.3 From 2da8ca822d49c8b8781800ad155aaa00e7bb5f1a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Dec 2013 12:28:04 -0500 Subject: cgroup: replace cftype->read_seq_string() with cftype->seq_show() In preparation of conversion to kernfs, cgroup file handling is updated so that it can be easily mapped to kernfs. This patch replaces cftype->read_seq_string() with cftype->seq_show() which is not limited to single_open() operation and will map directcly to kernfs seq_file interface. The conversions are mechanical. As ->seq_show() doesn't have @css and @cft, the functions which make use of them are converted to use seq_css() and seq_cft() respectively. In several occassions, e.f. if it has seq_string in its name, the function name is updated to fit the new method better. This patch does not introduce any behavior changes. Signed-off-by: Tejun Heo Acked-by: Aristeu Rozanski Acked-by: Vivek Goyal Acked-by: Michal Hocko Acked-by: Daniel Wagner Acked-by: Li Zefan Cc: Jens Axboe Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Johannes Weiner Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Neil Horman --- kernel/sched/core.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f28ec6722f0b..7e8cbb9ee4d6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7256,10 +7256,9 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) return ret; } -static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, - struct seq_file *sf) +static int cpu_stats_show(struct seq_file *sf, void *v) { - struct task_group *tg = css_tg(css); + struct task_group *tg = css_tg(seq_css(sf)); struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods); @@ -7318,7 +7317,7 @@ static struct cftype cpu_files[] = { }, { .name = "stat", - .read_seq_string = cpu_stats_show, + .seq_show = cpu_stats_show, }, #endif #ifdef CONFIG_RT_GROUP_SCHED -- cgit v1.2.3 From 8fe8ff09ce3b5750e1f3e45a1f4a81d59c7ff1f1 Mon Sep 17 00:00:00 2001 From: Kevin Hilman Date: Wed, 15 Jan 2014 14:51:38 +0100 Subject: sched/nohz: Fix overflow error in scheduler_tick_max_deferment() While calculating the scheduler tick max deferment, the delta is converted from microseconds to nanoseconds through a multiplication against NSEC_PER_USEC. But this microseconds operand is an unsigned int, thus the result may likely overflow. The result is cast to u64 but only once the operation is completed, which is too late to avoid overflown result. This is currently not a problem because the scheduler tick max deferment is 1 second. But this may become an issue as we plan to make this value tunable. So lets fix this by casting the usecs value to u64 before multiplying by NSECS_PER_USEC. Also to prevent from this kind of mistake to happen again, move this ad-hoc jiffies -> nsecs conversion to a new helper. Signed-off-by: Kevin Hilman Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Alex Shi Cc: Steven Rostedt Cc: Paul E. McKenney Cc: John Stultz Cc: Kevin Hilman Link: http://lkml.kernel.org/r/1387315388-31676-2-git-send-email-khilman@linaro.org [move ad-hoc conversion to jiffies_to_nsecs helper] Signed-off-by: Frederic Weisbecker --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a88f4a485c5e..61e601fc2b1e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2325,7 +2325,7 @@ u64 scheduler_tick_max_deferment(void) if (time_before_eq(next, now)) return 0; - return jiffies_to_usecs(next - now) * NSEC_PER_USEC; + return jiffies_to_nsecs(next - now); } #endif -- cgit v1.2.3 From 286549dcaf4f128cb04f0ad56dfb677d7d19b500 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 15:51:03 -0800 Subject: sched: add tracepoints related to NUMA task migration This patch adds three tracepoints o trace_sched_move_numa when a task is moved to a node o trace_sched_swap_numa when a task is swapped with another task o trace_sched_stick_numa when a numa-related migration fails The tracepoints allow the NUMA scheduler activity to be monitored and the following high-level metrics can be calculated o NUMA migrated stuck nr trace_sched_stick_numa o NUMA migrated idle nr trace_sched_move_numa o NUMA migrated swapped nr trace_sched_swap_numa o NUMA local swapped trace_sched_swap_numa src_nid == dst_nid (should never happen) o NUMA remote swapped trace_sched_swap_numa src_nid != dst_nid (should == NUMA migrated swapped) o NUMA group swapped trace_sched_swap_numa src_ngid == dst_ngid Maybe a small number of these are acceptable but a high number would be a major surprise. It would be even worse if bounces are frequent. o NUMA avg task migs. Average number of migrations for tasks o NUMA stddev task mig Self-explanatory o NUMA max task migs. Maximum number of migrations for a single task In general the intent of the tracepoints is to help diagnose problems where automatic NUMA balancing appears to be doing an excessive amount of useless work. [akpm@linux-foundation.org: remove semicolon-after-if, repair coding-style] Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Alex Thorlton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 36c951b7eef8..5ae36cc11fe5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1108,6 +1108,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p) if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) goto out; + trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); out: @@ -4603,6 +4604,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu) /* TODO: This is not properly updating schedstats */ + trace_sched_move_numa(p, curr_cpu, target_cpu); return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); } -- cgit v1.2.3 From 54a43d54988a3731d644fdeb7a1d6f46b4ac64c7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 23 Jan 2014 15:53:13 -0800 Subject: numa: add a sysctl for numa_balancing Add a working sysctl to enable/disable automatic numa memory balancing at runtime. This allows us to track down performance problems with this feature and is generally a good idea. This was possible earlier through debugfs, but only with special debugging options set. Also fix the boot message. [akpm@linux-foundation.org: s/sched_numa_balancing/sysctl_numa_balancing/] Signed-off-by: Andi Kleen Acked-by: Mel Gorman Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/core.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4d6964e49711..7fea865a810d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1770,7 +1770,29 @@ void set_numabalancing_state(bool enabled) numabalancing_enabled = enabled; } #endif /* CONFIG_SCHED_DEBUG */ -#endif /* CONFIG_NUMA_BALANCING */ + +#ifdef CONFIG_PROC_SYSCTL +int sysctl_numa_balancing(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int err; + int state = numabalancing_enabled; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + t = *table; + t.data = &state; + err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + if (write) + set_numabalancing_state(state); + return err; +} +#endif +#endif /* * fork()/clone()-time setup: -- cgit v1.2.3