From 534a3fbb3f37c42ec32de9876681c3195be0b658 Mon Sep 17 00:00:00 2001 From: Daeseok Youn Date: Fri, 18 Apr 2014 09:08:14 +0900 Subject: workqueue: simplify wq_update_unbound_numa() by jumping to use_dfl_pwq if the target cpumask equals wq's wq_update_unbound_numa(), when it's decided that the newly updated cpumask equals the default, looks at whether the current pwq is already the default one and skips setting pwq to the default one. This extra step is unnecessary and we can always jump to use_dfl_pwq instead. Simplify the code by removing the conditional. This doesn't make any functional difference. Signed-off-by: Daeseok Youn Signed-off-by: Tejun Heo --- kernel/workqueue.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8edc87185427..c3f076f0f8fd 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4103,17 +4103,13 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, * Let's determine what needs to be done. If the target cpumask is * different from wq's, we need to compare it to @pwq's and create * a new one if they don't match. If the target cpumask equals - * wq's, the default pwq should be used. If @pwq is already the - * default one, nothing to do; otherwise, install the default one. + * wq's, the default pwq should be used. */ if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) { if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) goto out_unlock; } else { - if (pwq == wq->dfl_pwq) - goto out_unlock; - else - goto use_dfl_pwq; + goto use_dfl_pwq; } mutex_unlock(&wq->mutex); -- cgit v1.2.3 From 4104d326b670c2b66f575d2004daa28b2d1b4c8d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 10 Jan 2014 17:01:58 -0500 Subject: ftrace: Remove global function list and call function directly Instead of having a list of global functions that are called, as only one global function is allow to be enabled at a time, there's no reason to have a list. Instead, simply have all the users of the global ops, use the global ops directly, instead of registering their own ftrace_ops. Just switch what function is used before enabling the function tracer. This removes a lot of code as well as the complexity involved with it. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 152 +++++++++++--------------------------- kernel/trace/trace.c | 2 + kernel/trace/trace.h | 19 +++-- kernel/trace/trace_functions.c | 55 +++++--------- kernel/trace/trace_irqsoff.c | 33 ++++----- kernel/trace/trace_sched_wakeup.c | 40 +++++----- kernel/trace/trace_selftest.c | 33 +++++---- 7 files changed, 125 insertions(+), 209 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1fd4b9479210..8f61ef70a297 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -62,7 +62,7 @@ #define FTRACE_HASH_DEFAULT_BITS 10 #define FTRACE_HASH_MAX_BITS 12 -#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) +#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL) #ifdef CONFIG_DYNAMIC_FTRACE #define INIT_REGEX_LOCK(opsname) \ @@ -103,7 +103,6 @@ static int ftrace_disabled __read_mostly; static DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; @@ -171,23 +170,6 @@ int ftrace_nr_registered_ops(void) return cnt; } -static void -ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs) -{ - int bit; - - bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX); - if (bit < 0) - return; - - do_for_each_ftrace_op(op, ftrace_global_list) { - op->func(ip, parent_ip, op, regs); - } while_for_each_ftrace_op(op); - - trace_clear_recursion(bit); -} - static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { @@ -237,43 +219,6 @@ static int control_ops_alloc(struct ftrace_ops *ops) return 0; } -static void update_global_ops(void) -{ - ftrace_func_t func = ftrace_global_list_func; - void *private = NULL; - - /* The list has its own recursion protection. */ - global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; - - /* - * If there's only one function registered, then call that - * function directly. Otherwise, we need to iterate over the - * registered callers. - */ - if (ftrace_global_list == &ftrace_list_end || - ftrace_global_list->next == &ftrace_list_end) { - func = ftrace_global_list->func; - private = ftrace_global_list->private; - /* - * As we are calling the function directly. - * If it does not have recursion protection, - * the function_trace_op needs to be updated - * accordingly. - */ - if (!(ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE)) - global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE; - } - - /* If we filter on pids, update to use the pid function */ - if (!list_empty(&ftrace_pids)) { - set_ftrace_pid_function(func); - func = ftrace_pid_func; - } - - global_ops.func = func; - global_ops.private = private; -} - static void ftrace_sync(struct work_struct *work) { /* @@ -301,8 +246,6 @@ static void update_ftrace_function(void) { ftrace_func_t func; - update_global_ops(); - /* * If we are at the end of the list and this ops is * recursion safe and not dynamic and the arch supports passing ops, @@ -314,10 +257,7 @@ static void update_ftrace_function(void) (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) && !FTRACE_FORCE_LIST_FUNC)) { /* Set the ftrace_ops that the arch callback uses */ - if (ftrace_ops_list == &global_ops) - set_function_trace_op = ftrace_global_list; - else - set_function_trace_op = ftrace_ops_list; + set_function_trace_op = ftrace_ops_list; func = ftrace_ops_list->func; } else { /* Just use the default ftrace_ops */ @@ -434,16 +374,9 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if (ops->flags & FTRACE_OPS_FL_DELETED) return -EINVAL; - if (FTRACE_WARN_ON(ops == &global_ops)) - return -EINVAL; - if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) return -EBUSY; - /* We don't support both control and global flags set. */ - if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) - return -EINVAL; - #ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS /* * If the ftrace_ops specifies SAVE_REGS, then it only can be used @@ -461,10 +394,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if (!core_kernel_data((unsigned long)ops)) ops->flags |= FTRACE_OPS_FL_DYNAMIC; - if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops); - ops->flags |= FTRACE_OPS_FL_ENABLED; - } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { + if (ops->flags & FTRACE_OPS_FL_CONTROL) { if (control_ops_alloc(ops)) return -ENOMEM; add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); @@ -484,15 +414,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) return -EBUSY; - if (FTRACE_WARN_ON(ops == &global_ops)) - return -EINVAL; - - if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - ret = remove_ftrace_list_ops(&ftrace_global_list, - &global_ops, ops); - if (!ret) - ops->flags &= ~FTRACE_OPS_FL_ENABLED; - } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { + if (ops->flags & FTRACE_OPS_FL_CONTROL) { ret = remove_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); } else @@ -2128,15 +2050,6 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) ftrace_start_up++; command |= FTRACE_UPDATE_CALLS; - /* ops marked global share the filter hashes */ - if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - ops = &global_ops; - /* Don't update hash if global is already set */ - if (global_start_up) - hash_enable = false; - global_start_up++; - } - ops->flags |= FTRACE_OPS_FL_ENABLED; if (hash_enable) ftrace_hash_rec_enable(ops, 1); @@ -2166,21 +2079,10 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) */ WARN_ON_ONCE(ftrace_start_up < 0); - if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - ops = &global_ops; - global_start_up--; - WARN_ON_ONCE(global_start_up < 0); - /* Don't update hash if global still has users */ - if (global_start_up) { - WARN_ON_ONCE(!ftrace_start_up); - hash_disable = false; - } - } - if (hash_disable) ftrace_hash_rec_disable(ops, 1); - if (ops != &global_ops || !global_start_up) + if (!global_start_up) ops->flags &= ~FTRACE_OPS_FL_ENABLED; command |= FTRACE_UPDATE_CALLS; @@ -3524,10 +3426,6 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, struct ftrace_hash *hash; int ret; - /* All global ops uses the global ops filters */ - if (ops->flags & FTRACE_OPS_FL_GLOBAL) - ops = &global_ops; - if (unlikely(ftrace_disabled)) return -ENODEV; @@ -4462,6 +4360,34 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) #endif /* CONFIG_DYNAMIC_FTRACE */ +__init void ftrace_init_global_array_ops(struct trace_array *tr) +{ + tr->ops = &global_ops; + tr->ops->private = tr; +} + +void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func) +{ + /* If we filter on pids, update to use the pid function */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { + if (WARN_ON(tr->ops->func != ftrace_stub)) + printk("ftrace ops had %pS for function\n", + tr->ops->func); + /* Only the top level instance does pid tracing */ + if (!list_empty(&ftrace_pids)) { + set_ftrace_pid_function(func); + func = ftrace_pid_func; + } + } + tr->ops->func = func; + tr->ops->private = tr; +} + +void ftrace_reset_array_ops(struct trace_array *tr) +{ + tr->ops->func = ftrace_stub; +} + static void ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) @@ -4520,9 +4446,16 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, */ preempt_disable_notrace(); do_for_each_ftrace_op(op, ftrace_ops_list) { - if (ftrace_ops_test(op, ip, regs)) + if (ftrace_ops_test(op, ip, regs)) { + if (WARN_ON(!op->func)) { + function_trace_stop = 1; + printk("op=%p %pS\n", op, op); + goto out; + } op->func(ip, parent_ip, op, regs); + } } while_for_each_ftrace_op(op); +out: preempt_enable_notrace(); trace_clear_recursion(bit); } @@ -5076,8 +5009,7 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state, /* Just a place holder for function graph */ static struct ftrace_ops fgraph_ops __read_mostly = { .func = ftrace_stub, - .flags = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL | - FTRACE_OPS_FL_RECURSION_SAFE, + .flags = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_RECURSION_SAFE, }; static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 737b0efa1a62..fdd33aacdf05 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6629,6 +6629,8 @@ __init static int tracer_alloc_buffers(void) */ global_trace.current_trace = &nop_trace; + ftrace_init_global_array_ops(&global_trace); + register_tracer(&nop_trace); /* All seems OK, enable tracing */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2e29d7ba5a52..df5256be64cd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -416,13 +416,7 @@ enum { TRACE_FTRACE_IRQ_BIT, TRACE_FTRACE_SIRQ_BIT, - /* GLOBAL_BITs must be greater than FTRACE_BITs */ - TRACE_GLOBAL_BIT, - TRACE_GLOBAL_NMI_BIT, - TRACE_GLOBAL_IRQ_BIT, - TRACE_GLOBAL_SIRQ_BIT, - - /* INTERNAL_BITs must be greater than GLOBAL_BITs */ + /* INTERNAL_BITs must be greater than FTRACE_BITs */ TRACE_INTERNAL_BIT, TRACE_INTERNAL_NMI_BIT, TRACE_INTERNAL_IRQ_BIT, @@ -449,9 +443,6 @@ enum { #define TRACE_FTRACE_START TRACE_FTRACE_BIT #define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) -#define TRACE_GLOBAL_START TRACE_GLOBAL_BIT -#define TRACE_GLOBAL_MAX ((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1) - #define TRACE_LIST_START TRACE_INTERNAL_BIT #define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) @@ -823,6 +814,9 @@ extern int ftrace_is_dead(void); int ftrace_create_function_files(struct trace_array *tr, struct dentry *parent); void ftrace_destroy_function_files(struct trace_array *tr); +void ftrace_init_global_array_ops(struct trace_array *tr); +void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); +void ftrace_reset_array_ops(struct trace_array *tr); #else static inline int ftrace_trace_task(struct task_struct *task) { @@ -836,6 +830,11 @@ ftrace_create_function_files(struct trace_array *tr, return 0; } static inline void ftrace_destroy_function_files(struct trace_array *tr) { } +static inline __init void +ftrace_init_global_array_ops(struct trace_array *tr) { } +static inline void ftrace_reset_array_ops(struct trace_array *tr) { } +/* ftace_func_t type is not defined, use macro instead of static inline */ +#define ftrace_init_array_ops(tr, func) do { } while (0) #endif /* CONFIG_FUNCTION_TRACER */ #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index ffd56351b521..2d9482b8f26a 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -26,8 +26,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs); -static struct ftrace_ops trace_ops; -static struct ftrace_ops trace_stack_ops; static struct tracer_flags func_flags; /* Our option */ @@ -83,28 +81,24 @@ void ftrace_destroy_function_files(struct trace_array *tr) static int function_trace_init(struct trace_array *tr) { - struct ftrace_ops *ops; - - if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { - /* There's only one global tr */ - if (!trace_ops.private) { - trace_ops.private = tr; - trace_stack_ops.private = tr; - } + ftrace_func_t func; - if (func_flags.val & TRACE_FUNC_OPT_STACK) - ops = &trace_stack_ops; - else - ops = &trace_ops; - tr->ops = ops; - } else if (!tr->ops) { - /* - * Instance trace_arrays get their ops allocated - * at instance creation. Unless it failed - * the allocation. - */ + /* + * Instance trace_arrays get their ops allocated + * at instance creation. Unless it failed + * the allocation. + */ + if (!tr->ops) return -ENOMEM; - } + + /* Currently only the global instance can do stack tracing */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL && + func_flags.val & TRACE_FUNC_OPT_STACK) + func = function_stack_trace_call; + else + func = function_trace_call; + + ftrace_init_array_ops(tr, func); tr->trace_buffer.cpu = get_cpu(); put_cpu(); @@ -118,6 +112,7 @@ static void function_trace_reset(struct trace_array *tr) { tracing_stop_function_trace(tr); tracing_stop_cmdline_record(); + ftrace_reset_array_ops(tr); } static void function_trace_start(struct trace_array *tr) @@ -199,18 +194,6 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, local_irq_restore(flags); } -static struct ftrace_ops trace_ops __read_mostly = -{ - .func = function_trace_call, - .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, -}; - -static struct ftrace_ops trace_stack_ops __read_mostly = -{ - .func = function_stack_trace_call, - .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, -}; - static struct tracer_opt func_opts[] = { #ifdef CONFIG_STACKTRACE { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, @@ -248,10 +231,10 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) unregister_ftrace_function(tr->ops); if (set) { - tr->ops = &trace_stack_ops; + tr->ops->func = function_stack_trace_call; register_ftrace_function(tr->ops); } else { - tr->ops = &trace_ops; + tr->ops->func = function_trace_call; register_ftrace_function(tr->ops); } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 8ff02cbb892f..b5cb047df3e9 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -151,12 +151,6 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, atomic_dec(&data->disabled); } - -static struct ftrace_ops trace_ops __read_mostly = -{ - .func = irqsoff_tracer_call, - .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, -}; #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -531,7 +525,7 @@ void trace_preempt_off(unsigned long a0, unsigned long a1) } #endif /* CONFIG_PREEMPT_TRACER */ -static int register_irqsoff_function(int graph, int set) +static int register_irqsoff_function(struct trace_array *tr, int graph, int set) { int ret; @@ -543,7 +537,7 @@ static int register_irqsoff_function(int graph, int set) ret = register_ftrace_graph(&irqsoff_graph_return, &irqsoff_graph_entry); else - ret = register_ftrace_function(&trace_ops); + ret = register_ftrace_function(tr->ops); if (!ret) function_enabled = true; @@ -551,7 +545,7 @@ static int register_irqsoff_function(int graph, int set) return ret; } -static void unregister_irqsoff_function(int graph) +static void unregister_irqsoff_function(struct trace_array *tr, int graph) { if (!function_enabled) return; @@ -559,17 +553,17 @@ static void unregister_irqsoff_function(int graph) if (graph) unregister_ftrace_graph(); else - unregister_ftrace_function(&trace_ops); + unregister_ftrace_function(tr->ops); function_enabled = false; } -static void irqsoff_function_set(int set) +static void irqsoff_function_set(struct trace_array *tr, int set) { if (set) - register_irqsoff_function(is_graph(), 1); + register_irqsoff_function(tr, is_graph(), 1); else - unregister_irqsoff_function(is_graph()); + unregister_irqsoff_function(tr, is_graph()); } static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) @@ -577,7 +571,7 @@ static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) struct tracer *tracer = tr->current_trace; if (mask & TRACE_ITER_FUNCTION) - irqsoff_function_set(set); + irqsoff_function_set(tr, set); return trace_keep_overwrite(tracer, mask, set); } @@ -586,7 +580,7 @@ static int start_irqsoff_tracer(struct trace_array *tr, int graph) { int ret; - ret = register_irqsoff_function(graph, 0); + ret = register_irqsoff_function(tr, graph, 0); if (!ret && tracing_is_enabled()) tracer_enabled = 1; @@ -600,7 +594,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph) { tracer_enabled = 0; - unregister_irqsoff_function(graph); + unregister_irqsoff_function(tr, graph); } static void __irqsoff_tracer_init(struct trace_array *tr) @@ -617,7 +611,11 @@ static void __irqsoff_tracer_init(struct trace_array *tr) smp_wmb(); tracing_reset_online_cpus(&tr->trace_buffer); - if (start_irqsoff_tracer(tr, is_graph())) + ftrace_init_array_ops(tr, irqsoff_tracer_call); + + /* Only toplevel instance supports graph tracing */ + if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL && + is_graph()))) printk(KERN_ERR "failed to start irqsoff tracer\n"); } @@ -630,6 +628,7 @@ static void irqsoff_tracer_reset(struct trace_array *tr) set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); + ftrace_reset_array_ops(tr); } static void irqsoff_tracer_start(struct trace_array *tr) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index e14da5e97a69..4dd986defa60 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -130,15 +130,9 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, atomic_dec(&data->disabled); preempt_enable_notrace(); } - -static struct ftrace_ops trace_ops __read_mostly = -{ - .func = wakeup_tracer_call, - .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, -}; #endif /* CONFIG_FUNCTION_TRACER */ -static int register_wakeup_function(int graph, int set) +static int register_wakeup_function(struct trace_array *tr, int graph, int set) { int ret; @@ -150,7 +144,7 @@ static int register_wakeup_function(int graph, int set) ret = register_ftrace_graph(&wakeup_graph_return, &wakeup_graph_entry); else - ret = register_ftrace_function(&trace_ops); + ret = register_ftrace_function(tr->ops); if (!ret) function_enabled = true; @@ -158,7 +152,7 @@ static int register_wakeup_function(int graph, int set) return ret; } -static void unregister_wakeup_function(int graph) +static void unregister_wakeup_function(struct trace_array *tr, int graph) { if (!function_enabled) return; @@ -166,17 +160,17 @@ static void unregister_wakeup_function(int graph) if (graph) unregister_ftrace_graph(); else - unregister_ftrace_function(&trace_ops); + unregister_ftrace_function(tr->ops); function_enabled = false; } -static void wakeup_function_set(int set) +static void wakeup_function_set(struct trace_array *tr, int set) { if (set) - register_wakeup_function(is_graph(), 1); + register_wakeup_function(tr, is_graph(), 1); else - unregister_wakeup_function(is_graph()); + unregister_wakeup_function(tr, is_graph()); } static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) @@ -184,16 +178,16 @@ static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) struct tracer *tracer = tr->current_trace; if (mask & TRACE_ITER_FUNCTION) - wakeup_function_set(set); + wakeup_function_set(tr, set); return trace_keep_overwrite(tracer, mask, set); } -static int start_func_tracer(int graph) +static int start_func_tracer(struct trace_array *tr, int graph) { int ret; - ret = register_wakeup_function(graph, 0); + ret = register_wakeup_function(tr, graph, 0); if (!ret && tracing_is_enabled()) tracer_enabled = 1; @@ -203,11 +197,11 @@ static int start_func_tracer(int graph) return ret; } -static void stop_func_tracer(int graph) +static void stop_func_tracer(struct trace_array *tr, int graph) { tracer_enabled = 0; - unregister_wakeup_function(graph); + unregister_wakeup_function(tr, graph); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -221,12 +215,12 @@ wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) if (!(is_graph() ^ set)) return 0; - stop_func_tracer(!set); + stop_func_tracer(tr, !set); wakeup_reset(wakeup_trace); tracing_max_latency = 0; - return start_func_tracer(set); + return start_func_tracer(tr, set); } static int wakeup_graph_entry(struct ftrace_graph_ent *trace) @@ -587,7 +581,7 @@ static void start_wakeup_tracer(struct trace_array *tr) */ smp_wmb(); - if (start_func_tracer(is_graph())) + if (start_func_tracer(tr, is_graph())) printk(KERN_ERR "failed to start wakeup tracer\n"); return; @@ -600,7 +594,7 @@ fail_deprobe: static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; - stop_func_tracer(is_graph()); + stop_func_tracer(tr, is_graph()); unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); unregister_trace_sched_wakeup_new(probe_wakeup, NULL); unregister_trace_sched_wakeup(probe_wakeup, NULL); @@ -617,6 +611,7 @@ static int __wakeup_tracer_init(struct trace_array *tr) tracing_max_latency = 0; wakeup_trace = tr; + ftrace_init_array_ops(tr, wakeup_tracer_call); start_wakeup_tracer(tr); return 0; } @@ -653,6 +648,7 @@ static void wakeup_tracer_reset(struct trace_array *tr) set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); + ftrace_reset_array_ops(tr); } static void wakeup_tracer_start(struct trace_array *tr) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index e98fca60974f..519d04affe38 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -161,11 +161,6 @@ static struct ftrace_ops test_probe3 = { .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; -static struct ftrace_ops test_global = { - .func = trace_selftest_test_global_func, - .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, -}; - static void print_counts(void) { printk("(%d %d %d %d %d) ", @@ -185,7 +180,7 @@ static void reset_counts(void) trace_selftest_test_dyn_cnt = 0; } -static int trace_selftest_ops(int cnt) +static int trace_selftest_ops(struct trace_array *tr, int cnt) { int save_ftrace_enabled = ftrace_enabled; struct ftrace_ops *dyn_ops; @@ -220,7 +215,11 @@ static int trace_selftest_ops(int cnt) register_ftrace_function(&test_probe1); register_ftrace_function(&test_probe2); register_ftrace_function(&test_probe3); - register_ftrace_function(&test_global); + /* First time we are running with main function */ + if (cnt > 1) { + ftrace_init_array_ops(tr, trace_selftest_test_global_func); + register_ftrace_function(tr->ops); + } DYN_FTRACE_TEST_NAME(); @@ -232,8 +231,10 @@ static int trace_selftest_ops(int cnt) goto out; if (trace_selftest_test_probe3_cnt != 1) goto out; - if (trace_selftest_test_global_cnt == 0) - goto out; + if (cnt > 1) { + if (trace_selftest_test_global_cnt == 0) + goto out; + } DYN_FTRACE_TEST_NAME2(); @@ -269,8 +270,10 @@ static int trace_selftest_ops(int cnt) goto out_free; if (trace_selftest_test_probe3_cnt != 3) goto out_free; - if (trace_selftest_test_global_cnt == 0) - goto out; + if (cnt > 1) { + if (trace_selftest_test_global_cnt == 0) + goto out; + } if (trace_selftest_test_dyn_cnt == 0) goto out_free; @@ -295,7 +298,9 @@ static int trace_selftest_ops(int cnt) unregister_ftrace_function(&test_probe1); unregister_ftrace_function(&test_probe2); unregister_ftrace_function(&test_probe3); - unregister_ftrace_function(&test_global); + if (cnt > 1) + unregister_ftrace_function(tr->ops); + ftrace_reset_array_ops(tr); /* Make sure everything is off */ reset_counts(); @@ -388,7 +393,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, } /* Test the ops with global tracing running */ - ret = trace_selftest_ops(1); + ret = trace_selftest_ops(tr, 1); trace->reset(tr); out: @@ -399,7 +404,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, /* Test the ops with global tracing off */ if (!ret) - ret = trace_selftest_ops(2); + ret = trace_selftest_ops(tr, 2); return ret; } -- cgit v1.2.3 From 6d9b3fa5e7f663bbfb9d2d80d46136f75319cb28 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 14 Jan 2014 11:28:38 -0500 Subject: tracing: Move tracing_max_latency into trace_array In preparation for letting the latency tracers be used by instances, remove the global tracing_max_latency variable and add a max_latency field to the trace_array that the latency tracers will now use. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 14 ++++++-------- kernel/trace/trace.h | 3 +-- kernel/trace/trace_irqsoff.c | 14 +++++++------- kernel/trace/trace_sched_wakeup.c | 12 ++++++------ kernel/trace/trace_selftest.c | 26 +++++++++++++------------- 5 files changed, 33 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fdd33aacdf05..f5fc56bf0227 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -982,8 +982,6 @@ static arch_spinlock_t ftrace_max_lock = unsigned long __read_mostly tracing_thresh; #ifdef CONFIG_TRACER_MAX_TRACE -unsigned long __read_mostly tracing_max_latency; - /* * Copy the new maximum trace into the separate maximum-trace * structure. (this way the maximum trace is permanently saved, @@ -1000,7 +998,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) max_buf->cpu = cpu; max_buf->time_start = data->preempt_timestamp; - max_data->saved_latency = tracing_max_latency; + max_data->saved_latency = tr->max_latency; max_data->critical_start = data->critical_start; max_data->critical_end = data->critical_end; @@ -6328,6 +6326,11 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("tracing_on", 0644, d_tracer, tr, &rb_simple_fops); +#ifdef CONFIG_TRACER_MAX_TRACE + trace_create_file("tracing_max_latency", 0644, d_tracer, + &tr->max_latency, &tracing_max_lat_fops); +#endif + if (ftrace_create_function_files(tr, d_tracer)) WARN(1, "Could not allocate function filter files"); @@ -6353,11 +6356,6 @@ static __init int tracer_init_debugfs(void) init_tracer_debugfs(&global_trace, d_tracer); -#ifdef CONFIG_TRACER_MAX_TRACE - trace_create_file("tracing_max_latency", 0644, d_tracer, - &tracing_max_latency, &tracing_max_lat_fops); -#endif - trace_create_file("tracing_thresh", 0644, d_tracer, &tracing_thresh, &tracing_max_lat_fops); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index df5256be64cd..644a8b533e1d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -190,6 +190,7 @@ struct trace_array { */ struct trace_buffer max_buffer; bool allocated_snapshot; + unsigned long max_latency; #endif int buffer_disabled; #ifdef CONFIG_FTRACE_SYSCALLS @@ -599,8 +600,6 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs); extern unsigned long tracing_thresh; #ifdef CONFIG_TRACER_MAX_TRACE -extern unsigned long tracing_max_latency; - void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index b5cb047df3e9..40aa300d3491 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -170,7 +170,7 @@ irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) for_each_possible_cpu(cpu) per_cpu(tracing_cpu, cpu) = 0; - tracing_max_latency = 0; + tr->max_latency = 0; tracing_reset_online_cpus(&irqsoff_trace->trace_buffer); return start_irqsoff_tracer(irqsoff_trace, set); @@ -297,13 +297,13 @@ static void irqsoff_print_header(struct seq_file *s) /* * Should this new latency be reported/recorded? */ -static int report_latency(cycle_t delta) +static int report_latency(struct trace_array *tr, cycle_t delta) { if (tracing_thresh) { if (delta < tracing_thresh) return 0; } else { - if (delta <= tracing_max_latency) + if (delta <= tr->max_latency) return 0; } return 1; @@ -327,13 +327,13 @@ check_critical_timing(struct trace_array *tr, pc = preempt_count(); - if (!report_latency(delta)) + if (!report_latency(tr, delta)) goto out; raw_spin_lock_irqsave(&max_trace_lock, flags); /* check if we are still the max latency */ - if (!report_latency(delta)) + if (!report_latency(tr, delta)) goto out_unlock; __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); @@ -346,7 +346,7 @@ check_critical_timing(struct trace_array *tr, data->critical_end = parent_ip; if (likely(!is_tracing_stopped())) { - tracing_max_latency = delta; + tr->max_latency = delta; update_max_tr_single(tr, current, cpu); } @@ -605,7 +605,7 @@ static void __irqsoff_tracer_init(struct trace_array *tr) set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); - tracing_max_latency = 0; + tr->max_latency = 0; irqsoff_trace = tr; /* make sure that the tracer is visible */ smp_wmb(); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 4dd986defa60..41e0b8aa78ed 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -218,7 +218,7 @@ wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) stop_func_tracer(tr, !set); wakeup_reset(wakeup_trace); - tracing_max_latency = 0; + tr->max_latency = 0; return start_func_tracer(tr, set); } @@ -344,13 +344,13 @@ static void wakeup_print_header(struct seq_file *s) /* * Should this new latency be reported/recorded? */ -static int report_latency(cycle_t delta) +static int report_latency(struct trace_array *tr, cycle_t delta) { if (tracing_thresh) { if (delta < tracing_thresh) return 0; } else { - if (delta <= tracing_max_latency) + if (delta <= tr->max_latency) return 0; } return 1; @@ -418,11 +418,11 @@ probe_wakeup_sched_switch(void *ignore, T1 = ftrace_now(cpu); delta = T1-T0; - if (!report_latency(delta)) + if (!report_latency(wakeup_trace, delta)) goto out_unlock; if (likely(!is_tracing_stopped())) { - tracing_max_latency = delta; + wakeup_trace->max_latency = delta; update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); } @@ -609,7 +609,7 @@ static int __wakeup_tracer_init(struct trace_array *tr) set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); - tracing_max_latency = 0; + tr->max_latency = 0; wakeup_trace = tr; ftrace_init_array_ops(tr, wakeup_tracer_call); start_wakeup_tracer(tr); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 519d04affe38..ac3185892960 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -807,7 +807,7 @@ out: int trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) { - unsigned long save_max = tracing_max_latency; + unsigned long save_max = tr->max_latency; unsigned long count; int ret; @@ -819,7 +819,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) } /* reset the max latency */ - tracing_max_latency = 0; + tr->max_latency = 0; /* disable interrupts for a bit */ local_irq_disable(); udelay(100); @@ -846,7 +846,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) ret = -1; } - tracing_max_latency = save_max; + tr->max_latency = save_max; return ret; } @@ -856,7 +856,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) int trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) { - unsigned long save_max = tracing_max_latency; + unsigned long save_max = tr->max_latency; unsigned long count; int ret; @@ -881,7 +881,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) } /* reset the max latency */ - tracing_max_latency = 0; + tr->max_latency = 0; /* disable preemption for a bit */ preempt_disable(); udelay(100); @@ -908,7 +908,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) ret = -1; } - tracing_max_latency = save_max; + tr->max_latency = save_max; return ret; } @@ -918,7 +918,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) int trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr) { - unsigned long save_max = tracing_max_latency; + unsigned long save_max = tr->max_latency; unsigned long count; int ret; @@ -943,7 +943,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * } /* reset the max latency */ - tracing_max_latency = 0; + tr->max_latency = 0; /* disable preemption and interrupts for a bit */ preempt_disable(); @@ -978,7 +978,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * } /* do the test by disabling interrupts first this time */ - tracing_max_latency = 0; + tr->max_latency = 0; tracing_start(); trace->start(tr); @@ -1009,7 +1009,7 @@ out: tracing_start(); out_no_start: trace->reset(tr); - tracing_max_latency = save_max; + tr->max_latency = save_max; return ret; } @@ -1062,7 +1062,7 @@ static int trace_wakeup_test_thread(void *data) int trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) { - unsigned long save_max = tracing_max_latency; + unsigned long save_max = tr->max_latency; struct task_struct *p; struct completion is_ready; unsigned long count; @@ -1088,7 +1088,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) } /* reset the max latency */ - tracing_max_latency = 0; + tr->max_latency = 0; while (p->on_rq) { /* @@ -1118,7 +1118,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) trace->reset(tr); tracing_start(); - tracing_max_latency = save_max; + tr->max_latency = save_max; /* kill the thread */ kthread_stop(p); -- cgit v1.2.3 From 0b9b12c1b884eb34773312f15c194220025e0416 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 14 Jan 2014 10:04:59 -0500 Subject: tracing: Move ftrace_max_lock into trace_array In preparation for having tracers enabled in instances, the max_lock should be unique as updating the max for one tracer is a separate operation than updating it for another tracer using a different max. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 40 ++++++++++++++-------------------------- kernel/trace/trace.h | 14 ++++++++++++++ kernel/trace/trace_selftest.c | 4 ++-- 3 files changed, 30 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f5fc56bf0227..bb5147a55be5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -963,22 +963,6 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) return cnt; } -/* - * ftrace_max_lock is used to protect the swapping of buffers - * when taking a max snapshot. The buffers themselves are - * protected by per_cpu spinlocks. But the action of the swap - * needs its own lock. - * - * This is defined as a arch_spinlock_t in order to help - * with performance when lockdep debugging is enabled. - * - * It is also used in other places outside the update_max_tr - * so it needs to be defined outside of the - * CONFIG_TRACER_MAX_TRACE. - */ -static arch_spinlock_t ftrace_max_lock = - (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - unsigned long __read_mostly tracing_thresh; #ifdef CONFIG_TRACER_MAX_TRACE @@ -1046,14 +1030,14 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) return; } - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&tr->max_lock); buf = tr->trace_buffer.buffer; tr->trace_buffer.buffer = tr->max_buffer.buffer; tr->max_buffer.buffer = buf; __update_max_tr(tr, tsk, cpu); - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&tr->max_lock); } /** @@ -1079,7 +1063,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) return; } - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&tr->max_lock); ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu); @@ -1097,7 +1081,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); __update_max_tr(tr, tsk, cpu); - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&tr->max_lock); } #endif /* CONFIG_TRACER_MAX_TRACE */ @@ -1351,7 +1335,7 @@ void tracing_start(void) } /* Prevent the buffers from switching */ - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&global_trace.max_lock); buffer = global_trace.trace_buffer.buffer; if (buffer) @@ -1363,7 +1347,7 @@ void tracing_start(void) ring_buffer_record_enable(buffer); #endif - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&global_trace.max_lock); ftrace_start(); out: @@ -1418,7 +1402,7 @@ void tracing_stop(void) goto out; /* Prevent the buffers from switching */ - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&global_trace.max_lock); buffer = global_trace.trace_buffer.buffer; if (buffer) @@ -1430,7 +1414,7 @@ void tracing_stop(void) ring_buffer_record_disable(buffer); #endif - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&global_trace.max_lock); out: raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); @@ -3331,7 +3315,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, mutex_lock(&tracing_cpumask_update_lock); local_irq_disable(); - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&tr->max_lock); for_each_tracing_cpu(cpu) { /* * Increase/decrease the disabled counter if we are @@ -3348,7 +3332,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); } } - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&tr->max_lock); local_irq_enable(); cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); @@ -6129,6 +6113,8 @@ static int new_instance_create(const char *name) raw_spin_lock_init(&tr->start_lock); + tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + tr->current_trace = &nop_trace; INIT_LIST_HEAD(&tr->systems); @@ -6627,6 +6613,8 @@ __init static int tracer_alloc_buffers(void) */ global_trace.current_trace = &nop_trace; + global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + ftrace_init_global_array_ops(&global_trace); register_tracer(&nop_trace); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 644a8b533e1d..5d2f07d6746c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -192,6 +192,20 @@ struct trace_array { bool allocated_snapshot; unsigned long max_latency; #endif + /* + * max_lock is used to protect the swapping of buffers + * when taking a max snapshot. The buffers themselves are + * protected by per_cpu spinlocks. But the action of the swap + * needs its own lock. + * + * This is defined as a arch_spinlock_t in order to help + * with performance when lockdep debugging is enabled. + * + * It is also used in other places outside the update_max_tr + * so it needs to be defined outside of the + * CONFIG_TRACER_MAX_TRACE. + */ + arch_spinlock_t max_lock; int buffer_disabled; #ifdef CONFIG_FTRACE_SYSCALLS int sys_refcount_enter; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index ac3185892960..1c95cd7a98c8 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -65,7 +65,7 @@ static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count) /* Don't allow flipping of max traces now */ local_irq_save(flags); - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&buf->tr->max_lock); cnt = ring_buffer_entries(buf->buffer); @@ -83,7 +83,7 @@ static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count) break; } tracing_on(); - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&buf->tr->max_lock); local_irq_restore(flags); if (count) -- cgit v1.2.3 From 65daaca7c6dac4db0ef64f2baac0e448cf5d847f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 14 Jan 2014 07:06:29 -0500 Subject: tracing: Allow wakeup tracers to be used by instances The wakeup and wakeup_rt tracers can now be used by instances. But they may only be used by one instance at a time (including the top level directory). This allows multiple tracers to run while the wakeup tracer is running simultaneously. Signed-off-by: Steven Rostedt --- kernel/trace/trace_sched_wakeup.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 41e0b8aa78ed..1573c03640d2 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -601,6 +601,8 @@ static void stop_wakeup_tracer(struct trace_array *tr) unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL); } +static bool wakeup_busy; + static int __wakeup_tracer_init(struct trace_array *tr) { save_flags = trace_flags; @@ -613,11 +615,16 @@ static int __wakeup_tracer_init(struct trace_array *tr) wakeup_trace = tr; ftrace_init_array_ops(tr, wakeup_tracer_call); start_wakeup_tracer(tr); + + wakeup_busy = true; return 0; } static int wakeup_tracer_init(struct trace_array *tr) { + if (wakeup_busy) + return -EBUSY; + wakeup_dl = 0; wakeup_rt = 0; return __wakeup_tracer_init(tr); @@ -625,6 +632,9 @@ static int wakeup_tracer_init(struct trace_array *tr) static int wakeup_rt_tracer_init(struct trace_array *tr) { + if (wakeup_busy) + return -EBUSY; + wakeup_dl = 0; wakeup_rt = 1; return __wakeup_tracer_init(tr); @@ -632,6 +642,9 @@ static int wakeup_rt_tracer_init(struct trace_array *tr) static int wakeup_dl_tracer_init(struct trace_array *tr) { + if (wakeup_busy) + return -EBUSY; + wakeup_dl = 1; wakeup_rt = 0; return __wakeup_tracer_init(tr); @@ -649,6 +662,7 @@ static void wakeup_tracer_reset(struct trace_array *tr) set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); ftrace_reset_array_ops(tr); + wakeup_busy = false; } static void wakeup_tracer_start(struct trace_array *tr) @@ -680,6 +694,7 @@ static struct tracer wakeup_tracer __read_mostly = #endif .open = wakeup_trace_open, .close = wakeup_trace_close, + .allow_instances = true, .use_max_tr = true, }; @@ -702,6 +717,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = #endif .open = wakeup_trace_open, .close = wakeup_trace_close, + .allow_instances = true, .use_max_tr = true, }; -- cgit v1.2.3 From 02f2f7646fd5d0482e805d728601e8e4305ad56f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 14 Jan 2014 09:32:58 -0500 Subject: tracing: Allow irq/preempt tracers to be used by instances The irqsoff, preemptoff and preemptirqsoff tracers can now be used by instances. But they may only be used by one instance at a time (including the top level directory). This allows multiple tracers to run while the irqsoff (and friends) tracer is running simultaneously. Signed-off-by: Steven Rostedt --- kernel/trace/trace_irqsoff.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 40aa300d3491..9bb104f748d0 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -597,8 +597,13 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph) unregister_irqsoff_function(tr, graph); } -static void __irqsoff_tracer_init(struct trace_array *tr) +static bool irqsoff_busy; + +static int __irqsoff_tracer_init(struct trace_array *tr) { + if (irqsoff_busy) + return -EBUSY; + save_flags = trace_flags; /* non overwrite screws up the latency tracers */ @@ -617,6 +622,9 @@ static void __irqsoff_tracer_init(struct trace_array *tr) if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL && is_graph()))) printk(KERN_ERR "failed to start irqsoff tracer\n"); + + irqsoff_busy = true; + return 0; } static void irqsoff_tracer_reset(struct trace_array *tr) @@ -629,6 +637,8 @@ static void irqsoff_tracer_reset(struct trace_array *tr) set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); ftrace_reset_array_ops(tr); + + irqsoff_busy = false; } static void irqsoff_tracer_start(struct trace_array *tr) @@ -646,8 +656,7 @@ static int irqsoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_IRQS_OFF; - __irqsoff_tracer_init(tr); - return 0; + return __irqsoff_tracer_init(tr); } static struct tracer irqsoff_tracer __read_mostly = { @@ -667,6 +676,7 @@ static struct tracer irqsoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, + .allow_instances = true, .use_max_tr = true, }; # define register_irqsoff(trace) register_tracer(&trace) @@ -679,8 +689,7 @@ static int preemptoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_PREEMPT_OFF; - __irqsoff_tracer_init(tr); - return 0; + return __irqsoff_tracer_init(tr); } static struct tracer preemptoff_tracer __read_mostly = @@ -701,6 +710,7 @@ static struct tracer preemptoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, + .allow_instances = true, .use_max_tr = true, }; # define register_preemptoff(trace) register_tracer(&trace) @@ -715,8 +725,7 @@ static int preemptirqsoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; - __irqsoff_tracer_init(tr); - return 0; + return __irqsoff_tracer_init(tr); } static struct tracer preemptirqsoff_tracer __read_mostly = @@ -737,6 +746,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, + .allow_instances = true, .use_max_tr = true, }; -- cgit v1.2.3 From 8275f69f076d8b1ecc6d93305451e5a8f7336d4e Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sun, 30 Mar 2014 15:31:50 +0200 Subject: ftrace: Statically initialize pm notifier block Instead of initializing the pm notifier block in register_ftrace_graph(), initialize it statically. This safes us some code. Found in the PaX patch, written by the PaX Team. Link: http://lkml.kernel.org/p/1396186310-3156-1-git-send-email-minipli@googlemail.com Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: PaX Team Signed-off-by: Mathias Krause Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8f61ef70a297..846888ea2ba4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4860,7 +4860,6 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int ftrace_graph_active; -static struct notifier_block ftrace_suspend_notifier; int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) { @@ -5036,6 +5035,10 @@ static void update_function_graph_func(void) ftrace_graph_entry = ftrace_graph_entry_test; } +static struct notifier_block ftrace_suspend_notifier = { + .notifier_call = ftrace_suspend_notifier_call, +}; + int register_ftrace_graph(trace_func_graph_ret_t retfunc, trace_func_graph_ent_t entryfunc) { @@ -5049,7 +5052,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, goto out; } - ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; register_pm_notifier(&ftrace_suspend_notifier); ftrace_graph_active++; -- cgit v1.2.3 From ad1438a076e275b70d1a04de1364bc483e5a81db Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Thu, 17 Apr 2014 21:44:42 +0200 Subject: tracing: Add static to local functions This patch adds static to the following functions: -cycle_t buffer_ftrace_now -void free_snapshot -int trace_selftest_startup_dynamic_tracing Link: http://lkml.kernel.org/p/20140417214442.d7abc7c0b0e4b90e7fedecc9@skynet.be Signed-off-by: Fabian Frederick Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 ++-- kernel/trace/trace_selftest.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bb5147a55be5..cb41e98cc64b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -275,7 +275,7 @@ int call_filter_check_discard(struct ftrace_event_call *call, void *rec, } EXPORT_SYMBOL_GPL(call_filter_check_discard); -cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) +static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) { u64 ts; @@ -599,7 +599,7 @@ static int alloc_snapshot(struct trace_array *tr) return 0; } -void free_snapshot(struct trace_array *tr) +static void free_snapshot(struct trace_array *tr) { /* * We don't free the ring buffer. instead, resize it because diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 1c95cd7a98c8..5ef60499dc8e 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -320,9 +320,9 @@ static int trace_selftest_ops(struct trace_array *tr, int cnt) } /* Test dynamic code modification and ftrace filters */ -int trace_selftest_startup_dynamic_tracing(struct tracer *trace, - struct trace_array *tr, - int (*func)(void)) +static int trace_selftest_startup_dynamic_tracing(struct tracer *trace, + struct trace_array *tr, + int (*func)(void)) { int save_ftrace_enabled = ftrace_enabled; unsigned long count; -- cgit v1.2.3 From 3a101b8de0d39403b2c7e5c23fd0b005668acf48 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 22 Apr 2014 21:31:56 -0400 Subject: audit: add netlink audit protocol bind to check capabilities on multicast join Register a netlink per-protocol bind fuction for audit to check userspace process capabilities before allowing a multicast group connection. Signed-off-by: Richard Guy Briggs Signed-off-by: David S. Miller --- kernel/audit.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 7c2893602d06..223cb746f141 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1076,10 +1076,20 @@ static void audit_receive(struct sk_buff *skb) mutex_unlock(&audit_cmd_mutex); } +/* Run custom bind function on netlink socket group connect or bind requests. */ +static int audit_bind(int group) +{ + if (!capable(CAP_AUDIT_READ)) + return -EPERM; + + return 0; +} + static int __net_init audit_net_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = audit_receive, + .bind = audit_bind, }; struct audit_net *aunet = net_generic(net, audit_net_id); -- cgit v1.2.3 From 451f921639fea4600dfb9ab2889332bdcc7b48d3 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 22 Apr 2014 21:31:57 -0400 Subject: audit: add netlink multicast group for log read Add a netlink multicast socket with one group to kaudit for "best-effort" delivery to read-only userspace clients such as systemd, in addition to the existing bidirectional unicast auditd userspace client. Currently, auditd is intended to use the CAP_AUDIT_CONTROL and CAP_AUDIT_WRITE capabilities, but actually uses CAP_NET_ADMIN. The CAP_AUDIT_READ capability is added for use by read-only AUDIT_NLGRP_READLOG netlink multicast group clients to the kaudit subsystem. This will safely give access to services such as systemd to consume audit logs while ensuring write access remains restricted for integrity. Signed-off-by: Richard Guy Briggs Signed-off-by: David S. Miller --- kernel/audit.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 223cb746f141..d272cc11dff4 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -423,6 +423,35 @@ static void kauditd_send_skb(struct sk_buff *skb) consume_skb(skb); } +/* + * kauditd_send_multicast_skb - send the skb to multicast userspace listeners + * + * This function doesn't consume an skb as might be expected since it has to + * copy it anyways. + */ +static void kauditd_send_multicast_skb(struct sk_buff *skb) +{ + struct sk_buff *copy; + struct audit_net *aunet = net_generic(&init_net, audit_net_id); + struct sock *sock = aunet->nlsk; + + /* + * The seemingly wasteful skb_copy() rather than bumping the refcount + * using skb_get() is necessary because non-standard mods are made to + * the skb by the original kaudit unicast socket send routine. The + * existing auditd daemon assumes this breakage. Fixing this would + * require co-ordinating a change in the established protocol between + * the kaudit kernel subsystem and the auditd userspace code. There is + * no reason for new multicast clients to continue with this + * non-compliance. + */ + copy = skb_copy(skb, GFP_KERNEL); + if (!copy) + return; + + nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL); +} + /* * flush_hold_queue - empty the hold queue if auditd appears * @@ -1090,6 +1119,8 @@ static int __net_init audit_net_init(struct net *net) struct netlink_kernel_cfg cfg = { .input = audit_receive, .bind = audit_bind, + .flags = NL_CFG_F_NONROOT_RECV, + .groups = AUDIT_NLGRP_MAX, }; struct audit_net *aunet = net_generic(net, audit_net_id); @@ -1911,10 +1942,10 @@ out: * audit_log_end - end one audit record * @ab: the audit_buffer * - * The netlink_* functions cannot be called inside an irq context, so - * the audit buffer is placed on a queue and a tasklet is scheduled to - * remove them from the queue outside the irq context. May be called in - * any context. + * netlink_unicast() cannot be called inside an irq context because it blocks + * (last arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed + * on a queue and a tasklet is scheduled to remove them from the queue outside + * the irq context. May be called in any context. */ void audit_log_end(struct audit_buffer *ab) { @@ -1924,6 +1955,18 @@ void audit_log_end(struct audit_buffer *ab) audit_log_lost("rate limit exceeded"); } else { struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); + + kauditd_send_multicast_skb(ab->skb); + + /* + * The original kaudit unicast socket sends up messages with + * nlmsg_len set to the payload length rather than the entire + * message length. This breaks the standard set by netlink. + * The existing auditd daemon assumes this breakage. Fixing + * this would require co-ordinating a change in the established + * protocol between the kaudit kernel subsystem and the auditd + * userspace code. + */ nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN; if (audit_pid) { -- cgit v1.2.3 From 7f74ecd788a8b2a122d4d8bdc4d517cc60b8b638 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 22 Apr 2014 21:31:58 -0400 Subject: audit: send multicast messages only if there are listeners Test first to see if there are any userspace multicast listeners bound to the socket before starting the multicast send work. Signed-off-by: Richard Guy Briggs Signed-off-by: David S. Miller --- kernel/audit.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index d272cc11dff4..33531d72e4a2 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -435,6 +435,9 @@ static void kauditd_send_multicast_skb(struct sk_buff *skb) struct audit_net *aunet = net_generic(&init_net, audit_net_id); struct sock *sock = aunet->nlsk; + if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG)) + return; + /* * The seemingly wasteful skb_copy() rather than bumping the refcount * using skb_get() is necessary because non-standard mods are made to -- cgit v1.2.3 From ea8fd3b47ff4ed4b1b5942bf3e0cb8d8f590ec59 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:14 -0400 Subject: cgroup: cgroup_apply_cftypes() shouldn't skip the default hierarhcy cgroup_apply_cftypes() skip creating or removing files if the subsystem is attached to the default hierarchy, which led to missing files in the root of the default hierarchy. Skipping made sense when the default hierarchy was dummy; however, now that the default hierarchy is full functional and planned to be used as the unified hierarchy, it shouldn't be skipped over. Reported-by: Li Zefan Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 11a03d67635a..a6894272353b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2436,10 +2436,6 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) lockdep_assert_held(&cgroup_tree_mutex); - /* don't bother if @ss isn't attached */ - if (ss->root == &cgrp_dfl_root) - return 0; - /* add/rm files for all cgroups created before */ css_for_each_descendant_pre(css, cgroup_css(root, ss)) { struct cgroup *cgrp = css->cgroup; -- cgit v1.2.3 From f392e51cd6ae6f6ee5b9b6d611cdc282b4c1711e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:14 -0400 Subject: cgroup: update cgroup->subsys_mask to ->child_subsys_mask and restore cgroup_root->subsys_mask 944196278d3d ("cgroup: move ->subsys_mask from cgroupfs_root to cgroup") moved ->subsys_mask from cgroup_root to cgroup to prepare for the unified hierarhcy; however, it turns out that carrying the subsys_mask of the children in the parent, instead of itself, is a lot more natural. This patch restores cgroup_root->subsys_mask and morphs cgroup->subsys_mask into cgroup->child_subsys_mask. * Uses of root->cgrp.subsys_mask are restored to root->subsys_mask. * Remove automatic setting and clearing of cgrp->subsys_mask and instead just inherit ->child_subsys_mask from the parent during cgroup creation. Note that this doesn't affect any current behaviors. * Undo __kill_css() separation. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 64 ++++++++++++++++++++++++--------------------------------- 1 file changed, 27 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a6894272353b..f944619077f4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -529,7 +529,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, * won't change, so no need for locking. */ for_each_subsys(ss, i) { - if (root->cgrp.subsys_mask & (1UL << i)) { + if (root->subsys_mask & (1UL << i)) { /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ @@ -742,7 +742,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) BUG_ON(!list_empty(&cgrp->children)); /* Rebind all subsystems back to the default hierarchy */ - rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask); + rebind_subsystems(&cgrp_dfl_root, root->subsys_mask); /* * Release all the links from cset_links to this hierarchy's @@ -1050,8 +1050,11 @@ static int rebind_subsystems(struct cgroup_root *dst_root, ss->root = dst_root; css->cgroup = &dst_root->cgrp; - src_root->cgrp.subsys_mask &= ~(1 << ssid); - dst_root->cgrp.subsys_mask |= 1 << ssid; + src_root->subsys_mask &= ~(1 << ssid); + src_root->cgrp.child_subsys_mask &= ~(1 << ssid); + + dst_root->subsys_mask |= 1 << ssid; + dst_root->cgrp.child_subsys_mask |= 1 << ssid; if (ss->bind) ss->bind(css); @@ -1069,7 +1072,7 @@ static int cgroup_show_options(struct seq_file *seq, int ssid; for_each_subsys(ss, ssid) - if (root->cgrp.subsys_mask & (1 << ssid)) + if (root->subsys_mask & (1 << ssid)) seq_printf(seq, ",%s", ss->name); if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) seq_puts(seq, ",sane_behavior"); @@ -1273,12 +1276,12 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) if (ret) goto out_unlock; - if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent) + if (opts.subsys_mask != root->subsys_mask || opts.release_agent) pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); - added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask; - removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask; + added_mask = opts.subsys_mask & ~root->subsys_mask; + removed_mask = root->subsys_mask & ~opts.subsys_mask; /* Don't allow flags or name to change at remount */ if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || @@ -1535,7 +1538,7 @@ retry: * subsystems) then they must match. */ if ((opts.subsys_mask || opts.none) && - (opts.subsys_mask != root->cgrp.subsys_mask)) { + (opts.subsys_mask != root->subsys_mask)) { if (!name_match) continue; ret = -EBUSY; @@ -3658,8 +3661,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) cgroup_get(cgrp); css_get(css->parent); - cgrp->subsys_mask |= 1 << ss->id; - if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && parent->parent) { pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", @@ -3780,13 +3781,15 @@ static long cgroup_create(struct cgroup *parent, const char *name, /* let's create and online css's */ for_each_subsys(ss, ssid) { - if (root->cgrp.subsys_mask & (1 << ssid)) { + if (parent->child_subsys_mask & (1 << ssid)) { err = create_css(cgrp, ss); if (err) goto err_destroy; } } + cgrp->child_subsys_mask = parent->child_subsys_mask; + kernfs_activate(kn); mutex_unlock(&cgroup_mutex); @@ -3882,7 +3885,16 @@ static void css_killed_ref_fn(struct percpu_ref *ref) queue_work(cgroup_destroy_wq, &css->destroy_work); } -static void __kill_css(struct cgroup_subsys_state *css) +/** + * kill_css - destroy a css + * @css: css to destroy + * + * This function initiates destruction of @css by removing cgroup interface + * files and putting its base reference. ->css_offline() will be invoked + * asynchronously once css_tryget() is guaranteed to fail and when the + * reference count reaches zero, @css will be released. + */ +static void kill_css(struct cgroup_subsys_state *css) { lockdep_assert_held(&cgroup_tree_mutex); @@ -3911,28 +3923,6 @@ static void __kill_css(struct cgroup_subsys_state *css) percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); } -/** - * kill_css - destroy a css - * @css: css to destroy - * - * This function initiates destruction of @css by removing cgroup interface - * files and putting its base reference. ->css_offline() will be invoked - * asynchronously once css_tryget() is guaranteed to fail and when the - * reference count reaches zero, @css will be released. - */ -static void kill_css(struct cgroup_subsys_state *css) -{ - struct cgroup *cgrp = css->cgroup; - - lockdep_assert_held(&cgroup_tree_mutex); - - /* if already killed, noop */ - if (cgrp->subsys_mask & (1 << css->ss->id)) { - cgrp->subsys_mask &= ~(1 << css->ss->id); - __kill_css(css); - } -} - /** * cgroup_destroy_locked - the first stage of cgroup destruction * @cgrp: cgroup to be destroyed @@ -4145,7 +4135,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(online_css(css)); - cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id; + cgrp_dfl_root.subsys_mask |= 1 << ss->id; mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); @@ -4302,7 +4292,7 @@ int proc_cgroup_show(struct seq_file *m, void *v) seq_printf(m, "%d:", root->hierarchy_id); for_each_subsys(ss, ssid) - if (root->cgrp.subsys_mask & (1 << ssid)) + if (root->subsys_mask & (1 << ssid)) seq_printf(m, "%s%s", count++ ? "," : "", ss->name); if (strlen(root->name)) seq_printf(m, "%sname=%s", count ? "," : "", -- cgit v1.2.3 From aec3dfcb2e43892180ee053e8c260dcdeccf4392 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:14 -0400 Subject: cgroup: introduce effective cgroup_subsys_state In the planned default unified hierarchy, controllers may get dynamically attached to and detached from a cgroup and a cgroup may not have csses for all the controllers associated with the hierarchy. When a cgroup doesn't have its own css for a given controller, the css of the nearest ancestor with the controller enabled will be used, which is called the effective css. This patch introduces cgroup_e_css() and for_each_e_css() to access the effective csses and convert compare_css_sets(), find_existing_css_set() and cgroup_migrate() to use the effective csses so that they can handle cgroups with partial csses correctly. This means that for two css_sets to be considered identical, they should have both matching csses and cgroups. compare_css_sets() already compares both, not for correctness but for optimization. As this now becomes a matter of correctness, update the comments accordingly. For all !default hierarchies, cgroup_e_css() always equals cgroup_css(), so this patch doesn't change behavior. While at it, fix incorrect locking comment for for_each_css(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 83 ++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 64 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f944619077f4..4eb2dd1bb5b1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -208,6 +208,34 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, return &cgrp->dummy_css; } +/** + * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest (%NULL returns the dummy_css) + * + * Similar to cgroup_css() but returns the effctive css, which is defined + * as the matching css of the nearest ancestor including self which has @ss + * enabled. If @ss is associated with the hierarchy @cgrp is on, this + * function is guaranteed to return non-NULL css. + */ +static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + lockdep_assert_held(&cgroup_mutex); + + if (!ss) + return &cgrp->dummy_css; + + if (!(cgrp->root->subsys_mask & (1 << ss->id))) + return NULL; + + while (cgrp->parent && + !(cgrp->parent->child_subsys_mask & (1 << ss->id))) + cgrp = cgrp->parent; + + return cgroup_css(cgrp, ss); +} + /* convenient tests for these bits */ static inline bool cgroup_is_dead(const struct cgroup *cgrp) { @@ -273,7 +301,7 @@ static int notify_on_release(const struct cgroup *cgrp) * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end * @cgrp: the target cgroup to iterate css's of * - * Should be called under cgroup_mutex. + * Should be called under cgroup_[tree_]mutex. */ #define for_each_css(css, ssid, cgrp) \ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ @@ -283,6 +311,20 @@ static int notify_on_release(const struct cgroup *cgrp) lockdep_is_held(&cgroup_mutex)))) { } \ else +/** + * for_each_e_css - iterate all effective css's of a cgroup + * @css: the iteration cursor + * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end + * @cgrp: the target cgroup to iterate css's of + * + * Should be called under cgroup_[tree_]mutex. + */ +#define for_each_e_css(css, ssid, cgrp) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ + if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \ + ; \ + else + /** * for_each_subsys - iterate all enabled cgroup subsystems * @ss: the iteration cursor @@ -452,20 +494,20 @@ static bool compare_css_sets(struct css_set *cset, { struct list_head *l1, *l2; - if (memcmp(template, cset->subsys, sizeof(cset->subsys))) { - /* Not all subsystems matched */ + /* + * On the default hierarchy, there can be csets which are + * associated with the same set of cgroups but different csses. + * Let's first ensure that csses match. + */ + if (memcmp(template, cset->subsys, sizeof(cset->subsys))) return false; - } /* * Compare cgroup pointers in order to distinguish between - * different cgroups in heirarchies with no subsystems. We - * could get by with just this check alone (and skip the - * memcmp above) but on most setups the memcmp check will - * avoid the need for this more expensive check on almost all - * candidates. + * different cgroups in hierarchies. As different cgroups may + * share the same effective css, this comparison is always + * necessary. */ - l1 = &cset->cgrp_links; l2 = &old_cset->cgrp_links; while (1) { @@ -530,13 +572,16 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, */ for_each_subsys(ss, i) { if (root->subsys_mask & (1UL << i)) { - /* Subsystem is in this hierarchy. So we want - * the subsystem state from the new - * cgroup */ - template[i] = cgroup_css(cgrp, ss); + /* + * @ss is in this hierarchy, so we want the + * effective css from @cgrp. + */ + template[i] = cgroup_e_css(cgrp, ss); } else { - /* Subsystem is not in this hierarchy, so we - * don't want to change the subsystem state */ + /* + * @ss is not in this hierarchy, so we don't want + * to change the css. + */ template[i] = old_cset->subsys[i]; } } @@ -1969,7 +2014,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, return 0; /* check that we can legitimately attach to the cgroup */ - for_each_css(css, i, cgrp) { + for_each_e_css(css, i, cgrp) { if (css->ss->can_attach) { ret = css->ss->can_attach(css, &tset); if (ret) { @@ -1999,7 +2044,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, */ tset.csets = &tset.dst_csets; - for_each_css(css, i, cgrp) + for_each_e_css(css, i, cgrp) if (css->ss->attach) css->ss->attach(css, &tset); @@ -2007,7 +2052,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, goto out_release_tset; out_cancel_attach: - for_each_css(css, i, cgrp) { + for_each_e_css(css, i, cgrp) { if (css == failed_css) break; if (css->ss->cancel_attach) -- cgit v1.2.3 From 2d8f243a5e6efa57fb7c46fe83fafa45b33d0ec2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:15 -0400 Subject: cgroup: implement cgroup->e_csets[] On the default unified hierarchy, a cgroup may be associated with csses of its ancestors, which means that a css of a given cgroup may be associated with css_sets of descendant cgroups. This means that we can't walk all tasks associated with a css by iterating the css_sets associated with the cgroup as there are css_sets which are pointing to the css but linked on the descendants. This patch adds per-subsystem list heads cgroup->e_csets[]. Any css_set which is pointing to a css is linked to css->cgroup->e_csets[$SUBSYS_ID] through css_set->e_cset_node[$SUBSYS_ID]. The lists are protected by css_set_rwsem and will allow us to walk all css_sets associated with a given css so that we can find out all associated tasks. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4eb2dd1bb5b1..37d966289978 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -425,6 +425,8 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) static void put_css_set_locked(struct css_set *cset, bool taskexit) { struct cgrp_cset_link *link, *tmp_link; + struct cgroup_subsys *ss; + int ssid; lockdep_assert_held(&css_set_rwsem); @@ -432,6 +434,8 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit) return; /* This css_set is dead. unlink it and release cgroup refcounts */ + for_each_subsys(ss, ssid) + list_del(&cset->e_cset_node[ssid]); hash_del(&cset->hlist); css_set_count--; @@ -673,7 +677,9 @@ static struct css_set *find_css_set(struct css_set *old_cset, struct css_set *cset; struct list_head tmp_links; struct cgrp_cset_link *link; + struct cgroup_subsys *ss; unsigned long key; + int ssid; lockdep_assert_held(&cgroup_mutex); @@ -724,10 +730,14 @@ static struct css_set *find_css_set(struct css_set *old_cset, css_set_count++; - /* Add this cgroup group to the hash table */ + /* Add @cset to the hash table */ key = css_set_hash(cset->subsys); hash_add(css_set_table, &cset->hlist, key); + for_each_subsys(ss, ssid) + list_add_tail(&cset->e_cset_node[ssid], + &cset->subsys[ssid]->cgroup->e_csets[ssid]); + up_write(&css_set_rwsem); return cset; @@ -1028,7 +1038,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned long ss_mask) { struct cgroup_subsys *ss; - int ssid, ret; + int ssid, i, ret; lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); @@ -1081,6 +1091,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, for_each_subsys(ss, ssid) { struct cgroup_root *src_root; struct cgroup_subsys_state *css; + struct css_set *cset; if (!(ss_mask & (1 << ssid))) continue; @@ -1095,6 +1106,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root, ss->root = dst_root; css->cgroup = &dst_root->cgrp; + down_write(&css_set_rwsem); + hash_for_each(css_set_table, i, cset, hlist) + list_move_tail(&cset->e_cset_node[ss->id], + &dst_root->cgrp.e_csets[ss->id]); + up_write(&css_set_rwsem); + src_root->subsys_mask &= ~(1 << ssid); src_root->cgrp.child_subsys_mask &= ~(1 << ssid); @@ -1417,6 +1434,9 @@ out_unlock: static void init_cgroup_housekeeping(struct cgroup *cgrp) { + struct cgroup_subsys *ss; + int ssid; + atomic_set(&cgrp->refcnt, 1); INIT_LIST_HEAD(&cgrp->sibling); INIT_LIST_HEAD(&cgrp->children); @@ -1425,6 +1445,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); cgrp->dummy_css.cgroup = cgrp; + + for_each_subsys(ss, ssid) + INIT_LIST_HEAD(&cgrp->e_csets[ssid]); } static void init_cgroup_root(struct cgroup_root *root, @@ -4249,6 +4272,9 @@ int __init cgroup_init(void) if (!ss->early_init) cgroup_init_subsys(ss); + list_add_tail(&init_css_set.e_cset_node[ssid], + &cgrp_dfl_root.cgrp.e_csets[ssid]); + /* * cftype registration needs kmalloc and can't be done * during early_init. Register base cftypes separately. -- cgit v1.2.3 From 3b281afbc3a06cd69c54e6db1a04a8e73997723f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:15 -0400 Subject: cgroup: make css_next_child() skip missing csses css_next_child() walks the children of the specified css. It does this by finding the next cgroup and then returning the requested css. On the default unified hierarchy, a cgroup may not have a css associated with it even if the hierarchy has the subsystem enabled. This patch updates css_next_child() so that it skips children without the requested css associated. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 37d966289978..0edc186cd545 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2708,10 +2708,19 @@ css_next_child(struct cgroup_subsys_state *pos_css, break; } - if (&next->sibling == &cgrp->children) - return NULL; + /* + * @next, if not pointing to the head, can be dereferenced and is + * the next sibling; however, it might have @ss disabled. If so, + * fast-forward to the next enabled one. + */ + while (&next->sibling != &cgrp->children) { + struct cgroup_subsys_state *next_css = cgroup_css(next, parent_css->ss); - return cgroup_css(next, parent_css->ss); + if (next_css) + return next_css; + next = list_entry_rcu(next->sibling.next, struct cgroup, sibling); + } + return NULL; } /** -- cgit v1.2.3 From 0f0a2b4fa6210147131082999f1f16d7fb79abf8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:15 -0400 Subject: cgroup: reorganize css_task_iter This patch reorganizes css_task_iter so that adding effective css support is easier. * s/->cset_link/->cset_pos/ and s/->task/->task_pos/ for consistency * ->origin_css is used to determine whether the iteration reached the last css_set. Replace it with explicit ->cset_head so that css_advance_task_iter() doesn't have to know the termination condition directly. * css_task_iter_next() currently assumes that it's walking list of cgrp_cset_link and reaches into the current cset through the current link to determine the termination conditions for task walking. As this won't always be true for effective css walking, add ->tasks_head and ->mg_tasks_head and use them to control task walking so that css_task_iter_next() doesn't have to know how css_sets are being walked. This patch doesn't make any behavior changes. The iteration logic stays unchanged after the patch. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0edc186cd545..d48163b26196 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2857,27 +2857,30 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, */ static void css_advance_task_iter(struct css_task_iter *it) { - struct list_head *l = it->cset_link; + struct list_head *l = it->cset_pos; struct cgrp_cset_link *link; struct css_set *cset; /* Advance to the next non-empty css_set */ do { l = l->next; - if (l == &it->origin_css->cgroup->cset_links) { - it->cset_link = NULL; + if (l == it->cset_head) { + it->cset_pos = NULL; return; } link = list_entry(l, struct cgrp_cset_link, cset_link); cset = link->cset; } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); - it->cset_link = l; + it->cset_pos = l; if (!list_empty(&cset->tasks)) - it->task = cset->tasks.next; + it->task_pos = cset->tasks.next; else - it->task = cset->mg_tasks.next; + it->task_pos = cset->mg_tasks.next; + + it->tasks_head = &cset->tasks; + it->mg_tasks_head = &cset->mg_tasks; } /** @@ -2903,8 +2906,8 @@ void css_task_iter_start(struct cgroup_subsys_state *css, down_read(&css_set_rwsem); - it->origin_css = css; - it->cset_link = &css->cgroup->cset_links; + it->cset_pos = &css->cgroup->cset_links; + it->cset_head = it->cset_pos; css_advance_task_iter(it); } @@ -2920,12 +2923,10 @@ void css_task_iter_start(struct cgroup_subsys_state *css, struct task_struct *css_task_iter_next(struct css_task_iter *it) { struct task_struct *res; - struct list_head *l = it->task; - struct cgrp_cset_link *link = list_entry(it->cset_link, - struct cgrp_cset_link, cset_link); + struct list_head *l = it->task_pos; /* If the iterator cg is NULL, we have no tasks */ - if (!it->cset_link) + if (!it->cset_pos) return NULL; res = list_entry(l, struct task_struct, cg_list); @@ -2936,13 +2937,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) */ l = l->next; - if (l == &link->cset->tasks) - l = link->cset->mg_tasks.next; + if (l == it->tasks_head) + l = it->mg_tasks_head->next; - if (l == &link->cset->mg_tasks) + if (l == it->mg_tasks_head) css_advance_task_iter(it); else - it->task = l; + it->task_pos = l; return res; } -- cgit v1.2.3 From 3ebb2b6ef38875b866ec0118bfae7bc52afd0166 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:15 -0400 Subject: cgroup: teach css_task_iter about effective csses Currently, css_task_iter iterates tasks associated with a css by visiting each css_set associated with the owning cgroup and walking tasks of each of them. This works fine for !unified hierarchies as each cgroup has its own css for each associated subsystem on the hierarchy; however, on the planned unified hierarchy, a cgroup may not have csses associated and its tasks would be considered associated with the matching css of the nearest ancestor which has the subsystem enabled. This means that on the default unified hierarchy, just walking all tasks associated with a cgroup isn't enough to walk all tasks which are associated with the specified css. If any of its children doesn't have the matching css enabled, task iteration should also include all tasks from the subtree. We already added cgroup->e_csets[] to list all css_sets effectively associated with a given css and walk css_sets on that list instead to achieve such iteration. This patch updates css_task_iter iteration such that it walks css_sets on cgroup->e_csets[] instead of cgroup->cset_links if iteration is requested on an non-dummy css. Thanks to the previous iteration update, this change can be achieved with the addition of css_task_iter->ss and minimal updates to css_advance_task_iter() and css_task_iter_start(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d48163b26196..ad28866ed44c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2868,8 +2868,14 @@ static void css_advance_task_iter(struct css_task_iter *it) it->cset_pos = NULL; return; } - link = list_entry(l, struct cgrp_cset_link, cset_link); - cset = link->cset; + + if (it->ss) { + cset = container_of(l, struct css_set, + e_cset_node[it->ss->id]); + } else { + link = list_entry(l, struct cgrp_cset_link, cset_link); + cset = link->cset; + } } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); it->cset_pos = l; @@ -2906,7 +2912,13 @@ void css_task_iter_start(struct cgroup_subsys_state *css, down_read(&css_set_rwsem); - it->cset_pos = &css->cgroup->cset_links; + it->ss = css->ss; + + if (it->ss) + it->cset_pos = &css->cgroup->e_csets[css->ss->id]; + else + it->cset_pos = &css->cgroup->cset_links; + it->cset_head = it->cset_pos; css_advance_task_iter(it); -- cgit v1.2.3 From e32978031016f56be977a9a856ba4d9f447db51f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:15 -0400 Subject: cgroup: cgroup->subsys[] should be cleared after the css is offlined After a css finishes offlining, offline_css() mistakenly performs RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css) which just sets the cgroup->subsys[] pointer to the current value. The intention was to clear it after offline is complete, not reassign the same value. Update it to assign NULL instead of the current value. This makes cgroup_css() to return NULL once offline is complete. All the existing users of the function either can handle NULL return already or guarantee that the css doesn't get offlined. While this is a bugfix, as css lifetime is currently tied to the cgroup it belongs to, this bug doesn't cause any actual problems. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ad28866ed44c..83a8fff43d68 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3710,7 +3710,7 @@ static void offline_css(struct cgroup_subsys_state *css) css->flags &= ~CSS_ONLINE; css->cgroup->nr_css--; - RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css); + RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); } /** -- cgit v1.2.3 From bd53d617b34c781dac8e22dbc75e8f182d918ecf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:16 -0400 Subject: cgroup: allow cgroup creation and suppress automatic css creation in the unified hierarchy Now that effective css handling has been added and iterators updated accordingly, it's safe to allow cgroup creation in the default hierarchy. Unblock cgroup creation in the default hierarchy. As the default hierarchy will implement explicit enabling and disabling of controllers on each cgroup, suppress automatic css enabling on cgroup creation. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 83a8fff43d68..2a4f88db3205 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1115,8 +1115,10 @@ static int rebind_subsystems(struct cgroup_root *dst_root, src_root->subsys_mask &= ~(1 << ssid); src_root->cgrp.child_subsys_mask &= ~(1 << ssid); + /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; - dst_root->cgrp.child_subsys_mask |= 1 << ssid; + if (dst_root != &cgrp_dfl_root) + dst_root->cgrp.child_subsys_mask |= 1 << ssid; if (ss->bind) ss->bind(css); @@ -3786,13 +3788,6 @@ static long cgroup_create(struct cgroup *parent, const char *name, struct cgroup_subsys *ss; struct kernfs_node *kn; - /* - * XXX: The default hierarchy isn't fully implemented yet. Block - * !root cgroup creation on it for now. - */ - if (root == &cgrp_dfl_root) - return -EINVAL; - /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); if (!cgrp) @@ -3878,7 +3873,12 @@ static long cgroup_create(struct cgroup *parent, const char *name, } } - cgrp->child_subsys_mask = parent->child_subsys_mask; + /* + * On the default hierarchy, a child doesn't automatically inherit + * child_subsys_mask from the parent. Each is configured manually. + */ + if (!cgroup_on_dfl(cgrp)) + cgrp->child_subsys_mask = parent->child_subsys_mask; kernfs_activate(kn); -- cgit v1.2.3 From 6803c006282768ec850760766a6e4eb1a6ff87df Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:16 -0400 Subject: cgroup: add css_set->dfl_cgrp To implement the unified hierarchy behavior, we'll need to be able to determine the associated cgroup on the default hierarchy from css_set. Let's add css_set->dfl_cgrp so that it can be accessed conveniently and efficiently. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2a4f88db3205..c66bfc8ee8a7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -651,6 +651,10 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, struct cgrp_cset_link *link; BUG_ON(list_empty(tmp_links)); + + if (cgroup_on_dfl(cgrp)) + cset->dfl_cgrp = cgrp; + link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); link->cset = cset; link->cgrp = cgrp; -- cgit v1.2.3 From 7fd8c565d8a501486d63d7ee07fd6582e97db437 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:16 -0400 Subject: cgroup: update subsystem rebind restrictions Because the default root couldn't have any non-root csses attached to it, rebinding away from it was always allowed; however, the default hierarchy will soon host the unified hierarchy and have non-root csses so the rebind restrictions need to be updated accordingly. Instead of special casing rebinding from the default hierarchy and then checking whether the source hierarchy has children cgroups, which implies non-root csses for !dfl hierarchies, simply check whether the source hierarchy has non-root csses for the subsystem using css_next_child(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c66bfc8ee8a7..15eb2273d80b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1051,16 +1051,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root, if (!(ss_mask & (1 << ssid))) continue; - /* if @ss is on the dummy_root, we can always move it */ - if (ss->root == &cgrp_dfl_root) - continue; - - /* if @ss has non-root cgroups attached to it, can't move */ - if (!list_empty(&ss->root->cgrp.children)) + /* if @ss has non-root csses attached to it, can't move */ + if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss))) return -EBUSY; /* can't move between two non-dummy roots either */ - if (dst_root != &cgrp_dfl_root) + if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) return -EBUSY; } -- cgit v1.2.3 From f817de98513d060023be4fa1d061b29a6515273e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:16 -0400 Subject: cgroup: prepare migration path for unified hierarchy Unified hierarchy implementation would require re-migrating tasks onto the same cgroup on the default hierarchy to reflect updated effective csses. Update cgroup_migrate_prepare_dst() so that it accepts NULL as the destination cgrp. When NULL is specified, the destination is considered to be the cgroup on the default hierarchy associated with each css_set. After this change, the identity check in cgroup_migrate_add_src() isn't sufficient for noop detection as the associated csses may change without any cgroup association changing. The only way to tell whether a migration is noop or not is testing whether the source and destination csets are identical. The noop check in cgroup_migrate_add_src() is removed and cset identity test is added to cgroup_migreate_prepare_dst(). If it's detected that source and destination csets are identical, the cset is removed removed from @preloaded_csets and all the migration nodes are cleared which makes cgroup_migrate() ignore the cset. Also, make the function append the destination css_sets to @preloaded_list so that destination css_sets always come after source css_sets. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 15eb2273d80b..8c2835a9e192 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1902,10 +1902,6 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); - /* nothing to do if this cset already belongs to the cgroup */ - if (src_cgrp == dst_cgrp) - return; - if (!list_empty(&src_cset->mg_preload_node)) return; @@ -1920,13 +1916,14 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, /** * cgroup_migrate_prepare_dst - prepare destination css_sets for migration - * @dst_cgrp: the destination cgroup + * @dst_cgrp: the destination cgroup (may be %NULL) * @preloaded_csets: list of preloaded source css_sets * * Tasks are about to be moved to @dst_cgrp and all the source css_sets * have been preloaded to @preloaded_csets. This function looks up and - * pins all destination css_sets, links each to its source, and put them on - * @preloaded_csets. + * pins all destination css_sets, links each to its source, and append them + * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each + * source css_set is assumed to be its cgroup on the default hierarchy. * * This function must be called after cgroup_migrate_add_src() has been * called on each migration source css_set. After migration is performed @@ -1937,19 +1934,34 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, struct list_head *preloaded_csets) { LIST_HEAD(csets); - struct css_set *src_cset; + struct css_set *src_cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); /* look up the dst cset for each src cset and link it to src */ - list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) { + list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { struct css_set *dst_cset; - dst_cset = find_css_set(src_cset, dst_cgrp); + dst_cset = find_css_set(src_cset, + dst_cgrp ?: src_cset->dfl_cgrp); if (!dst_cset) goto err; WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); + + /* + * If src cset equals dst, it's noop. Drop the src. + * cgroup_migrate() will skip the cset too. Note that we + * can't handle src == dst as some nodes are used by both. + */ + if (src_cset == dst_cset) { + src_cset->mg_src_cgrp = NULL; + list_del_init(&src_cset->mg_preload_node); + put_css_set(src_cset, false); + put_css_set(dst_cset, false); + continue; + } + src_cset->mg_dst_cset = dst_cset; if (list_empty(&dst_cset->mg_preload_node)) @@ -1958,7 +1970,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, put_css_set(dst_cset, false); } - list_splice(&csets, preloaded_csets); + list_splice_tail(&csets, preloaded_csets); return 0; err: cgroup_migrate_finish(&csets); -- cgit v1.2.3 From f8f22e53a262ebee37fc98004f16b066cf5bc125 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:16 -0400 Subject: cgroup: implement dynamic subtree controller enable/disable on the default hierarchy cgroup is switching away from multiple hierarchies and will use one unified default hierarchy where controllers can be dynamically enabled and disabled per subtree. The default hierarchy will serve as the unified hierarchy to which all controllers are attached and a css on the default hierarchy would need to also serve the tasks of descendant cgroups which don't have the controller enabled - ie. the tree may be collapsed from leaf towards root when viewed from specific controllers. This has been implemented through effective css in the previous patches. This patch finally implements dynamic subtree controller enable/disable on the default hierarchy via a new knob - "cgroup.subtree_control" which controls which controllers are enabled on the child cgroups. Let's assume a hierarchy like the following. root - A - B - C \ D root's "cgroup.subtree_control" determines which controllers are enabled on A. A's on B. B's on C and D. This coincides with the fact that controllers on the immediate sub-level are used to distribute the resources of the parent. In fact, it's natural to assume that resource control knobs of a child belong to its parent. Enabling a controller in "cgroup.subtree_control" declares that distribution of the respective resources of the cgroup will be controlled. Note that this means that controller enable states are shared among siblings. The default hierarchy has an extra restriction - only cgroups which don't contain any task may have controllers enabled in "cgroup.subtree_control". Combined with the other properties of the default hierarchy, this guarantees that, from the view point of controllers, tasks are only on the leaf cgroups. In other words, only leaf csses may contain tasks. This rules out situations where child cgroups compete against internal tasks of the parent, which is a competition between two different types of entities without any clear way to determine resource distribution between the two. Different controllers handle it differently and all the implemented behaviors are ambiguous, ad-hoc, cumbersome and/or just wrong. Having this structural constraints imposed from cgroup core removes the burden from controller implementations and enables showing one consistent behavior across all controllers. When a controller is enabled or disabled, css associations for the controller in the subtrees of each child should be updated. After enabling, the whole subtree of a child should point to the new css of the child. After disabling, the whole subtree of a child should point to the cgroup's css. This is implemented by first updating cgroup states such that cgroup_e_css() result points to the appropriate css and then invoking cgroup_update_dfl_csses() which migrates all tasks in the affected subtrees to the self cgroup on the default hierarchy. * When read, "cgroup.subtree_control" lists all the currently enabled controllers on the children of the cgroup. * White-space separated list of controller names prefixed with either '+' or '-' can be written to "cgroup.subtree_control". The ones prefixed with '+' are enabled on the controller and '-' disabled. * A controller can be enabled iff the parent's "cgroup.subtree_control" enables it and disabled iff no child's "cgroup.subtree_control" has it enabled. * If a cgroup has tasks, no controller can be enabled via "cgroup.subtree_control". Likewise, if "cgroup.subtree_control" has some controllers enabled, tasks can't be migrated into the cgroup. * All controllers which aren't bound on other hierarchies are automatically associated with the root cgroup of the default hierarchy. All the controllers which are bound to the default hierarchy are listed in the read-only file "cgroup.controllers" in the root directory. * "cgroup.controllers" in all non-root cgroups is read-only file whose content is equal to that of "cgroup.subtree_control" of the parent. This indicates which controllers can be used in the cgroup's "cgroup.subtree_control". This is still experimental and there are some holes, one of which is that ->can_attach() failure during cgroup_update_dfl_csses() may leave the cgroups in an undefined state. The issues will be addressed by future patches. v2: Non-root cgroups now also have "cgroup.controllers". Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 367 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 365 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8c2835a9e192..809dd903ceb8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -182,6 +182,8 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned long ss_mask); static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); +static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); +static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); @@ -338,6 +340,14 @@ static int notify_on_release(const struct cgroup *cgrp) #define for_each_root(root) \ list_for_each_entry((root), &cgroup_roots, root_list) +/* iterate over child cgrps, lock should be held throughout iteration */ +#define cgroup_for_each_live_child(child, cgrp) \ + list_for_each_entry((child), &(cgrp)->children, sibling) \ + if (({ lockdep_assert_held(&cgroup_tree_mutex); \ + cgroup_is_dead(child); })) \ + ; \ + else + /** * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. * @cgrp: the cgroup to be checked for liveness @@ -1450,6 +1460,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); + + init_waitqueue_head(&cgrp->offline_waitq); } static void init_cgroup_root(struct cgroup_root *root, @@ -1938,6 +1950,14 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, lockdep_assert_held(&cgroup_mutex); + /* + * Except for the root, child_subsys_mask must be zero for a cgroup + * with tasks so that child cgroups don't compete against tasks. + */ + if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && dst_cgrp->parent && + dst_cgrp->child_subsys_mask) + return -EBUSY; + /* look up the dst cset for each src cset and link it to src */ list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { struct css_set *dst_cset; @@ -2303,6 +2323,326 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) return 0; } +static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask) +{ + struct cgroup_subsys *ss; + bool printed = false; + int ssid; + + for_each_subsys(ss, ssid) { + if (ss_mask & (1 << ssid)) { + if (printed) + seq_putc(seq, ' '); + seq_printf(seq, "%s", ss->name); + printed = true; + } + } + if (printed) + seq_putc(seq, '\n'); +} + +/* show controllers which are currently attached to the default hierarchy */ +static int cgroup_root_controllers_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + cgroup_print_ss_mask(seq, cgrp->root->subsys_mask); + return 0; +} + +/* show controllers which are enabled from the parent */ +static int cgroup_controllers_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + cgroup_print_ss_mask(seq, cgrp->parent->child_subsys_mask); + return 0; +} + +/* show controllers which are enabled for a given cgroup's children */ +static int cgroup_subtree_control_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + cgroup_print_ss_mask(seq, cgrp->child_subsys_mask); + return 0; +} + +/** + * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy + * @cgrp: root of the subtree to update csses for + * + * @cgrp's child_subsys_mask has changed and its subtree's (self excluded) + * css associations need to be updated accordingly. This function looks up + * all css_sets which are attached to the subtree, creates the matching + * updated css_sets and migrates the tasks to the new ones. + */ +static int cgroup_update_dfl_csses(struct cgroup *cgrp) +{ + LIST_HEAD(preloaded_csets); + struct cgroup_subsys_state *css; + struct css_set *src_cset; + int ret; + + lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); + + /* look up all csses currently attached to @cgrp's subtree */ + down_read(&css_set_rwsem); + css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { + struct cgrp_cset_link *link; + + /* self is not affected by child_subsys_mask change */ + if (css->cgroup == cgrp) + continue; + + list_for_each_entry(link, &css->cgroup->cset_links, cset_link) + cgroup_migrate_add_src(link->cset, cgrp, + &preloaded_csets); + } + up_read(&css_set_rwsem); + + /* NULL dst indicates self on default hierarchy */ + ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets); + if (ret) + goto out_finish; + + list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { + struct task_struct *last_task = NULL, *task; + + /* src_csets precede dst_csets, break on the first dst_cset */ + if (!src_cset->mg_src_cgrp) + break; + + /* + * All tasks in src_cset need to be migrated to the + * matching dst_cset. Empty it process by process. We + * walk tasks but migrate processes. The leader might even + * belong to a different cset but such src_cset would also + * be among the target src_csets because the default + * hierarchy enforces per-process membership. + */ + while (true) { + down_read(&css_set_rwsem); + task = list_first_entry_or_null(&src_cset->tasks, + struct task_struct, cg_list); + if (task) { + task = task->group_leader; + WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp); + get_task_struct(task); + } + up_read(&css_set_rwsem); + + if (!task) + break; + + /* guard against possible infinite loop */ + if (WARN(last_task == task, + "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n")) + goto out_finish; + last_task = task; + + threadgroup_lock(task); + /* raced against de_thread() from another thread? */ + if (!thread_group_leader(task)) { + threadgroup_unlock(task); + put_task_struct(task); + continue; + } + + ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); + + threadgroup_unlock(task); + put_task_struct(task); + + if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) + goto out_finish; + } + } + +out_finish: + cgroup_migrate_finish(&preloaded_csets); + return ret; +} + +/* change the enabled child controllers for a cgroup in the default hierarchy */ +static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css, + struct cftype *cft, char *buffer) +{ + unsigned long enable_req = 0, disable_req = 0, enable, disable; + struct cgroup *cgrp = dummy_css->cgroup, *child; + struct cgroup_subsys *ss; + char *tok, *p; + int ssid, ret; + + /* + * Parse input - white space separated list of subsystem names + * prefixed with either + or -. + */ + p = buffer; + while ((tok = strsep(&p, " \t\n"))) { + for_each_subsys(ss, ssid) { + if (ss->disabled || strcmp(tok + 1, ss->name)) + continue; + + if (*tok == '+') { + enable_req |= 1 << ssid; + disable_req &= ~(1 << ssid); + } else if (*tok == '-') { + disable_req |= 1 << ssid; + enable_req &= ~(1 << ssid); + } else { + return -EINVAL; + } + break; + } + if (ssid == CGROUP_SUBSYS_COUNT) + return -EINVAL; + } + + /* + * We're gonna grab cgroup_tree_mutex which nests outside kernfs + * active_ref. cgroup_lock_live_group() already provides enough + * protection. Ensure @cgrp stays accessible and break the + * active_ref protection. + */ + cgroup_get(cgrp); + kernfs_break_active_protection(cgrp->control_kn); +retry: + enable = enable_req; + disable = disable_req; + + mutex_lock(&cgroup_tree_mutex); + + for_each_subsys(ss, ssid) { + if (enable & (1 << ssid)) { + if (cgrp->child_subsys_mask & (1 << ssid)) { + enable &= ~(1 << ssid); + continue; + } + + /* + * Because css offlining is asynchronous, userland + * might try to re-enable the same controller while + * the previous instance is still around. In such + * cases, wait till it's gone using offline_waitq. + */ + cgroup_for_each_live_child(child, cgrp) { + wait_queue_t wait; + + if (!cgroup_css(child, ss)) + continue; + + prepare_to_wait(&child->offline_waitq, &wait, + TASK_UNINTERRUPTIBLE); + mutex_unlock(&cgroup_tree_mutex); + schedule(); + finish_wait(&child->offline_waitq, &wait); + goto retry; + } + + /* unavailable or not enabled on the parent? */ + if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || + (cgrp->parent && + !(cgrp->parent->child_subsys_mask & (1 << ssid)))) { + ret = -ENOENT; + goto out_unlock_tree; + } + } else if (disable & (1 << ssid)) { + if (!(cgrp->child_subsys_mask & (1 << ssid))) { + disable &= ~(1 << ssid); + continue; + } + + /* a child has it enabled? */ + cgroup_for_each_live_child(child, cgrp) { + if (child->child_subsys_mask & (1 << ssid)) { + ret = -EBUSY; + goto out_unlock_tree; + } + } + } + } + + if (!enable && !disable) { + ret = 0; + goto out_unlock_tree; + } + + if (!cgroup_lock_live_group(cgrp)) { + ret = -ENODEV; + goto out_unlock_tree; + } + + /* + * Except for the root, child_subsys_mask must be zero for a cgroup + * with tasks so that child cgroups don't compete against tasks. + */ + if (enable && cgrp->parent && !list_empty(&cgrp->cset_links)) { + ret = -EBUSY; + goto out_unlock; + } + + /* + * Create csses for enables and update child_subsys_mask. This + * changes cgroup_e_css() results which in turn makes the + * subsequent cgroup_update_dfl_csses() associate all tasks in the + * subtree to the updated csses. + */ + for_each_subsys(ss, ssid) { + if (!(enable & (1 << ssid))) + continue; + + cgroup_for_each_live_child(child, cgrp) { + ret = create_css(child, ss); + if (ret) + goto err_undo_css; + } + } + + cgrp->child_subsys_mask |= enable; + cgrp->child_subsys_mask &= ~disable; + + ret = cgroup_update_dfl_csses(cgrp); + if (ret) + goto err_undo_css; + + /* all tasks are now migrated away from the old csses, kill them */ + for_each_subsys(ss, ssid) { + if (!(disable & (1 << ssid))) + continue; + + cgroup_for_each_live_child(child, cgrp) + kill_css(cgroup_css(child, ss)); + } + + kernfs_activate(cgrp->kn); + ret = 0; +out_unlock: + mutex_unlock(&cgroup_mutex); +out_unlock_tree: + mutex_unlock(&cgroup_tree_mutex); + kernfs_unbreak_active_protection(cgrp->control_kn); + cgroup_put(cgrp); + return ret; + +err_undo_css: + cgrp->child_subsys_mask &= ~enable; + cgrp->child_subsys_mask |= disable; + + for_each_subsys(ss, ssid) { + if (!(enable & (1 << ssid))) + continue; + + cgroup_for_each_live_child(child, cgrp) { + struct cgroup_subsys_state *css = cgroup_css(child, ss); + if (css) + kill_css(css); + } + } + goto out_unlock; +} + static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -2462,9 +2802,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) return PTR_ERR(kn); ret = cgroup_kn_set_ugid(kn); - if (ret) + if (ret) { kernfs_remove(kn); - return ret; + return ret; + } + + if (cft->seq_show == cgroup_subtree_control_show) + cgrp->control_kn = kn; + return 0; } /** @@ -3557,6 +3902,22 @@ static struct cftype cgroup_base_files[] = { .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_sane_behavior_show, }, + { + .name = "cgroup.controllers", + .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT, + .seq_show = cgroup_root_controllers_show, + }, + { + .name = "cgroup.controllers", + .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_controllers_show, + }, + { + .name = "cgroup.subtree_control", + .flags = CFTYPE_ONLY_ON_DFL, + .seq_show = cgroup_subtree_control_show, + .write_string = cgroup_subtree_control_write, + }, /* * Historical crazy stuff. These don't have "cgroup." prefix and @@ -3725,6 +4086,8 @@ static void offline_css(struct cgroup_subsys_state *css) css->flags &= ~CSS_ONLINE; css->cgroup->nr_css--; RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); + + wake_up_all(&css->cgroup->offline_waitq); } /** -- cgit v1.2.3 From be8f274323c26ddc7e6fd6c44254b7abcdbe6389 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 17 Apr 2014 17:16:58 +0900 Subject: kprobes: Prohibit probing on .entry.text code .entry.text is a code area which is used for interrupt/syscall entries, which includes many sensitive code. Thus, it is better to prohibit probing on all of such code instead of a part of that. Since some symbols are already registered on kprobe blacklist, this also removes them from the blacklist. Signed-off-by: Masami Hiramatsu Reviewed-by: Steven Rostedt Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: Borislav Petkov Cc: David S. Miller Cc: Frederic Weisbecker Cc: Jan Kiszka Cc: Jiri Kosina Cc: Jonathan Lebon Cc: Seiji Aguchi Link: http://lkml.kernel.org/r/20140417081658.26341.57354.stgit@ltc230.yrl.intra.hitachi.co.jp Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ceeadfcabb76..5b5ac76671e7 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -96,9 +96,6 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) static struct kprobe_blackpoint kprobe_blacklist[] = { {"preempt_schedule",}, {"native_get_debugreg",}, - {"irq_entries_start",}, - {"common_interrupt",}, - {"mcount",}, /* mcount can be called from everywhere */ {NULL} /* Terminator */ }; @@ -1324,12 +1321,18 @@ out: return ret; } +bool __weak arch_within_kprobe_blacklist(unsigned long addr) +{ + /* The __kprobes marked functions and entry code must not be probed */ + return addr >= (unsigned long)__kprobes_text_start && + addr < (unsigned long)__kprobes_text_end; +} + static int __kprobes in_kprobes_functions(unsigned long addr) { struct kprobe_blackpoint *kb; - if (addr >= (unsigned long)__kprobes_text_start && - addr < (unsigned long)__kprobes_text_end) + if (arch_within_kprobe_blacklist(addr)) return -EINVAL; /* * If there exists a kprobe_blacklist, verify and -- cgit v1.2.3 From 376e242429bf8539ef39a080ac113c8799840b13 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 17 Apr 2014 17:17:05 +0900 Subject: kprobes: Introduce NOKPROBE_SYMBOL() macro to maintain kprobes blacklist MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce NOKPROBE_SYMBOL() macro which builds a kprobes blacklist at kernel build time. The usage of this macro is similar to EXPORT_SYMBOL(), placed after the function definition: NOKPROBE_SYMBOL(function); Since this macro will inhibit inlining of static/inline functions, this patch also introduces a nokprobe_inline macro for static/inline functions. In this case, we must use NOKPROBE_SYMBOL() for the inline function caller. When CONFIG_KPROBES=y, the macro stores the given function address in the "_kprobe_blacklist" section. Since the data structures are not fully initialized by the macro (because there is no "size" information), those are re-initialized at boot time by using kallsyms. Signed-off-by: Masami Hiramatsu Link: http://lkml.kernel.org/r/20140417081705.26341.96719.stgit@ltc230.yrl.intra.hitachi.co.jp Cc: Alok Kataria Cc: Ananth N Mavinakayanahalli Cc: Andrew Morton Cc: Anil S Keshavamurthy Cc: Arnd Bergmann Cc: Christopher Li Cc: Chris Wright Cc: David S. Miller Cc: Jan-Simon Möller Cc: Jeremy Fitzhardinge Cc: Linus Torvalds Cc: Randy Dunlap Cc: Rusty Russell Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-sparse@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 100 +++++++++++++++++++++++++++------------------------- kernel/sched/core.c | 1 + 2 files changed, 53 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 5b5ac76671e7..5ffc6875d2a7 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -86,18 +86,8 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) return &(kretprobe_table_locks[hash].lock); } -/* - * Normally, functions that we'd want to prohibit kprobes in, are marked - * __kprobes. But, there are cases where such functions already belong to - * a different section (__sched for preempt_schedule) - * - * For such cases, we now have a blacklist - */ -static struct kprobe_blackpoint kprobe_blacklist[] = { - {"preempt_schedule",}, - {"native_get_debugreg",}, - {NULL} /* Terminator */ -}; +/* Blacklist -- list of struct kprobe_blacklist_entry */ +static LIST_HEAD(kprobe_blacklist); #ifdef __ARCH_WANT_KPROBES_INSN_SLOT /* @@ -1328,24 +1318,22 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr) addr < (unsigned long)__kprobes_text_end; } -static int __kprobes in_kprobes_functions(unsigned long addr) +static bool __kprobes within_kprobe_blacklist(unsigned long addr) { - struct kprobe_blackpoint *kb; + struct kprobe_blacklist_entry *ent; if (arch_within_kprobe_blacklist(addr)) - return -EINVAL; + return true; /* * If there exists a kprobe_blacklist, verify and * fail any probe registration in the prohibited area */ - for (kb = kprobe_blacklist; kb->name != NULL; kb++) { - if (kb->start_addr) { - if (addr >= kb->start_addr && - addr < (kb->start_addr + kb->range)) - return -EINVAL; - } + list_for_each_entry(ent, &kprobe_blacklist, list) { + if (addr >= ent->start_addr && addr < ent->end_addr) + return true; } - return 0; + + return false; } /* @@ -1436,7 +1424,7 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p, /* Ensure it is not in reserved area nor out of text */ if (!kernel_text_address((unsigned long) p->addr) || - in_kprobes_functions((unsigned long) p->addr) || + within_kprobe_blacklist((unsigned long) p->addr) || jump_label_text_reserved(p->addr, p->addr)) { ret = -EINVAL; goto out; @@ -2022,6 +2010,38 @@ void __kprobes dump_kprobe(struct kprobe *kp) kp->symbol_name, kp->addr, kp->offset); } +/* + * Lookup and populate the kprobe_blacklist. + * + * Unlike the kretprobe blacklist, we'll need to determine + * the range of addresses that belong to the said functions, + * since a kprobe need not necessarily be at the beginning + * of a function. + */ +static int __init populate_kprobe_blacklist(unsigned long *start, + unsigned long *end) +{ + unsigned long *iter; + struct kprobe_blacklist_entry *ent; + unsigned long offset = 0, size = 0; + + for (iter = start; iter < end; iter++) { + if (!kallsyms_lookup_size_offset(*iter, &size, &offset)) { + pr_err("Failed to find blacklist %p\n", (void *)*iter); + continue; + } + + ent = kmalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + return -ENOMEM; + ent->start_addr = *iter; + ent->end_addr = *iter + size; + INIT_LIST_HEAD(&ent->list); + list_add_tail(&ent->list, &kprobe_blacklist); + } + return 0; +} + /* Module notifier call back, checking kprobes on the module */ static int __kprobes kprobes_module_callback(struct notifier_block *nb, unsigned long val, void *data) @@ -2065,14 +2085,13 @@ static struct notifier_block kprobe_module_nb = { .priority = 0 }; +/* Markers of _kprobe_blacklist section */ +extern unsigned long __start_kprobe_blacklist[]; +extern unsigned long __stop_kprobe_blacklist[]; + static int __init init_kprobes(void) { int i, err = 0; - unsigned long offset = 0, size = 0; - char *modname, namebuf[KSYM_NAME_LEN]; - const char *symbol_name; - void *addr; - struct kprobe_blackpoint *kb; /* FIXME allocate the probe table, currently defined statically */ /* initialize all list heads */ @@ -2082,26 +2101,11 @@ static int __init init_kprobes(void) raw_spin_lock_init(&(kretprobe_table_locks[i].lock)); } - /* - * Lookup and populate the kprobe_blacklist. - * - * Unlike the kretprobe blacklist, we'll need to determine - * the range of addresses that belong to the said functions, - * since a kprobe need not necessarily be at the beginning - * of a function. - */ - for (kb = kprobe_blacklist; kb->name != NULL; kb++) { - kprobe_lookup_name(kb->name, addr); - if (!addr) - continue; - - kb->start_addr = (unsigned long)addr; - symbol_name = kallsyms_lookup(kb->start_addr, - &size, &offset, &modname, namebuf); - if (!symbol_name) - kb->range = 0; - else - kb->range = size; + err = populate_kprobe_blacklist(__start_kprobe_blacklist, + __stop_kprobe_blacklist); + if (err) { + pr_err("kprobes: failed to populate blacklist: %d\n", err); + pr_err("Please take care of using kprobes.\n"); } if (kretprobe_blacklist_size) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 268a45ea238c..6863631e8cd0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2804,6 +2804,7 @@ asmlinkage void __sched notrace preempt_schedule(void) barrier(); } while (need_resched()); } +NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); #endif /* CONFIG_PREEMPT */ -- cgit v1.2.3 From 55479f64756fc508182a05e35e52f01395a50d4d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 17 Apr 2014 17:17:54 +0900 Subject: kprobes: Allow probe on some kprobe functions There is no need to prohibit probing on the functions used for preparation, registeration, optimization, controll etc. Those are safely probed because those are not invoked from breakpoint/fault/debug handlers, there is no chance to cause recursive exceptions. Following functions are now removed from the kprobes blacklist: add_new_kprobe aggr_kprobe_disabled alloc_aggr_kprobe alloc_aggr_kprobe arm_all_kprobes __arm_kprobe arm_kprobe arm_kprobe_ftrace check_kprobe_address_safe collect_garbage_slots collect_garbage_slots collect_one_slot debugfs_kprobe_init __disable_kprobe disable_kprobe disarm_all_kprobes __disarm_kprobe disarm_kprobe disarm_kprobe_ftrace do_free_cleaned_kprobes do_optimize_kprobes do_unoptimize_kprobes enable_kprobe force_unoptimize_kprobe free_aggr_kprobe free_aggr_kprobe __free_insn_slot __get_insn_slot get_optimized_kprobe __get_valid_kprobe init_aggr_kprobe init_aggr_kprobe in_nokprobe_functions kick_kprobe_optimizer kill_kprobe kill_optimized_kprobe kprobe_addr kprobe_optimizer kprobe_queued kprobe_seq_next kprobe_seq_start kprobe_seq_stop kprobes_module_callback kprobes_open optimize_all_kprobes optimize_kprobe prepare_kprobe prepare_optimized_kprobe register_aggr_kprobe register_jprobe register_jprobes register_kprobe register_kprobes register_kretprobe register_kretprobe register_kretprobes register_kretprobes report_probe show_kprobe_addr try_to_optimize_kprobe unoptimize_all_kprobes unoptimize_kprobe unregister_jprobe unregister_jprobes unregister_kprobe __unregister_kprobe_bottom unregister_kprobes __unregister_kprobe_top unregister_kretprobe unregister_kretprobe unregister_kretprobes unregister_kretprobes wait_for_kprobe_optimizer I tested those functions by putting kprobes on all instructions in the functions with the bash script I sent to LKML. See: https://lkml.org/lkml/2014/3/27/33 Signed-off-by: Masami Hiramatsu Link: http://lkml.kernel.org/r/20140417081753.26341.57889.stgit@ltc230.yrl.intra.hitachi.co.jp Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S. Miller Cc: fche@redhat.com Cc: systemtap@sourceware.org Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 153 +++++++++++++++++++++++++++---------------------------- 1 file changed, 76 insertions(+), 77 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 5ffc6875d2a7..4db2cc616f50 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -138,13 +138,13 @@ struct kprobe_insn_cache kprobe_insn_slots = { .insn_size = MAX_INSN_SIZE, .nr_garbage = 0, }; -static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c); +static int collect_garbage_slots(struct kprobe_insn_cache *c); /** * __get_insn_slot() - Find a slot on an executable page for an instruction. * We allocate an executable page if there's no room on existing ones. */ -kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) +kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) { struct kprobe_insn_page *kip; kprobe_opcode_t *slot = NULL; @@ -201,7 +201,7 @@ out: } /* Return 1 if all garbages are collected, otherwise 0. */ -static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) +static int collect_one_slot(struct kprobe_insn_page *kip, int idx) { kip->slot_used[idx] = SLOT_CLEAN; kip->nused--; @@ -222,7 +222,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) return 0; } -static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c) +static int collect_garbage_slots(struct kprobe_insn_cache *c) { struct kprobe_insn_page *kip, *next; @@ -244,8 +244,8 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c) return 0; } -void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, - kprobe_opcode_t *slot, int dirty) +void __free_insn_slot(struct kprobe_insn_cache *c, + kprobe_opcode_t *slot, int dirty) { struct kprobe_insn_page *kip; @@ -361,7 +361,7 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs) } /* Free optimized instructions and optimized_kprobe */ -static __kprobes void free_aggr_kprobe(struct kprobe *p) +static void free_aggr_kprobe(struct kprobe *p) { struct optimized_kprobe *op; @@ -399,7 +399,7 @@ static inline int kprobe_disarmed(struct kprobe *p) } /* Return true(!0) if the probe is queued on (un)optimizing lists */ -static int __kprobes kprobe_queued(struct kprobe *p) +static int kprobe_queued(struct kprobe *p) { struct optimized_kprobe *op; @@ -415,7 +415,7 @@ static int __kprobes kprobe_queued(struct kprobe *p) * Return an optimized kprobe whose optimizing code replaces * instructions including addr (exclude breakpoint). */ -static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) +static struct kprobe *get_optimized_kprobe(unsigned long addr) { int i; struct kprobe *p = NULL; @@ -447,7 +447,7 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); * Optimize (replace a breakpoint with a jump) kprobes listed on * optimizing_list. */ -static __kprobes void do_optimize_kprobes(void) +static void do_optimize_kprobes(void) { /* Optimization never be done when disarmed */ if (kprobes_all_disarmed || !kprobes_allow_optimization || @@ -475,7 +475,7 @@ static __kprobes void do_optimize_kprobes(void) * Unoptimize (replace a jump with a breakpoint and remove the breakpoint * if need) kprobes listed on unoptimizing_list. */ -static __kprobes void do_unoptimize_kprobes(void) +static void do_unoptimize_kprobes(void) { struct optimized_kprobe *op, *tmp; @@ -507,7 +507,7 @@ static __kprobes void do_unoptimize_kprobes(void) } /* Reclaim all kprobes on the free_list */ -static __kprobes void do_free_cleaned_kprobes(void) +static void do_free_cleaned_kprobes(void) { struct optimized_kprobe *op, *tmp; @@ -519,13 +519,13 @@ static __kprobes void do_free_cleaned_kprobes(void) } /* Start optimizer after OPTIMIZE_DELAY passed */ -static __kprobes void kick_kprobe_optimizer(void) +static void kick_kprobe_optimizer(void) { schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); } /* Kprobe jump optimizer */ -static __kprobes void kprobe_optimizer(struct work_struct *work) +static void kprobe_optimizer(struct work_struct *work) { mutex_lock(&kprobe_mutex); /* Lock modules while optimizing kprobes */ @@ -561,7 +561,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) } /* Wait for completing optimization and unoptimization */ -static __kprobes void wait_for_kprobe_optimizer(void) +static void wait_for_kprobe_optimizer(void) { mutex_lock(&kprobe_mutex); @@ -580,7 +580,7 @@ static __kprobes void wait_for_kprobe_optimizer(void) } /* Optimize kprobe if p is ready to be optimized */ -static __kprobes void optimize_kprobe(struct kprobe *p) +static void optimize_kprobe(struct kprobe *p) { struct optimized_kprobe *op; @@ -614,7 +614,7 @@ static __kprobes void optimize_kprobe(struct kprobe *p) } /* Short cut to direct unoptimizing */ -static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op) +static void force_unoptimize_kprobe(struct optimized_kprobe *op) { get_online_cpus(); arch_unoptimize_kprobe(op); @@ -624,7 +624,7 @@ static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op) } /* Unoptimize a kprobe if p is optimized */ -static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force) +static void unoptimize_kprobe(struct kprobe *p, bool force) { struct optimized_kprobe *op; @@ -684,7 +684,7 @@ static void reuse_unused_kprobe(struct kprobe *ap) } /* Remove optimized instructions */ -static void __kprobes kill_optimized_kprobe(struct kprobe *p) +static void kill_optimized_kprobe(struct kprobe *p) { struct optimized_kprobe *op; @@ -710,7 +710,7 @@ static void __kprobes kill_optimized_kprobe(struct kprobe *p) } /* Try to prepare optimized instructions */ -static __kprobes void prepare_optimized_kprobe(struct kprobe *p) +static void prepare_optimized_kprobe(struct kprobe *p) { struct optimized_kprobe *op; @@ -719,7 +719,7 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p) } /* Allocate new optimized_kprobe and try to prepare optimized instructions */ -static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) +static struct kprobe *alloc_aggr_kprobe(struct kprobe *p) { struct optimized_kprobe *op; @@ -734,13 +734,13 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) return &op->kp; } -static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p); +static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p); /* * Prepare an optimized_kprobe and optimize it * NOTE: p must be a normal registered kprobe */ -static __kprobes void try_to_optimize_kprobe(struct kprobe *p) +static void try_to_optimize_kprobe(struct kprobe *p) { struct kprobe *ap; struct optimized_kprobe *op; @@ -774,7 +774,7 @@ out: } #ifdef CONFIG_SYSCTL -static void __kprobes optimize_all_kprobes(void) +static void optimize_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; @@ -797,7 +797,7 @@ out: mutex_unlock(&kprobe_mutex); } -static void __kprobes unoptimize_all_kprobes(void) +static void unoptimize_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; @@ -848,7 +848,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, #endif /* CONFIG_SYSCTL */ /* Put a breakpoint for a probe. Must be called with text_mutex locked */ -static void __kprobes __arm_kprobe(struct kprobe *p) +static void __arm_kprobe(struct kprobe *p) { struct kprobe *_p; @@ -863,7 +863,7 @@ static void __kprobes __arm_kprobe(struct kprobe *p) } /* Remove the breakpoint of a probe. Must be called with text_mutex locked */ -static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt) +static void __disarm_kprobe(struct kprobe *p, bool reopt) { struct kprobe *_p; @@ -898,13 +898,13 @@ static void reuse_unused_kprobe(struct kprobe *ap) BUG_ON(kprobe_unused(ap)); } -static __kprobes void free_aggr_kprobe(struct kprobe *p) +static void free_aggr_kprobe(struct kprobe *p) { arch_remove_kprobe(p); kfree(p); } -static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) +static struct kprobe *alloc_aggr_kprobe(struct kprobe *p) { return kzalloc(sizeof(struct kprobe), GFP_KERNEL); } @@ -918,7 +918,7 @@ static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { static int kprobe_ftrace_enabled; /* Must ensure p->addr is really on ftrace */ -static int __kprobes prepare_kprobe(struct kprobe *p) +static int prepare_kprobe(struct kprobe *p) { if (!kprobe_ftrace(p)) return arch_prepare_kprobe(p); @@ -927,7 +927,7 @@ static int __kprobes prepare_kprobe(struct kprobe *p) } /* Caller must lock kprobe_mutex */ -static void __kprobes arm_kprobe_ftrace(struct kprobe *p) +static void arm_kprobe_ftrace(struct kprobe *p) { int ret; @@ -942,7 +942,7 @@ static void __kprobes arm_kprobe_ftrace(struct kprobe *p) } /* Caller must lock kprobe_mutex */ -static void __kprobes disarm_kprobe_ftrace(struct kprobe *p) +static void disarm_kprobe_ftrace(struct kprobe *p) { int ret; @@ -962,7 +962,7 @@ static void __kprobes disarm_kprobe_ftrace(struct kprobe *p) #endif /* Arm a kprobe with text_mutex */ -static void __kprobes arm_kprobe(struct kprobe *kp) +static void arm_kprobe(struct kprobe *kp) { if (unlikely(kprobe_ftrace(kp))) { arm_kprobe_ftrace(kp); @@ -979,7 +979,7 @@ static void __kprobes arm_kprobe(struct kprobe *kp) } /* Disarm a kprobe with text_mutex */ -static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt) +static void disarm_kprobe(struct kprobe *kp, bool reopt) { if (unlikely(kprobe_ftrace(kp))) { disarm_kprobe_ftrace(kp); @@ -1189,7 +1189,7 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp) * Add the new probe to ap->list. Fail if this is the * second jprobe at the address - two jprobes can't coexist */ -static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) +static int add_new_kprobe(struct kprobe *ap, struct kprobe *p) { BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); @@ -1213,7 +1213,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) * Fill in the required fields of the "manager kprobe". Replace the * earlier kprobe in the hlist with the manager kprobe */ -static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) +static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) { /* Copy p's insn slot to ap */ copy_kprobe(p, ap); @@ -1239,8 +1239,7 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) * This is the second or subsequent kprobe at the address - handle * the intricacies */ -static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, - struct kprobe *p) +static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) { int ret = 0; struct kprobe *ap = orig_p; @@ -1318,7 +1317,7 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr) addr < (unsigned long)__kprobes_text_end; } -static bool __kprobes within_kprobe_blacklist(unsigned long addr) +static bool within_kprobe_blacklist(unsigned long addr) { struct kprobe_blacklist_entry *ent; @@ -1342,7 +1341,7 @@ static bool __kprobes within_kprobe_blacklist(unsigned long addr) * This returns encoded errors if it fails to look up symbol or invalid * combination of parameters. */ -static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) +static kprobe_opcode_t *kprobe_addr(struct kprobe *p) { kprobe_opcode_t *addr = p->addr; @@ -1365,7 +1364,7 @@ invalid: } /* Check passed kprobe is valid and return kprobe in kprobe_table. */ -static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) +static struct kprobe *__get_valid_kprobe(struct kprobe *p) { struct kprobe *ap, *list_p; @@ -1397,8 +1396,8 @@ static inline int check_kprobe_rereg(struct kprobe *p) return ret; } -static __kprobes int check_kprobe_address_safe(struct kprobe *p, - struct module **probed_mod) +static int check_kprobe_address_safe(struct kprobe *p, + struct module **probed_mod) { int ret = 0; unsigned long ftrace_addr; @@ -1460,7 +1459,7 @@ out: return ret; } -int __kprobes register_kprobe(struct kprobe *p) +int register_kprobe(struct kprobe *p) { int ret; struct kprobe *old_p; @@ -1522,7 +1521,7 @@ out: EXPORT_SYMBOL_GPL(register_kprobe); /* Check if all probes on the aggrprobe are disabled */ -static int __kprobes aggr_kprobe_disabled(struct kprobe *ap) +static int aggr_kprobe_disabled(struct kprobe *ap) { struct kprobe *kp; @@ -1538,7 +1537,7 @@ static int __kprobes aggr_kprobe_disabled(struct kprobe *ap) } /* Disable one kprobe: Make sure called under kprobe_mutex is locked */ -static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p) +static struct kprobe *__disable_kprobe(struct kprobe *p) { struct kprobe *orig_p; @@ -1565,7 +1564,7 @@ static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p) /* * Unregister a kprobe without a scheduler synchronization. */ -static int __kprobes __unregister_kprobe_top(struct kprobe *p) +static int __unregister_kprobe_top(struct kprobe *p) { struct kprobe *ap, *list_p; @@ -1622,7 +1621,7 @@ disarmed: return 0; } -static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) +static void __unregister_kprobe_bottom(struct kprobe *p) { struct kprobe *ap; @@ -1638,7 +1637,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) /* Otherwise, do nothing. */ } -int __kprobes register_kprobes(struct kprobe **kps, int num) +int register_kprobes(struct kprobe **kps, int num) { int i, ret = 0; @@ -1656,13 +1655,13 @@ int __kprobes register_kprobes(struct kprobe **kps, int num) } EXPORT_SYMBOL_GPL(register_kprobes); -void __kprobes unregister_kprobe(struct kprobe *p) +void unregister_kprobe(struct kprobe *p) { unregister_kprobes(&p, 1); } EXPORT_SYMBOL_GPL(unregister_kprobe); -void __kprobes unregister_kprobes(struct kprobe **kps, int num) +void unregister_kprobes(struct kprobe **kps, int num) { int i; @@ -1691,7 +1690,7 @@ unsigned long __weak arch_deref_entry_point(void *entry) return (unsigned long)entry; } -int __kprobes register_jprobes(struct jprobe **jps, int num) +int register_jprobes(struct jprobe **jps, int num) { struct jprobe *jp; int ret = 0, i; @@ -1722,19 +1721,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num) } EXPORT_SYMBOL_GPL(register_jprobes); -int __kprobes register_jprobe(struct jprobe *jp) +int register_jprobe(struct jprobe *jp) { return register_jprobes(&jp, 1); } EXPORT_SYMBOL_GPL(register_jprobe); -void __kprobes unregister_jprobe(struct jprobe *jp) +void unregister_jprobe(struct jprobe *jp) { unregister_jprobes(&jp, 1); } EXPORT_SYMBOL_GPL(unregister_jprobe); -void __kprobes unregister_jprobes(struct jprobe **jps, int num) +void unregister_jprobes(struct jprobe **jps, int num) { int i; @@ -1799,7 +1798,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, return 0; } -int __kprobes register_kretprobe(struct kretprobe *rp) +int register_kretprobe(struct kretprobe *rp) { int ret = 0; struct kretprobe_instance *inst; @@ -1852,7 +1851,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp) } EXPORT_SYMBOL_GPL(register_kretprobe); -int __kprobes register_kretprobes(struct kretprobe **rps, int num) +int register_kretprobes(struct kretprobe **rps, int num) { int ret = 0, i; @@ -1870,13 +1869,13 @@ int __kprobes register_kretprobes(struct kretprobe **rps, int num) } EXPORT_SYMBOL_GPL(register_kretprobes); -void __kprobes unregister_kretprobe(struct kretprobe *rp) +void unregister_kretprobe(struct kretprobe *rp) { unregister_kretprobes(&rp, 1); } EXPORT_SYMBOL_GPL(unregister_kretprobe); -void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) +void unregister_kretprobes(struct kretprobe **rps, int num) { int i; @@ -1899,24 +1898,24 @@ void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) EXPORT_SYMBOL_GPL(unregister_kretprobes); #else /* CONFIG_KRETPROBES */ -int __kprobes register_kretprobe(struct kretprobe *rp) +int register_kretprobe(struct kretprobe *rp) { return -ENOSYS; } EXPORT_SYMBOL_GPL(register_kretprobe); -int __kprobes register_kretprobes(struct kretprobe **rps, int num) +int register_kretprobes(struct kretprobe **rps, int num) { return -ENOSYS; } EXPORT_SYMBOL_GPL(register_kretprobes); -void __kprobes unregister_kretprobe(struct kretprobe *rp) +void unregister_kretprobe(struct kretprobe *rp) { } EXPORT_SYMBOL_GPL(unregister_kretprobe); -void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) +void unregister_kretprobes(struct kretprobe **rps, int num) { } EXPORT_SYMBOL_GPL(unregister_kretprobes); @@ -1930,7 +1929,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, #endif /* CONFIG_KRETPROBES */ /* Set the kprobe gone and remove its instruction buffer. */ -static void __kprobes kill_kprobe(struct kprobe *p) +static void kill_kprobe(struct kprobe *p) { struct kprobe *kp; @@ -1954,7 +1953,7 @@ static void __kprobes kill_kprobe(struct kprobe *p) } /* Disable one kprobe */ -int __kprobes disable_kprobe(struct kprobe *kp) +int disable_kprobe(struct kprobe *kp) { int ret = 0; @@ -1970,7 +1969,7 @@ int __kprobes disable_kprobe(struct kprobe *kp) EXPORT_SYMBOL_GPL(disable_kprobe); /* Enable one kprobe */ -int __kprobes enable_kprobe(struct kprobe *kp) +int enable_kprobe(struct kprobe *kp) { int ret = 0; struct kprobe *p; @@ -2043,8 +2042,8 @@ static int __init populate_kprobe_blacklist(unsigned long *start, } /* Module notifier call back, checking kprobes on the module */ -static int __kprobes kprobes_module_callback(struct notifier_block *nb, - unsigned long val, void *data) +static int kprobes_module_callback(struct notifier_block *nb, + unsigned long val, void *data) { struct module *mod = data; struct hlist_head *head; @@ -2145,7 +2144,7 @@ static int __init init_kprobes(void) } #ifdef CONFIG_DEBUG_FS -static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, +static void report_probe(struct seq_file *pi, struct kprobe *p, const char *sym, int offset, char *modname, struct kprobe *pp) { char *kprobe_type; @@ -2174,12 +2173,12 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, (kprobe_ftrace(pp) ? "[FTRACE]" : "")); } -static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) +static void *kprobe_seq_start(struct seq_file *f, loff_t *pos) { return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL; } -static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos) +static void *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos) { (*pos)++; if (*pos >= KPROBE_TABLE_SIZE) @@ -2187,12 +2186,12 @@ static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos) return pos; } -static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v) +static void kprobe_seq_stop(struct seq_file *f, void *v) { /* Nothing to do */ } -static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) +static int show_kprobe_addr(struct seq_file *pi, void *v) { struct hlist_head *head; struct kprobe *p, *kp; @@ -2223,7 +2222,7 @@ static const struct seq_operations kprobes_seq_ops = { .show = show_kprobe_addr }; -static int __kprobes kprobes_open(struct inode *inode, struct file *filp) +static int kprobes_open(struct inode *inode, struct file *filp) { return seq_open(filp, &kprobes_seq_ops); } @@ -2235,7 +2234,7 @@ static const struct file_operations debugfs_kprobes_operations = { .release = seq_release, }; -static void __kprobes arm_all_kprobes(void) +static void arm_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; @@ -2263,7 +2262,7 @@ already_enabled: return; } -static void __kprobes disarm_all_kprobes(void) +static void disarm_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; @@ -2347,7 +2346,7 @@ static const struct file_operations fops_kp = { .llseek = default_llseek, }; -static int __kprobes debugfs_kprobe_init(void) +static int __init debugfs_kprobe_init(void) { struct dentry *dir, *file; unsigned int value = 1; -- cgit v1.2.3 From fbc1963d2c1c4eb4651132a2c5c9d6111ada17d3 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 17 Apr 2014 17:18:00 +0900 Subject: kprobes, ftrace: Allow probing on some functions There is no need to prohibit probing on the functions used for preparation and uprobe only fetch functions. Those are safely probed because those are not invoked from kprobe's breakpoint/fault/debug handlers. So there is no chance to cause recursive exceptions. Following functions are now removed from the kprobes blacklist: update_bitfield_fetch_param free_bitfield_fetch_param kprobe_register FETCH_FUNC_NAME(stack, type) in trace_uprobe.c FETCH_FUNC_NAME(memory, type) in trace_uprobe.c FETCH_FUNC_NAME(memory, string) in trace_uprobe.c FETCH_FUNC_NAME(memory, string_size) in trace_uprobe.c FETCH_FUNC_NAME(file_offset, type) in trace_uprobe.c Signed-off-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20140417081800.26341.56504.stgit@ltc230.yrl.intra.hitachi.co.jp Signed-off-by: Ingo Molnar --- kernel/trace/trace_kprobe.c | 5 ++--- kernel/trace/trace_probe.c | 4 ++-- kernel/trace/trace_uprobe.c | 20 ++++++++++---------- 3 files changed, 14 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 903ae28962be..aa5f0bfcdf7b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1196,9 +1196,8 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe * lockless, but we can't race with this __init function. */ -static __kprobes -int kprobe_register(struct ftrace_event_call *event, - enum trace_reg type, void *data) +static int kprobe_register(struct ftrace_event_call *event, + enum trace_reg type, void *data) { struct trace_kprobe *tk = (struct trace_kprobe *)event->data; struct ftrace_event_file *file = data; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 8364a421b4df..d3a91e40a659 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -183,7 +183,7 @@ DEFINE_BASIC_FETCH_FUNCS(bitfield) #define fetch_bitfield_string NULL #define fetch_bitfield_string_size NULL -static __kprobes void +static void update_bitfield_fetch_param(struct bitfield_fetch_param *data) { /* @@ -196,7 +196,7 @@ update_bitfield_fetch_param(struct bitfield_fetch_param *data) update_symbol_cache(data->orig.data); } -static __kprobes void +static void free_bitfield_fetch_param(struct bitfield_fetch_param *data) { /* diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c082a7441345..991e3b7c4edb 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -108,8 +108,8 @@ static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n) * Uprobes-specific fetch functions */ #define DEFINE_FETCH_stack(type) \ -static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ - void *offset, void *dest) \ +static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \ + void *offset, void *dest) \ { \ *(type *)dest = (type)get_user_stack_nth(regs, \ ((unsigned long)offset)); \ @@ -120,8 +120,8 @@ DEFINE_BASIC_FETCH_FUNCS(stack) #define fetch_stack_string_size NULL #define DEFINE_FETCH_memory(type) \ -static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ - void *addr, void *dest) \ +static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \ + void *addr, void *dest) \ { \ type retval; \ void __user *vaddr = (void __force __user *) addr; \ @@ -136,8 +136,8 @@ DEFINE_BASIC_FETCH_FUNCS(memory) * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max * length and relative data location. */ -static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, - void *addr, void *dest) +static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, + void *addr, void *dest) { long ret; u32 rloc = *(u32 *)dest; @@ -158,8 +158,8 @@ static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, } } -static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, - void *addr, void *dest) +static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, + void *addr, void *dest) { int len; void __user *vaddr = (void __force __user *) addr; @@ -184,8 +184,8 @@ static unsigned long translate_user_vaddr(void *file_offset) } #define DEFINE_FETCH_file_offset(type) \ -static __kprobes void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs,\ - void *offset, void *dest) \ +static void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs, \ + void *offset, void *dest)\ { \ void *vaddr = (void *)translate_user_vaddr(offset); \ \ -- cgit v1.2.3 From 820aede0209a51549e8a014c8030e29229920e4e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 17 Apr 2014 17:18:21 +0900 Subject: kprobes: Use NOKPROBE_SYMBOL macro instead of __kprobes Use NOKPROBE_SYMBOL macro to protect functions from kprobes instead of __kprobes annotation. Signed-off-by: Masami Hiramatsu Reviewed-by: Steven Rostedt Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S. Miller Link: http://lkml.kernel.org/r/20140417081821.26341.40362.stgit@ltc230.yrl.intra.hitachi.co.jp Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 67 ++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 4db2cc616f50..a21b4e67fd97 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -301,7 +301,7 @@ static inline void reset_kprobe_instance(void) * OR * - with preemption disabled - from arch/xxx/kernel/kprobes.c */ -struct kprobe __kprobes *get_kprobe(void *addr) +struct kprobe *get_kprobe(void *addr) { struct hlist_head *head; struct kprobe *p; @@ -314,8 +314,9 @@ struct kprobe __kprobes *get_kprobe(void *addr) return NULL; } +NOKPROBE_SYMBOL(get_kprobe); -static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs); +static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs); /* Return true if the kprobe is an aggregator */ static inline int kprobe_aggrprobe(struct kprobe *p) @@ -347,7 +348,7 @@ static bool kprobes_allow_optimization; * Call all pre_handler on the list, but ignores its return value. * This must be called from arch-dep optimized caller. */ -void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs) +void opt_pre_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe *kp; @@ -359,6 +360,7 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs) reset_kprobe_instance(); } } +NOKPROBE_SYMBOL(opt_pre_handler); /* Free optimized instructions and optimized_kprobe */ static void free_aggr_kprobe(struct kprobe *p) @@ -995,7 +997,7 @@ static void disarm_kprobe(struct kprobe *kp, bool reopt) * Aggregate handlers for multiple kprobes support - these handlers * take care of invoking the individual kprobe handlers on p->list */ -static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) +static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe *kp; @@ -1009,9 +1011,10 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) } return 0; } +NOKPROBE_SYMBOL(aggr_pre_handler); -static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, - unsigned long flags) +static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, + unsigned long flags) { struct kprobe *kp; @@ -1023,9 +1026,10 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, } } } +NOKPROBE_SYMBOL(aggr_post_handler); -static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, - int trapnr) +static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, + int trapnr) { struct kprobe *cur = __this_cpu_read(kprobe_instance); @@ -1039,8 +1043,9 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, } return 0; } +NOKPROBE_SYMBOL(aggr_fault_handler); -static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) +static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe *cur = __this_cpu_read(kprobe_instance); int ret = 0; @@ -1052,9 +1057,10 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) reset_kprobe_instance(); return ret; } +NOKPROBE_SYMBOL(aggr_break_handler); /* Walks the list and increments nmissed count for multiprobe case */ -void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) +void kprobes_inc_nmissed_count(struct kprobe *p) { struct kprobe *kp; if (!kprobe_aggrprobe(p)) { @@ -1065,9 +1071,10 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) } return; } +NOKPROBE_SYMBOL(kprobes_inc_nmissed_count); -void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, - struct hlist_head *head) +void recycle_rp_inst(struct kretprobe_instance *ri, + struct hlist_head *head) { struct kretprobe *rp = ri->rp; @@ -1082,8 +1089,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, /* Unregistering */ hlist_add_head(&ri->hlist, head); } +NOKPROBE_SYMBOL(recycle_rp_inst); -void __kprobes kretprobe_hash_lock(struct task_struct *tsk, +void kretprobe_hash_lock(struct task_struct *tsk, struct hlist_head **head, unsigned long *flags) __acquires(hlist_lock) { @@ -1094,17 +1102,19 @@ __acquires(hlist_lock) hlist_lock = kretprobe_table_lock_ptr(hash); raw_spin_lock_irqsave(hlist_lock, *flags); } +NOKPROBE_SYMBOL(kretprobe_hash_lock); -static void __kprobes kretprobe_table_lock(unsigned long hash, - unsigned long *flags) +static void kretprobe_table_lock(unsigned long hash, + unsigned long *flags) __acquires(hlist_lock) { raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); raw_spin_lock_irqsave(hlist_lock, *flags); } +NOKPROBE_SYMBOL(kretprobe_table_lock); -void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, - unsigned long *flags) +void kretprobe_hash_unlock(struct task_struct *tsk, + unsigned long *flags) __releases(hlist_lock) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); @@ -1113,14 +1123,16 @@ __releases(hlist_lock) hlist_lock = kretprobe_table_lock_ptr(hash); raw_spin_unlock_irqrestore(hlist_lock, *flags); } +NOKPROBE_SYMBOL(kretprobe_hash_unlock); -static void __kprobes kretprobe_table_unlock(unsigned long hash, - unsigned long *flags) +static void kretprobe_table_unlock(unsigned long hash, + unsigned long *flags) __releases(hlist_lock) { raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); raw_spin_unlock_irqrestore(hlist_lock, *flags); } +NOKPROBE_SYMBOL(kretprobe_table_unlock); /* * This function is called from finish_task_switch when task tk becomes dead, @@ -1128,7 +1140,7 @@ __releases(hlist_lock) * with this task. These left over instances represent probed functions * that have been called but will never return. */ -void __kprobes kprobe_flush_task(struct task_struct *tk) +void kprobe_flush_task(struct task_struct *tk) { struct kretprobe_instance *ri; struct hlist_head *head, empty_rp; @@ -1153,6 +1165,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) kfree(ri); } } +NOKPROBE_SYMBOL(kprobe_flush_task); static inline void free_rp_inst(struct kretprobe *rp) { @@ -1165,7 +1178,7 @@ static inline void free_rp_inst(struct kretprobe *rp) } } -static void __kprobes cleanup_rp_inst(struct kretprobe *rp) +static void cleanup_rp_inst(struct kretprobe *rp) { unsigned long flags, hash; struct kretprobe_instance *ri; @@ -1184,6 +1197,7 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp) } free_rp_inst(rp); } +NOKPROBE_SYMBOL(cleanup_rp_inst); /* * Add the new probe to ap->list. Fail if this is the @@ -1758,8 +1772,7 @@ EXPORT_SYMBOL_GPL(unregister_jprobes); * This kprobe pre_handler is registered with every kretprobe. When probe * hits it will set up the return probe. */ -static int __kprobes pre_handler_kretprobe(struct kprobe *p, - struct pt_regs *regs) +static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) { struct kretprobe *rp = container_of(p, struct kretprobe, kp); unsigned long hash, flags = 0; @@ -1797,6 +1810,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, } return 0; } +NOKPROBE_SYMBOL(pre_handler_kretprobe); int register_kretprobe(struct kretprobe *rp) { @@ -1920,11 +1934,11 @@ void unregister_kretprobes(struct kretprobe **rps, int num) } EXPORT_SYMBOL_GPL(unregister_kretprobes); -static int __kprobes pre_handler_kretprobe(struct kprobe *p, - struct pt_regs *regs) +static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) { return 0; } +NOKPROBE_SYMBOL(pre_handler_kretprobe); #endif /* CONFIG_KRETPROBES */ @@ -2002,12 +2016,13 @@ out: } EXPORT_SYMBOL_GPL(enable_kprobe); -void __kprobes dump_kprobe(struct kprobe *kp) +void dump_kprobe(struct kprobe *kp) { printk(KERN_WARNING "Dumping kprobe:\n"); printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n", kp->symbol_name, kp->addr, kp->offset); } +NOKPROBE_SYMBOL(dump_kprobe); /* * Lookup and populate the kprobe_blacklist. -- cgit v1.2.3 From 3da0f18007e5b87b573cf6ae8c445d59e757d274 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 17 Apr 2014 17:18:28 +0900 Subject: kprobes, ftrace: Use NOKPROBE_SYMBOL macro in ftrace Use NOKPROBE_SYMBOL macro to protect functions from kprobes instead of __kprobes annotation in ftrace. This applies nokprobe_inline annotation for some cases, because NOKPROBE_SYMBOL() will inhibit inlining by referring the symbol address. Signed-off-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20140417081828.26341.55152.stgit@ltc230.yrl.intra.hitachi.co.jp Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_perf.c | 5 ++-- kernel/trace/trace_kprobe.c | 66 ++++++++++++++++++++++++----------------- kernel/trace/trace_probe.c | 61 ++++++++++++++++++++----------------- kernel/trace/trace_probe.h | 15 +++++----- 4 files changed, 82 insertions(+), 65 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index c894614de14d..5d12bb407b44 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -248,8 +248,8 @@ void perf_trace_del(struct perf_event *p_event, int flags) tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); } -__kprobes void *perf_trace_buf_prepare(int size, unsigned short type, - struct pt_regs *regs, int *rctxp) +void *perf_trace_buf_prepare(int size, unsigned short type, + struct pt_regs *regs, int *rctxp) { struct trace_entry *entry; unsigned long flags; @@ -281,6 +281,7 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, return raw_data; } EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); +NOKPROBE_SYMBOL(perf_trace_buf_prepare); #ifdef CONFIG_FUNCTION_TRACER static void diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index aa5f0bfcdf7b..242e4ec97d94 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -40,27 +40,27 @@ struct trace_kprobe { (sizeof(struct probe_arg) * (n))) -static __kprobes bool trace_kprobe_is_return(struct trace_kprobe *tk) +static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) { return tk->rp.handler != NULL; } -static __kprobes const char *trace_kprobe_symbol(struct trace_kprobe *tk) +static nokprobe_inline const char *trace_kprobe_symbol(struct trace_kprobe *tk) { return tk->symbol ? tk->symbol : "unknown"; } -static __kprobes unsigned long trace_kprobe_offset(struct trace_kprobe *tk) +static nokprobe_inline unsigned long trace_kprobe_offset(struct trace_kprobe *tk) { return tk->rp.kp.offset; } -static __kprobes bool trace_kprobe_has_gone(struct trace_kprobe *tk) +static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk) { return !!(kprobe_gone(&tk->rp.kp)); } -static __kprobes bool trace_kprobe_within_module(struct trace_kprobe *tk, +static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, struct module *mod) { int len = strlen(mod->name); @@ -68,7 +68,7 @@ static __kprobes bool trace_kprobe_within_module(struct trace_kprobe *tk, return strncmp(mod->name, name, len) == 0 && name[len] == ':'; } -static __kprobes bool trace_kprobe_is_on_module(struct trace_kprobe *tk) +static nokprobe_inline bool trace_kprobe_is_on_module(struct trace_kprobe *tk) { return !!strchr(trace_kprobe_symbol(tk), ':'); } @@ -132,19 +132,21 @@ struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) * Kprobes-specific fetch functions */ #define DEFINE_FETCH_stack(type) \ -static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ +static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \ void *offset, void *dest) \ { \ *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ (unsigned int)((unsigned long)offset)); \ -} +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(stack, type)); + DEFINE_BASIC_FETCH_FUNCS(stack) /* No string on the stack entry */ #define fetch_stack_string NULL #define fetch_stack_string_size NULL #define DEFINE_FETCH_memory(type) \ -static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ +static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \ void *addr, void *dest) \ { \ type retval; \ @@ -152,14 +154,16 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ *(type *)dest = 0; \ else \ *(type *)dest = retval; \ -} +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, type)); + DEFINE_BASIC_FETCH_FUNCS(memory) /* * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max * length and relative data location. */ -static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, - void *addr, void *dest) +static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, + void *addr, void *dest) { long ret; int maxlen = get_rloc_len(*(u32 *)dest); @@ -193,10 +197,11 @@ static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, get_rloc_offs(*(u32 *)dest)); } } +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string)); /* Return the length of string -- including null terminal byte */ -static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, - void *addr, void *dest) +static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, + void *addr, void *dest) { mm_segment_t old_fs; int ret, len = 0; @@ -219,17 +224,19 @@ static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, else *(u32 *)dest = len; } +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string_size)); #define DEFINE_FETCH_symbol(type) \ -__kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, \ - void *data, void *dest) \ +void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, void *data, void *dest)\ { \ struct symbol_cache *sc = data; \ if (sc->addr) \ fetch_memory_##type(regs, (void *)sc->addr, dest); \ else \ *(type *)dest = 0; \ -} +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(symbol, type)); + DEFINE_BASIC_FETCH_FUNCS(symbol) DEFINE_FETCH_symbol(string) DEFINE_FETCH_symbol(string_size) @@ -907,7 +914,7 @@ static const struct file_operations kprobe_profile_ops = { }; /* Kprobe handler */ -static __kprobes void +static nokprobe_inline void __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, struct ftrace_event_file *ftrace_file) { @@ -943,7 +950,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, entry, irq_flags, pc, regs); } -static __kprobes void +static void kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct event_file_link *link; @@ -951,9 +958,10 @@ kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs) list_for_each_entry_rcu(link, &tk->tp.files, list) __kprobe_trace_func(tk, regs, link->file); } +NOKPROBE_SYMBOL(kprobe_trace_func); /* Kretprobe handler */ -static __kprobes void +static nokprobe_inline void __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs, struct ftrace_event_file *ftrace_file) @@ -991,7 +999,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, entry, irq_flags, pc, regs); } -static __kprobes void +static void kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs) { @@ -1000,6 +1008,7 @@ kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, list_for_each_entry_rcu(link, &tk->tp.files, list) __kretprobe_trace_func(tk, ri, regs, link->file); } +NOKPROBE_SYMBOL(kretprobe_trace_func); /* Event entry printers */ static enum print_line_t @@ -1131,7 +1140,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) #ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ -static __kprobes void +static void kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct ftrace_event_call *call = &tk->tp.call; @@ -1158,9 +1167,10 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); } +NOKPROBE_SYMBOL(kprobe_perf_func); /* Kretprobe profile handler */ -static __kprobes void +static void kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs) { @@ -1188,6 +1198,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); } +NOKPROBE_SYMBOL(kretprobe_perf_func); #endif /* CONFIG_PERF_EVENTS */ /* @@ -1223,8 +1234,7 @@ static int kprobe_register(struct ftrace_event_call *event, return 0; } -static __kprobes -int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) +static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); @@ -1238,9 +1248,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) #endif return 0; /* We don't tweek kernel, so just return 0 */ } +NOKPROBE_SYMBOL(kprobe_dispatcher); -static __kprobes -int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) +static int +kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp); @@ -1254,6 +1265,7 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) #endif return 0; /* We don't tweek kernel, so just return 0 */ } +NOKPROBE_SYMBOL(kretprobe_dispatcher); static struct trace_event_functions kretprobe_funcs = { .trace = print_kretprobe_event diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index d3a91e40a659..d4b9fc22cd27 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -37,13 +37,13 @@ const char *reserved_field_names[] = { /* Printing in basic type function template */ #define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \ -__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ - const char *name, \ - void *data, void *ent) \ +int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ + void *data, void *ent) \ { \ return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ } \ -const char PRINT_TYPE_FMT_NAME(type)[] = fmt; +const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \ +NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type)); DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x") DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x") @@ -55,9 +55,8 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d") DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld") /* Print type function for string type */ -__kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, - const char *name, - void *data, void *ent) +int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, + void *data, void *ent) { int len = *(u32 *)data >> 16; @@ -67,6 +66,7 @@ __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, return trace_seq_printf(s, " %s=\"%s\"", name, (const char *)get_loc_data(data, ent)); } +NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string)); const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; @@ -81,23 +81,24 @@ const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; /* Data fetch function templates */ #define DEFINE_FETCH_reg(type) \ -__kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ - void *offset, void *dest) \ +void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, void *offset, void *dest) \ { \ *(type *)dest = (type)regs_get_register(regs, \ (unsigned int)((unsigned long)offset)); \ -} +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(reg, type)); DEFINE_BASIC_FETCH_FUNCS(reg) /* No string on the register */ #define fetch_reg_string NULL #define fetch_reg_string_size NULL #define DEFINE_FETCH_retval(type) \ -__kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \ - void *dummy, void *dest) \ +void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \ + void *dummy, void *dest) \ { \ *(type *)dest = (type)regs_return_value(regs); \ -} +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(retval, type)); DEFINE_BASIC_FETCH_FUNCS(retval) /* No string on the retval */ #define fetch_retval_string NULL @@ -112,8 +113,8 @@ struct deref_fetch_param { }; #define DEFINE_FETCH_deref(type) \ -__kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \ - void *data, void *dest) \ +void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \ + void *data, void *dest) \ { \ struct deref_fetch_param *dprm = data; \ unsigned long addr; \ @@ -123,12 +124,13 @@ __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \ dprm->fetch(regs, (void *)addr, dest); \ } else \ *(type *)dest = 0; \ -} +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, type)); DEFINE_BASIC_FETCH_FUNCS(deref) DEFINE_FETCH_deref(string) -__kprobes void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs, - void *data, void *dest) +void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs, + void *data, void *dest) { struct deref_fetch_param *dprm = data; unsigned long addr; @@ -140,16 +142,18 @@ __kprobes void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs, } else *(string_size *)dest = 0; } +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, string_size)); -static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data) +static void update_deref_fetch_param(struct deref_fetch_param *data) { if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) update_deref_fetch_param(data->orig.data); else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) update_symbol_cache(data->orig.data); } +NOKPROBE_SYMBOL(update_deref_fetch_param); -static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) +static void free_deref_fetch_param(struct deref_fetch_param *data) { if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) free_deref_fetch_param(data->orig.data); @@ -157,6 +161,7 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) free_symbol_cache(data->orig.data); kfree(data); } +NOKPROBE_SYMBOL(free_deref_fetch_param); /* Bitfield fetch function */ struct bitfield_fetch_param { @@ -166,8 +171,8 @@ struct bitfield_fetch_param { }; #define DEFINE_FETCH_bitfield(type) \ -__kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \ - void *data, void *dest) \ +void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \ + void *data, void *dest) \ { \ struct bitfield_fetch_param *bprm = data; \ type buf = 0; \ @@ -177,8 +182,8 @@ __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \ buf >>= bprm->low_shift; \ } \ *(type *)dest = buf; \ -} - +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(bitfield, type)); DEFINE_BASIC_FETCH_FUNCS(bitfield) #define fetch_bitfield_string NULL #define fetch_bitfield_string_size NULL @@ -255,17 +260,17 @@ fail: } /* Special function : only accept unsigned long */ -static __kprobes void fetch_kernel_stack_address(struct pt_regs *regs, - void *dummy, void *dest) +static void fetch_kernel_stack_address(struct pt_regs *regs, void *dummy, void *dest) { *(unsigned long *)dest = kernel_stack_pointer(regs); } +NOKPROBE_SYMBOL(fetch_kernel_stack_address); -static __kprobes void fetch_user_stack_address(struct pt_regs *regs, - void *dummy, void *dest) +static void fetch_user_stack_address(struct pt_regs *regs, void *dummy, void *dest) { *(unsigned long *)dest = user_stack_pointer(regs); } +NOKPROBE_SYMBOL(fetch_user_stack_address); static fetch_func_t get_fetch_size_function(const struct fetch_type *type, fetch_func_t orig_fn, diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index fb1ab5dfbd42..4f815fbce16d 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -81,13 +81,13 @@ */ #define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) -static inline void *get_rloc_data(u32 *dl) +static nokprobe_inline void *get_rloc_data(u32 *dl) { return (u8 *)dl + get_rloc_offs(*dl); } /* For data_loc conversion */ -static inline void *get_loc_data(u32 *dl, void *ent) +static nokprobe_inline void *get_loc_data(u32 *dl, void *ent) { return (u8 *)ent + get_rloc_offs(*dl); } @@ -136,9 +136,8 @@ typedef u32 string_size; /* Printing in basic type function template */ #define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \ -__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ - const char *name, \ - void *data, void *ent); \ +int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ + void *data, void *ent); \ extern const char PRINT_TYPE_FMT_NAME(type)[] DECLARE_BASIC_PRINT_TYPE_FUNC(u8); @@ -303,7 +302,7 @@ static inline bool trace_probe_is_registered(struct trace_probe *tp) return !!(tp->flags & TP_FLAG_REGISTERED); } -static inline __kprobes void call_fetch(struct fetch_param *fprm, +static nokprobe_inline void call_fetch(struct fetch_param *fprm, struct pt_regs *regs, void *dest) { return fprm->fn(regs, fprm->data, dest); @@ -351,7 +350,7 @@ extern ssize_t traceprobe_probes_write(struct file *file, extern int traceprobe_command(const char *buf, int (*createfn)(int, char**)); /* Sum up total data length for dynamic arraies (strings) */ -static inline __kprobes int +static nokprobe_inline int __get_data_size(struct trace_probe *tp, struct pt_regs *regs) { int i, ret = 0; @@ -367,7 +366,7 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs) } /* Store the value of each argument */ -static inline __kprobes void +static nokprobe_inline void store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, u8 *data, int maxlen) { -- cgit v1.2.3 From b40a2cb6e0c218c6d0f12c7f7e683e75972973c1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 17 Apr 2014 17:18:35 +0900 Subject: kprobes, notifier: Use NOKPROBE_SYMBOL macro in notifier Use NOKPROBE_SYMBOL macro to protect functions from kprobes instead of __kprobes annotation in notifier. Signed-off-by: Masami Hiramatsu Reviewed-by: Steven Rostedt Reviewed-by: Josh Triplett Cc: Paul E. McKenney Link: http://lkml.kernel.org/r/20140417081835.26341.56128.stgit@ltc230.yrl.intra.hitachi.co.jp Signed-off-by: Ingo Molnar --- kernel/notifier.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/notifier.c b/kernel/notifier.c index db4c8b08a50c..4803da6eab62 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -71,9 +71,9 @@ static int notifier_chain_unregister(struct notifier_block **nl, * @returns: notifier_call_chain returns the value returned by the * last notifier function called. */ -static int __kprobes notifier_call_chain(struct notifier_block **nl, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) +static int notifier_call_chain(struct notifier_block **nl, + unsigned long val, void *v, + int nr_to_call, int *nr_calls) { int ret = NOTIFY_DONE; struct notifier_block *nb, *next_nb; @@ -102,6 +102,7 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl, } return ret; } +NOKPROBE_SYMBOL(notifier_call_chain); /* * Atomic notifier chain routines. Registration and unregistration @@ -172,9 +173,9 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); * Otherwise the return value is the return value * of the last notifier function called. */ -int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) +int __atomic_notifier_call_chain(struct atomic_notifier_head *nh, + unsigned long val, void *v, + int nr_to_call, int *nr_calls) { int ret; @@ -184,13 +185,15 @@ int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh, return ret; } EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain); +NOKPROBE_SYMBOL(__atomic_notifier_call_chain); -int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh, - unsigned long val, void *v) +int atomic_notifier_call_chain(struct atomic_notifier_head *nh, + unsigned long val, void *v) { return __atomic_notifier_call_chain(nh, val, v, -1, NULL); } EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); +NOKPROBE_SYMBOL(atomic_notifier_call_chain); /* * Blocking notifier chain routines. All access to the chain is @@ -527,7 +530,7 @@ EXPORT_SYMBOL_GPL(srcu_init_notifier_head); static ATOMIC_NOTIFIER_HEAD(die_chain); -int notrace __kprobes notify_die(enum die_val val, const char *str, +int notrace notify_die(enum die_val val, const char *str, struct pt_regs *regs, long err, int trap, int sig) { struct die_args args = { @@ -540,6 +543,7 @@ int notrace __kprobes notify_die(enum die_val val, const char *str, }; return atomic_notifier_call_chain(&die_chain, val, &args); } +NOKPROBE_SYMBOL(notify_die); int register_die_notifier(struct notifier_block *nb) { -- cgit v1.2.3 From edafe3a56dbd42c499245b222e9f7e80099356e5 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 17 Apr 2014 17:18:42 +0900 Subject: kprobes, sched: Use NOKPROBE_SYMBOL macro in sched Use NOKPROBE_SYMBOL macro to protect functions from kprobes instead of __kprobes annotation in sched/core.c. Signed-off-by: Masami Hiramatsu Reviewed-by: Steven Rostedt Cc: Peter Zijlstra Signed-off-by: Ingo Molnar Link: http://lkml.kernel.org/r/20140417081842.26341.83959.stgit@ltc230.yrl.intra.hitachi.co.jp --- kernel/sched/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6863631e8cd0..00781cc38047 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2480,7 +2480,7 @@ notrace unsigned long get_parent_ip(unsigned long addr) #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) -void __kprobes preempt_count_add(int val) +void preempt_count_add(int val) { #ifdef CONFIG_DEBUG_PREEMPT /* @@ -2506,8 +2506,9 @@ void __kprobes preempt_count_add(int val) } } EXPORT_SYMBOL(preempt_count_add); +NOKPROBE_SYMBOL(preempt_count_add); -void __kprobes preempt_count_sub(int val) +void preempt_count_sub(int val) { #ifdef CONFIG_DEBUG_PREEMPT /* @@ -2528,6 +2529,7 @@ void __kprobes preempt_count_sub(int val) __preempt_count_sub(val); } EXPORT_SYMBOL(preempt_count_sub); +NOKPROBE_SYMBOL(preempt_count_sub); #endif -- cgit v1.2.3 From 637247403abff8c963bc7be8002b3f49ea604563 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 17 Apr 2014 17:18:49 +0900 Subject: kprobes: Show blacklist entries via debugfs Show blacklist entries (function names with the address range) via /sys/kernel/debug/kprobes/blacklist. Note that at this point the blacklist supports only in vmlinux, not module. So the list is fixed and not updated. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S. Miller Link: http://lkml.kernel.org/r/20140417081849.26341.11609.stgit@ltc230.yrl.intra.hitachi.co.jp Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 53 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index a21b4e67fd97..3214289df5a7 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2249,6 +2249,46 @@ static const struct file_operations debugfs_kprobes_operations = { .release = seq_release, }; +/* kprobes/blacklist -- shows which functions can not be probed */ +static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos) +{ + return seq_list_start(&kprobe_blacklist, *pos); +} + +static void *kprobe_blacklist_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &kprobe_blacklist, pos); +} + +static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) +{ + struct kprobe_blacklist_entry *ent = + list_entry(v, struct kprobe_blacklist_entry, list); + + seq_printf(m, "0x%p-0x%p\t%ps\n", (void *)ent->start_addr, + (void *)ent->end_addr, (void *)ent->start_addr); + return 0; +} + +static const struct seq_operations kprobe_blacklist_seq_ops = { + .start = kprobe_blacklist_seq_start, + .next = kprobe_blacklist_seq_next, + .stop = kprobe_seq_stop, /* Reuse void function */ + .show = kprobe_blacklist_seq_show, +}; + +static int kprobe_blacklist_open(struct inode *inode, struct file *filp) +{ + return seq_open(filp, &kprobe_blacklist_seq_ops); +} + +static const struct file_operations debugfs_kprobe_blacklist_ops = { + .open = kprobe_blacklist_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static void arm_all_kprobes(void) { struct hlist_head *head; @@ -2372,19 +2412,24 @@ static int __init debugfs_kprobe_init(void) file = debugfs_create_file("list", 0444, dir, NULL, &debugfs_kprobes_operations); - if (!file) { - debugfs_remove(dir); - return -ENOMEM; - } + if (!file) + goto error; file = debugfs_create_file("enabled", 0600, dir, &value, &fops_kp); - if (!file) { - debugfs_remove(dir); - return -ENOMEM; - } + if (!file) + goto error; + + file = debugfs_create_file("blacklist", 0444, dir, NULL, + &debugfs_kprobe_blacklist_ops); + if (!file) + goto error; return 0; + +error: + debugfs_remove(dir); + return -ENOMEM; } late_initcall(debugfs_kprobe_init); -- cgit v1.2.3 From 7eea4fce0246fe3a15ad7f3bb8d0a56d1f9440e6 Mon Sep 17 00:00:00 2001 From: Jiaxing Wang Date: Sun, 20 Apr 2014 23:10:43 +0800 Subject: tracing/stack_trace: Skip 4 instead of 3 when using ftrace_ops_list_func When using ftrace_ops_list_func, we should skip 4 instead of 3, to avoid ftrace_call+0x5/0xb appearing in the stack trace: Depth Size Location (110 entries) ----- ---- -------- 0) 2956 0 update_curr+0xe/0x1e0 1) 2956 68 ftrace_call+0x5/0xb 2) 2888 92 enqueue_entity+0x53/0xe80 3) 2796 80 enqueue_task_fair+0x47/0x7e0 4) 2716 28 enqueue_task+0x45/0x70 5) 2688 12 activate_task+0x22/0x30 Add a function using_ftrace_ops_list_func() to test for this while keeping ftrace_ops_list_func to remain static. Link: http://lkml.kernel.org/p/1398006644-5935-2-git-send-email-wangjiaxing@insigma.com.cn Signed-off-by: Jiaxing Wang Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 5 +++++ kernel/trace/trace.h | 1 + kernel/trace/trace_stack.c | 8 ++++++-- 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 846888ea2ba4..3cfeb66bda67 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -313,6 +313,11 @@ static void update_ftrace_function(void) ftrace_trace_function = func; } +int using_ftrace_ops_list_func(void) +{ + return ftrace_trace_function == ftrace_ops_list_func; +} + static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) { ops->next = *list; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5d2f07d6746c..8624b5041466 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -830,6 +830,7 @@ void ftrace_destroy_function_files(struct trace_array *tr); void ftrace_init_global_array_ops(struct trace_array *tr); void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); void ftrace_reset_array_ops(struct trace_array *tr); +int using_ftrace_ops_list_func(void); #else static inline int ftrace_trace_task(struct task_struct *task) { diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 21b320e5d163..5aa9a5b9b6e2 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -85,8 +85,12 @@ check_stack(unsigned long ip, unsigned long *stack) max_stack_size = this_size; - max_stack_trace.nr_entries = 0; - max_stack_trace.skip = 3; + max_stack_trace.nr_entries = 0; + + if (using_ftrace_ops_list_func()) + max_stack_trace.skip = 4; + else + max_stack_trace.skip = 3; save_stack_trace(&max_stack_trace); -- cgit v1.2.3 From 8d1b065d47ff1424b1aef7a6aa605b467694c120 Mon Sep 17 00:00:00 2001 From: Jiaxing Wang Date: Sun, 20 Apr 2014 23:10:44 +0800 Subject: tracing: Fix documentation of ftrace_set_global_{filter,notrace}() The functions ftrace_set_global_filter() and ftrace_set_global_notrace() still have their old names in the kernel doc (ftrace_set_filter and ftrace_set_notrace respectively). Replace these with the real names. Link: http://lkml.kernel.org/p/1398006644-5935-3-git-send-email-wangjiaxing@insigma.com.cn Signed-off-by: Jiaxing Wang Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3cfeb66bda67..34b098bfded4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3542,8 +3542,7 @@ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, } EXPORT_SYMBOL_GPL(ftrace_set_notrace); /** - * ftrace_set_filter - set a function to filter on in ftrace - * @ops - the ops to set the filter with + * ftrace_set_global_filter - set a function to filter on with global tracers * @buf - the string that holds the function filter text. * @len - the length of the string. * @reset - non zero to reset all filters before applying this filter. @@ -3558,8 +3557,7 @@ void ftrace_set_global_filter(unsigned char *buf, int len, int reset) EXPORT_SYMBOL_GPL(ftrace_set_global_filter); /** - * ftrace_set_notrace - set a function to not trace in ftrace - * @ops - the ops to set the notrace filter with + * ftrace_set_global_notrace - set a function to not trace with global tracers * @buf - the string that holds the function notrace text. * @len - the length of the string. * @reset - non zero to reset all filters before applying this filter. -- cgit v1.2.3 From 842b597ee0a7e1aa5a3148164ffdba00ec17f614 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 25 Apr 2014 18:28:02 -0400 Subject: cgroup: implement cgroup.populated for the default hierarchy cgroup users often need a way to determine when a cgroup's subhierarchy becomes empty so that it can be cleaned up. cgroup currently provides release_agent for it; unfortunately, this mechanism is riddled with issues. * It delivers events by forking and execing a userland binary specified as the release_agent. This is a long deprecated method of notification delivery. It's extremely heavy, slow and cumbersome to integrate with larger infrastructure. * There is single monitoring point at the root. There's no way to delegate management of a subtree. * The event isn't recursive. It triggers when a cgroup doesn't have any tasks or child cgroups. Events for internal nodes trigger only after all children are removed. This again makes it impossible to delegate management of a subtree. * Events are filtered from the kernel side. "notify_on_release" file is used to subscribe to or suppress release event. This is unnecessarily complicated and probably done this way because event delivery itself was expensive. This patch implements interface file "cgroup.populated" which can be used to monitor whether the cgroup's subhierarchy has tasks in it or not. Its value is 0 if there is no task in the cgroup and its descendants; otherwise, 1, and kernfs_notify() notificaiton is triggers when the value changes, which can be monitored through poll and [di]notify. This is a lot ligther and simpler and trivially allows delegating management of subhierarchy - subhierarchy monitoring can block further propgation simply by putting itself or another process in the root of the subhierarchy and monitor events that it's interested in from there without interfering with monitoring higher in the tree. v2: Patch description updated as per Serge. v3: "cgroup.subtree_populated" renamed to "cgroup.populated". The subtree_ prefix was a bit confusing because "cgroup.subtree_control" uses it to denote the tree rooted at the cgroup sans the cgroup itself while the populated state includes the cgroup itself. Signed-off-by: Tejun Heo Acked-by: Serge Hallyn Acked-by: Li Zefan Cc: Lennart Poettering --- kernel/cgroup.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 61 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 809dd903ceb8..0f986f7afee4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -411,6 +411,43 @@ static struct css_set init_css_set = { static int css_set_count = 1; /* 1 for init_css_set */ +/** + * cgroup_update_populated - updated populated count of a cgroup + * @cgrp: the target cgroup + * @populated: inc or dec populated count + * + * @cgrp is either getting the first task (css_set) or losing the last. + * Update @cgrp->populated_cnt accordingly. The count is propagated + * towards root so that a given cgroup's populated_cnt is zero iff the + * cgroup and all its descendants are empty. + * + * @cgrp's interface file "cgroup.populated" is zero if + * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt + * changes from or to zero, userland is notified that the content of the + * interface file has changed. This can be used to detect when @cgrp and + * its descendants become populated or empty. + */ +static void cgroup_update_populated(struct cgroup *cgrp, bool populated) +{ + lockdep_assert_held(&css_set_rwsem); + + do { + bool trigger; + + if (populated) + trigger = !cgrp->populated_cnt++; + else + trigger = !--cgrp->populated_cnt; + + if (!trigger) + break; + + if (cgrp->populated_kn) + kernfs_notify(cgrp->populated_kn); + cgrp = cgrp->parent; + } while (cgrp); +} + /* * hash table for cgroup groups. This improves the performance to find * an existing css_set. This hash doesn't (currently) take into @@ -456,10 +493,13 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit) list_del(&link->cgrp_link); /* @cgrp can't go away while we're holding css_set_rwsem */ - if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { - if (taskexit) - set_bit(CGRP_RELEASABLE, &cgrp->flags); - check_for_release(cgrp); + if (list_empty(&cgrp->cset_links)) { + cgroup_update_populated(cgrp, false); + if (notify_on_release(cgrp)) { + if (taskexit) + set_bit(CGRP_RELEASABLE, &cgrp->flags); + check_for_release(cgrp); + } } kfree(link); @@ -668,7 +708,11 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); link->cset = cset; link->cgrp = cgrp; + + if (list_empty(&cgrp->cset_links)) + cgroup_update_populated(cgrp, true); list_move(&link->cset_link, &cgrp->cset_links); + /* * Always add links to the tail of the list so that the list * is sorted by order of hierarchy creation @@ -2643,6 +2687,12 @@ err_undo_css: goto out_unlock; } +static int cgroup_populated_show(struct seq_file *seq, void *v) +{ + seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt); + return 0; +} + static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -2809,6 +2859,8 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) if (cft->seq_show == cgroup_subtree_control_show) cgrp->control_kn = kn; + else if (cft->seq_show == cgroup_populated_show) + cgrp->populated_kn = kn; return 0; } @@ -3918,6 +3970,11 @@ static struct cftype cgroup_base_files[] = { .seq_show = cgroup_subtree_control_show, .write_string = cgroup_subtree_control_write, }, + { + .name = "cgroup.populated", + .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_populated_show, + }, /* * Historical crazy stuff. These don't have "cgroup." prefix and -- cgit v1.2.3 From 2f0edc04e702fc07d29621f9e361b9120a7594d0 Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Fri, 25 Apr 2014 18:28:03 -0400 Subject: cgroup: clean up obsolete comment for parse_cgroupfs_options() 1d5be6b287c8efc87 ("cgroup: move module ref handling into rebind_subsystems()") makes parse_cgroupfs_options() no longer takes refcounts on subsystems. And unified hierachy makes parse_cgroupfs_options not need to call with cgroup_mutex held to protect the cgroup_subsys[]. So this patch removes BUG_ON() and the comment. As the comment doesn't contain useful information afterwards, the whole comment is removed. Signed-off-by: Jianyu Zhan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0f986f7afee4..fb848be0ea7b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1221,12 +1221,6 @@ struct cgroup_sb_opts { bool none; }; -/* - * Convert a hierarchy specifier into a bitmask of subsystems and - * flags. Call with cgroup_mutex held to protect the cgroup_subsys[] - * array. This function takes refcounts on subsystems to be used, unless it - * returns error, in which case no refcounts are taken. - */ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { char *token, *o = data; @@ -1235,8 +1229,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) struct cgroup_subsys *ss; int i; - BUG_ON(!mutex_is_locked(&cgroup_mutex)); - #ifdef CONFIG_CPUSETS mask = ~(1UL << cpuset_cgrp_id); #endif -- cgit v1.2.3 From f8719ccf7bc0858384c7e93d8c57fe69ae8c9eac Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Fri, 25 Apr 2014 18:28:03 -0400 Subject: cgroup: remove orphaned cgroup_pidlist_seq_operations 6612f05b88fa309c9 ("cgroup: unify pidlist and other file handling") has removed the only user of cgroup_pidlist_seq_operations : cgroup_pidlist_open(). This patch removes it. Signed-off-by: Jianyu Zhan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fb848be0ea7b..3849d3d2dfe1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3880,17 +3880,6 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v) return seq_printf(s, "%d\n", *(int *)v); } -/* - * seq_operations functions for iterating on pidlists through seq_file - - * independent of whether it's tasks or procs - */ -static const struct seq_operations cgroup_pidlist_seq_operations = { - .start = cgroup_pidlist_start, - .stop = cgroup_pidlist_stop, - .next = cgroup_pidlist_next, - .show = cgroup_pidlist_show, -}; - static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, struct cftype *cft) { -- cgit v1.2.3 From a2a1f9eaf945c46b5b2bc0e439cba68888e3d540 Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Fri, 25 Apr 2014 18:28:03 -0400 Subject: cgroup: replace pr_warning with preferred pr_warn As suggested by scripts/checkpatch.pl, substitude all pr_warning() with pr_warn(). No functional change. Signed-off-by: Jianyu Zhan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3849d3d2dfe1..cb453e9954c1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1126,9 +1126,9 @@ static int rebind_subsystems(struct cgroup_root *dst_root, * Just warn about it and continue. */ if (cgrp_dfl_root_visible) { - pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n", - ret, ss_mask); - pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n"); + pr_warn("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n", + ret, ss_mask); + pr_warn("cgroup: you may retry by moving them to a different hierarchy and unbinding\n"); } } @@ -1323,7 +1323,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) /* Consistency checks */ if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { - pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); + pr_warn("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || opts->cpuset_clone_children || opts->release_agent || @@ -1387,8 +1387,8 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) goto out_unlock; if (opts.subsys_mask != root->subsys_mask || opts.release_agent) - pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", - task_tgid_nr(current), current->comm); + pr_warn("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", + task_tgid_nr(current), current->comm); added_mask = opts.subsys_mask & ~root->subsys_mask; removed_mask = root->subsys_mask & ~opts.subsys_mask; @@ -1669,7 +1669,7 @@ retry: ret = -EINVAL; goto out_unlock; } else { - pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); + pr_warn("cgroup: new mount options do not match the existing superblock, will be ignored\n"); } } @@ -4168,10 +4168,10 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && parent->parent) { - pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", - current->comm, current->pid, ss->name); + pr_warn("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", + current->comm, current->pid, ss->name); if (!strcmp(ss->name, "memory")) - pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); + pr_warn("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); ss->warned_broken_hierarchy = true; } -- cgit v1.2.3 From ed3d261b53f51c9505822d757d1800c79fb68788 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 25 Apr 2014 18:28:03 -0400 Subject: cgroup: Use more current logging style Use pr_fmt and remove embedded prefixes. Realign modified multi-line statements to open parenthesis. Convert embedded function name to "%s: ", __func__ Signed-off-by: Joe Perches Signed-off-by: Tejun Heo --- kernel/cgroup.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cb453e9954c1..3873267c9ee3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -26,6 +26,8 @@ * distribution for more details. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -1126,9 +1128,9 @@ static int rebind_subsystems(struct cgroup_root *dst_root, * Just warn about it and continue. */ if (cgrp_dfl_root_visible) { - pr_warn("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n", + pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", ret, ss_mask); - pr_warn("cgroup: you may retry by moving them to a different hierarchy and unbinding\n"); + pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); } } @@ -1323,12 +1325,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) /* Consistency checks */ if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { - pr_warn("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); + pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || opts->cpuset_clone_children || opts->release_agent || opts->name) { - pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); + pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); return -EINVAL; } } else { @@ -1374,7 +1376,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) unsigned long added_mask, removed_mask; if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { - pr_err("cgroup: sane_behavior: remount is not allowed\n"); + pr_err("sane_behavior: remount is not allowed\n"); return -EINVAL; } @@ -1387,7 +1389,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) goto out_unlock; if (opts.subsys_mask != root->subsys_mask || opts.release_agent) - pr_warn("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", + pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); added_mask = opts.subsys_mask & ~root->subsys_mask; @@ -1396,7 +1398,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) /* Don't allow flags or name to change at remount */ if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || (opts.name && strcmp(opts.name, root->name))) { - pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n", + pr_err("option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n", opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", root->flags & CGRP_ROOT_OPTION_MASK, root->name); ret = -EINVAL; @@ -1665,11 +1667,11 @@ retry: if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { - pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); + pr_err("sane_behavior: new mount options should match the existing superblock\n"); ret = -EINVAL; goto out_unlock; } else { - pr_warn("cgroup: new mount options do not match the existing superblock, will be ignored\n"); + pr_warn("new mount options do not match the existing superblock, will be ignored\n"); } } @@ -2889,8 +2891,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], if (is_add) { ret = cgroup_add_file(cgrp, cft); if (ret) { - pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", - cft->name, ret); + pr_warn("%s: failed to add %s, err=%d\n", + __func__, cft->name, ret); return ret; } } else { @@ -4168,10 +4170,10 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && parent->parent) { - pr_warn("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", + pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", current->comm, current->pid, ss->name); if (!strcmp(ss->name, "memory")) - pr_warn("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); + pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n"); ss->warned_broken_hierarchy = true; } -- cgit v1.2.3 From 51e158c12aca3c9ac63988611a97c05109b14dc9 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 28 Apr 2014 11:34:33 +0930 Subject: param: hand arguments after -- straight to init The kernel passes any args it doesn't need through to init, except it assumes anything containing '.' belongs to the kernel (for a module). This change means all users can clearly distinguish which arguments are for init. For example, the kernel uses debug ("dee-bug") to mean log everything to the console, where systemd uses the debug from the Scandinavian "day-boog" meaning "fail to boot". If a future versions uses argv[] instead of reading /proc/cmdline, this confusion will be avoided. eg: test 'FOO="this is --foo"' -- 'systemd.debug="true true true"' Gives: argv[0] = '/debug-init' argv[1] = 'test' argv[2] = 'systemd.debug=true true true' envp[0] = 'HOME=/' envp[1] = 'TERM=linux' envp[2] = 'FOO=this is --foo' Signed-off-by: Rusty Russell --- kernel/module.c | 12 +++++++++--- kernel/params.c | 25 ++++++++++++++----------- 2 files changed, 23 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 11869408f79b..66e4e0d260a9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3193,6 +3193,7 @@ static int load_module(struct load_info *info, const char __user *uargs, { struct module *mod; long err; + char *after_dashes; err = module_sig_check(info); if (err) @@ -3277,10 +3278,15 @@ static int load_module(struct load_info *info, const char __user *uargs, goto ddebug_cleanup; /* Module is ready to execute: parsing args may do that. */ - err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, - -32768, 32767, unknown_module_param_cb); - if (err < 0) + after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, + -32768, 32767, unknown_module_param_cb); + if (IS_ERR(after_dashes)) { + err = PTR_ERR(after_dashes); goto bug_cleanup; + } else if (after_dashes) { + pr_warn("%s: parameters '%s' after `--' ignored\n", + mod->name, after_dashes); + } /* Link in to syfs. */ err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); diff --git a/kernel/params.c b/kernel/params.c index b00142e7f3ba..1e52ca233fd9 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -177,13 +177,13 @@ static char *next_arg(char *args, char **param, char **val) } /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ -int parse_args(const char *doing, - char *args, - const struct kernel_param *params, - unsigned num, - s16 min_level, - s16 max_level, - int (*unknown)(char *param, char *val, const char *doing)) +char *parse_args(const char *doing, + char *args, + const struct kernel_param *params, + unsigned num, + s16 min_level, + s16 max_level, + int (*unknown)(char *param, char *val, const char *doing)) { char *param, *val; @@ -198,6 +198,9 @@ int parse_args(const char *doing, int irq_was_disabled; args = next_arg(args, ¶m, &val); + /* Stop at -- */ + if (!val && strcmp(param, "--") == 0) + return args; irq_was_disabled = irqs_disabled(); ret = parse_one(param, val, doing, params, num, min_level, max_level, unknown); @@ -208,22 +211,22 @@ int parse_args(const char *doing, switch (ret) { case -ENOENT: pr_err("%s: Unknown parameter `%s'\n", doing, param); - return ret; + return ERR_PTR(ret); case -ENOSPC: pr_err("%s: `%s' too large for parameter `%s'\n", doing, val ?: "", param); - return ret; + return ERR_PTR(ret); case 0: break; default: pr_err("%s: `%s' invalid for parameter `%s'\n", doing, val ?: "", param); - return ret; + return ERR_PTR(ret); } } /* All parsed OK. */ - return 0; + return NULL; } /* Lazy bastard, eh? */ -- cgit v1.2.3 From f4874261049e3abdd481359d82cafa5068369ebd Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 29 Apr 2014 16:07:28 -0400 Subject: tracing: Break out of tracing_wait_pipe() before wait_pipe() is called When reading from trace_pipe, if tracing is off but nothing was read it should block. If something is read and tracing is off, then EOF is returned. If tracing is on and there's nothing to read, it will block. But because the check of whether tracing is off and something was read is done after the block on the pipe, it is hit or miss if the EOF is returned or not leading to inconsistent behavior. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index cb41e98cc64b..e058c6091e45 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4237,15 +4237,6 @@ static int tracing_wait_pipe(struct file *filp) return -EAGAIN; } - mutex_unlock(&iter->mutex); - - iter->trace->wait_pipe(iter); - - mutex_lock(&iter->mutex); - - if (signal_pending(current)) - return -EINTR; - /* * We block until we read something and tracing is disabled. * We still block if tracing is disabled, but we have never @@ -4257,6 +4248,15 @@ static int tracing_wait_pipe(struct file *filp) */ if (!tracing_is_on() && iter->pos) break; + + mutex_unlock(&iter->mutex); + + iter->trace->wait_pipe(iter); + + mutex_lock(&iter->mutex); + + if (signal_pending(current)) + return -EINTR; } return 1; -- cgit v1.2.3 From b1169cc69ba96b124df820904a6d3eb775491d7f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 29 Apr 2014 17:54:37 -0400 Subject: tracing: Remove mock up poll wait function Now that the ring buffer has a built in way to wake up readers when there's data, using irq_work such that it is safe to do it in any context. But it was still using the old "poor man's" wait polling that checks every 1/10 of a second to see if it should wake up a waiter. This makes the latency for a wake up excruciatingly long. No need to do that anymore. Completely remove the different wait_poll types from the tracers and have them all use the default one now. Reported-by: Johannes Berg Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 29 ++++------------------------- kernel/trace/trace.h | 4 ---- kernel/trace/trace_functions.c | 1 - kernel/trace/trace_functions_graph.c | 1 - kernel/trace/trace_nop.c | 1 - kernel/trace/trace_sched_wakeup.c | 2 -- 6 files changed, 4 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e058c6091e45..4c392c8238bf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1085,7 +1085,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) } #endif /* CONFIG_TRACER_MAX_TRACE */ -static void default_wait_pipe(struct trace_iterator *iter) +static void wait_on_pipe(struct trace_iterator *iter) { /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) @@ -1202,8 +1202,6 @@ int register_tracer(struct tracer *type) else if (!type->flags->opts) type->flags->opts = dummy_tracer_opt; - if (!type->wait_pipe) - type->wait_pipe = default_wait_pipe; ret = run_tracer_selftest(type); if (ret < 0) @@ -4207,25 +4205,6 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table) return trace_poll(iter, filp, poll_table); } -/* - * This is a make-shift waitqueue. - * A tracer might use this callback on some rare cases: - * - * 1) the current tracer might hold the runqueue lock when it wakes up - * a reader, hence a deadlock (sched, function, and function graph tracers) - * 2) the function tracers, trace all functions, we don't want - * the overhead of calling wake_up and friends - * (and tracing them too) - * - * Anyway, this is really very primitive wakeup. - */ -void poll_wait_pipe(struct trace_iterator *iter) -{ - set_current_state(TASK_INTERRUPTIBLE); - /* sleep for 100 msecs, and try again. */ - schedule_timeout(HZ / 10); -} - /* Must be called with trace_types_lock mutex held. */ static int tracing_wait_pipe(struct file *filp) { @@ -4251,7 +4230,7 @@ static int tracing_wait_pipe(struct file *filp) mutex_unlock(&iter->mutex); - iter->trace->wait_pipe(iter); + wait_on_pipe(iter); mutex_lock(&iter->mutex); @@ -5179,7 +5158,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, goto out_unlock; } mutex_unlock(&trace_types_lock); - iter->trace->wait_pipe(iter); + wait_on_pipe(iter); mutex_lock(&trace_types_lock); if (signal_pending(current)) { size = -EINTR; @@ -5390,7 +5369,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, goto out; } mutex_unlock(&trace_types_lock); - iter->trace->wait_pipe(iter); + wait_on_pipe(iter); mutex_lock(&trace_types_lock); if (signal_pending(current)) { ret = -EINTR; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8624b5041466..3b3e09e61f33 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -338,7 +338,6 @@ struct tracer_flags { * @stop: called when tracing is paused (echo 0 > tracing_enabled) * @open: called when the trace file is opened * @pipe_open: called when the trace_pipe file is opened - * @wait_pipe: override how the user waits for traces on trace_pipe * @close: called when the trace file is released * @pipe_close: called when the trace_pipe file is released * @read: override the default read callback on trace_pipe @@ -357,7 +356,6 @@ struct tracer { void (*stop)(struct trace_array *tr); void (*open)(struct trace_iterator *iter); void (*pipe_open)(struct trace_iterator *iter); - void (*wait_pipe)(struct trace_iterator *iter); void (*close)(struct trace_iterator *iter); void (*pipe_close)(struct trace_iterator *iter); ssize_t (*read)(struct trace_iterator *iter, @@ -566,8 +564,6 @@ void trace_init_global_iter(struct trace_iterator *iter); void tracing_iter_reset(struct trace_iterator *iter, int cpu); -void poll_wait_pipe(struct trace_iterator *iter); - void tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *prev, struct task_struct *next, diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 2d9482b8f26a..57f0ec962d2c 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -252,7 +252,6 @@ static struct tracer function_trace __tracer_data = .init = function_trace_init, .reset = function_trace_reset, .start = function_trace_start, - .wait_pipe = poll_wait_pipe, .flags = &func_flags, .set_flag = func_set_flag, .allow_instances = true, diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index deff11200261..b86dd4d8c6a6 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1505,7 +1505,6 @@ static struct tracer graph_trace __tracer_data = { .pipe_open = graph_trace_open, .close = graph_trace_close, .pipe_close = graph_trace_close, - .wait_pipe = poll_wait_pipe, .init = graph_trace_init, .reset = graph_trace_reset, .print_line = print_graph_function, diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 69a5cc94c01a..fcf0a9e48916 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -91,7 +91,6 @@ struct tracer nop_trace __read_mostly = .name = "nop", .init = nop_trace_init, .reset = nop_trace_reset, - .wait_pipe = poll_wait_pipe, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_nop, #endif diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 1573c03640d2..19bd8928ce94 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -705,7 +705,6 @@ static struct tracer wakeup_rt_tracer __read_mostly = .reset = wakeup_tracer_reset, .start = wakeup_tracer_start, .stop = wakeup_tracer_stop, - .wait_pipe = poll_wait_pipe, .print_max = true, .print_header = wakeup_print_header, .print_line = wakeup_print_line, @@ -728,7 +727,6 @@ static struct tracer wakeup_dl_tracer __read_mostly = .reset = wakeup_tracer_reset, .start = wakeup_tracer_start, .stop = wakeup_tracer_stop, - .wait_pipe = poll_wait_pipe, .print_max = true, .print_header = wakeup_print_header, .print_line = wakeup_print_line, -- cgit v1.2.3 From ce5f36a58fd1d92cb945cf2568751d40e8598508 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 24 Apr 2014 13:26:01 +0200 Subject: uprobes/tracing: Make uprobe_perf_close() visible to uprobe_perf_open() Preparation. Move uprobe_perf_close() up before uprobe_perf_open() to avoid the forward declaration in the next patch and make it readable. Signed-off-by: Oleg Nesterov Acked-by: Steven Rostedt Acked-by: Srikar Dronamraju --- kernel/trace/trace_uprobe.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c082a7441345..51bd071eaca0 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1009,54 +1009,54 @@ uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); } -static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) +static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) { bool done; write_lock(&tu->filter.rwlock); if (event->hw.tp_target) { - /* - * event->parent != NULL means copy_process(), we can avoid - * uprobe_apply(). current->mm must be probed and we can rely - * on dup_mmap() which preserves the already installed bp's. - * - * attr.enable_on_exec means that exec/mmap will install the - * breakpoints we need. - */ + list_del(&event->hw.tp_list); done = tu->filter.nr_systemwide || - event->parent || event->attr.enable_on_exec || + (event->hw.tp_target->flags & PF_EXITING) || uprobe_filter_event(tu, event); - list_add(&event->hw.tp_list, &tu->filter.perf_events); } else { + tu->filter.nr_systemwide--; done = tu->filter.nr_systemwide; - tu->filter.nr_systemwide++; } write_unlock(&tu->filter.rwlock); if (!done) - uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); return 0; } -static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) +static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) { bool done; write_lock(&tu->filter.rwlock); if (event->hw.tp_target) { - list_del(&event->hw.tp_list); + /* + * event->parent != NULL means copy_process(), we can avoid + * uprobe_apply(). current->mm must be probed and we can rely + * on dup_mmap() which preserves the already installed bp's. + * + * attr.enable_on_exec means that exec/mmap will install the + * breakpoints we need. + */ done = tu->filter.nr_systemwide || - (event->hw.tp_target->flags & PF_EXITING) || + event->parent || event->attr.enable_on_exec || uprobe_filter_event(tu, event); + list_add(&event->hw.tp_list, &tu->filter.perf_events); } else { - tu->filter.nr_systemwide--; done = tu->filter.nr_systemwide; + tu->filter.nr_systemwide++; } write_unlock(&tu->filter.rwlock); if (!done) - uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); return 0; } -- cgit v1.2.3 From 927d687480ab7e43d73a003bab58803fc67717d9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 24 Apr 2014 13:33:31 +0200 Subject: uprobes/tracing: Fix uprobe_perf_open() on uprobe_apply() failure uprobe_perf_open()->uprobe_apply() can fail, but this error is wrongly ignored. Change uprobe_perf_open() to do uprobe_perf_close() and return the error code in this case. Change uprobe_perf_close() to propogate the error from uprobe_apply() as well, although it should not fail. Signed-off-by: Oleg Nesterov Acked-by: Steven Rostedt Acked-by: Srikar Dronamraju --- kernel/trace/trace_uprobe.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 51bd071eaca0..5a7f1a6b3b8b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1026,7 +1026,7 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) write_unlock(&tu->filter.rwlock); if (!done) - uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); return 0; } @@ -1034,6 +1034,7 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) { bool done; + int err; write_lock(&tu->filter.rwlock); if (event->hw.tp_target) { @@ -1055,10 +1056,13 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) } write_unlock(&tu->filter.rwlock); - if (!done) - uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); - - return 0; + err = 0; + if (!done) { + err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + if (err) + uprobe_perf_close(tu, event); + } + return err; } static bool uprobe_perf_filter(struct uprobe_consumer *uc, -- cgit v1.2.3 From 13f59c5e45be59665c11ddde19799b6295543b7d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 28 Apr 2014 20:15:43 +0200 Subject: uprobes: Refuse to insert a probe into MAP_SHARED vma valid_vma() rejects the VM_SHARED vmas, but this still allows to insert a probe into the MAP_SHARED but not VM_MAYWRITE vma. Currently this is fine, such a mapping doesn't really differ from the private read-only mmap except mprotect(PROT_WRITE) won't work. However, get_user_pages(FOLL_WRITE | FOLL_FORCE) doesn't allow to COW in this case, and it would be safer to follow the same conventions as mm even if currently this happens to work. After the recent cda540ace6a1 "mm: get_user_pages(write,force) refuse to COW in shared areas" only uprobes can insert an anon page into the shared file-backed area, lets stop this and change valid_vma() to check VM_MAYSHARE instead. Signed-off-by: Oleg Nesterov --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d1edc5e6fd03..7716c40f2c50 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -127,7 +127,7 @@ struct xol_area { */ static bool valid_vma(struct vm_area_struct *vma, bool is_register) { - vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED; + vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE; if (is_register) flags |= VM_WRITE; -- cgit v1.2.3 From fd06a54990e94c7f40ca21cf82b9c83106ccb94b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 1 May 2014 23:05:31 -0400 Subject: ftrace: Have function graph tracer use global_ops for filtering Commit 4104d326b670 "ftrace: Remove global function list and call function directly" cleaned up the global_ops filtering and made the code simpler. But it left out function graph filtering which also depended on that code. The function graph filtering still needs to use global_ops as the filter otherwise it wont filter at all. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 34b098bfded4..9eb1aa03a18d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5008,12 +5008,6 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state, return NOTIFY_DONE; } -/* Just a place holder for function graph */ -static struct ftrace_ops fgraph_ops __read_mostly = { - .func = ftrace_stub, - .flags = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_RECURSION_SAFE, -}; - static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) { if (!ftrace_ops_test(&global_ops, trace->func, NULL)) @@ -5076,7 +5070,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, ftrace_graph_entry = ftrace_graph_entry_test; update_function_graph_func(); - ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET); + /* Function graph doesn't use the .func field of global_ops */ + global_ops.flags |= FTRACE_OPS_FL_STUB; + + ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); out: mutex_unlock(&ftrace_lock); @@ -5094,7 +5091,8 @@ void unregister_ftrace_graph(void) ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; __ftrace_graph_entry = ftrace_graph_entry_stub; - ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET); + ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); + global_ops.flags &= ~FTRACE_OPS_FL_STUB; unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); -- cgit v1.2.3 From 69dfa00ccb72a37f3810687ca110e5a8154c6eed Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 4 May 2014 15:09:13 -0400 Subject: cgroup: make flags and subsys_masks unsigned int There's no reason to use atomic bitops for cgroup_subsys_state->flags, cgroup_root->flags and various subsys_masks. This patch updates those to use bitwise and/or operations instead and converts them form unsigned long to unsigned int. This makes the fields occupy (marginally) smaller space and makes it clear that they don't require atomicity. This patch doesn't cause any behavior difference. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3873267c9ee3..21667f396a1e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -181,7 +181,7 @@ static struct cftype cgroup_base_files[]; static void cgroup_put(struct cgroup *cgrp); static int rebind_subsystems(struct cgroup_root *dst_root, - unsigned long ss_mask); + unsigned int ss_mask); static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); @@ -963,7 +963,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * update of a tasks cgroup pointer by cgroup_attach_task() */ -static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); +static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask); static struct kernfs_syscall_ops cgroup_kf_syscall_ops; static const struct file_operations proc_cgroupstats_operations; @@ -1079,7 +1079,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) * @cgrp: target cgroup * @subsys_mask: mask of the subsystem ids whose files should be removed */ -static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) +static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask) { struct cgroup_subsys *ss; int i; @@ -1087,15 +1087,14 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) for_each_subsys(ss, i) { struct cftype *cfts; - if (!test_bit(i, &subsys_mask)) + if (!(subsys_mask & (1 << i))) continue; list_for_each_entry(cfts, &ss->cfts, node) cgroup_addrm_files(cgrp, cfts, false); } } -static int rebind_subsystems(struct cgroup_root *dst_root, - unsigned long ss_mask) +static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) { struct cgroup_subsys *ss; int ssid, i, ret; @@ -1128,7 +1127,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, * Just warn about it and continue. */ if (cgrp_dfl_root_visible) { - pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", + pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n", ret, ss_mask); pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); } @@ -1214,8 +1213,8 @@ static int cgroup_show_options(struct seq_file *seq, } struct cgroup_sb_opts { - unsigned long subsys_mask; - unsigned long flags; + unsigned int subsys_mask; + unsigned int flags; char *release_agent; bool cpuset_clone_children; char *name; @@ -1227,12 +1226,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { char *token, *o = data; bool all_ss = false, one_ss = false; - unsigned long mask = (unsigned long)-1; + unsigned int mask = -1U; struct cgroup_subsys *ss; int i; #ifdef CONFIG_CPUSETS - mask = ~(1UL << cpuset_cgrp_id); + mask = ~(1U << cpuset_cgrp_id); #endif memset(opts, 0, sizeof(*opts)); @@ -1313,7 +1312,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) /* Mutually exclusive option 'all' + subsystem name */ if (all_ss) return -EINVAL; - set_bit(i, &opts->subsys_mask); + opts->subsys_mask |= (1 << i); one_ss = true; break; @@ -1342,7 +1341,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if (all_ss || (!one_ss && !opts->none && !opts->name)) for_each_subsys(ss, i) if (!ss->disabled) - set_bit(i, &opts->subsys_mask); + opts->subsys_mask |= (1 << i); /* * We either have to specify by name or by subsystems. (So @@ -1373,7 +1372,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) int ret = 0; struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_sb_opts opts; - unsigned long added_mask, removed_mask; + unsigned int added_mask, removed_mask; if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { pr_err("sane_behavior: remount is not allowed\n"); @@ -1398,7 +1397,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) /* Don't allow flags or name to change at remount */ if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || (opts.name && strcmp(opts.name, root->name))) { - pr_err("option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n", + pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", root->flags & CGRP_ROOT_OPTION_MASK, root->name); ret = -EINVAL; @@ -1522,7 +1521,7 @@ static void init_cgroup_root(struct cgroup_root *root, set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) +static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; @@ -2507,7 +2506,7 @@ out_finish: static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css, struct cftype *cft, char *buffer) { - unsigned long enable_req = 0, disable_req = 0, enable, disable; + unsigned int enable_req = 0, disable_req = 0, enable, disable; struct cgroup *cgrp = dummy_css->cgroup, *child; struct cgroup_subsys *ss; char *tok, *p; @@ -3998,7 +3997,7 @@ static struct cftype cgroup_base_files[] = { * * On failure, no file is added. */ -static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) +static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask) { struct cgroup_subsys *ss; int i, ret = 0; @@ -4007,7 +4006,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) for_each_subsys(ss, i) { struct cftype *cfts; - if (!test_bit(i, &subsys_mask)) + if (!(subsys_mask & (1 << i))) continue; list_for_each_entry(cfts, &ss->cfts, node) { -- cgit v1.2.3 From 7d699ddb2b181a2c76e5ea18b1bdf102c4bebe4b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 4 May 2014 15:09:13 -0400 Subject: cgroup, memcg: allocate cgroup ID from 1 Currently, cgroup->id is allocated from 0, which is always assigned to the root cgroup; unfortunately, memcg wants to use ID 0 to indicate invalid IDs and ends up incrementing all IDs by one. It's reasonable to reserve 0 for special purposes. This patch updates cgroup core so that ID 0 is not used and the root cgroups get ID 1. The ID incrementing is removed form memcg. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Cc: Johannes Weiner Acked-by: Li Zefan --- kernel/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 21667f396a1e..3fa0463e74bb 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1531,7 +1531,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); - ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); + ret = idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL); if (ret < 0) goto out; root_cgrp->id = ret; @@ -4225,7 +4225,7 @@ static long cgroup_create(struct cgroup *parent, const char *name, * Temporarily set the pointer to NULL, so idr_find() won't return * a half-baked cgroup. */ - cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); + cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); if (cgrp->id < 0) { err = -ENOMEM; goto err_unlock; -- cgit v1.2.3 From 6fa4918d03c39351aef3573ac3e7958d6a5ad9b6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 4 May 2014 15:09:13 -0400 Subject: cgroup: protect cgroup_root->cgroup_idr with a spinlock Currently, cgroup_root->cgroup_idr is protected by cgroup_mutex, which ends up requiring cgroup_put() to be invoked under sleepable context. This is okay for now but is an unusual requirement and we'll soon add css->id which will have the same problem but won't be able to simply grab cgroup_mutex as removal will have to happen from css_release() which can't sleep. Introduce cgroup_idr_lock and idr_alloc/replace/remove() wrappers which protects the idr operations with the lock and use them for cgroup_root->cgroup_idr. cgroup_put() no longer needs to grab cgroup_mutex and css_from_id() is updated to always require RCU read lock instead of either RCU read lock or cgroup_mutex, which doesn't affect the existing users. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 51 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3fa0463e74bb..7cb9c0847445 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -99,6 +99,12 @@ static DEFINE_MUTEX(cgroup_mutex); static DECLARE_RWSEM(css_set_rwsem); #endif +/* + * Protects cgroup_idr so that IDs can be released without grabbing + * cgroup_mutex. + */ +static DEFINE_SPINLOCK(cgroup_idr_lock); + /* * Protects cgroup_subsys->release_agent_path. Modifying it also requires * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. @@ -190,6 +196,37 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); +/* IDR wrappers which synchronize using cgroup_idr_lock */ +static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, + gfp_t gfp_mask) +{ + int ret; + + idr_preload(gfp_mask); + spin_lock(&cgroup_idr_lock); + ret = idr_alloc(idr, ptr, start, end, gfp_mask); + spin_unlock(&cgroup_idr_lock); + idr_preload_end(); + return ret; +} + +static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id) +{ + void *ret; + + spin_lock(&cgroup_idr_lock); + ret = idr_replace(idr, ptr, id); + spin_unlock(&cgroup_idr_lock); + return ret; +} + +static void cgroup_idr_remove(struct idr *idr, int id) +{ + spin_lock(&cgroup_idr_lock); + idr_remove(idr, id); + spin_unlock(&cgroup_idr_lock); +} + /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest @@ -1058,9 +1095,7 @@ static void cgroup_put(struct cgroup *cgrp) * per-subsystem and moved to css->id so that lookups are * successful until the target css is released. */ - mutex_lock(&cgroup_mutex); - idr_remove(&cgrp->root->cgroup_idr, cgrp->id); - mutex_unlock(&cgroup_mutex); + cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; call_rcu(&cgrp->rcu_head, cgroup_free_rcu); @@ -1531,7 +1566,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); - ret = idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL); + ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT); if (ret < 0) goto out; root_cgrp->id = ret; @@ -4225,7 +4260,7 @@ static long cgroup_create(struct cgroup *parent, const char *name, * Temporarily set the pointer to NULL, so idr_find() won't return * a half-baked cgroup. */ - cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); + cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT); if (cgrp->id < 0) { err = -ENOMEM; goto err_unlock; @@ -4268,7 +4303,7 @@ static long cgroup_create(struct cgroup *parent, const char *name, * @cgrp is now fully operational. If something fails after this * point, it'll be released via the normal destruction path. */ - idr_replace(&root->cgroup_idr, cgrp, cgrp->id); + cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); err = cgroup_kn_set_ugid(kn); if (err) @@ -4302,7 +4337,7 @@ static long cgroup_create(struct cgroup *parent, const char *name, return 0; err_free_id: - idr_remove(&root->cgroup_idr, cgrp->id); + cgroup_idr_remove(&root->cgroup_idr, cgrp->id); err_unlock: mutex_unlock(&cgroup_mutex); err_unlock_tree: @@ -5162,7 +5197,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { struct cgroup *cgrp; - cgroup_assert_mutexes_or_rcu_locked(); + WARN_ON_ONCE(!rcu_read_lock_held()); cgrp = idr_find(&ss->root->cgroup_idr, id); if (cgrp) -- cgit v1.2.3 From a2bed8209a3afc3b2cf1c28383fb48155c1fea46 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 4 May 2014 15:09:14 -0400 Subject: cgroup: use RCU free in create_css() failure path Currently, when create_css() fails in the middle, the half-initialized css is freed by invoking cgroup_subsys->css_free() directly. This patch updates the function so that it invokes RCU free path instead. As the RCU free path puts the parent css and owning cgroup, their references are now acquired right after a new css is successfully allocated. This doesn't make any visible difference now but is to enable implementing css->id and RCU protected lookup by such IDs. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7cb9c0847445..0e2c401ed7b9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4185,12 +4185,14 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) if (IS_ERR(css)) return PTR_ERR(css); + init_css(css, ss, cgrp); + cgroup_get(cgrp); + css_get(css->parent); + err = percpu_ref_init(&css->refcnt, css_release); if (err) goto err_free_css; - init_css(css, ss, cgrp); - err = cgroup_populate_dir(cgrp, 1 << ss->id); if (err) goto err_free_percpu_ref; @@ -4199,9 +4201,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) if (err) goto err_clear_dir; - cgroup_get(cgrp); - css_get(css->parent); - if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && parent->parent) { pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", @@ -4218,7 +4217,7 @@ err_clear_dir: err_free_percpu_ref: percpu_ref_cancel_init(&css->refcnt); err_free_css: - ss->css_free(css); + call_rcu(&css->rcu_head, css_free_rcu_fn); return err; } -- cgit v1.2.3 From ddfcadab35dda6e5bc23ccf1c3055ecb63a71e49 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 4 May 2014 15:09:14 -0400 Subject: cgroup: update init_css() into init_and_link_css() init_css() takes the cgroup the new css belongs to as an argument and initializes the new css's ->cgroup and ->parent pointers but doesn't acquire the matching reference counts. After the previous patch, create_css() puts init_css() and reference acquisition right next to each other. Let's move reference acquistion into init_css() and rename the function to init_and_link_css(). This makes sense and is easier to follow. This makes the root csses to hold a reference on cgrp_dfl_root.cgrp, which is harmless. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0e2c401ed7b9..f1c98c527b2d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4109,17 +4109,21 @@ static void css_release(struct percpu_ref *ref) call_rcu(&css->rcu_head, css_free_rcu_fn); } -static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, - struct cgroup *cgrp) +static void init_and_link_css(struct cgroup_subsys_state *css, + struct cgroup_subsys *ss, struct cgroup *cgrp) { + cgroup_get(cgrp); + css->cgroup = cgrp; css->ss = ss; css->flags = 0; - if (cgrp->parent) + if (cgrp->parent) { css->parent = cgroup_css(cgrp->parent, ss); - else + css_get(css->parent); + } else { css->flags |= CSS_ROOT; + } BUG_ON(cgroup_css(cgrp, ss)); } @@ -4185,9 +4189,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) if (IS_ERR(css)) return PTR_ERR(css); - init_css(css, ss, cgrp); - cgroup_get(cgrp); - css_get(css->parent); + init_and_link_css(css, ss, cgrp); err = percpu_ref_init(&css->refcnt, css_release); if (err) @@ -4656,7 +4658,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); - init_css(css, ss, &cgrp_dfl_root.cgrp); + init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is -- cgit v1.2.3 From 15a4c835e4ed3e60dd68727cd1907e3dd89563f4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 4 May 2014 15:09:14 -0400 Subject: cgroup, memcg: implement css->id and convert css_from_id() to use it Until now, cgroup->id has been used to identify all the associated csses and css_from_id() takes cgroup ID and returns the matching css by looking up the cgroup and then dereferencing the css associated with it; however, now that the lifetimes of cgroup and css are separate, this is incorrect and breaks on the unified hierarchy when a controller is disabled and enabled back again before the previous instance is released. This patch adds css->id which is a subsystem-unique ID and converts css_from_id() to look up by the new css->id instead. memcg is the only user of css_from_id() and also converted to use css->id instead. For traditional hierarchies, this shouldn't make any functional difference. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Jianyu Zhan Acked-by: Li Zefan --- kernel/cgroup.c | 59 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f1c98c527b2d..a1a20e8c973a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -100,8 +100,8 @@ static DECLARE_RWSEM(css_set_rwsem); #endif /* - * Protects cgroup_idr so that IDs can be released without grabbing - * cgroup_mutex. + * Protects cgroup_idr and css_idr so that IDs can be released without + * grabbing cgroup_mutex. */ static DEFINE_SPINLOCK(cgroup_idr_lock); @@ -1089,12 +1089,6 @@ static void cgroup_put(struct cgroup *cgrp) if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp))) return; - /* - * XXX: cgrp->id is only used to look up css's. As cgroup and - * css's lifetimes will be decoupled, it should be made - * per-subsystem and moved to css->id so that lookups are - * successful until the target css is released. - */ cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; @@ -4104,8 +4098,11 @@ static void css_release(struct percpu_ref *ref) { struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); + struct cgroup_subsys *ss = css->ss; + + RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); + cgroup_idr_remove(&ss->css_idr, css->id); - RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL); call_rcu(&css->rcu_head, css_free_rcu_fn); } @@ -4195,9 +4192,17 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) if (err) goto err_free_css; + err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT); + if (err < 0) + goto err_free_percpu_ref; + css->id = err; + err = cgroup_populate_dir(cgrp, 1 << ss->id); if (err) - goto err_free_percpu_ref; + goto err_free_id; + + /* @css is ready to be brought online now, make it visible */ + cgroup_idr_replace(&ss->css_idr, css, css->id); err = online_css(css); if (err) @@ -4216,6 +4221,8 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) err_clear_dir: cgroup_clear_dir(css->cgroup, 1 << css->ss->id); +err_free_id: + cgroup_idr_remove(&ss->css_idr, css->id); err_free_percpu_ref: percpu_ref_cancel_init(&css->refcnt); err_free_css: @@ -4642,7 +4649,7 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { .rename = cgroup_rename, }; -static void __init cgroup_init_subsys(struct cgroup_subsys *ss) +static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) { struct cgroup_subsys_state *css; @@ -4651,6 +4658,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); + idr_init(&ss->css_idr); INIT_LIST_HEAD(&ss->cfts); /* Create the root cgroup state for this subsystem */ @@ -4659,6 +4667,13 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); + if (early) { + /* idr_alloc() can't be called safely during early init */ + css->id = 1; + } else { + css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); + BUG_ON(css->id < 0); + } /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is @@ -4709,7 +4724,7 @@ int __init cgroup_init_early(void) ss->name = cgroup_subsys_name[i]; if (ss->early_init) - cgroup_init_subsys(ss); + cgroup_init_subsys(ss, true); } return 0; } @@ -4741,8 +4756,16 @@ int __init cgroup_init(void) mutex_unlock(&cgroup_tree_mutex); for_each_subsys(ss, ssid) { - if (!ss->early_init) - cgroup_init_subsys(ss); + if (ss->early_init) { + struct cgroup_subsys_state *css = + init_css_set.subsys[ss->id]; + + css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, + GFP_KERNEL); + BUG_ON(css->id < 0); + } else { + cgroup_init_subsys(ss, false); + } list_add_tail(&init_css_set.e_cset_node[ssid], &cgrp_dfl_root.cgrp.e_csets[ssid]); @@ -5196,14 +5219,8 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, */ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { - struct cgroup *cgrp; - WARN_ON_ONCE(!rcu_read_lock_held()); - - cgrp = idr_find(&ss->root->cgroup_idr, id); - if (cgrp) - return cgroup_css(cgrp, ss); - return NULL; + return idr_find(&ss->css_idr, id); } #ifdef CONFIG_CGROUP_DEBUG -- cgit v1.2.3 From 60106946ca7f63396680130b25511ccf6b7d5bff Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 5 May 2014 20:08:13 +0200 Subject: kernel/cgroup.c: fix 2 kernel-doc warnings Fix typo and variable name. tj: Updated @cgrp argument description in cgroup_destroy_css_killed() Cc: Andrew Morton Signed-off-by: Fabian Frederick Signed-off-by: Tejun Heo --- kernel/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a1a20e8c973a..07815ef7b1f6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1888,7 +1888,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) /** * cgroup_task_migrate - move a task from one cgroup to another. - * @old_cgrp; the cgroup @tsk is being migrated from + * @old_cgrp: the cgroup @tsk is being migrated from * @tsk: the task being migrated * @new_cset: the new css_set @tsk is being attached to * @@ -4586,7 +4586,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /** * cgroup_destroy_css_killed - the second step of cgroup destruction - * @work: cgroup->destroy_free_work + * @cgrp: the cgroup whose csses have just finished offlining * * This function is invoked from a work item for a cgroup which is being * destroyed after all css's are offlined and performs the rest of -- cgit v1.2.3 From bdffd893a0e9c431304142d12d9a0a21d365c502 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 29 Apr 2014 14:17:40 -0500 Subject: tracing: Replace __get_cpu_var uses with this_cpu_ptr Replace uses of &__get_cpu_var for address calculation with this_cpu_ptr. Link: http://lkml.kernel.org/p/alpine.DEB.2.10.1404291415560.18364@gentwo.org Acked-by: Masami Hiramatsu Signed-off-by: Christoph Lameter Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 4 ++-- kernel/trace/trace.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9eb1aa03a18d..38e5cf73b9ae 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -822,7 +822,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip, local_irq_save(flags); - stat = &__get_cpu_var(ftrace_profile_stats); + stat = this_cpu_ptr(&ftrace_profile_stats); if (!stat->hash || !ftrace_profile_enabled) goto out; @@ -853,7 +853,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) unsigned long flags; local_irq_save(flags); - stat = &__get_cpu_var(ftrace_profile_stats); + stat = this_cpu_ptr(&ftrace_profile_stats); if (!stat->hash || !ftrace_profile_enabled) goto out; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4c392c8238bf..05431696b10c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1726,7 +1726,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, */ barrier(); if (use_stack == 1) { - trace.entries = &__get_cpu_var(ftrace_stack).calls[0]; + trace.entries = this_cpu_ptr(ftrace_stack.calls); trace.max_entries = FTRACE_STACK_MAX_ENTRIES; if (regs) -- cgit v1.2.3 From fc34ac1dc52f02a7cf1862960c24771e4883e39b Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 5 May 2014 19:46:55 +0200 Subject: kernel/cpuset.c: kernel-doc fixes This patch also converts seq_printf to seq_puts Cc: Andrew Morton Signed-off-by: Fabian Frederick Acked-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3d54c418bd06..1d8c0472bfb0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -890,6 +890,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root) /** * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it * @cs: the cpuset to consider + * @trialcs: trial cpuset * @buf: buffer of cpu numbers written to this cpuset */ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, @@ -2536,7 +2537,7 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, /** * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed - * @task: pointer to task_struct of some task. + * @tsk: pointer to task_struct of some task. * * Description: Prints @task's name, cpuset name, and cached copy of its * mems_allowed to the kernel log. @@ -2646,10 +2647,10 @@ out: /* Display task mems_allowed in /proc//status file. */ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { - seq_printf(m, "Mems_allowed:\t"); + seq_puts(m, "Mems_allowed:\t"); seq_nodemask(m, &task->mems_allowed); - seq_printf(m, "\n"); - seq_printf(m, "Mems_allowed_list:\t"); + seq_puts(m, "\n"); + seq_puts(m, "Mems_allowed_list:\t"); seq_nodemask_list(m, &task->mems_allowed); - seq_printf(m, "\n"); + seq_puts(m, "\n"); } -- cgit v1.2.3 From 12d3089c192c3a762fed098988d99f4b8ff33266 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 5 May 2014 19:49:00 +0200 Subject: kernel/cpuset.c: convert printk to pr_foo() Cc: Andrew Morton Signed-off-by: Fabian Frederick Acked-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1d8c0472bfb0..7c0e8da59e26 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -696,11 +696,8 @@ restart: if (nslot == ndoms) { static int warnings = 10; if (warnings) { - printk(KERN_WARNING - "rebuild_sched_domains confused:" - " nslot %d, ndoms %d, csn %d, i %d," - " apn %d\n", - nslot, ndoms, csn, i, apn); + pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n", + nslot, ndoms, csn, i, apn); warnings--; } continue; @@ -2018,7 +2015,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) parent = parent_cs(parent); if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { - printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset "); + pr_err("cpuset: failed to transfer tasks out of empty cpuset "); pr_cont_cgroup_name(cs->css.cgroup); pr_cont("\n"); } @@ -2555,7 +2552,7 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk) cgrp = task_cs(tsk)->css.cgroup; nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, tsk->mems_allowed); - printk(KERN_INFO "%s cpuset=", tsk->comm); + pr_info("%s cpuset=", tsk->comm); pr_cont_cgroup_name(cgrp); pr_cont(" mems_allowed=%s\n", cpuset_nodelist); -- cgit v1.2.3 From 2d916033a318d7e763eb099c69600d5dcd1ccb6b Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 12 May 2014 13:59:35 -0400 Subject: kernel/workqueue.c: pr_warning/pr_warn & printk/pr_info tj: Refreshed on top of wq/for-3.16. Cc: Andrew Morton Signed-off-by: Fabian Frederick Signed-off-by: Tejun Heo --- kernel/workqueue.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c3f076f0f8fd..c8411085466f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4117,8 +4117,8 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, /* create a new pwq */ pwq = alloc_unbound_pwq(wq, target_attrs); if (!pwq) { - pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", - wq->name); + pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", + wq->name); mutex_lock(&wq->mutex); goto use_dfl_pwq; } @@ -4568,7 +4568,7 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) probe_kernel_read(desc, worker->desc, sizeof(desc) - 1); if (fn || name[0] || desc[0]) { - printk("%sWorkqueue: %s %pf", log_lvl, name, fn); + pr_info("%sWorkqueue: %s %pf", log_lvl, name, fn); if (desc[0]) pr_cont(" (%s)", desc); pr_cont("\n"); -- cgit v1.2.3 From 0cee8b7786467907e12d1d8f872e6dc73bc95204 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:10:59 -0400 Subject: cgroup: fix offlining child waiting in cgroup_subtree_control_write() cgroup_subtree_control_write() waits for offline to complete child-by-child before enabling a controller; however, it has a couple bugs. * It doesn't initialize the wait_queue_t. This can lead to infinite hang on the following schedule() among other things. * It forgets to pin the child before releasing cgroup_tree_mutex and performing schedule(). The child may already be gone by the time it wakes up and invokes finish_wait(). Pin the child being waited on. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9db1a9629a5c..95fc66b16091 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2594,16 +2594,18 @@ retry: * cases, wait till it's gone using offline_waitq. */ cgroup_for_each_live_child(child, cgrp) { - wait_queue_t wait; + DEFINE_WAIT(wait); if (!cgroup_css(child, ss)) continue; + cgroup_get(child); prepare_to_wait(&child->offline_waitq, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&cgroup_tree_mutex); schedule(); finish_wait(&child->offline_waitq, &wait); + cgroup_put(child); goto retry; } -- cgit v1.2.3 From 54504e977ceee0bea6fbe8b632eceea771b18c6c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:10:59 -0400 Subject: cgroup: cgroup_idr_lock should be bh cgroup_idr_remove() can be invoked from bh leading to lockdep detecting possible AA deadlock (IN_BH/ON_BH). Make the lock bh-safe. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 95fc66b16091..e2ff925e6ee8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -203,9 +203,9 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, int ret; idr_preload(gfp_mask); - spin_lock(&cgroup_idr_lock); + spin_lock_bh(&cgroup_idr_lock); ret = idr_alloc(idr, ptr, start, end, gfp_mask); - spin_unlock(&cgroup_idr_lock); + spin_unlock_bh(&cgroup_idr_lock); idr_preload_end(); return ret; } @@ -214,17 +214,17 @@ static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id) { void *ret; - spin_lock(&cgroup_idr_lock); + spin_lock_bh(&cgroup_idr_lock); ret = idr_replace(idr, ptr, id); - spin_unlock(&cgroup_idr_lock); + spin_unlock_bh(&cgroup_idr_lock); return ret; } static void cgroup_idr_remove(struct idr *idr, int id) { - spin_lock(&cgroup_idr_lock); + spin_lock_bh(&cgroup_idr_lock); idr_remove(idr, id); - spin_unlock(&cgroup_idr_lock); + spin_unlock_bh(&cgroup_idr_lock); } /** -- cgit v1.2.3 From 0ab7a60dea71c285dfbb65e344d895b9c4f7bcb9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:10:59 -0400 Subject: cgroup: css_release() shouldn't clear cgroup->subsys[] c1a71504e971 ("cgroup: don't recycle cgroup id until all csses' have been destroyed") made cgroup ID persist until a cgroup is released and add cgroup->subsys[] clearing to css_release() so that css_from_id() doesn't return a css which has already been released which happens before cgroup release; however, the right change here was updating offline_css() to clear cgroup->subsys[] which was done by e32978031016 ("cgroup: cgroup->subsys[] should be cleared after the css is offlined") instead of clearing it from css_release(). We're now clearing cgroup->subsys[] twice. This is okay for traditional hierarchies as a css's lifetime is the same as its cgroup's; however, this confuses unified hierarchy and turning on and off a controller repeatedly using "cgroup.subtree_control" can lead to an oops like the following which happens because cgroup->subsys[] is incorrectly cleared asynchronously by css_release(). BUG: unable to handle kernel NULL pointer dereference at 00000000000000 08 IP: [] kill_css+0x21/0x1c0 PGD 1170d067 PUD f0ab067 PMD 0 Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: CPU: 2 PID: 459 Comm: bash Not tainted 3.15.0-rc2-work+ #5 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 task: ffff880009296710 ti: ffff88000e198000 task.ti: ffff88000e198000 RIP: 0010:[] [] kill_css+0x21/0x1c0 RSP: 0018:ffff88000e199dc8 EFLAGS: 00010202 RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000001 RDX: 0000000000000001 RSI: ffffffff8238a968 RDI: ffff880009296f98 RBP: ffff88000e199de0 R08: 0000000000000001 R09: 02b0000000000000 R10: 0000000000000000 R11: ffff880009296fc0 R12: 0000000000000001 R13: ffff88000db6fc58 R14: 0000000000000001 R15: ffff8800139dcc00 FS: 00007ff9160c5740(0000) GS:ffff88001fb00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000008 CR3: 0000000013947000 CR4: 00000000000006e0 Stack: ffff88000e199de0 ffffffff82389160 0000000000000001 ffff88000e199e80 ffffffff8113537f 0000000000000007 ffff88000e74af00 ffff88000e199e48 ffff880009296710 ffff88000db6fc00 ffffffff8239c100 0000000000000002 Call Trace: [] cgroup_subtree_control_write+0x85f/0xa00 [] cgroup_file_write+0x38/0x1d0 [] kernfs_fop_write+0xe7/0x170 [] vfs_write+0xb6/0x1c0 [] SyS_write+0x4d/0xc0 [] system_call_fastpath+0x16/0x1b Code: 5c 41 5d 41 5e 41 5f 5d c3 90 0f 1f 44 00 00 55 48 89 e5 41 54 53 48 89 fb 48 83 ec 08 8b 05 37 ad 29 01 85 c0 0f 85 df 00 00 00 <48> 8b 43 08 48 8b 3b be 01 00 00 00 8b 48 5c d3 e6 e8 49 ff ff RIP [] kill_css+0x21/0x1c0 RSP CR2: 0000000000000008 ---[ end trace e7aae1f877c4e1b4 ]--- Remove the unnecessary cgroup->subsys[] clearing from css_release(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e2ff925e6ee8..35daf892b6e6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4102,7 +4102,6 @@ static void css_release(struct percpu_ref *ref) container_of(ref, struct cgroup_subsys_state, refcnt); struct cgroup_subsys *ss = css->ss; - RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); cgroup_idr_remove(&ss->css_idr, css->id); call_rcu(&css->rcu_head, css_free_rcu_fn); -- cgit v1.2.3 From d37167ab7b3d67d53519585a44c47416e6758ed2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:10:59 -0400 Subject: cgroup: update and fix parsing of "cgroup.subtree_control" I was confused that strsep() was equivalent to strtok_r() in skipping over consecutive delimiters. strsep() just splits at the first occurrence of one of the delimiters which makes the parsing very inflexible, which makes allowing multiple whitespace chars as delimters kinda moot. Let's just be consistently strict and require list of tokens separated by spaces. This is what Documentation/cgroups/unified-hierarchy.txt describes too. Also, parsing may access beyond the end of the string if the string ends with spaces or is zero-length. Make sure it skips zero-length tokens. Note that this also ensures that the parser doesn't puke on multiple consecutive spaces. v2: Add zero-length token skipping. v3: Added missing space after "==". Spotted by Li. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 35daf892b6e6..250def0694b4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2542,11 +2542,13 @@ static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css, int ssid, ret; /* - * Parse input - white space separated list of subsystem names - * prefixed with either + or -. + * Parse input - space separated list of subsystem names prefixed + * with either + or -. */ p = buffer; - while ((tok = strsep(&p, " \t\n"))) { + while ((tok = strsep(&p, " "))) { + if (tok[0] == '\0') + continue; for_each_subsys(ss, ssid) { if (ss->disabled || strcmp(tok + 1, ss->name)) continue; -- cgit v1.2.3 From 7d331fa985d3a39d5b8cb60caf016d3e53e57c91 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:11:00 -0400 Subject: cgroup: use restart_syscall() for retries after offline waits in cgroup_subtree_control_write() After waiting for a child to finish offline, cgroup_subtree_control_write() jumps up to retry from after the input parsing and active protection breaking. This retry makes the scheduled locking update - removal of cgroup_tree_mutex - more difficult. Let's simplify it by returning with restart_syscall() for retries. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 250def0694b4..3251cc9070fa 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2535,7 +2535,7 @@ out_finish: static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css, struct cftype *cft, char *buffer) { - unsigned int enable_req = 0, disable_req = 0, enable, disable; + unsigned int enable = 0, disable = 0; struct cgroup *cgrp = dummy_css->cgroup, *child; struct cgroup_subsys *ss; char *tok, *p; @@ -2554,11 +2554,11 @@ static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css, continue; if (*tok == '+') { - enable_req |= 1 << ssid; - disable_req &= ~(1 << ssid); + enable |= 1 << ssid; + disable &= ~(1 << ssid); } else if (*tok == '-') { - disable_req |= 1 << ssid; - enable_req &= ~(1 << ssid); + disable |= 1 << ssid; + enable &= ~(1 << ssid); } else { return -EINVAL; } @@ -2576,9 +2576,6 @@ static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css, */ cgroup_get(cgrp); kernfs_break_active_protection(cgrp->control_kn); -retry: - enable = enable_req; - disable = disable_req; mutex_lock(&cgroup_tree_mutex); @@ -2608,7 +2605,9 @@ retry: schedule(); finish_wait(&child->offline_waitq, &wait); cgroup_put(child); - goto retry; + + ret = restart_syscall(); + goto out_unbreak; } /* unavailable or not enabled on the parent? */ @@ -2692,6 +2691,7 @@ out_unlock: mutex_unlock(&cgroup_mutex); out_unlock_tree: mutex_unlock(&cgroup_tree_mutex); +out_unbreak: kernfs_unbreak_active_protection(cgrp->control_kn); cgroup_put(cgrp); return ret; -- cgit v1.2.3 From 46cfeb043b04f5878154bea36714709d46028495 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:11:00 -0400 Subject: cgroup: use release_agent_path_lock in cgroup_release_agent_show() release_path is now protected by release_agent_path_lock to allow accessing it without grabbing cgroup_mutex; however, cgroup_release_agent_show() was still grabbing cgroup_mutex. Let's convert it to release_agent_path_lock so that we don't have to worry about this one for the planned locking updates. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3251cc9070fa..7633703e9baf 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2373,11 +2373,10 @@ static int cgroup_release_agent_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; + spin_lock(&release_agent_path_lock); seq_puts(seq, cgrp->root->release_agent_path); + spin_unlock(&release_agent_path_lock); seq_putc(seq, '\n'); - mutex_unlock(&cgroup_mutex); return 0; } -- cgit v1.2.3 From ec903c0c858e4963a9e0724bdcadfa837253341c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:11:01 -0400 Subject: cgroup: rename css_tryget*() to css_tryget_online*() Unlike the more usual refcnting, what css_tryget() provides is the distinction between online and offline csses instead of protection against upping a refcnt which already reached zero. cgroup is planning to provide actual tryget which fails if the refcnt already reached zero. Let's rename the existing trygets so that they clearly indicate that they're onliness. I thought about keeping the existing names as-are and introducing new names for the planned actual tryget; however, given that each controller participates in the synchronization of the online state, it seems worthwhile to make it explicit that these functions are about on/offline state. Rename css_tryget() to css_tryget_online() and css_tryget_from_dir() to css_tryget_online_from_dir(). This is pure rename. v2: cgroup_freezer grew new usages of css_tryget(). Update accordingly. Signed-off-by: Tejun Heo Acked-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Li Zefan Cc: Vivek Goyal Cc: Jens Axboe Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- kernel/cgroup.c | 40 ++++++++++++++++++++-------------------- kernel/cgroup_freezer.c | 4 ++-- kernel/cpuset.c | 6 +++--- kernel/events/core.c | 3 ++- 4 files changed, 27 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7633703e9baf..671d8a6dae37 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3771,7 +3771,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) /* * We aren't being called from kernfs and there's no guarantee on - * @kn->priv's validity. For this and css_tryget_from_dir(), + * @kn->priv's validity. For this and css_tryget_online_from_dir(), * @kn->priv is RCU safe. Let's do the RCU dancing. */ rcu_read_lock(); @@ -4060,9 +4060,9 @@ err: * Implemented in kill_css(). * * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs - * and thus css_tryget() is guaranteed to fail, the css can be offlined - * by invoking offline_css(). After offlining, the base ref is put. - * Implemented in css_killed_work_fn(). + * and thus css_tryget_online() is guaranteed to fail, the css can be + * offlined by invoking offline_css(). After offlining, the base ref is + * put. Implemented in css_killed_work_fn(). * * 3. When the percpu_ref reaches zero, the only possible remaining * accessors are inside RCU read sections. css_release() schedules the @@ -4386,7 +4386,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, /* * This is called when the refcnt of a css is confirmed to be killed. - * css_tryget() is now guaranteed to fail. + * css_tryget_online() is now guaranteed to fail. */ static void css_killed_work_fn(struct work_struct *work) { @@ -4398,8 +4398,8 @@ static void css_killed_work_fn(struct work_struct *work) mutex_lock(&cgroup_mutex); /* - * css_tryget() is guaranteed to fail now. Tell subsystems to - * initate destruction. + * css_tryget_online() is guaranteed to fail now. Tell subsystems + * to initate destruction. */ offline_css(css); @@ -4440,8 +4440,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref) * * This function initiates destruction of @css by removing cgroup interface * files and putting its base reference. ->css_offline() will be invoked - * asynchronously once css_tryget() is guaranteed to fail and when the - * reference count reaches zero, @css will be released. + * asynchronously once css_tryget_online() is guaranteed to fail and when + * the reference count reaches zero, @css will be released. */ static void kill_css(struct cgroup_subsys_state *css) { @@ -4462,7 +4462,7 @@ static void kill_css(struct cgroup_subsys_state *css) /* * cgroup core guarantees that, by the time ->css_offline() is * invoked, no new css reference will be given out via - * css_tryget(). We can't simply call percpu_ref_kill() and + * css_tryget_online(). We can't simply call percpu_ref_kill() and * proceed to offlining css's because percpu_ref_kill() doesn't * guarantee that the ref is seen as killed on all CPUs on return. * @@ -4478,9 +4478,9 @@ static void kill_css(struct cgroup_subsys_state *css) * * css's make use of percpu refcnts whose killing latency shouldn't be * exposed to userland and are RCU protected. Also, cgroup core needs to - * guarantee that css_tryget() won't succeed by the time ->css_offline() is - * invoked. To satisfy all the requirements, destruction is implemented in - * the following two steps. + * guarantee that css_tryget_online() won't succeed by the time + * ->css_offline() is invoked. To satisfy all the requirements, + * destruction is implemented in the following two steps. * * s1. Verify @cgrp can be destroyed and mark it dying. Remove all * userland visible parts and start killing the percpu refcnts of @@ -4574,9 +4574,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * There are two control paths which try to determine cgroup from * dentry without going through kernfs - cgroupstats_build() and - * css_tryget_from_dir(). Those are supported by RCU protecting - * clearing of cgrp->kn->priv backpointer, which should happen - * after all files under it have been removed. + * css_tryget_online_from_dir(). Those are supported by RCU + * protecting clearing of cgrp->kn->priv backpointer, which should + * happen after all files under it have been removed. */ kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */ RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); @@ -5173,7 +5173,7 @@ static int __init cgroup_disable(char *str) __setup("cgroup_disable=", cgroup_disable); /** - * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir + * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest * @ss: subsystem of interest * @@ -5181,8 +5181,8 @@ __setup("cgroup_disable=", cgroup_disable); * to get the corresponding css and return it. If such css doesn't exist * or can't be pinned, an ERR_PTR value is returned. */ -struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, - struct cgroup_subsys *ss) +struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, + struct cgroup_subsys *ss) { struct kernfs_node *kn = kernfs_node_from_dentry(dentry); struct cgroup_subsys_state *css = NULL; @@ -5204,7 +5204,7 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, if (cgrp) css = cgroup_css(cgrp, ss); - if (!css || !css_tryget(css)) + if (!css || !css_tryget_online(css)) css = ERR_PTR(-ENOENT); rcu_read_unlock(); diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 345628c78b5b..0398f7e9ac81 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -304,7 +304,7 @@ static int freezer_read(struct seq_file *m, void *v) /* update states bottom-up */ css_for_each_descendant_post(pos, css) { - if (!css_tryget(pos)) + if (!css_tryget_online(pos)) continue; rcu_read_unlock(); @@ -404,7 +404,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) struct freezer *pos_f = css_freezer(pos); struct freezer *parent = parent_freezer(pos_f); - if (!css_tryget(pos)) + if (!css_tryget_online(pos)) continue; rcu_read_unlock(); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7c0e8da59e26..37ca0a5c226d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -872,7 +872,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root) continue; } } - if (!css_tryget(&cp->css)) + if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); @@ -1108,7 +1108,7 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root) continue; } } - if (!css_tryget(&cp->css)) + if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); @@ -2153,7 +2153,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) rcu_read_lock(); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { - if (cs == &top_cpuset || !css_tryget(&cs->css)) + if (cs == &top_cpuset || !css_tryget_online(&cs->css)) continue; rcu_read_unlock(); diff --git a/kernel/events/core.c b/kernel/events/core.c index f83a71a3e46d..077968d19b8a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -607,7 +607,8 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, if (!f.file) return -EBADF; - css = css_tryget_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys); + css = css_tryget_online_from_dir(f.file->f_dentry, + &perf_event_cgrp_subsys); if (IS_ERR(css)) { ret = PTR_ERR(css); goto out; -- cgit v1.2.3 From b41686401e501430ffe93b575ef7959d2ecc6f2e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:16:21 -0400 Subject: cgroup: implement cftype->write() During the recent conversion to kernfs, cftype's seq_file operations are updated so that they are directly mapped to kernfs operations and thus can fully access the associated kernfs and cgroup contexts; however, write path hasn't seen similar updates and none of the existing write operations has access to, for example, the associated kernfs_open_file. Let's introduce a new operation cftype->write() which maps directly to the kernfs write operation and has access to all the arguments and contexts. This will replace ->write_string() and ->trigger() and ease manipulation of kernfs active protection from cgroup file operations. Two accessors - of_cft() and of_css() - are introduced to enable accessing the associated cgroup context from cftype->write() which only takes kernfs_open_file for the context information. The accessors for seq_file operations - seq_cft() and seq_css() - are rewritten to wrap the of_ accessors. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 671d8a6dae37..a16f91d12f4e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -283,11 +283,10 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp) return test_bit(CGRP_DEAD, &cgrp->flags); } -struct cgroup_subsys_state *seq_css(struct seq_file *seq) +struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { - struct kernfs_open_file *of = seq->private; struct cgroup *cgrp = of->kn->parent->priv; - struct cftype *cft = seq_cft(seq); + struct cftype *cft = of_cft(of); /* * This is open and unprotected implementation of cgroup_css(). @@ -302,7 +301,7 @@ struct cgroup_subsys_state *seq_css(struct seq_file *seq) else return &cgrp->dummy_css; } -EXPORT_SYMBOL_GPL(seq_css); +EXPORT_SYMBOL_GPL(of_css); /** * cgroup_is_descendant - test ancestry @@ -1035,8 +1034,8 @@ static umode_t cgroup_file_mode(const struct cftype *cft) if (cft->read_u64 || cft->read_s64 || cft->seq_show) mode |= S_IRUGO; - if (cft->write_u64 || cft->write_s64 || cft->write_string || - cft->trigger) + if (cft->write_u64 || cft->write_s64 || cft->write || + cft->write_string || cft->trigger) mode |= S_IWUSR; return mode; @@ -2726,6 +2725,9 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, struct cgroup_subsys_state *css; int ret; + if (cft->write) + return cft->write(of, buf, nbytes, off); + /* * kernfs guarantees that a file isn't deleted with operations in * flight, which means that the matching css is and stays alive and -- cgit v1.2.3 From 451af504df0c62f695a69b83c250486e77c66378 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:16:21 -0400 Subject: cgroup: replace cftype->write_string() with cftype->write() Convert all cftype->write_string() users to the new cftype->write() which maps directly to kernfs write operation and has full access to kernfs and cgroup contexts. The conversions are mostly mechanical. * @css and @cft are accessed using of_css() and of_cft() accessors respectively instead of being specified as arguments. * Should return @nbytes on success instead of 0. * @buf is not trimmed automatically. Trim if necessary. Note that blkcg and netprio don't need this as the parsers already handle whitespaces. cftype->write_string() has no user left after the conversions and removed. While at it, remove unnecessary local variable @p in cgroup_subtree_control_write() and stale comment about CGROUP_LOCAL_BUFFER_SIZE in cgroup_freezer.c. This patch doesn't introduce any visible behavior changes. v2: netprio was missing from conversion. Converted. Signed-off-by: Tejun Heo Acked-by: Aristeu Rozanski Acked-by: Vivek Goyal Acked-by: Li Zefan Cc: Jens Axboe Cc: Johannes Weiner Cc: Michal Hocko Cc: Neil Horman Cc: "David S. Miller" --- kernel/cgroup.c | 38 +++++++++++++++++++------------------- kernel/cgroup_freezer.c | 20 +++++++++----------- kernel/cpuset.c | 16 +++++++++------- 3 files changed, 37 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a16f91d12f4e..2a88ce7b24b6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1035,7 +1035,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft) mode |= S_IRUGO; if (cft->write_u64 || cft->write_s64 || cft->write || - cft->write_string || cft->trigger) + cft->trigger) mode |= S_IWUSR; return mode; @@ -2352,20 +2352,21 @@ static int cgroup_procs_write(struct cgroup_subsys_state *css, return attach_task_by_pid(css->cgroup, tgid, true); } -static int cgroup_release_agent_write(struct cgroup_subsys_state *css, - struct cftype *cft, char *buffer) +static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - struct cgroup_root *root = css->cgroup->root; + struct cgroup *cgrp = of_css(of)->cgroup; + struct cgroup_root *root = cgrp->root; BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX); - if (!cgroup_lock_live_group(css->cgroup)) + if (!cgroup_lock_live_group(cgrp)) return -ENODEV; spin_lock(&release_agent_path_lock); - strlcpy(root->release_agent_path, buffer, + strlcpy(root->release_agent_path, strstrip(buf), sizeof(root->release_agent_path)); spin_unlock(&release_agent_path_lock); mutex_unlock(&cgroup_mutex); - return 0; + return nbytes; } static int cgroup_release_agent_show(struct seq_file *seq, void *v) @@ -2530,21 +2531,22 @@ out_finish: } /* change the enabled child controllers for a cgroup in the default hierarchy */ -static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css, - struct cftype *cft, char *buffer) +static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) { unsigned int enable = 0, disable = 0; - struct cgroup *cgrp = dummy_css->cgroup, *child; + struct cgroup *cgrp = of_css(of)->cgroup, *child; struct cgroup_subsys *ss; - char *tok, *p; + char *tok; int ssid, ret; /* * Parse input - space separated list of subsystem names prefixed * with either + or -. */ - p = buffer; - while ((tok = strsep(&p, " "))) { + buf = strstrip(buf); + while ((tok = strsep(&buf, " "))) { if (tok[0] == '\0') continue; for_each_subsys(ss, ssid) { @@ -2692,7 +2694,7 @@ out_unlock_tree: out_unbreak: kernfs_unbreak_active_protection(cgrp->control_kn); cgroup_put(cgrp); - return ret; + return ret ?: nbytes; err_undo_css: cgrp->child_subsys_mask &= ~enable; @@ -2738,9 +2740,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, css = cgroup_css(cgrp, cft->ss); rcu_read_unlock(); - if (cft->write_string) { - ret = cft->write_string(css, cft, strstrip(buf)); - } else if (cft->write_u64) { + if (cft->write_u64) { unsigned long long v; ret = kstrtoull(buf, 0, &v); if (!ret) @@ -3984,7 +3984,7 @@ static struct cftype cgroup_base_files[] = { .name = "cgroup.subtree_control", .flags = CFTYPE_ONLY_ON_DFL, .seq_show = cgroup_subtree_control_show, - .write_string = cgroup_subtree_control_write, + .write = cgroup_subtree_control_write, }, { .name = "cgroup.populated", @@ -4018,7 +4018,7 @@ static struct cftype cgroup_base_files[] = { .name = "release_agent", .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_release_agent_show, - .write_string = cgroup_release_agent_write, + .write = cgroup_release_agent_write, .max_write_len = PATH_MAX - 1, }, { } /* terminate */ diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 0398f7e9ac81..6b4e60e33a9a 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -73,10 +73,6 @@ bool cgroup_freezing(struct task_struct *task) return ret; } -/* - * cgroups_write_string() limits the size of freezer state strings to - * CGROUP_LOCAL_BUFFER_SIZE - */ static const char *freezer_state_strs(unsigned int state) { if (state & CGROUP_FROZEN) @@ -423,20 +419,22 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) mutex_unlock(&freezer_mutex); } -static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, - char *buffer) +static ssize_t freezer_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { bool freeze; - if (strcmp(buffer, freezer_state_strs(0)) == 0) + buf = strstrip(buf); + + if (strcmp(buf, freezer_state_strs(0)) == 0) freeze = false; - else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0) + else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) freeze = true; else return -EINVAL; - freezer_change_state(css_freezer(css), freeze); - return 0; + freezer_change_state(css_freezer(of_css(of)), freeze); + return nbytes; } static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, @@ -460,7 +458,7 @@ static struct cftype files[] = { .name = "state", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = freezer_read, - .write_string = freezer_write, + .write = freezer_write, }, { .name = "self_freezing", diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 37ca0a5c226d..2f4b08b8db24 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1603,13 +1603,15 @@ out_unlock: /* * Common handling for a write to a "cpus" or "mems" file. */ -static int cpuset_write_resmask(struct cgroup_subsys_state *css, - struct cftype *cft, char *buf) +static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - struct cpuset *cs = css_cs(css); + struct cpuset *cs = css_cs(of_css(of)); struct cpuset *trialcs; int retval = -ENODEV; + buf = strstrip(buf); + /* * CPU or memory hotunplug may leave @cs w/o any execution * resources, in which case the hotplug code asynchronously updates @@ -1633,7 +1635,7 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css, goto out_unlock; } - switch (cft->private) { + switch (of_cft(of)->private) { case FILE_CPULIST: retval = update_cpumask(cs, trialcs, buf); break; @@ -1648,7 +1650,7 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css, free_trial_cpuset(trialcs); out_unlock: mutex_unlock(&cpuset_mutex); - return retval; + return retval ?: nbytes; } /* @@ -1750,7 +1752,7 @@ static struct cftype files[] = { { .name = "cpus", .seq_show = cpuset_common_seq_show, - .write_string = cpuset_write_resmask, + .write = cpuset_write_resmask, .max_write_len = (100U + 6 * NR_CPUS), .private = FILE_CPULIST, }, @@ -1758,7 +1760,7 @@ static struct cftype files[] = { { .name = "mems", .seq_show = cpuset_common_seq_show, - .write_string = cpuset_write_resmask, + .write = cpuset_write_resmask, .max_write_len = (100U + 6 * MAX_NUMNODES), .private = FILE_MEMLIST, }, -- cgit v1.2.3 From 6770c64e5c8da4705d1f0973bdeb5c2bf4f3a404 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:16:21 -0400 Subject: cgroup: replace cftype->trigger() with cftype->write() cftype->trigger() is pointless. It's trivial to ignore the input buffer from a regular ->write() operation. Convert all ->trigger() users to ->write() and remove ->trigger(). This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Johannes Weiner Cc: Michal Hocko --- kernel/cgroup.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2a88ce7b24b6..2f16aab03493 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1034,8 +1034,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft) if (cft->read_u64 || cft->read_s64 || cft->seq_show) mode |= S_IRUGO; - if (cft->write_u64 || cft->write_s64 || cft->write || - cft->trigger) + if (cft->write_u64 || cft->write_s64 || cft->write) mode |= S_IWUSR; return mode; @@ -2750,8 +2749,6 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, ret = kstrtoll(buf, 0, &v); if (!ret) ret = cft->write_s64(css, cft, v); - } else if (cft->trigger) { - ret = cft->trigger(css, (unsigned int)cft->private); } else { ret = -EINVAL; } -- cgit v1.2.3 From acbef755f40e204b8a6503fa79958d51a898762a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:16:22 -0400 Subject: cgroup: convert "tasks" and "cgroup.procs" handle to use cftype->write() cgroup_tasks_write() and cgroup_procs_write() are currently using cftype->write_u64(). This patch converts them to use cftype->write() instead. This allows access to the associated kernfs_open_file which will be necessary to implement the planned kernfs active protection manipulation for these files. This shifts buffer parsing to attach_task_by_pid() and makes it return @nbytes on success. Let's rename it to __cgroup_procs_write() to clearly indicate that this is a write handler implementation. This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2f16aab03493..9a48c117ebf1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2232,12 +2232,18 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, * function to attach either it or all tasks in its threadgroup. Will lock * cgroup_mutex and threadgroup. */ -static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) +static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off, bool threadgroup) { struct task_struct *tsk; const struct cred *cred = current_cred(), *tcred; + struct cgroup *cgrp = of_css(of)->cgroup; + pid_t pid; int ret; + if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) + return -EINVAL; + if (!cgroup_lock_live_group(cgrp)) return -ENODEV; @@ -2305,7 +2311,7 @@ retry_find_task: put_task_struct(tsk); out_unlock_cgroup: mutex_unlock(&cgroup_mutex); - return ret; + return ret ?: nbytes; } /** @@ -2339,16 +2345,16 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) } EXPORT_SYMBOL_GPL(cgroup_attach_task_all); -static int cgroup_tasks_write(struct cgroup_subsys_state *css, - struct cftype *cft, u64 pid) +static ssize_t cgroup_tasks_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - return attach_task_by_pid(css->cgroup, pid, false); + return __cgroup_procs_write(of, buf, nbytes, off, false); } -static int cgroup_procs_write(struct cgroup_subsys_state *css, - struct cftype *cft, u64 tgid) +static ssize_t cgroup_procs_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - return attach_task_by_pid(css->cgroup, tgid, true); + return __cgroup_procs_write(of, buf, nbytes, off, true); } static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, @@ -3953,7 +3959,7 @@ static struct cftype cgroup_base_files[] = { .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_PROCS, - .write_u64 = cgroup_procs_write, + .write = cgroup_procs_write, .mode = S_IRUGO | S_IWUSR, }, { @@ -4002,7 +4008,7 @@ static struct cftype cgroup_base_files[] = { .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_TASKS, - .write_u64 = cgroup_tasks_write, + .write = cgroup_tasks_write, .mode = S_IRUGO | S_IWUSR, }, { -- cgit v1.2.3 From b7fc5ad235936379fae67a9f7b50bb53487a1a3a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:16:22 -0400 Subject: cgroup: remove cgroup->control_kn Now that cgroup_subtree_control_write() has access to the associated kernfs_open_file and thus the kernfs_node, there's no need to cache it in cgroup->control_kn on creation. Remove cgroup->control_kn and use @of->kn directly. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9a48c117ebf1..94d259bcd2b9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2580,7 +2580,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, * active_ref protection. */ cgroup_get(cgrp); - kernfs_break_active_protection(cgrp->control_kn); + kernfs_break_active_protection(of->kn); mutex_lock(&cgroup_tree_mutex); @@ -2697,7 +2697,7 @@ out_unlock: out_unlock_tree: mutex_unlock(&cgroup_tree_mutex); out_unbreak: - kernfs_unbreak_active_protection(cgrp->control_kn); + kernfs_unbreak_active_protection(of->kn); cgroup_put(cgrp); return ret ?: nbytes; @@ -2887,9 +2887,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) return ret; } - if (cft->seq_show == cgroup_subtree_control_show) - cgrp->control_kn = kn; - else if (cft->seq_show == cgroup_populated_show) + if (cft->seq_show == cgroup_populated_show) cgrp->populated_kn = kn; return 0; } -- cgit v1.2.3 From ba0f4d761503bd9ce4c7458b56bfd7c3fdb51e86 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:19:22 -0400 Subject: cgroup: reorganize cgroup_create() Reorganize cgroup_create() so that all paths share unlock out path. * All err_* labels are renamed to out_* as they're now shared by both success and failure paths. * @err renamed to @ret for the similar reason as above and so that it's more consistent with other functions. * cgroup memory allocation moved after locking so that freeing failed cgroup happens before unlocking. While this moves more code inside critical section, memory allocations inside cgroup locking are already pretty common and this is unlikely to make any noticeable difference. * While at it, replace a stray @parent->root dereference with @root. This reorganization will help simplifying locking. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 69 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 34 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 94d259bcd2b9..1d6106c3fb4e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4246,15 +4246,10 @@ static long cgroup_create(struct cgroup *parent, const char *name, { struct cgroup *cgrp; struct cgroup_root *root = parent->root; - int ssid, err; + int ssid, ret; struct cgroup_subsys *ss; struct kernfs_node *kn; - /* allocate the cgroup and its ID, 0 is reserved for the root */ - cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); - if (!cgrp) - return -ENOMEM; - mutex_lock(&cgroup_tree_mutex); /* @@ -4265,8 +4260,15 @@ static long cgroup_create(struct cgroup *parent, const char *name, * don't get nasty surprises if we ever grow another caller. */ if (!cgroup_lock_live_group(parent)) { - err = -ENODEV; - goto err_unlock_tree; + ret = -ENODEV; + goto out_unlock_tree; + } + + /* allocate the cgroup and its ID, 0 is reserved for the root */ + cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); + if (!cgrp) { + ret = -ENOMEM; + goto out_unlock; } /* @@ -4275,15 +4277,15 @@ static long cgroup_create(struct cgroup *parent, const char *name, */ cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT); if (cgrp->id < 0) { - err = -ENOMEM; - goto err_unlock; + ret = -ENOMEM; + goto out_free_cgrp; } init_cgroup_housekeeping(cgrp); cgrp->parent = parent; cgrp->dummy_css.parent = &parent->dummy_css; - cgrp->root = parent->root; + cgrp->root = root; if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -4294,8 +4296,8 @@ static long cgroup_create(struct cgroup *parent, const char *name, /* create the directory */ kn = kernfs_create_dir(parent->kn, name, mode, cgrp); if (IS_ERR(kn)) { - err = PTR_ERR(kn); - goto err_free_id; + ret = PTR_ERR(kn); + goto out_free_id; } cgrp->kn = kn; @@ -4318,20 +4320,20 @@ static long cgroup_create(struct cgroup *parent, const char *name, */ cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); - err = cgroup_kn_set_ugid(kn); - if (err) - goto err_destroy; + ret = cgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; - err = cgroup_addrm_files(cgrp, cgroup_base_files, true); - if (err) - goto err_destroy; + ret = cgroup_addrm_files(cgrp, cgroup_base_files, true); + if (ret) + goto out_destroy; /* let's create and online css's */ for_each_subsys(ss, ssid) { if (parent->child_subsys_mask & (1 << ssid)) { - err = create_css(cgrp, ss); - if (err) - goto err_destroy; + ret = create_css(cgrp, ss); + if (ret) + goto out_destroy; } } @@ -4344,25 +4346,22 @@ static long cgroup_create(struct cgroup *parent, const char *name, kernfs_activate(kn); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); - - return 0; + ret = 0; + goto out_unlock; -err_free_id: +out_free_id: cgroup_idr_remove(&root->cgroup_idr, cgrp->id); -err_unlock: +out_free_cgrp: + kfree(cgrp); +out_unlock: mutex_unlock(&cgroup_mutex); -err_unlock_tree: +out_unlock_tree: mutex_unlock(&cgroup_tree_mutex); - kfree(cgrp); - return err; + return ret; -err_destroy: +out_destroy: cgroup_destroy_locked(cgrp); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); - return err; + goto out_unlock; } static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, -- cgit v1.2.3 From b3bfd983ca94cf1393accc11e90123c83909babb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:19:22 -0400 Subject: cgroup: collapse cgroup_create() into croup_mkdir() cgroup_mkdir() is the sole user of cgroup_create(). Let's collapse the latter into the former. This will help simplifying locking. While at it, remove now stale comment about inode locking. This patch doesn't introduce any functional changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 52 +++++++++++++--------------------------------------- 1 file changed, 13 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1d6106c3fb4e..580d3484f97a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4235,30 +4235,24 @@ err_free_css: return err; } -/** - * cgroup_create - create a cgroup - * @parent: cgroup that will be parent of the new cgroup - * @name: name of the new cgroup - * @mode: mode to set on new cgroup - */ -static long cgroup_create(struct cgroup *parent, const char *name, - umode_t mode) +static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) { - struct cgroup *cgrp; + struct cgroup *parent = parent_kn->priv, *cgrp; struct cgroup_root *root = parent->root; - int ssid, ret; struct cgroup_subsys *ss; struct kernfs_node *kn; - - mutex_lock(&cgroup_tree_mutex); + int ssid, ret; /* - * Only live parents can have children. Note that the liveliness - * check isn't strictly necessary because cgroup_mkdir() and - * cgroup_rmdir() are fully synchronized by i_mutex; however, do it - * anyway so that locking is contained inside cgroup proper and we - * don't get nasty surprises if we ever grow another caller. + * cgroup_mkdir() grabs cgroup_tree_mutex which nests outside + * kernfs active_ref and cgroup_create() already synchronizes + * properly against removal through cgroup_lock_live_group(). + * Break it before calling cgroup_create(). */ + cgroup_get(parent); + kernfs_break_active_protection(parent_kn); + mutex_lock(&cgroup_tree_mutex); if (!cgroup_lock_live_group(parent)) { ret = -ENODEV; goto out_unlock_tree; @@ -4357,6 +4351,8 @@ out_unlock: mutex_unlock(&cgroup_mutex); out_unlock_tree: mutex_unlock(&cgroup_tree_mutex); + kernfs_unbreak_active_protection(parent_kn); + cgroup_put(parent); return ret; out_destroy: @@ -4364,28 +4360,6 @@ out_destroy: goto out_unlock; } -static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) -{ - struct cgroup *parent = parent_kn->priv; - int ret; - - /* - * cgroup_create() grabs cgroup_tree_mutex which nests outside - * kernfs active_ref and cgroup_create() already synchronizes - * properly against removal through cgroup_lock_live_group(). - * Break it before calling cgroup_create(). - */ - cgroup_get(parent); - kernfs_break_active_protection(parent_kn); - - ret = cgroup_create(parent, name, mode); - - kernfs_unbreak_active_protection(parent_kn); - cgroup_put(parent); - return ret; -} - /* * This is called when the refcnt of a css is confirmed to be killed. * css_tryget_online() is now guaranteed to fail. -- cgit v1.2.3 From ddab2b6e0e516098efe6d3b69585bb25b1408d61 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:19:22 -0400 Subject: cgroup: grab cgroup_mutex earlier in cgroup_subtree_control_write() Move cgroup_lock_live_group() invocation upwards to right below cgroup_tree_mutex in cgroup_subtree_control_write(). This is to help the planned locking simplification. This doesn't make any userland-visible behavioral changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 580d3484f97a..8afddb1a1c6c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2583,6 +2583,10 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); mutex_lock(&cgroup_tree_mutex); + if (!cgroup_lock_live_group(cgrp)) { + ret = -ENODEV; + goto out_unlock_tree; + } for_each_subsys(ss, ssid) { if (enable & (1 << ssid)) { @@ -2606,6 +2610,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, cgroup_get(child); prepare_to_wait(&child->offline_waitq, &wait, TASK_UNINTERRUPTIBLE); + mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); schedule(); finish_wait(&child->offline_waitq, &wait); @@ -2620,7 +2625,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, (cgrp->parent && !(cgrp->parent->child_subsys_mask & (1 << ssid)))) { ret = -ENOENT; - goto out_unlock_tree; + goto out_unlock; } } else if (disable & (1 << ssid)) { if (!(cgrp->child_subsys_mask & (1 << ssid))) { @@ -2632,7 +2637,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, cgroup_for_each_live_child(child, cgrp) { if (child->child_subsys_mask & (1 << ssid)) { ret = -EBUSY; - goto out_unlock_tree; + goto out_unlock; } } } @@ -2640,12 +2645,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, if (!enable && !disable) { ret = 0; - goto out_unlock_tree; - } - - if (!cgroup_lock_live_group(cgrp)) { - ret = -ENODEV; - goto out_unlock_tree; + goto out_unlock; } /* -- cgit v1.2.3 From cfc79d5bec04cdf26cd207d3e73d8bd59fd780a8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:19:22 -0400 Subject: cgroup: move cgroup->kn->priv clearing to cgroup_rmdir() The ->priv field of a cgroup directory kernfs_node points back to the cgroup. This field is RCU cleared in cgroup_destroy_locked() for non-kernfs accesses from css_tryget_from_dir() and cgroupstats_build(). As these are only applicable to cgroups which finished creation successfully and fully initialized cgroups are always removed by cgroup_rmdir(), this can be safely moved to the end of cgroup_rmdir(). This will help simplifying cgroup locking and shouldn't introduce any behavior difference. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8afddb1a1c6c..b49e63d5386b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4546,17 +4546,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* remove @cgrp directory along with the base files */ mutex_unlock(&cgroup_mutex); - - /* - * There are two control paths which try to determine cgroup from - * dentry without going through kernfs - cgroupstats_build() and - * css_tryget_online_from_dir(). Those are supported by RCU - * protecting clearing of cgrp->kn->priv backpointer, which should - * happen after all files under it have been removed. - */ kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */ - RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); - mutex_lock(&cgroup_mutex); return 0; @@ -4615,6 +4605,17 @@ static int cgroup_rmdir(struct kernfs_node *kn) mutex_unlock(&cgroup_tree_mutex); kernfs_unbreak_active_protection(kn); + + /* + * There are two control paths which try to determine cgroup from + * dentry without going through kernfs - cgroupstats_build() and + * css_tryget_online_from_dir(). Those are supported by RCU + * protecting clearing of cgrp->kn->priv backpointer, which should + * happen after all files under it have been removed. + */ + if (!ret) + RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL); + cgroup_put(cgrp); return ret; } @@ -5174,7 +5175,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, /* * This path doesn't originate from kernfs and @kn could already * have been or be removed at any point. @kn->priv is RCU - * protected for this access. See destroy_locked() for details. + * protected for this access. See cgroup_rmdir() for details. */ cgrp = rcu_dereference(kn->priv); if (cgrp) -- cgit v1.2.3 From a9746d8da786bc79b3b4ae1baa0fbbc4b795c1b7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:19:22 -0400 Subject: cgroup: factor out cgroup_kn_lock_live() and cgroup_kn_unlock() cgroup_mkdir(), cgroup_rmdir() and cgroup_subtree_control_write() share the logic to break active protection so that they can grab cgroup_tree_mutex which nests above active protection and/or remove self. Factor out this logic into cgroup_kn_lock_live() and cgroup_kn_unlock(). This patch doesn't introduce any functional changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 157 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 90 insertions(+), 67 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b49e63d5386b..21739e481006 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1093,6 +1093,75 @@ static void cgroup_put(struct cgroup *cgrp) call_rcu(&cgrp->rcu_head, cgroup_free_rcu); } +/** + * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods + * @kn: the kernfs_node being serviced + * + * This helper undoes cgroup_kn_lock_live() and should be invoked before + * the method finishes if locking succeeded. Note that once this function + * returns the cgroup returned by cgroup_kn_lock_live() may become + * inaccessible any time. If the caller intends to continue to access the + * cgroup, it should pin it before invoking this function. + */ +static void cgroup_kn_unlock(struct kernfs_node *kn) +{ + struct cgroup *cgrp; + + if (kernfs_type(kn) == KERNFS_DIR) + cgrp = kn->priv; + else + cgrp = kn->parent->priv; + + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); + + kernfs_unbreak_active_protection(kn); + cgroup_put(cgrp); +} + +/** + * cgroup_kn_lock_live - locking helper for cgroup kernfs methods + * @kn: the kernfs_node being serviced + * + * This helper is to be used by a cgroup kernfs method currently servicing + * @kn. It breaks the active protection, performs cgroup locking and + * verifies that the associated cgroup is alive. Returns the cgroup if + * alive; otherwise, %NULL. A successful return should be undone by a + * matching cgroup_kn_unlock() invocation. + * + * Any cgroup kernfs method implementation which requires locking the + * associated cgroup should use this helper. It avoids nesting cgroup + * locking under kernfs active protection and allows all kernfs operations + * including self-removal. + */ +static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) +{ + struct cgroup *cgrp; + + if (kernfs_type(kn) == KERNFS_DIR) + cgrp = kn->priv; + else + cgrp = kn->parent->priv; + + /* + * We're gonna grab cgroup_tree_mutex which nests outside kernfs + * active_ref. cgroup liveliness check alone provides enough + * protection against removal. Ensure @cgrp stays accessible and + * break the active_ref protection. + */ + cgroup_get(cgrp); + kernfs_break_active_protection(kn); + + mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); + + if (!cgroup_is_dead(cgrp)) + return cgrp; + + cgroup_kn_unlock(kn); + return NULL; +} + static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; @@ -2541,7 +2610,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, loff_t off) { unsigned int enable = 0, disable = 0; - struct cgroup *cgrp = of_css(of)->cgroup, *child; + struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; int ssid, ret; @@ -2573,20 +2642,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, return -EINVAL; } - /* - * We're gonna grab cgroup_tree_mutex which nests outside kernfs - * active_ref. cgroup_lock_live_group() already provides enough - * protection. Ensure @cgrp stays accessible and break the - * active_ref protection. - */ - cgroup_get(cgrp); - kernfs_break_active_protection(of->kn); - - mutex_lock(&cgroup_tree_mutex); - if (!cgroup_lock_live_group(cgrp)) { - ret = -ENODEV; - goto out_unlock_tree; - } + cgrp = cgroup_kn_lock_live(of->kn); + if (!cgrp) + return -ENODEV; for_each_subsys(ss, ssid) { if (enable & (1 << ssid)) { @@ -2610,14 +2668,12 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, cgroup_get(child); prepare_to_wait(&child->offline_waitq, &wait, TASK_UNINTERRUPTIBLE); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); + cgroup_kn_unlock(of->kn); schedule(); finish_wait(&child->offline_waitq, &wait); cgroup_put(child); - ret = restart_syscall(); - goto out_unbreak; + return restart_syscall(); } /* unavailable or not enabled on the parent? */ @@ -2693,12 +2749,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, kernfs_activate(cgrp->kn); ret = 0; out_unlock: - mutex_unlock(&cgroup_mutex); -out_unlock_tree: - mutex_unlock(&cgroup_tree_mutex); -out_unbreak: - kernfs_unbreak_active_protection(of->kn); - cgroup_put(cgrp); + cgroup_kn_unlock(of->kn); return ret ?: nbytes; err_undo_css: @@ -4238,25 +4289,16 @@ err_free_css: static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { - struct cgroup *parent = parent_kn->priv, *cgrp; - struct cgroup_root *root = parent->root; + struct cgroup *parent, *cgrp; + struct cgroup_root *root; struct cgroup_subsys *ss; struct kernfs_node *kn; int ssid, ret; - /* - * cgroup_mkdir() grabs cgroup_tree_mutex which nests outside - * kernfs active_ref and cgroup_create() already synchronizes - * properly against removal through cgroup_lock_live_group(). - * Break it before calling cgroup_create(). - */ - cgroup_get(parent); - kernfs_break_active_protection(parent_kn); - mutex_lock(&cgroup_tree_mutex); - if (!cgroup_lock_live_group(parent)) { - ret = -ENODEV; - goto out_unlock_tree; - } + parent = cgroup_kn_lock_live(parent_kn); + if (!parent) + return -ENODEV; + root = parent->root; /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); @@ -4348,11 +4390,7 @@ out_free_id: out_free_cgrp: kfree(cgrp); out_unlock: - mutex_unlock(&cgroup_mutex); -out_unlock_tree: - mutex_unlock(&cgroup_tree_mutex); - kernfs_unbreak_active_protection(parent_kn); - cgroup_put(parent); + cgroup_kn_unlock(parent_kn); return ret; out_destroy: @@ -4579,32 +4617,17 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp) static int cgroup_rmdir(struct kernfs_node *kn) { - struct cgroup *cgrp = kn->priv; + struct cgroup *cgrp; int ret = 0; - /* - * This is self-destruction but @kn can't be removed while this - * callback is in progress. Let's break active protection. Once - * the protection is broken, @cgrp can be destroyed at any point. - * Pin it so that it stays accessible. - */ - cgroup_get(cgrp); - kernfs_break_active_protection(kn); - - mutex_lock(&cgroup_tree_mutex); - mutex_lock(&cgroup_mutex); - - /* - * @cgrp might already have been destroyed while we're trying to - * grab the mutexes. - */ - if (!cgroup_is_dead(cgrp)) - ret = cgroup_destroy_locked(cgrp); + cgrp = cgroup_kn_lock_live(kn); + if (!cgrp) + return 0; + cgroup_get(cgrp); /* for @kn->priv clearing */ - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); + ret = cgroup_destroy_locked(cgrp); - kernfs_unbreak_active_protection(kn); + cgroup_kn_unlock(kn); /* * There are two control paths which try to determine cgroup from -- cgit v1.2.3 From e76ecaeef65c497153ceacf59c2e21c070d43f64 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:19:23 -0400 Subject: cgroup: use cgroup_kn_lock_live() in other cgroup kernfs methods Make __cgroup_procs_write() and cgroup_release_agent_write() use cgroup_kn_lock_live() and cgroup_kn_unlock() instead of cgroup_lock_live_group(). This puts the operations under both cgroup_tree_mutex and cgroup_mutex protection without circular dependency from kernfs active protection. Also, this means that cgroup_mutex is no longer nested below kernfs active protection. There is no longer any place where the two locks interact. This leaves cgroup_lock_live_group() without any user. Removed. This will help simplifying cgroup locking. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 39 ++++++++++++--------------------------- 1 file changed, 12 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 21739e481006..b7cd80845f6a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -386,23 +386,6 @@ static int notify_on_release(const struct cgroup *cgrp) ; \ else -/** - * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. - * @cgrp: the cgroup to be checked for liveness - * - * On success, returns true; the mutex should be later unlocked. On - * failure returns false with no lock held. - */ -static bool cgroup_lock_live_group(struct cgroup *cgrp) -{ - mutex_lock(&cgroup_mutex); - if (cgroup_is_dead(cgrp)) { - mutex_unlock(&cgroup_mutex); - return false; - } - return true; -} - /* the list of cgroups eligible for automatic release. Protected by * release_list_lock */ static LIST_HEAD(release_list); @@ -2306,14 +2289,15 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, { struct task_struct *tsk; const struct cred *cred = current_cred(), *tcred; - struct cgroup *cgrp = of_css(of)->cgroup; + struct cgroup *cgrp; pid_t pid; int ret; if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return -EINVAL; - if (!cgroup_lock_live_group(cgrp)) + cgrp = cgroup_kn_lock_live(of->kn); + if (!cgrp) return -ENODEV; retry_find_task: @@ -2379,7 +2363,7 @@ retry_find_task: put_task_struct(tsk); out_unlock_cgroup: - mutex_unlock(&cgroup_mutex); + cgroup_kn_unlock(of->kn); return ret ?: nbytes; } @@ -2429,17 +2413,18 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - struct cgroup *cgrp = of_css(of)->cgroup; - struct cgroup_root *root = cgrp->root; + struct cgroup *cgrp; - BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX); - if (!cgroup_lock_live_group(cgrp)) + BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + + cgrp = cgroup_kn_lock_live(of->kn); + if (!cgrp) return -ENODEV; spin_lock(&release_agent_path_lock); - strlcpy(root->release_agent_path, strstrip(buf), - sizeof(root->release_agent_path)); + strlcpy(cgrp->root->release_agent_path, strstrip(buf), + sizeof(cgrp->root->release_agent_path)); spin_unlock(&release_agent_path_lock); - mutex_unlock(&cgroup_mutex); + cgroup_kn_unlock(of->kn); return nbytes; } -- cgit v1.2.3 From 01f6474ce04fffd6282b569ac0a31f4b98d4c82a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:19:23 -0400 Subject: cgroup: nest kernfs active protection under cgroup_mutex After the recent cgroup_kn_lock_live() changes, cgroup_mutex is no longer nested below kernfs active protection. The two don't have any relationship now. This patch nests kernfs active protection under cgroup_mutex. All cftype operations now require both cgroup_tree_mutex and cgroup_mutex, temporary cgroup_mutex releases over kernfs operations are removed, and cgroup_add/rm_cftypes() grab both mutexes. This makes cgroup_tree_mutex redundant, which will be removed by the next patch. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b7cd80845f6a..bf1d7ce250ac 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1127,7 +1127,7 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) cgrp = kn->parent->priv; /* - * We're gonna grab cgroup_tree_mutex which nests outside kernfs + * We're gonna grab cgroup_mutex which nests outside kernfs * active_ref. cgroup liveliness check alone provides enough * protection against removal. Ensure @cgrp stays accessible and * break the active_ref protection. @@ -1150,6 +1150,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) char name[CGROUP_FILE_NAME_MAX]; lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); } @@ -1216,11 +1217,9 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) * Nothing can fail from this point on. Remove files for the * removed subsystems and rebind each subsystem. */ - mutex_unlock(&cgroup_mutex); for_each_subsys(ss, ssid) if (ss_mask & (1 << ssid)) cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); - mutex_lock(&cgroup_mutex); for_each_subsys(ss, ssid) { struct cgroup_root *src_root; @@ -2946,6 +2945,7 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], int ret; lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); for (cft = cfts; cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ @@ -2981,6 +2981,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) int ret = 0; lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); /* add/rm files for all cgroups created before */ css_for_each_descendant_pre(css, cgroup_css(root, ss)) { @@ -3049,6 +3050,7 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) static int cgroup_rm_cftypes_locked(struct cftype *cfts) { lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); if (!cfts || !cfts[0].ss) return -ENOENT; @@ -3075,7 +3077,9 @@ int cgroup_rm_cftypes(struct cftype *cfts) int ret; mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); ret = cgroup_rm_cftypes_locked(cfts); + mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); return ret; } @@ -3106,12 +3110,14 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) return ret; mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); list_add_tail(&cfts->node, &ss->cfts); ret = cgroup_apply_cftypes(cfts, true); if (ret) cgroup_rm_cftypes_locked(cfts); + mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); return ret; } @@ -4445,6 +4451,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref) static void kill_css(struct cgroup_subsys_state *css) { lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); /* * This must happen before css is disassociated with its cgroup. @@ -4544,13 +4551,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Initiate massacre of all css's. cgroup_destroy_css_killed() * will be invoked to perform the rest of destruction once the - * percpu refs of all css's are confirmed to be killed. This - * involves removing the subsystem's files, drop cgroup_mutex. + * percpu refs of all css's are confirmed to be killed. */ - mutex_unlock(&cgroup_mutex); for_each_css(css, ssid, cgrp) kill_css(css); - mutex_lock(&cgroup_mutex); /* CGRP_DEAD is set, remove from ->release_list for the last time */ raw_spin_lock(&release_list_lock); @@ -4567,10 +4571,11 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) if (!cgrp->nr_css) cgroup_destroy_css_killed(cgrp); - /* remove @cgrp directory along with the base files */ - mutex_unlock(&cgroup_mutex); - kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */ - mutex_lock(&cgroup_mutex); + /* + * Remove @cgrp directory along with the base files. @cgrp has an + * extra ref on its kn. + */ + kernfs_remove(cgrp->kn); return 0; }; -- cgit v1.2.3 From 8353da1f91f12a3079ecc849226f371242d2807c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:19:23 -0400 Subject: cgroup: remove cgroup_tree_mutex cgroup_tree_mutex was introduced to work around the circular dependency between cgroup_mutex and kernfs active protection - some kernfs file and directory operations needed cgroup_mutex putting cgroup_mutex under active protection but cgroup also needs to be able to access cgroup hierarchies and cftypes to determine which kernfs_nodes need to be removed. cgroup_tree_mutex nested above both cgroup_mutex and kernfs active protection and used to protect the hierarchy and cftypes. While this worked, it added a lot of double lockings and was generally cumbersome. kernfs provides a mechanism to opt out of active protection and cgroup was already using it for removal and subtree_control. There's no reason to mix both methods of avoiding circular locking dependency and the preceding cgroup_kn_lock_live() changes applied it to all relevant cgroup kernfs operations making it unnecessary to nest cgroup_mutex under kernfs active protection. The previous patch reversed the original lock ordering and put cgroup_mutex above kernfs active protection. After these changes, all cgroup_tree_mutex usages are now accompanied by cgroup_mutex making the former completely redundant. This patch removes cgroup_tree_mutex and all its usages. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 64 ++++++++------------------------------------------------- 1 file changed, 9 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bf1d7ce250ac..457e52705f56 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -70,15 +70,6 @@ #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ MAX_CFTYPE_NAME + 2) -/* - * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file - * creation/removal and hierarchy changing operations including cgroup - * creation, removal, css association and controller rebinding. This outer - * lock is needed mainly to resolve the circular dependency between kernfs - * active ref and cgroup_mutex. cgroup_tree_mutex nests above both. - */ -static DEFINE_MUTEX(cgroup_tree_mutex); - /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. @@ -111,11 +102,10 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); */ static DEFINE_SPINLOCK(release_agent_path_lock); -#define cgroup_assert_mutexes_or_rcu_locked() \ +#define cgroup_assert_mutex_or_rcu_locked() \ rcu_lockdep_assert(rcu_read_lock_held() || \ - lockdep_is_held(&cgroup_tree_mutex) || \ lockdep_is_held(&cgroup_mutex), \ - "cgroup_[tree_]mutex or RCU read lock required"); + "cgroup_mutex or RCU read lock required"); /* * cgroup destruction makes heavy use of work items and there can be a lot @@ -243,7 +233,6 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, { if (ss) return rcu_dereference_check(cgrp->subsys[ss->id], - lockdep_is_held(&cgroup_tree_mutex) || lockdep_is_held(&cgroup_mutex)); else return &cgrp->dummy_css; @@ -347,7 +336,6 @@ static int notify_on_release(const struct cgroup *cgrp) for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ if (!((css) = rcu_dereference_check( \ (cgrp)->subsys[(ssid)], \ - lockdep_is_held(&cgroup_tree_mutex) || \ lockdep_is_held(&cgroup_mutex)))) { } \ else @@ -381,7 +369,7 @@ static int notify_on_release(const struct cgroup *cgrp) /* iterate over child cgrps, lock should be held throughout iteration */ #define cgroup_for_each_live_child(child, cgrp) \ list_for_each_entry((child), &(cgrp)->children, sibling) \ - if (({ lockdep_assert_held(&cgroup_tree_mutex); \ + if (({ lockdep_assert_held(&cgroup_mutex); \ cgroup_is_dead(child); })) \ ; \ else @@ -869,7 +857,6 @@ static void cgroup_destroy_root(struct cgroup_root *root) struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); BUG_ON(atomic_read(&root->nr_cgrps)); @@ -899,7 +886,6 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_exit_root_id(root); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); kernfs_destroy_root(root->kf_root); cgroup_free_root(root); @@ -1096,7 +1082,6 @@ static void cgroup_kn_unlock(struct kernfs_node *kn) cgrp = kn->parent->priv; mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); kernfs_unbreak_active_protection(kn); cgroup_put(cgrp); @@ -1135,7 +1120,6 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) cgroup_get(cgrp); kernfs_break_active_protection(kn); - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); if (!cgroup_is_dead(cgrp)) @@ -1149,7 +1133,6 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); } @@ -1179,7 +1162,6 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) struct cgroup_subsys *ss; int ssid, i, ret; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); for_each_subsys(ss, ssid) { @@ -1457,7 +1439,6 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) return -EINVAL; } - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); /* See what subsystems are wanted */ @@ -1503,7 +1484,6 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) kfree(opts.release_agent); kfree(opts.name); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); return ret; } @@ -1606,7 +1586,6 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) struct css_set *cset; int i, ret; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT); @@ -1696,7 +1675,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); /* First find the desired set of subsystems */ @@ -1761,9 +1739,7 @@ retry: */ if (!atomic_inc_not_zero(&root->cgrp.refcnt)) { mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); msleep(10); - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); goto retry; } @@ -1796,7 +1772,6 @@ retry: out_unlock: mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); kfree(opts.release_agent); kfree(opts.name); @@ -2507,7 +2482,6 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) struct css_set *src_cset; int ret; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); /* look up all csses currently attached to @cgrp's subtree */ @@ -2866,20 +2840,18 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, return -EPERM; /* - * We're gonna grab cgroup_tree_mutex which nests outside kernfs + * We're gonna grab cgroup_mutex which nests outside kernfs * active_ref. kernfs_rename() doesn't require active_ref - * protection. Break them before grabbing cgroup_tree_mutex. + * protection. Break them before grabbing cgroup_mutex. */ kernfs_break_active_protection(new_parent); kernfs_break_active_protection(kn); - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); ret = kernfs_rename(kn, new_parent, new_name_str); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); kernfs_unbreak_active_protection(kn); kernfs_unbreak_active_protection(new_parent); @@ -2944,7 +2916,6 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], struct cftype *cft; int ret; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); for (cft = cfts; cft->name[0] != '\0'; cft++) { @@ -2980,7 +2951,6 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) struct cgroup_subsys_state *css; int ret = 0; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); /* add/rm files for all cgroups created before */ @@ -3049,7 +3019,6 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) static int cgroup_rm_cftypes_locked(struct cftype *cfts) { - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); if (!cfts || !cfts[0].ss) @@ -3076,11 +3045,9 @@ int cgroup_rm_cftypes(struct cftype *cfts) { int ret; - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); ret = cgroup_rm_cftypes_locked(cfts); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); return ret; } @@ -3109,7 +3076,6 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) if (ret) return ret; - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); list_add_tail(&cfts->node, &ss->cfts); @@ -3118,7 +3084,6 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) cgroup_rm_cftypes_locked(cfts); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); return ret; } @@ -3158,7 +3123,7 @@ css_next_child(struct cgroup_subsys_state *pos_css, struct cgroup *cgrp = parent_css->cgroup; struct cgroup *next; - cgroup_assert_mutexes_or_rcu_locked(); + cgroup_assert_mutex_or_rcu_locked(); /* * @pos could already have been removed. Once a cgroup is removed, @@ -3224,7 +3189,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, { struct cgroup_subsys_state *next; - cgroup_assert_mutexes_or_rcu_locked(); + cgroup_assert_mutex_or_rcu_locked(); /* if first iteration, visit @root */ if (!pos) @@ -3264,7 +3229,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos) { struct cgroup_subsys_state *last, *tmp; - cgroup_assert_mutexes_or_rcu_locked(); + cgroup_assert_mutex_or_rcu_locked(); do { last = pos; @@ -3311,7 +3276,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, { struct cgroup_subsys_state *next; - cgroup_assert_mutexes_or_rcu_locked(); + cgroup_assert_mutex_or_rcu_locked(); /* if first iteration, visit leftmost descendant which may be @root */ if (!pos) @@ -4178,7 +4143,6 @@ static int online_css(struct cgroup_subsys_state *css) struct cgroup_subsys *ss = css->ss; int ret = 0; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); if (ss->css_online) @@ -4196,7 +4160,6 @@ static void offline_css(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); if (!(css->flags & CSS_ONLINE)) @@ -4399,7 +4362,6 @@ static void css_killed_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup *cgrp = css->cgroup; - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); /* @@ -4417,7 +4379,6 @@ static void css_killed_work_fn(struct work_struct *work) cgroup_destroy_css_killed(cgrp); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); /* * Put the css refs from kill_css(). Each css holds an extra @@ -4450,7 +4411,6 @@ static void css_killed_ref_fn(struct percpu_ref *ref) */ static void kill_css(struct cgroup_subsys_state *css) { - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); /* @@ -4510,7 +4470,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) bool empty; int ssid; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); /* @@ -4593,7 +4552,6 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp) { struct cgroup *parent = cgrp->parent; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); /* delete this cgroup from parent->children */ @@ -4647,7 +4605,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); idr_init(&ss->css_idr); @@ -4685,7 +4642,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) cgrp_dfl_root.subsys_mask |= 1 << ss->id; mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); } /** @@ -4735,7 +4691,6 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); /* Add init_css_set to the hash table */ @@ -4745,7 +4700,6 @@ int __init cgroup_init(void) BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); for_each_subsys(ss, ssid) { if (ss->early_init) { -- cgit v1.2.3 From 4982223e51e8ea9d09bb33c8323b5ec1877b2b51 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 14 May 2014 10:54:19 +0930 Subject: module: set nx before marking module MODULE_STATE_COMING. We currently set RO & NX on modules very late: after we move them from MODULE_STATE_UNFORMED to MODULE_STATE_COMING, and after we call parse_args() (which can exec code in the module). Much better is to do it in complete_formation() and then call the notifier. This means that the notifiers will be called on a module which is already RO & NX, so that may cause problems (ftrace already changed so they're unaffected). Signed-off-by: Rusty Russell --- kernel/module.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 66e4e0d260a9..f944c72d3c8a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3023,21 +3023,6 @@ static int do_init_module(struct module *mod) */ current->flags &= ~PF_USED_ASYNC; - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_COMING, mod); - - /* Set RO and NX regions for core */ - set_section_ro_nx(mod->module_core, - mod->core_text_size, - mod->core_ro_size, - mod->core_size); - - /* Set RO and NX regions for init */ - set_section_ro_nx(mod->module_init, - mod->init_text_size, - mod->init_ro_size, - mod->init_size); - do_mod_ctors(mod); /* Start the module */ if (mod->init != NULL) @@ -3168,9 +3153,26 @@ static int complete_formation(struct module *mod, struct load_info *info) /* This relies on module_mutex for list integrity. */ module_bug_finalize(info->hdr, info->sechdrs, mod); + /* Set RO and NX regions for core */ + set_section_ro_nx(mod->module_core, + mod->core_text_size, + mod->core_ro_size, + mod->core_size); + + /* Set RO and NX regions for init */ + set_section_ro_nx(mod->module_init, + mod->init_text_size, + mod->init_ro_size, + mod->init_size); + /* Mark state as coming so strong_try_module_get() ignores us, * but kallsyms etc. can see us. */ mod->state = MODULE_STATE_COMING; + mutex_unlock(&module_mutex); + + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_COMING, mod); + return 0; out: mutex_unlock(&module_mutex); -- cgit v1.2.3 From 29dedee0e693aa113164c820395ce51446a71ace Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 5 May 2014 16:38:18 +0200 Subject: uprobes: Add mem_cgroup_charge_anon() into uprobe_write_opcode() Hugh says: The one I noticed was that it forgets all about memcg (because it was copied from KSM, and there the replacement page has already been charged to a memcg). See how mm/memory.c do_anonymous_page() does a mem_cgroup_charge_anon(). Hopefully not a big problem, uprobes is a system-wide thing and only root can insert the probes. But I agree, should be fixed anyway. Add mem_cgroup_{un,}charge_anon() into uprobe_write_opcode(). To simplify the error handling (and avoid the new "uncharge" label) the patch also moves anon_vma_prepare() up before we alloc/charge the new page. While at it fix the comment about ->mmap_sem, it is held for write. Suggested-by: Hugh Dickins Signed-off-by: Oleg Nesterov --- kernel/events/uprobes.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7716c40f2c50..a13251e8bfa4 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -279,18 +279,13 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * supported by that architecture then we need to modify is_trap_at_addr and * uprobe_write_opcode accordingly. This would never be a problem for archs * that have fixed length instructions. - */ - -/* + * * uprobe_write_opcode - write the opcode at a given virtual address. * @mm: the probed process address space. * @vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @vaddr. * - * Called with mm->mmap_sem held (for read and with a reference to - * mm). - * - * For mm @mm, write the opcode at @vaddr. + * Called with mm->mmap_sem held for write. * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, @@ -310,21 +305,25 @@ retry: if (ret <= 0) goto put_old; + ret = anon_vma_prepare(vma); + if (ret) + goto put_old; + ret = -ENOMEM; new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); if (!new_page) goto put_old; - __SetPageUptodate(new_page); + if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) + goto put_new; + __SetPageUptodate(new_page); copy_highpage(new_page, old_page); copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); - ret = anon_vma_prepare(vma); - if (ret) - goto put_new; - ret = __replace_page(vma, vaddr, old_page, new_page); + if (ret) + mem_cgroup_uncharge_page(new_page); put_new: page_cache_release(new_page); -- cgit v1.2.3 From b02ef20a9fba08948e643d3eec0efadf1da01a44 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 12 May 2014 18:24:45 +0200 Subject: uprobes/x86: Fix the wrong ->si_addr when xol triggers a trap If the probed insn triggers a trap, ->si_addr = regs->ip is technically correct, but this is not what the signal handler wants; we need to pass the address of the probed insn, not the address of xol slot. Add the new arch-agnostic helper, uprobe_get_trap_addr(), and change fill_trap_info() and math_error() to use it. !CONFIG_UPROBES case in uprobes.h uses a macro to avoid include hell and ensure that it can be compiled even if an architecture doesn't define instruction_pointer(). Test-case: #include #include #include extern void probe_div(void); void sigh(int sig, siginfo_t *info, void *c) { int passed = (info->si_addr == probe_div); printf(passed ? "PASS\n" : "FAIL\n"); _exit(!passed); } int main(void) { struct sigaction sa = { .sa_sigaction = sigh, .sa_flags = SA_SIGINFO, }; sigaction(SIGFPE, &sa, NULL); asm ( "xor %ecx,%ecx\n" ".globl probe_div; probe_div:\n" "idiv %ecx\n" ); return 0; } it fails if probe_div() is probed. Note: show_unhandled_signals users should probably use this helper too, but we need to cleanup them first. Signed-off-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu --- kernel/events/uprobes.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a13251e8bfa4..3b02c72938a8 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1351,6 +1351,16 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs) return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE; } +unsigned long uprobe_get_trap_addr(struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + if (unlikely(utask && utask->active_uprobe)) + return utask->vaddr; + + return instruction_pointer(regs); +} + /* * Called with no locks held. * Called in context of a exiting or a exec-ing thread. -- cgit v1.2.3 From a015edd26e28afe225cdd04f25794bd2b3bbe2da Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:00 -0400 Subject: cgroup: use restart_syscall() for mount retries cgroup_mount() uses dumb delay-and-retry logic to wait for cgroup_root which is being destroyed. The retry currently loops inside cgroup_mount() proper. This patch makes it return with restart_syscall() instead so that retry travels out to userland boundary. This slightly simplifies the logic and more importantly makes the retry logic behave better when the wait for some reason becomes lengthy or infinite by allowing the operation to be suspended or terminated from userland. v2: The original patch forgot to free memory allocated for @opts. Fixed. Caught by Li Zefan. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 457e52705f56..f36fd9c15b3a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1681,7 +1681,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, ret = parse_cgroupfs_options(data, &opts); if (ret) goto out_unlock; -retry: + /* look for a matching existing root */ if (!opts.subsys_mask && !opts.none && !opts.name) { cgrp_dfl_root_visible = true; @@ -1740,8 +1740,8 @@ retry: if (!atomic_inc_not_zero(&root->cgrp.refcnt)) { mutex_unlock(&cgroup_mutex); msleep(10); - mutex_lock(&cgroup_mutex); - goto retry; + ret = restart_syscall(); + goto out_free; } ret = 0; @@ -1772,7 +1772,7 @@ retry: out_unlock: mutex_unlock(&cgroup_mutex); - +out_free: kfree(opts.release_agent); kfree(opts.name); -- cgit v1.2.3 From 9d800df12d31734a6853915e9d2deb5d6747985f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:00 -0400 Subject: cgroup: rename cgroup->dummy_css to ->self and move it to the top cgroup->dummy_css is used as the placeholder css when performing css oriended operations on the cgroup. We're gonna shift more cgroup management to this css. Let's rename it to ->self and move it to the top. This is pure rename and field relocation. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f36fd9c15b3a..b57a949ae4bc 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -220,7 +220,7 @@ static void cgroup_idr_remove(struct idr *idr, int id) /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest - * @ss: the subsystem of interest (%NULL returns the dummy_css) + * @ss: the subsystem of interest (%NULL returns @cgrp->self) * * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This * function must be called either under cgroup_mutex or rcu_read_lock() and @@ -235,13 +235,13 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, return rcu_dereference_check(cgrp->subsys[ss->id], lockdep_is_held(&cgroup_mutex)); else - return &cgrp->dummy_css; + return &cgrp->self; } /** * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest - * @ss: the subsystem of interest (%NULL returns the dummy_css) + * @ss: the subsystem of interest (%NULL returns @cgrp->self) * * Similar to cgroup_css() but returns the effctive css, which is defined * as the matching css of the nearest ancestor including self which has @ss @@ -254,7 +254,7 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, lockdep_assert_held(&cgroup_mutex); if (!ss) - return &cgrp->dummy_css; + return &cgrp->self; if (!(cgrp->root->subsys_mask & (1 << ss->id))) return NULL; @@ -288,7 +288,7 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) if (cft->ss) return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); else - return &cgrp->dummy_css; + return &cgrp->self; } EXPORT_SYMBOL_GPL(of_css); @@ -1551,7 +1551,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); - cgrp->dummy_css.cgroup = cgrp; + cgrp->self.cgroup = cgrp; for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); @@ -3454,7 +3454,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) * ->can_attach() fails. */ do { - css_task_iter_start(&from->dummy_css, &it); + css_task_iter_start(&from->self, &it); task = css_task_iter_next(&it); if (task) get_task_struct(task); @@ -3719,7 +3719,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (!array) return -ENOMEM; /* now, populate the array */ - css_task_iter_start(&cgrp->dummy_css, &it); + css_task_iter_start(&cgrp->self, &it); while ((tsk = css_task_iter_next(&it))) { if (unlikely(n == length)) break; @@ -3793,7 +3793,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) } rcu_read_unlock(); - css_task_iter_start(&cgrp->dummy_css, &it); + css_task_iter_start(&cgrp->self, &it); while ((tsk = css_task_iter_next(&it))) { switch (tsk->state) { case TASK_RUNNING: @@ -4274,7 +4274,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, init_cgroup_housekeeping(cgrp); cgrp->parent = parent; - cgrp->dummy_css.parent = &parent->dummy_css; + cgrp->self.parent = &parent->self; cgrp->root = root; if (notify_on_release(parent)) -- cgit v1.2.3 From cbc125efada6d8c2555dd35e938694eb9b7cd791 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:01 -0400 Subject: cgroup: separate out cgroup_has_live_children() from cgroup_destroy_locked() We're expecting another user. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b57a949ae4bc..3f5f48130b6b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3295,6 +3295,21 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, return css_parent(pos); } +static bool cgroup_has_live_children(struct cgroup *cgrp) +{ + struct cgroup *child; + + rcu_read_lock(); + list_for_each_entry_rcu(child, &cgrp->children, sibling) { + if (!cgroup_is_dead(child)) { + rcu_read_unlock(); + return true; + } + } + rcu_read_unlock(); + return false; +} + /** * css_advance_task_iter - advance a task itererator to the next css_set * @it: the iterator to advance @@ -4465,7 +4480,6 @@ static void kill_css(struct cgroup_subsys_state *css) static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { - struct cgroup *child; struct cgroup_subsys_state *css; bool empty; int ssid; @@ -4487,15 +4501,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * emptiness as dead children linger on it while being destroyed; * otherwise, "rmdir parent/child parent" may fail with -EBUSY. */ - empty = true; - rcu_read_lock(); - list_for_each_entry_rcu(child, &cgrp->children, sibling) { - empty = cgroup_is_dead(child); - if (!empty) - break; - } - rcu_read_unlock(); - if (!empty) + if (cgroup_has_live_children(cgrp)) return -EBUSY; /* -- cgit v1.2.3 From 9e4173e1f24fa3bd562f13b92ee34c7dfb1db7c9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:01 -0400 Subject: cgroup: move check_for_release(parent) call to the end of cgroup_destroy_locked() Currently, check_for_release() on the parent of a destroyed cgroup is invoked from cgroup_destroy_css_killed(). This is because this is where the destroyed cgroup can be removed from the parent's children list. check_for_release() tests the emptiness of the list directly, so invoking it before removing the cgroup from the list makes it think that the parent still has children even when it no longer does. This patch updates check_for_release() to use cgroup_has_live_children() instead of directly testing ->children emptiness and moves check_for_release(parent) earlier to the end of cgroup_destroy_locked(). As cgroup_has_live_children() ignores cgroups marked DEAD, check_for_release() functions correctly as long as it's called after asserting DEAD. This makes release notification slightly more timely and more importantly enables further simplification of cgroup destruction path. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3f5f48130b6b..061569fba245 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4542,6 +4542,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ kernfs_remove(cgrp->kn); + set_bit(CGRP_RELEASABLE, &cgrp->parent->flags); + check_for_release(cgrp->parent); + return 0; }; @@ -4556,17 +4559,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ static void cgroup_destroy_css_killed(struct cgroup *cgrp) { - struct cgroup *parent = cgrp->parent; - lockdep_assert_held(&cgroup_mutex); /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); cgroup_put(cgrp); - - set_bit(CGRP_RELEASABLE, &parent->flags); - check_for_release(parent); } static int cgroup_rmdir(struct kernfs_node *kn) @@ -5006,7 +5004,7 @@ void cgroup_exit(struct task_struct *tsk) static void check_for_release(struct cgroup *cgrp) { if (cgroup_is_releasable(cgrp) && - list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) { + list_empty(&cgrp->cset_links) && !cgroup_has_live_children(cgrp)) { /* * Control Group is currently removeable. If it's not * already queued for a userspace notification, queue -- cgit v1.2.3 From 4e4e28472365f8c7a7c55f6b5706f68bc40c5b13 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:01 -0400 Subject: cgroup: move cgroup->sibling unlinking to cgroup_put() Move cgroup->sibling unlinking from cgroup_destroy_css_killed() to cgroup_put(). This is later but still before the RCU grace period, so it doesn't break css_next_child() although there now is a larger window in which a dead cgroup is visible during css iteration. As css iteration always could have included offline csses, this doesn't affect correctness; however, it does make css_next_child() fall back to reiterting mode more often. This also makes cgroup_put() directly take cgroup_mutex, which limits where it can be called from. These are not immediately problematic and will be dealt with later. This change enables simplification of cgroup destruction path. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 061569fba245..e9aa2a51ca68 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1056,6 +1056,11 @@ static void cgroup_put(struct cgroup *cgrp) if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp))) return; + /* delete this cgroup from parent->children */ + mutex_lock(&cgroup_mutex); + list_del_rcu(&cgrp->sibling); + mutex_unlock(&cgroup_mutex); + cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; @@ -4561,9 +4566,6 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp) { lockdep_assert_held(&cgroup_mutex); - /* delete this cgroup from parent->children */ - list_del_rcu(&cgrp->sibling); - cgroup_put(cgrp); } -- cgit v1.2.3 From 249f3468a282dcbad53484c821bebb447f14ee03 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:01 -0400 Subject: cgroup: remove cgroup_destory_css_killed() cgroup_destroy_css_killed() is cgroup destruction stage which happens after all csses are offlined. After the recent updates, it no longer does anything other than putting the base reference. This patch removes the function and makes cgroup_destroy_locked() put the base ref at the end isntead. This also makes cgroup->nr_css unnecessary. Removed. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 62 ++++++--------------------------------------------------- 1 file changed, 6 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e9aa2a51ca68..4a94b0be598d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -178,7 +178,6 @@ static struct cftype cgroup_base_files[]; static void cgroup_put(struct cgroup *cgrp); static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask); -static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); static void kill_css(struct cgroup_subsys_state *css); @@ -4169,7 +4168,6 @@ static int online_css(struct cgroup_subsys_state *css) ret = ss->css_online(css); if (!ret) { css->flags |= CSS_ONLINE; - css->cgroup->nr_css++; rcu_assign_pointer(css->cgroup->subsys[ss->id], css); } return ret; @@ -4189,7 +4187,6 @@ static void offline_css(struct cgroup_subsys_state *css) ss->css_offline(css); css->flags &= ~CSS_ONLINE; - css->cgroup->nr_css--; RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); wake_up_all(&css->cgroup->offline_waitq); @@ -4374,39 +4371,18 @@ out_destroy: /* * This is called when the refcnt of a css is confirmed to be killed. - * css_tryget_online() is now guaranteed to fail. + * css_tryget_online() is now guaranteed to fail. Tell the subsystem to + * initate destruction and put the css ref from kill_css(). */ static void css_killed_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); - struct cgroup *cgrp = css->cgroup; mutex_lock(&cgroup_mutex); - - /* - * css_tryget_online() is guaranteed to fail now. Tell subsystems - * to initate destruction. - */ offline_css(css); - - /* - * If @cgrp is marked dead, it's waiting for refs of all css's to - * be disabled before proceeding to the second phase of cgroup - * destruction. If we are the last one, kick it off. - */ - if (!cgrp->nr_css && cgroup_is_dead(cgrp)) - cgroup_destroy_css_killed(cgrp); - mutex_unlock(&cgroup_mutex); - /* - * Put the css refs from kill_css(). Each css holds an extra - * reference to the cgroup's dentry and cgroup removal proceeds - * regardless of css refs. On the last put of each css, whenever - * that may be, the extra dentry ref is put so that dentry - * destruction happens only after all css's are released. - */ css_put(css); } @@ -4518,11 +4494,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ set_bit(CGRP_DEAD, &cgrp->flags); - /* - * Initiate massacre of all css's. cgroup_destroy_css_killed() - * will be invoked to perform the rest of destruction once the - * percpu refs of all css's are confirmed to be killed. - */ + /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) kill_css(css); @@ -4532,15 +4504,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) list_del_init(&cgrp->release_list); raw_spin_unlock(&release_list_lock); - /* - * If @cgrp has css's attached, the second stage of cgroup - * destruction is kicked off from css_killed_work_fn() after the - * refs of all attached css's are killed. If @cgrp doesn't have - * any css, we kick it off here. - */ - if (!cgrp->nr_css) - cgroup_destroy_css_killed(cgrp); - /* * Remove @cgrp directory along with the base files. @cgrp has an * extra ref on its kn. @@ -4550,25 +4513,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) set_bit(CGRP_RELEASABLE, &cgrp->parent->flags); check_for_release(cgrp->parent); + /* put the base reference */ + cgroup_put(cgrp); + return 0; }; -/** - * cgroup_destroy_css_killed - the second step of cgroup destruction - * @cgrp: the cgroup whose csses have just finished offlining - * - * This function is invoked from a work item for a cgroup which is being - * destroyed after all css's are offlined and performs the rest of - * destruction. This is the second step of destruction described in the - * comment above cgroup_destroy_locked(). - */ -static void cgroup_destroy_css_killed(struct cgroup *cgrp) -{ - lockdep_assert_held(&cgroup_mutex); - - cgroup_put(cgrp); -} - static int cgroup_rmdir(struct kernfs_node *kn) { struct cgroup *cgrp; -- cgit v1.2.3 From 25e15d835036a70a53dcc993beaa036f8919a373 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:02 -0400 Subject: cgroup: bounce css release through css->destroy_work css release is planned to do more and would require process context. Bounce it through css->destroy_work. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4a94b0be598d..e694f4153edb 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4126,10 +4126,10 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head) queue_work(cgroup_destroy_wq, &css->destroy_work); } -static void css_release(struct percpu_ref *ref) +static void css_release_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = - container_of(ref, struct cgroup_subsys_state, refcnt); + container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup_subsys *ss = css->ss; cgroup_idr_remove(&ss->css_idr, css->id); @@ -4137,6 +4137,15 @@ static void css_release(struct percpu_ref *ref) call_rcu(&css->rcu_head, css_free_rcu_fn); } +static void css_release(struct percpu_ref *ref) +{ + struct cgroup_subsys_state *css = + container_of(ref, struct cgroup_subsys_state, refcnt); + + INIT_WORK(&css->destroy_work, css_release_work_fn); + queue_work(cgroup_destroy_wq, &css->destroy_work); +} + static void init_and_link_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, struct cgroup *cgrp) { -- cgit v1.2.3 From 9395a4500404e05173eda9a2d198b6fa500e90c5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:02 -0400 Subject: cgroup: enable refcnting for root csses Currently, css_get(), css_tryget() and css_tryget_online() are noops for root csses as an optimization; however, we're planning to use css refcnts to track of cgroup lifetime too and root cgroups also need to be reference counted. Since css has been converted to percpu_refcnt, the overhead of refcnting is miniscule and this optimization isn't too meaningful anymore. Furthermore, controllers which optimize the root cgroup often never even invoke these functions in their hot paths. This patch enables refcnting for root csses too. This makes CSS_ROOT flag unused and removes it. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e694f4153edb..cb5864e36f99 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4158,8 +4158,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css, if (cgrp->parent) { css->parent = cgroup_css(cgrp->parent, ss); css_get(css->parent); - } else { - css->flags |= CSS_ROOT; } BUG_ON(cgroup_css(cgrp, ss)); @@ -4582,9 +4580,10 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) BUG_ON(IS_ERR(css)); init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); if (early) { - /* idr_alloc() can't be called safely during early init */ + /* allocation can't be done safely during early init */ css->id = 1; } else { + BUG_ON(percpu_ref_init(&css->refcnt, css_release)); css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); BUG_ON(css->id < 0); } @@ -4671,6 +4670,7 @@ int __init cgroup_init(void) struct cgroup_subsys_state *css = init_css_set.subsys[ss->id]; + BUG_ON(percpu_ref_init(&css->refcnt, css_release)); css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); BUG_ON(css->id < 0); -- cgit v1.2.3 From 9d755d33f0db8c9b49438f71b38a56e375b34360 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:02 -0400 Subject: cgroup: use cgroup->self.refcnt for cgroup refcnting Currently cgroup implements refcnting separately using atomic_t cgroup->refcnt. The destruction paths of cgroup and css are rather complex and bear a lot of similiarities including the use of RCU and bouncing to a work item. This patch makes cgroup use the refcnt of self css for refcnting instead of using its own. This makes cgroup refcnting use css's percpu refcnt and share the destruction mechanism. * css_release_work_fn() and css_free_work_fn() are updated to handle both csses and cgroups. This is a bit messy but should do until we can make cgroup->self a full css, which currently can't be done thanks to multiple hierarchies. * cgroup_destroy_locked() now performs percpu_ref_kill(&cgrp->self.refcnt) instead of cgroup_put(cgrp). * Negative refcnt sanity check in cgroup_get() is no longer necessary as percpu_ref already handles it. * Similarly, as a cgroup which hasn't been killed will never be released regardless of its refcnt value and percpu_ref has sanity check on kill, cgroup_is_dead() sanity check in cgroup_put() is no longer necessary. * As whether a refcnt reached zero or not can only be decided after the reference count is killed, cgroup_root->cgrp's refcnting can no longer be used to decide whether to kill the root or not. Let's make cgroup_kill_sb() explicitly initiate destruction if the root doesn't have any children. This makes sense anyway as unmounted cgroup hierarchy without any children should be destroyed. While this is a bit messy, this will allow pushing more bookkeeping towards cgroup->self and thus handling cgroups and csses in more uniform way. In the very long term, it should be possible to introduce a base subsystem and convert the self css to a proper one making things whole lot simpler and unified. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 146 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 80 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cb5864e36f99..c01e8e8dfad0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -176,10 +176,12 @@ static int need_forkexit_callback __read_mostly; static struct cftype cgroup_base_files[]; static void cgroup_put(struct cgroup *cgrp); +static bool cgroup_has_live_children(struct cgroup *cgrp); static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask); static int cgroup_destroy_locked(struct cgroup *cgrp); static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); +static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); @@ -1008,62 +1010,15 @@ static umode_t cgroup_file_mode(const struct cftype *cft) return mode; } -static void cgroup_free_fn(struct work_struct *work) -{ - struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); - - atomic_dec(&cgrp->root->nr_cgrps); - cgroup_pidlist_destroy_all(cgrp); - - if (cgrp->parent) { - /* - * We get a ref to the parent, and put the ref when this - * cgroup is being freed, so it's guaranteed that the - * parent won't be destroyed before its children. - */ - cgroup_put(cgrp->parent); - kernfs_put(cgrp->kn); - kfree(cgrp); - } else { - /* - * This is root cgroup's refcnt reaching zero, which - * indicates that the root should be released. - */ - cgroup_destroy_root(cgrp->root); - } -} - -static void cgroup_free_rcu(struct rcu_head *head) -{ - struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); - - INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); - queue_work(cgroup_destroy_wq, &cgrp->destroy_work); -} - static void cgroup_get(struct cgroup *cgrp) { WARN_ON_ONCE(cgroup_is_dead(cgrp)); - WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0); - atomic_inc(&cgrp->refcnt); + css_get(&cgrp->self); } static void cgroup_put(struct cgroup *cgrp) { - if (!atomic_dec_and_test(&cgrp->refcnt)) - return; - if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp))) - return; - - /* delete this cgroup from parent->children */ - mutex_lock(&cgroup_mutex); - list_del_rcu(&cgrp->sibling); - mutex_unlock(&cgroup_mutex); - - cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); - cgrp->id = -1; - - call_rcu(&cgrp->rcu_head, cgroup_free_rcu); + css_put(&cgrp->self); } /** @@ -1548,7 +1503,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) struct cgroup_subsys *ss; int ssid; - atomic_set(&cgrp->refcnt, 1); INIT_LIST_HEAD(&cgrp->sibling); INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->cset_links); @@ -1597,6 +1551,10 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) goto out; root_cgrp->id = ret; + ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release); + if (ret) + goto out; + /* * We're accessing css_set_count without locking css_set_rwsem here, * but that's OK - it can only be increased by someone holding @@ -1605,11 +1563,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) */ ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); if (ret) - goto out; + goto cancel_ref; ret = cgroup_init_root_id(root); if (ret) - goto out; + goto cancel_ref; root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, KERNFS_ROOT_CREATE_DEACTIVATED, @@ -1657,6 +1615,8 @@ destroy_root: root->kf_root = NULL; exit_root_id: cgroup_exit_root_id(root); +cancel_ref: + percpu_ref_cancel_init(&root_cgrp->self.refcnt); out: free_cgrp_cset_links(&tmp_links); return ret; @@ -1735,13 +1695,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, } /* - * A root's lifetime is governed by its root cgroup. Zero - * ref indicate that the root is being destroyed. Wait for - * destruction to complete so that the subsystems are free. - * We can use wait_queue for the wait but this path is - * super cold. Let's just sleep for a bit and retry. + * A root's lifetime is governed by its root cgroup. + * tryget_live failure indicate that the root is being + * destroyed. Wait for destruction to complete so that the + * subsystems are free. We can use wait_queue for the wait + * but this path is super cold. Let's just sleep for a bit + * and retry. */ - if (!atomic_inc_not_zero(&root->cgrp.refcnt)) { + if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { mutex_unlock(&cgroup_mutex); msleep(10); ret = restart_syscall(); @@ -1794,7 +1755,16 @@ static void cgroup_kill_sb(struct super_block *sb) struct kernfs_root *kf_root = kernfs_root_from_sb(sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); - cgroup_put(&root->cgrp); + /* + * If @root doesn't have any mounts or children, start killing it. + * This prevents new mounts by disabling percpu_ref_tryget_live(). + * cgroup_mount() may wait for @root's release. + */ + if (cgroup_has_live_children(&root->cgrp)) + cgroup_put(&root->cgrp); + else + percpu_ref_kill(&root->cgrp.self.refcnt); + kernfs_kill_sb(sb); } @@ -4110,11 +4080,37 @@ static void css_free_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup *cgrp = css->cgroup; - if (css->parent) - css_put(css->parent); + if (css->ss) { + /* css free path */ + if (css->parent) + css_put(css->parent); - css->ss->css_free(css); - cgroup_put(cgrp); + css->ss->css_free(css); + cgroup_put(cgrp); + } else { + /* cgroup free path */ + atomic_dec(&cgrp->root->nr_cgrps); + cgroup_pidlist_destroy_all(cgrp); + + if (cgrp->parent) { + /* + * We get a ref to the parent, and put the ref when + * this cgroup is being freed, so it's guaranteed + * that the parent won't be destroyed before its + * children. + */ + cgroup_put(cgrp->parent); + kernfs_put(cgrp->kn); + kfree(cgrp); + } else { + /* + * This is root cgroup's refcnt reaching zero, + * which indicates that the root should be + * released. + */ + cgroup_destroy_root(cgrp->root); + } + } } static void css_free_rcu_fn(struct rcu_head *rcu_head) @@ -4131,8 +4127,20 @@ static void css_release_work_fn(struct work_struct *work) struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup_subsys *ss = css->ss; + struct cgroup *cgrp = css->cgroup; - cgroup_idr_remove(&ss->css_idr, css->id); + if (ss) { + /* css release path */ + cgroup_idr_remove(&ss->css_idr, css->id); + } else { + /* cgroup release path */ + mutex_lock(&cgroup_mutex); + list_del_rcu(&cgrp->sibling); + mutex_unlock(&cgroup_mutex); + + cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); + cgrp->id = -1; + } call_rcu(&css->rcu_head, css_free_rcu_fn); } @@ -4285,6 +4293,10 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, goto out_unlock; } + ret = percpu_ref_init(&cgrp->self.refcnt, css_release); + if (ret) + goto out_free_cgrp; + /* * Temporarily set the pointer to NULL, so idr_find() won't return * a half-baked cgroup. @@ -4292,7 +4304,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT); if (cgrp->id < 0) { ret = -ENOMEM; - goto out_free_cgrp; + goto out_cancel_ref; } init_cgroup_housekeeping(cgrp); @@ -4365,6 +4377,8 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, out_free_id: cgroup_idr_remove(&root->cgroup_idr, cgrp->id); +out_cancel_ref: + percpu_ref_cancel_init(&cgrp->self.refcnt); out_free_cgrp: kfree(cgrp); out_unlock: @@ -4521,7 +4535,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) check_for_release(cgrp->parent); /* put the base reference */ - cgroup_put(cgrp); + percpu_ref_kill(&cgrp->self.refcnt); return 0; }; -- cgit v1.2.3 From 66209a5bd4825e8890bfb65d48efa8a47c647fea Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 6 May 2014 21:57:49 -0400 Subject: ftrace: Remove boolean of hash_enable and hash_disable Commit 4104d326b670 "ftrace: Remove global function list and call function directly" cleaned up the global_ops filtering and made the code simpler, but it left a variable "hash_enable" that was used to know if the hash functions should be updated or not. It was updated if the global_ops did not override them. As the global_ops are now no different than any other ftrace_ops, the hash always gets updated and there's no reason to use the hash_enable boolean. The same goes for hash_disable used in ftrace_shutdown(). Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 38e5cf73b9ae..2c99d1f7caf1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2042,7 +2042,6 @@ static void ftrace_startup_enable(int command) static int ftrace_startup(struct ftrace_ops *ops, int command) { - bool hash_enable = true; int ret; if (unlikely(ftrace_disabled)) @@ -2056,8 +2055,8 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) command |= FTRACE_UPDATE_CALLS; ops->flags |= FTRACE_OPS_FL_ENABLED; - if (hash_enable) - ftrace_hash_rec_enable(ops, 1); + + ftrace_hash_rec_enable(ops, 1); ftrace_startup_enable(command); @@ -2066,7 +2065,6 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) static int ftrace_shutdown(struct ftrace_ops *ops, int command) { - bool hash_disable = true; int ret; if (unlikely(ftrace_disabled)) @@ -2084,8 +2082,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) */ WARN_ON_ONCE(ftrace_start_up < 0); - if (hash_disable) - ftrace_hash_rec_disable(ops, 1); + ftrace_hash_rec_disable(ops, 1); if (!global_start_up) ops->flags &= ~FTRACE_OPS_FL_ENABLED; -- cgit v1.2.3 From 19eab4a472cfe4a3ae51cff1711d795e3f9bb564 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 7 May 2014 15:06:14 -0400 Subject: ftrace: Write in missing comment from a very old commit Back in 2011 Commit ed926f9b35cda "ftrace: Use counters to enable functions to trace" changed the way ftrace accounts for enabled and disabled traced functions. There was a comment started as: /* * */ But never finished. Well, that's rather useless. I probably forgot to save the file before committing it. And it passed review from all this time. Anyway, better late than never. I updated the comment to express what is happening in that somewhat complex code. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2c99d1f7caf1..61f39f8b62e1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1552,7 +1552,14 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); /* + * If filter_hash is set, we want to match all functions + * that are in the hash but not in the other hash. * + * If filter_hash is not set, then we are decrementing. + * That means we match anything that is in the hash + * and also in the other_hash. That is, we need to turn + * off functions in the other hash because they are disabled + * by this hash. */ if (filter_hash && in_hash && !in_other_hash) match = 1; -- cgit v1.2.3 From 68f40969f0173c02ddc22a40df865c81c29070e4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 1 May 2014 12:44:50 -0400 Subject: ftrace: Always inline ftrace_hash_empty() helper function The ftrace_hash_empty() function is a simple test: return !hash || !hash->count; But gcc seems to want to make it a call. As this is in an extreme hot path of the function tracer, there's no reason it needs to be a call. I only wrote it to be a helper function anyway, otherwise it would have been inlined manually. Force gcc to inline it, as it could have also been a macro. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 61f39f8b62e1..98fa931b6864 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1105,7 +1105,7 @@ struct ftrace_page { static struct ftrace_page *ftrace_pages_start; static struct ftrace_page *ftrace_pages; -static bool ftrace_hash_empty(struct ftrace_hash *hash) +static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash) { return !hash || !hash->count; } -- cgit v1.2.3 From 7413af1fb70e7efa6dbc7f27663e7a5126b3aa33 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 6 May 2014 21:34:14 -0400 Subject: ftrace: Make get_ftrace_addr() and get_ftrace_addr_old() global Move and rename get_ftrace_addr() and get_ftrace_addr_old() to ftrace_get_addr_new() and ftrace_get_addr_curr() respectively. This moves these two helper functions in the generic code out from the arch specific code, and renames them to have a better generic name. This will allow other archs to use them as well as makes it a bit easier to work on getting separate trampolines for different functions. ftrace_get_addr_new() returns the trampoline address that the mcount call address will be converted to. ftrace_get_addr_curr() returns the trampoline address of what the mcount call address currently jumps to. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 98fa931b6864..e825fded435d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1755,6 +1755,42 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable) return ftrace_check_record(rec, enable, 0); } +/** + * ftrace_get_addr_new - Get the call address to set to + * @rec: The ftrace record descriptor + * + * If the record has the FTRACE_FL_REGS set, that means that it + * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS + * is not not set, then it wants to convert to the normal callback. + * + * Returns the address of the trampoline to set to + */ +unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) +{ + if (rec->flags & FTRACE_FL_REGS) + return (unsigned long)FTRACE_REGS_ADDR; + else + return (unsigned long)FTRACE_ADDR; +} + +/** + * ftrace_get_addr_curr - Get the call address that is already there + * @rec: The ftrace record descriptor + * + * The FTRACE_FL_REGS_EN is set when the record already points to + * a function that saves all the regs. Basically the '_EN' version + * represents the current state of the function. + * + * Returns the address of the trampoline that is currently being called + */ +unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) +{ + if (rec->flags & FTRACE_FL_REGS_EN) + return (unsigned long)FTRACE_REGS_ADDR; + else + return (unsigned long)FTRACE_ADDR; +} + static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable) { -- cgit v1.2.3 From 7c0868e03b7a7c50fa10957d8dddaebb09c72044 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 8 May 2014 07:01:21 -0400 Subject: ftrace: Use the ftrace_addr helper functions to find the ftrace_addr With the moving of the functions that determine what the mcount call site should be replaced with into the generic code, there is a few places in the generic code that can use them instead of hard coding it as it does. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e825fded435d..52c2b53b7953 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1798,12 +1798,12 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) unsigned long ftrace_addr; int ret; - ret = ftrace_update_record(rec, enable); + ftrace_addr = ftrace_get_addr_new(rec); - if (rec->flags & FTRACE_FL_REGS) - ftrace_addr = (unsigned long)FTRACE_REGS_ADDR; - else - ftrace_addr = (unsigned long)FTRACE_ADDR; + /* This needs to be done before we call ftrace_update_record */ + ftrace_old_addr = ftrace_get_addr_curr(rec); + + ret = ftrace_update_record(rec, enable); switch (ret) { case FTRACE_UPDATE_IGNORE: @@ -1817,11 +1817,6 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) case FTRACE_UPDATE_MODIFY_CALL_REGS: case FTRACE_UPDATE_MODIFY_CALL: - if (rec->flags & FTRACE_FL_REGS) - ftrace_old_addr = (unsigned long)FTRACE_ADDR; - else - ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR; - return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); } -- cgit v1.2.3 From f1b2f2bd5821c6ab7feed2e133343dd54b212ed9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 7 May 2014 16:09:49 -0400 Subject: ftrace: Remove FTRACE_UPDATE_MODIFY_CALL_REGS flag As the decision to what needs to be done (converting a call to the ftrace_caller to ftrace_caller_regs or to convert from ftrace_caller_regs to ftrace_caller) can easily be determined from the rec->flags of FTRACE_FL_REGS and FTRACE_FL_REGS_EN, there's no need to have the ftrace_check_record() return either a UPDATE_MODIFY_CALL_REGS or a UPDATE_MODIFY_CALL. Just he latter is enough. This added flag causes more complexity than is required. Remove it. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 52c2b53b7953..cc07b7fc4372 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1701,19 +1701,15 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) /* * If this record is being updated from a nop, then * return UPDATE_MAKE_CALL. - * Otherwise, if the EN flag is set, then return - * UPDATE_MODIFY_CALL_REGS to tell the caller to convert - * from the non-save regs, to a save regs function. * Otherwise, * return UPDATE_MODIFY_CALL to tell the caller to convert - * from the save regs, to a non-save regs function. + * from the save regs, to a non-save regs function or + * vice versa. */ if (flag & FTRACE_FL_ENABLED) return FTRACE_UPDATE_MAKE_CALL; - else if (rec->flags & FTRACE_FL_REGS_EN) - return FTRACE_UPDATE_MODIFY_CALL_REGS; - else - return FTRACE_UPDATE_MODIFY_CALL; + + return FTRACE_UPDATE_MODIFY_CALL; } if (update) { @@ -1815,7 +1811,6 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) case FTRACE_UPDATE_MAKE_NOP: return ftrace_make_nop(NULL, rec, ftrace_addr); - case FTRACE_UPDATE_MODIFY_CALL_REGS: case FTRACE_UPDATE_MODIFY_CALL: return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); } -- cgit v1.2.3 From 122ff243f5f104194750ecbc76d5946dd1eec934 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 12 May 2014 16:04:53 -0700 Subject: ipv4: make ip_local_reserved_ports per netns ip_local_port_range is already per netns, so should ip_local_reserved_ports be. And since it is none by default we don't actually need it when we don't enable CONFIG_SYSCTL. By the way, rename inet_is_reserved_local_port() to inet_is_local_reserved_port() Cc: "David S. Miller" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 74f5b580fe34..e36ae4b15726 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2501,11 +2501,11 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, bool first = 1; size_t left = *lenp; unsigned long bitmap_len = table->maxlen; - unsigned long *bitmap = (unsigned long *) table->data; + unsigned long *bitmap = *(unsigned long **) table->data; unsigned long *tmp_bitmap = NULL; char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; - if (!bitmap_len || !left || (*ppos && !write)) { + if (!bitmap || !bitmap_len || !left || (*ppos && !write)) { *lenp = 0; return 0; } -- cgit v1.2.3 From 4449bf927b61bdb4389393c6fea6837214d1ace7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 6 May 2014 13:10:24 -0400 Subject: tracing: Add __bitmask() macro to trace events to cpumasks and other bitmasks Being able to show a cpumask of events can be useful as some events may affect only some CPUs. There is no standard way to record the cpumask and converting it to a string is rather expensive during the trace as traces happen in hotpaths. It would be better to record the raw event mask and be able to parse it at print time. The following macros were added for use with the TRACE_EVENT() macro: __bitmask() __assign_bitmask() __get_bitmask() To test this, I added this to the sched_migrate_task event, which looked like this: TRACE_EVENT(sched_migrate_task, TP_PROTO(struct task_struct *p, int dest_cpu, const struct cpumask *cpus), TP_ARGS(p, dest_cpu, cpus), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, prio ) __field( int, orig_cpu ) __field( int, dest_cpu ) __bitmask( cpumask, num_possible_cpus() ) ), TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->prio = p->prio; __entry->orig_cpu = task_cpu(p); __entry->dest_cpu = dest_cpu; __assign_bitmask(cpumask, cpumask_bits(cpus), num_possible_cpus()); ), TP_printk("comm=%s pid=%d prio=%d orig_cpu=%d dest_cpu=%d cpumask=%s", __entry->comm, __entry->pid, __entry->prio, __entry->orig_cpu, __entry->dest_cpu, __get_bitmask(cpumask)) ); With the output of: ksmtuned-3613 [003] d..2 485.220508: sched_migrate_task: comm=ksmtuned pid=3615 prio=120 orig_cpu=3 dest_cpu=2 cpumask=00000000,0000000f migration/1-13 [001] d..5 485.221202: sched_migrate_task: comm=ksmtuned pid=3614 prio=120 orig_cpu=1 dest_cpu=0 cpumask=00000000,0000000f awk-3615 [002] d.H5 485.221747: sched_migrate_task: comm=rcu_preempt pid=7 prio=120 orig_cpu=0 dest_cpu=1 cpumask=00000000,000000ff migration/2-18 [002] d..5 485.222062: sched_migrate_task: comm=ksmtuned pid=3615 prio=120 orig_cpu=2 dest_cpu=3 cpumask=00000000,0000000f Link: http://lkml.kernel.org/r/1399377998-14870-6-git-send-email-javi.merino@arm.com Link: http://lkml.kernel.org/r/20140506132238.22e136d1@gandalf.local.home Suggested-by: Javi Merino Tested-by: Javi Merino Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index a436de18aa99..f3dad80c20b2 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -125,6 +125,34 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) } EXPORT_SYMBOL_GPL(trace_seq_printf); +/** + * trace_seq_bitmask - put a list of longs as a bitmask print output + * @s: trace sequence descriptor + * @maskp: points to an array of unsigned longs that represent a bitmask + * @nmaskbits: The number of bits that are valid in @maskp + * + * It returns 0 if the trace oversizes the buffer's free + * space, 1 otherwise. + * + * Writes a ASCII representation of a bitmask string into @s. + */ +int +trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, + int nmaskbits) +{ + int len = (PAGE_SIZE - 1) - s->len; + int ret; + + if (s->full || !len) + return 0; + + ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits); + s->len += ret; + + return 1; +} +EXPORT_SYMBOL_GPL(trace_seq_bitmask); + /** * trace_seq_vprintf - sequence printing of trace information * @s: trace sequence descriptor @@ -398,6 +426,19 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, EXPORT_SYMBOL(ftrace_print_symbols_seq_u64); #endif +const char * +ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, + unsigned int bitmask_size) +{ + const char *ret = p->buffer + p->len; + + trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8); + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL_GPL(ftrace_print_bitmask_seq); + const char * ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) { -- cgit v1.2.3 From 8f577cadf7181243d336be9aba40c1bcc02c4c54 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 13 May 2014 19:50:47 -0700 Subject: seccomp: JIT compile seccomp filter Take advantage of internal BPF JIT 05-sim-long_jumps.c of libseccomp was used as micro-benchmark: seccomp_rule_add_exact(ctx,... seccomp_rule_add_exact(ctx,... rc = seccomp_load(ctx); for (i = 0; i < 10000000; i++) syscall(...); $ sudo sysctl net.core.bpf_jit_enable=1 $ time ./bench real 0m2.769s user 0m1.136s sys 0m1.624s $ sudo sysctl net.core.bpf_jit_enable=0 $ time ./bench real 0m5.825s user 0m1.268s sys 0m4.548s Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/seccomp.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index b35c21503a36..7e02d624cc50 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -54,8 +54,7 @@ struct seccomp_filter { atomic_t usage; struct seccomp_filter *prev; - unsigned short len; /* Instruction count */ - struct sock_filter_int insnsi[]; + struct sk_filter *prog; }; /* Limit any path through the tree to 256KB worth of instructions. */ @@ -189,7 +188,8 @@ static u32 seccomp_run_filters(int syscall) * value always takes priority (ignoring the DATA). */ for (f = current->seccomp.filter; f; f = f->prev) { - u32 cur_ret = sk_run_filter_int_seccomp(&sd, f->insnsi); + u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd); + if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) ret = cur_ret; } @@ -215,7 +215,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) return -EINVAL; for (filter = current->seccomp.filter; filter; filter = filter->prev) - total_insns += filter->len + 4; /* include a 4 instr penalty */ + total_insns += filter->prog->len + 4; /* include a 4 instr penalty */ if (total_insns > MAX_INSNS_PER_PATH) return -ENOMEM; @@ -256,19 +256,27 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) /* Allocate a new seccomp_filter */ ret = -ENOMEM; - filter = kzalloc(sizeof(struct seccomp_filter) + - sizeof(struct sock_filter_int) * new_len, + filter = kzalloc(sizeof(struct seccomp_filter), GFP_KERNEL|__GFP_NOWARN); if (!filter) goto free_prog; - ret = sk_convert_filter(fp, fprog->len, filter->insnsi, &new_len); - if (ret) + filter->prog = kzalloc(sk_filter_size(new_len), + GFP_KERNEL|__GFP_NOWARN); + if (!filter->prog) goto free_filter; + + ret = sk_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); + if (ret) + goto free_filter_prog; kfree(fp); atomic_set(&filter->usage, 1); - filter->len = new_len; + filter->prog->len = new_len; + filter->prog->bpf_func = (void *)sk_run_filter_int_seccomp; + + /* JIT internal BPF into native HW instructions */ + bpf_int_jit_compile(filter->prog); /* * If there is an existing filter, make it the prev and don't drop its @@ -278,6 +286,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) current->seccomp.filter = filter; return 0; +free_filter_prog: + kfree(filter->prog); free_filter: kfree(filter); free_prog: @@ -330,6 +340,7 @@ void put_seccomp_filter(struct task_struct *tsk) while (orig && atomic_dec_and_test(&orig->usage)) { struct seccomp_filter *freeme = orig; orig = orig->prev; + bpf_jit_free(freeme->prog); kfree(freeme); } } -- cgit v1.2.3 From 3b514d24e200fcdcde0a57c354a51d3677a86743 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:47 -0400 Subject: cgroup: skip refcnting on normal root csses and cgrp_dfl_root self css 9395a4500404 ("cgroup: enable refcnting for root csses") enabled reference counting for root csses (cgroup_subsys_states) so that cgroup's self csses can be used to manage the lifetime of the containing cgroups. Unfortunately, this change was incorrect. During early init, cgrp_dfl_root self css refcnt is used. percpu_ref can't initialized during early init and its initialization is deferred till cgroup_init() time. This means that cpu was using percpu_ref which wasn't properly initialized. Due to the way percpu variables are laid out on x86, this didn't blow up immediately on x86 but ended up incrementing and decrementing the percpu variable at offset zero, whatever it may be; however, on other archs, this caused fault and early boot failure. As cgroup self csses for root cgroups of non-dfl hierarchies need working refcounting, we can't revert 9395a4500404. This patch adds CSS_NO_REF which explicitly inhibits reference counting on the css and sets it on all normal (non-self) csses and cgroup_dfl_root self css. v2: cgrp_dfl_root.self is the offending one. Set the flag on it. Signed-off-by: Tejun Heo Reported-by: Stephen Warren Tested-by: Stephen Warren Fixes: 9395a4500404 ("cgroup: enable refcnting for root csses") --- kernel/cgroup.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c01e8e8dfad0..0343d7ee6d62 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4593,11 +4593,17 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); + + /* + * Root csses are never destroyed and we can't initialize + * percpu_ref during early init. Disable refcnting. + */ + css->flags |= CSS_NO_REF; + if (early) { /* allocation can't be done safely during early init */ css->id = 1; } else { - BUG_ON(percpu_ref_init(&css->refcnt, css_release)); css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); BUG_ON(css->id < 0); } @@ -4636,6 +4642,8 @@ int __init cgroup_init_early(void) int i; init_cgroup_root(&cgrp_dfl_root, &opts); + cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF; + RCU_INIT_POINTER(init_task.cgroups, &init_css_set); for_each_subsys(ss, i) { @@ -4684,7 +4692,6 @@ int __init cgroup_init(void) struct cgroup_subsys_state *css = init_css_set.subsys[ss->id]; - BUG_ON(percpu_ref_init(&css->refcnt, css_release)); css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); BUG_ON(css->id < 0); -- cgit v1.2.3 From 5c9d535b893f30266ea29fe377cb9b002fcd76aa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:48 -0400 Subject: cgroup: remove css_parent() cgroup in general is moving towards using cgroup_subsys_state as the fundamental structural component and css_parent() was introduced to convert from using cgroup->parent to css->parent. It was quite some time ago and we're moving forward with making css more prominent. This patch drops the trivial wrapper css_parent() and let the users dereference css->parent. While at it, explicitly mark fields of css which are public and immutable. v2: New usage from device_cgroup.c converted. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Acked-by: Neil Horman Acked-by: "David S. Miller" Acked-by: Li Zefan Cc: Vivek Goyal Cc: Jens Axboe Cc: Peter Zijlstra Cc: Johannes Weiner --- kernel/cgroup.c | 8 ++++---- kernel/cgroup_freezer.c | 2 +- kernel/cpuset.c | 2 +- kernel/sched/core.c | 2 +- kernel/sched/cpuacct.c | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0343d7ee6d62..929bbbc539e9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3176,10 +3176,10 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, /* no child, visit my or the closest ancestor's next sibling */ while (pos != root) { - next = css_next_child(pos, css_parent(pos)); + next = css_next_child(pos, pos->parent); if (next) return next; - pos = css_parent(pos); + pos = pos->parent; } return NULL; @@ -3261,12 +3261,12 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, return NULL; /* if there's an unvisited sibling, visit its leftmost descendant */ - next = css_next_child(pos, css_parent(pos)); + next = css_next_child(pos, pos->parent); if (next) return css_leftmost_descendant(next); /* no sibling left, visit parent */ - return css_parent(pos); + return pos->parent; } static bool cgroup_has_live_children(struct cgroup *cgrp) diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 6b4e60e33a9a..a79e40f9d700 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -59,7 +59,7 @@ static inline struct freezer *task_freezer(struct task_struct *task) static struct freezer *parent_freezer(struct freezer *freezer) { - return css_freezer(css_parent(&freezer->css)); + return css_freezer(freezer->css.parent); } bool cgroup_freezing(struct task_struct *task) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 2f4b08b8db24..5b2a31082f4f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -124,7 +124,7 @@ static inline struct cpuset *task_cs(struct task_struct *task) static inline struct cpuset *parent_cs(struct cpuset *cs) { - return css_cs(css_parent(&cs->css)); + return css_cs(cs->css.parent); } #ifdef CONFIG_NUMA diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 268a45ea238c..ac61ad1a5f9f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7586,7 +7586,7 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); - struct task_group *parent = css_tg(css_parent(css)); + struct task_group *parent = css_tg(css->parent); if (parent) sched_online_group(tg, parent); diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index c143ee380e3a..9cf350c94ec4 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -46,7 +46,7 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk) static inline struct cpuacct *parent_ca(struct cpuacct *ca) { - return css_ca(css_parent(&ca->css)); + return css_ca(ca->css.parent); } static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); -- cgit v1.2.3 From d51f39b05ce0008118c45945e681b20484990571 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:48 -0400 Subject: cgroup: remove cgroup->parent cgroup->parent is redundant as cgroup->self.parent can also be used to determine the parent cgroup and we're moving towards using cgroup_subsys_states as the fundamental structural blocks. This patch introduces cgroup_parent() which follows cgroup->self.parent and removes cgroup->parent. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 52 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 929bbbc539e9..8c67a739aea4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -218,6 +218,15 @@ static void cgroup_idr_remove(struct idr *idr, int id) spin_unlock_bh(&cgroup_idr_lock); } +static struct cgroup *cgroup_parent(struct cgroup *cgrp) +{ + struct cgroup_subsys_state *parent_css = cgrp->self.parent; + + if (parent_css) + return container_of(parent_css, struct cgroup, self); + return NULL; +} + /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest @@ -260,9 +269,9 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, if (!(cgrp->root->subsys_mask & (1 << ss->id))) return NULL; - while (cgrp->parent && - !(cgrp->parent->child_subsys_mask & (1 << ss->id))) - cgrp = cgrp->parent; + while (cgroup_parent(cgrp) && + !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) + cgrp = cgroup_parent(cgrp); return cgroup_css(cgrp, ss); } @@ -307,7 +316,7 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) while (cgrp) { if (cgrp == ancestor) return true; - cgrp = cgrp->parent; + cgrp = cgroup_parent(cgrp); } return false; } @@ -454,7 +463,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) if (cgrp->populated_kn) kernfs_notify(cgrp->populated_kn); - cgrp = cgrp->parent; + cgrp = cgroup_parent(cgrp); } while (cgrp); } @@ -2018,7 +2027,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, * Except for the root, child_subsys_mask must be zero for a cgroup * with tasks so that child cgroups don't compete against tasks. */ - if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && dst_cgrp->parent && + if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) && dst_cgrp->child_subsys_mask) return -EBUSY; @@ -2427,7 +2436,7 @@ static int cgroup_controllers_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - cgroup_print_ss_mask(seq, cgrp->parent->child_subsys_mask); + cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask); return 0; } @@ -2610,8 +2619,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, /* unavailable or not enabled on the parent? */ if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || - (cgrp->parent && - !(cgrp->parent->child_subsys_mask & (1 << ssid)))) { + (cgroup_parent(cgrp) && + !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) { ret = -ENOENT; goto out_unlock; } @@ -2640,7 +2649,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, * Except for the root, child_subsys_mask must be zero for a cgroup * with tasks so that child cgroups don't compete against tasks. */ - if (enable && cgrp->parent && !list_empty(&cgrp->cset_links)) { + if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { ret = -EBUSY; goto out_unlock; } @@ -2898,9 +2907,9 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], continue; if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) continue; - if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) + if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp)) continue; - if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) + if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp)) continue; if (is_add) { @@ -4092,14 +4101,14 @@ static void css_free_work_fn(struct work_struct *work) atomic_dec(&cgrp->root->nr_cgrps); cgroup_pidlist_destroy_all(cgrp); - if (cgrp->parent) { + if (cgroup_parent(cgrp)) { /* * We get a ref to the parent, and put the ref when * this cgroup is being freed, so it's guaranteed * that the parent won't be destroyed before its * children. */ - cgroup_put(cgrp->parent); + cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); kfree(cgrp); } else { @@ -4163,8 +4172,8 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css->ss = ss; css->flags = 0; - if (cgrp->parent) { - css->parent = cgroup_css(cgrp->parent, ss); + if (cgroup_parent(cgrp)) { + css->parent = cgroup_css(cgroup_parent(cgrp), ss); css_get(css->parent); } @@ -4218,7 +4227,7 @@ static void offline_css(struct cgroup_subsys_state *css) */ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { - struct cgroup *parent = cgrp->parent; + struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *css; int err; @@ -4251,7 +4260,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) goto err_clear_dir; if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && - parent->parent) { + cgroup_parent(parent)) { pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", current->comm, current->pid, ss->name); if (!strcmp(ss->name, "memory")) @@ -4309,7 +4318,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, init_cgroup_housekeeping(cgrp); - cgrp->parent = parent; cgrp->self.parent = &parent->self; cgrp->root = root; @@ -4336,7 +4344,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, cgrp->serial_nr = cgroup_serial_nr_next++; /* allocation complete, commit to creation */ - list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); + list_add_tail_rcu(&cgrp->sibling, &cgroup_parent(cgrp)->children); atomic_inc(&root->nr_cgrps); cgroup_get(parent); @@ -4531,8 +4539,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ kernfs_remove(cgrp->kn); - set_bit(CGRP_RELEASABLE, &cgrp->parent->flags); - check_for_release(cgrp->parent); + set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags); + check_for_release(cgroup_parent(cgrp)); /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); -- cgit v1.2.3 From d5c419b68e368fdd9f1857bf8d4bb4480edb9b80 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:48 -0400 Subject: cgroup: move cgroup->sibling and ->children into cgroup_subsys_state We're moving towards using cgroup_subsys_states as the fundamental structural blocks. Let's move cgroup->sibling and ->children into cgroup_subsys_state. This is pure move without functional change and only cgroup->self's fields are actually used. Other csses will make use of the fields later. While at it, update init_and_link_css() so that it zeroes the whole css before initializing it and remove explicit zeroing of ->flags. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8c67a739aea4..5385839e727b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -378,7 +378,7 @@ static int notify_on_release(const struct cgroup *cgrp) /* iterate over child cgrps, lock should be held throughout iteration */ #define cgroup_for_each_live_child(child, cgrp) \ - list_for_each_entry((child), &(cgrp)->children, sibling) \ + list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ if (({ lockdep_assert_held(&cgroup_mutex); \ cgroup_is_dead(child); })) \ ; \ @@ -870,7 +870,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) mutex_lock(&cgroup_mutex); BUG_ON(atomic_read(&root->nr_cgrps)); - BUG_ON(!list_empty(&cgrp->children)); + BUG_ON(!list_empty(&cgrp->self.children)); /* Rebind all subsystems back to the default hierarchy */ rebind_subsystems(&cgrp_dfl_root, root->subsys_mask); @@ -1432,7 +1432,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) } /* remounting is not allowed for populated hierarchies */ - if (!list_empty(&root->cgrp.children)) { + if (!list_empty(&root->cgrp.self.children)) { ret = -EBUSY; goto out_unlock; } @@ -1512,8 +1512,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) struct cgroup_subsys *ss; int ssid; - INIT_LIST_HEAD(&cgrp->sibling); - INIT_LIST_HEAD(&cgrp->children); + INIT_LIST_HEAD(&cgrp->self.sibling); + INIT_LIST_HEAD(&cgrp->self.children); INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); @@ -1612,7 +1612,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) link_css_set(&tmp_links, cset, root_cgrp); up_write(&css_set_rwsem); - BUG_ON(!list_empty(&root_cgrp->children)); + BUG_ON(!list_empty(&root_cgrp->self.children)); BUG_ON(atomic_read(&root->nr_cgrps) != 1); kernfs_activate(root_cgrp->kn); @@ -3128,11 +3128,11 @@ css_next_child(struct cgroup_subsys_state *pos_css, * cgroup is removed or iteration and removal race. */ if (!pos) { - next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling); + next = list_entry_rcu(cgrp->self.children.next, struct cgroup, self.sibling); } else if (likely(!cgroup_is_dead(pos))) { - next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); + next = list_entry_rcu(pos->self.sibling.next, struct cgroup, self.sibling); } else { - list_for_each_entry_rcu(next, &cgrp->children, sibling) + list_for_each_entry_rcu(next, &cgrp->self.children, self.sibling) if (next->serial_nr > pos->serial_nr) break; } @@ -3142,12 +3142,12 @@ css_next_child(struct cgroup_subsys_state *pos_css, * the next sibling; however, it might have @ss disabled. If so, * fast-forward to the next enabled one. */ - while (&next->sibling != &cgrp->children) { + while (&next->self.sibling != &cgrp->self.children) { struct cgroup_subsys_state *next_css = cgroup_css(next, parent_css->ss); if (next_css) return next_css; - next = list_entry_rcu(next->sibling.next, struct cgroup, sibling); + next = list_entry_rcu(next->self.sibling.next, struct cgroup, self.sibling); } return NULL; } @@ -3283,7 +3283,7 @@ static bool cgroup_has_live_children(struct cgroup *cgrp) struct cgroup *child; rcu_read_lock(); - list_for_each_entry_rcu(child, &cgrp->children, sibling) { + list_for_each_entry_rcu(child, &cgrp->self.children, self.sibling) { if (!cgroup_is_dead(child)) { rcu_read_unlock(); return true; @@ -4144,7 +4144,7 @@ static void css_release_work_fn(struct work_struct *work) } else { /* cgroup release path */ mutex_lock(&cgroup_mutex); - list_del_rcu(&cgrp->sibling); + list_del_rcu(&cgrp->self.sibling); mutex_unlock(&cgroup_mutex); cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); @@ -4168,9 +4168,11 @@ static void init_and_link_css(struct cgroup_subsys_state *css, { cgroup_get(cgrp); + memset(css, 0, sizeof(*css)); css->cgroup = cgrp; css->ss = ss; - css->flags = 0; + INIT_LIST_HEAD(&css->sibling); + INIT_LIST_HEAD(&css->children); if (cgroup_parent(cgrp)) { css->parent = cgroup_css(cgroup_parent(cgrp), ss); @@ -4344,7 +4346,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, cgrp->serial_nr = cgroup_serial_nr_next++; /* allocation complete, commit to creation */ - list_add_tail_rcu(&cgrp->sibling, &cgroup_parent(cgrp)->children); + list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); atomic_inc(&root->nr_cgrps); cgroup_get(parent); @@ -4507,9 +4509,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return -EBUSY; /* - * Make sure there's no live children. We can't test ->children - * emptiness as dead children linger on it while being destroyed; - * otherwise, "rmdir parent/child parent" may fail with -EBUSY. + * Make sure there's no live children. We can't test emptiness of + * ->self.children as dead children linger on it while being + * drained; otherwise, "rmdir parent/child parent" may fail. */ if (cgroup_has_live_children(cgrp)) return -EBUSY; -- cgit v1.2.3 From 1fed1b2e36ba1aa0257004a97e75bbdb70f216b5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:49 -0400 Subject: cgroup: link all cgroup_subsys_states in their sibling lists Currently, while all csses have ->children and ->sibling, only the self csses of cgroups make use of them. This patch makes all other csses to link themselves on the sibling lists too. This will be used to update css iteration. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5385839e727b..dcb06e181ce4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4138,19 +4138,21 @@ static void css_release_work_fn(struct work_struct *work) struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; + mutex_lock(&cgroup_mutex); + + list_del_rcu(&css->sibling); + if (ss) { /* css release path */ cgroup_idr_remove(&ss->css_idr, css->id); } else { /* cgroup release path */ - mutex_lock(&cgroup_mutex); - list_del_rcu(&cgrp->self.sibling); - mutex_unlock(&cgroup_mutex); - cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; } + mutex_unlock(&cgroup_mutex); + call_rcu(&css->rcu_head, css_free_rcu_fn); } @@ -4230,12 +4232,13 @@ static void offline_css(struct cgroup_subsys_state *css) static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); struct cgroup_subsys_state *css; int err; lockdep_assert_held(&cgroup_mutex); - css = ss->css_alloc(cgroup_css(parent, ss)); + css = ss->css_alloc(parent_css); if (IS_ERR(css)) return PTR_ERR(css); @@ -4255,11 +4258,12 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) goto err_free_id; /* @css is ready to be brought online now, make it visible */ + list_add_tail_rcu(&css->sibling, &parent_css->children); cgroup_idr_replace(&ss->css_idr, css, css->id); err = online_css(css); if (err) - goto err_clear_dir; + goto err_list_del; if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && cgroup_parent(parent)) { @@ -4272,7 +4276,8 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) return 0; -err_clear_dir: +err_list_del: + list_del_rcu(&css->sibling); cgroup_clear_dir(css->cgroup, 1 << css->ss->id); err_free_id: cgroup_idr_remove(&ss->css_idr, css->id); -- cgit v1.2.3 From 0cb51d71c1fa9234afe4213089844be76ec1765a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:49 -0400 Subject: cgroup: move cgroup->serial_nr into cgroup_subsys_state We're moving towards using cgroup_subsys_states as the fundamental structural blocks. All csses including the cgroup->self and actual ones now form trees through css->children and ->sibling which follow the same rules as what cgroup->children and ->sibling followed. This patch moves cgroup->serial_nr which is used to implement css iteration into css. Note that all csses, regardless of their types, allocate their serial numbers from the same monotonically increasing counter. This doesn't affect the ordering needed by css iteration or cause any other material behavior changes. This will be used to update css iteration. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index dcb06e181ce4..d5af128ec1ec 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -157,14 +157,13 @@ static int cgroup_root_count; static DEFINE_IDR(cgroup_hierarchy_idr); /* - * Assign a monotonically increasing serial number to cgroups. It - * guarantees cgroups with bigger numbers are newer than those with smaller - * numbers. Also, as cgroups are always appended to the parent's - * ->children list, it guarantees that sibling cgroups are always sorted in - * the ascending serial number order on the list. Protected by - * cgroup_mutex. + * Assign a monotonically increasing serial number to csses. It guarantees + * cgroups with bigger numbers are newer than those with smaller numbers. + * Also, as csses are always appended to the parent's ->children list, it + * guarantees that sibling csses are always sorted in the ascending serial + * number order on the list. Protected by cgroup_mutex. */ -static u64 cgroup_serial_nr_next = 1; +static u64 css_serial_nr_next = 1; /* This flag indicates whether tasks in the fork and exit paths should * check for fork/exit handlers to call. This avoids us having to do @@ -3133,7 +3132,7 @@ css_next_child(struct cgroup_subsys_state *pos_css, next = list_entry_rcu(pos->self.sibling.next, struct cgroup, self.sibling); } else { list_for_each_entry_rcu(next, &cgrp->self.children, self.sibling) - if (next->serial_nr > pos->serial_nr) + if (next->self.serial_nr > pos->self.serial_nr) break; } @@ -4168,6 +4167,8 @@ static void css_release(struct percpu_ref *ref) static void init_and_link_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, struct cgroup *cgrp) { + lockdep_assert_held(&cgroup_mutex); + cgroup_get(cgrp); memset(css, 0, sizeof(*css)); @@ -4175,6 +4176,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css->ss = ss; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); + css->serial_nr = css_serial_nr_next++; if (cgroup_parent(cgrp)) { css->parent = cgroup_css(cgroup_parent(cgrp), ss); @@ -4348,7 +4350,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, */ kernfs_get(kn); - cgrp->serial_nr = cgroup_serial_nr_next++; + cgrp->self.serial_nr = css_serial_nr_next++; /* allocation complete, commit to creation */ list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); -- cgit v1.2.3 From de3f034182ecbf0efbcef7ab8b253c6c3049a592 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:49 -0400 Subject: cgroup: introduce CSS_RELEASED and reduce css iteration fallback window css iterations allow the caller to drop RCU read lock. As long as the caller keeps the current position accessible, it can simply re-grab RCU read lock later and continue iteration. This is achieved by using CGRP_DEAD to detect whether the current positions next pointer is safe to dereference and if not re-iterate from the beginning to the next position using ->serial_nr. CGRP_DEAD is used as the marker to invalidate the next pointer and the only requirement is that the marker is set before the next sibling starts its RCU grace period. Because CGRP_DEAD is set at the end of cgroup_destroy_locked() but the cgroup is unlinked when the reference count reaches zero, we currently have a rather large window where this fallback re-iteration logic can be triggered. This patch introduces CSS_RELEASED which is set when a css is unlinked from its sibling list. This still keeps the re-iteration logic working while drastically reducing the window of its activation. While at it, rewrite the comment in css_next_child() to reflect the new flag and better explain the synchronization. This will also enable iterating csses directly instead of through cgroups. v2: CSS_RELEASED now assigned to 1 << 2 as 1 << 0 is used by CSS_NO_REF. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d5af128ec1ec..5544e685f2da 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3108,27 +3108,28 @@ css_next_child(struct cgroup_subsys_state *pos_css, cgroup_assert_mutex_or_rcu_locked(); /* - * @pos could already have been removed. Once a cgroup is removed, - * its ->sibling.next is no longer updated when its next sibling - * changes. As CGRP_DEAD assertion is serialized and happens - * before the cgroup is taken off the ->sibling list, if we see it - * unasserted, it's guaranteed that the next sibling hasn't - * finished its grace period even if it's already removed, and thus - * safe to dereference from this RCU critical section. If - * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed - * to be visible as %true here. + * @pos could already have been unlinked from the sibling list. + * Once a cgroup is removed, its ->sibling.next is no longer + * updated when its next sibling changes. CSS_RELEASED is set when + * @pos is taken off list, at which time its next pointer is valid, + * and, as releases are serialized, the one pointed to by the next + * pointer is guaranteed to not have started release yet. This + * implies that if we observe !CSS_RELEASED on @pos in this RCU + * critical section, the one pointed to by its next pointer is + * guaranteed to not have finished its RCU grace period even if we + * have dropped rcu_read_lock() inbetween iterations. * - * If @pos is dead, its next pointer can't be dereferenced; - * however, as each cgroup is given a monotonically increasing - * unique serial number and always appended to the sibling list, - * the next one can be found by walking the parent's children until - * we see a cgroup with higher serial number than @pos's. While - * this path can be slower, it's taken only when either the current - * cgroup is removed or iteration and removal race. + * If @pos has CSS_RELEASED set, its next pointer can't be + * dereferenced; however, as each css is given a monotonically + * increasing unique serial number and always appended to the + * sibling list, the next one can be found by walking the parent's + * children until the first css with higher serial number than + * @pos's. While this path can be slower, it happens iff iteration + * races against release and the race window is very small. */ if (!pos) { next = list_entry_rcu(cgrp->self.children.next, struct cgroup, self.sibling); - } else if (likely(!cgroup_is_dead(pos))) { + } else if (likely(!(pos->self.flags & CSS_RELEASED))) { next = list_entry_rcu(pos->self.sibling.next, struct cgroup, self.sibling); } else { list_for_each_entry_rcu(next, &cgrp->self.children, self.sibling) @@ -4139,6 +4140,7 @@ static void css_release_work_fn(struct work_struct *work) mutex_lock(&cgroup_mutex); + css->flags |= CSS_RELEASED; list_del_rcu(&css->sibling); if (ss) { @@ -4525,10 +4527,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Mark @cgrp dead. This prevents further task migration and child - * creation by disabling cgroup_lock_live_group(). Note that - * CGRP_DEAD assertion is depended upon by css_next_child() to - * resume iteration after dropping RCU read lock. See - * css_next_child() for details. + * creation by disabling cgroup_lock_live_group(). */ set_bit(CGRP_DEAD, &cgrp->flags); -- cgit v1.2.3 From c2931b70a32c705b9bd5762f5044f9eac8a52bb3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:51 -0400 Subject: cgroup: iterate cgroup_subsys_states directly Currently, css_next_child() is implemented as finding the next child cgroup which has the css enabled, which used to be the only way to do it as only cgroups participated in sibling lists and thus could be iteratd. This works as long as what's required during iteration is not missing online csses; however, it turns out that there are use cases where offlined but not yet released csses need to be iterated. This is difficult to implement through cgroup iteration the unified hierarchy as there may be multiple dying csses for the same subsystem associated with single cgroup. After the recent changes, the cgroup self and regular csses behave identically in how they're linked and unlinked from the sibling lists including assertion of CSS_RELEASED and css_next_child() can simply switch to iterating csses directly. This both simplifies the logic and ensures that all visible non-released csses are included in the iteration whether there are multiple dying csses for a subsystem or not. As all other iterators depend on css_next_child() for sibling iteration, this changes behaviors of all css iterators. Add and update explanations on the css states which are included in traversal to all iterators. As css iteration could always contain offlined csses, this shouldn't break any of the current users and new usages which need iteration of all on and offline csses can make use of the new semantics. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Johannes Weiner --- kernel/cgroup.c | 62 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5544e685f2da..097a1fc1e1e8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3089,21 +3089,25 @@ static int cgroup_task_count(const struct cgroup *cgrp) /** * css_next_child - find the next child of a given css - * @pos_css: the current position (%NULL to initiate traversal) - * @parent_css: css whose children to walk + * @pos: the current position (%NULL to initiate traversal) + * @parent: css whose children to walk * - * This function returns the next child of @parent_css and should be called + * This function returns the next child of @parent and should be called * under either cgroup_mutex or RCU read lock. The only requirement is - * that @parent_css and @pos_css are accessible. The next sibling is - * guaranteed to be returned regardless of their states. + * that @parent and @pos are accessible. The next sibling is guaranteed to + * be returned regardless of their states. + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. */ -struct cgroup_subsys_state * -css_next_child(struct cgroup_subsys_state *pos_css, - struct cgroup_subsys_state *parent_css) +struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *parent) { - struct cgroup *pos = pos_css ? pos_css->cgroup : NULL; - struct cgroup *cgrp = parent_css->cgroup; - struct cgroup *next; + struct cgroup_subsys_state *next; cgroup_assert_mutex_or_rcu_locked(); @@ -3128,27 +3132,21 @@ css_next_child(struct cgroup_subsys_state *pos_css, * races against release and the race window is very small. */ if (!pos) { - next = list_entry_rcu(cgrp->self.children.next, struct cgroup, self.sibling); - } else if (likely(!(pos->self.flags & CSS_RELEASED))) { - next = list_entry_rcu(pos->self.sibling.next, struct cgroup, self.sibling); + next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling); + } else if (likely(!(pos->flags & CSS_RELEASED))) { + next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling); } else { - list_for_each_entry_rcu(next, &cgrp->self.children, self.sibling) - if (next->self.serial_nr > pos->self.serial_nr) + list_for_each_entry_rcu(next, &parent->children, sibling) + if (next->serial_nr > pos->serial_nr) break; } /* * @next, if not pointing to the head, can be dereferenced and is - * the next sibling; however, it might have @ss disabled. If so, - * fast-forward to the next enabled one. + * the next sibling. */ - while (&next->self.sibling != &cgrp->self.children) { - struct cgroup_subsys_state *next_css = cgroup_css(next, parent_css->ss); - - if (next_css) - return next_css; - next = list_entry_rcu(next->self.sibling.next, struct cgroup, self.sibling); - } + if (&next->sibling != &parent->children) + return next; return NULL; } @@ -3165,6 +3163,13 @@ css_next_child(struct cgroup_subsys_state *pos_css, * doesn't require the whole traversal to be contained in a single critical * section. This function will return the correct next descendant as long * as both @pos and @root are accessible and @pos is a descendant of @root. + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. */ struct cgroup_subsys_state * css_next_descendant_pre(struct cgroup_subsys_state *pos, @@ -3252,6 +3257,13 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos) * section. This function will return the correct next descendant as long * as both @pos and @cgroup are accessible and @pos is a descendant of * @cgroup. + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. */ struct cgroup_subsys_state * css_next_descendant_post(struct cgroup_subsys_state *pos, -- cgit v1.2.3 From 184faf32328c65c9d86b19577b8d8b90bdd2cd2e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:51 -0400 Subject: cgroup: use CSS_ONLINE instead of CGRP_DEAD Use CSS_ONLINE on the self css to indicate whether a cgroup has been killed instead of CGRP_DEAD. This will allow re-using css online test for cgroup liveliness test. This doesn't introduce any functional change. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 097a1fc1e1e8..004004fd0ded 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -278,7 +278,7 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, /* convenient tests for these bits */ static inline bool cgroup_is_dead(const struct cgroup *cgrp) { - return test_bit(CGRP_DEAD, &cgrp->flags); + return !(cgrp->self.flags & CSS_ONLINE); } struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) @@ -1518,6 +1518,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); cgrp->self.cgroup = cgrp; + cgrp->self.flags |= CSS_ONLINE; for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); @@ -4541,13 +4542,13 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * Mark @cgrp dead. This prevents further task migration and child * creation by disabling cgroup_lock_live_group(). */ - set_bit(CGRP_DEAD, &cgrp->flags); + cgrp->self.flags &= ~CSS_ONLINE; /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) kill_css(css); - /* CGRP_DEAD is set, remove from ->release_list for the last time */ + /* CSS_ONLINE is clear, remove from ->release_list for the last time */ raw_spin_lock(&release_list_lock); if (!list_empty(&cgrp->release_list)) list_del_init(&cgrp->release_list); -- cgit v1.2.3 From f3d4650015301d1c880df4523f7e7ef320a38aab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:52 -0400 Subject: cgroup: convert cgroup_has_live_children() into css_has_online_children() Now that cgroup liveliness and css onliness are the same state, convert cgroup_has_live_children() into css_has_online_children() so that it can be used for actual csses too. The function now uses css_for_each_child() for iteration and is published. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 004004fd0ded..082bb842b11a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -175,7 +175,6 @@ static int need_forkexit_callback __read_mostly; static struct cftype cgroup_base_files[]; static void cgroup_put(struct cgroup *cgrp); -static bool cgroup_has_live_children(struct cgroup *cgrp); static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask); static int cgroup_destroy_locked(struct cgroup *cgrp); @@ -1769,7 +1768,7 @@ static void cgroup_kill_sb(struct super_block *sb) * This prevents new mounts by disabling percpu_ref_tryget_live(). * cgroup_mount() may wait for @root's release. */ - if (cgroup_has_live_children(&root->cgrp)) + if (css_has_online_children(&root->cgrp.self)) cgroup_put(&root->cgrp); else percpu_ref_kill(&root->cgrp.self.refcnt); @@ -3291,19 +3290,28 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, return pos->parent; } -static bool cgroup_has_live_children(struct cgroup *cgrp) +/** + * css_has_online_children - does a css have online children + * @css: the target css + * + * Returns %true if @css has any online children; otherwise, %false. This + * function can be called from any context but the caller is responsible + * for synchronizing against on/offlining as necessary. + */ +bool css_has_online_children(struct cgroup_subsys_state *css) { - struct cgroup *child; + struct cgroup_subsys_state *child; + bool ret = false; rcu_read_lock(); - list_for_each_entry_rcu(child, &cgrp->self.children, self.sibling) { - if (!cgroup_is_dead(child)) { - rcu_read_unlock(); - return true; + css_for_each_child(child, css) { + if (css->flags & CSS_ONLINE) { + ret = true; + break; } } rcu_read_unlock(); - return false; + return ret; } /** @@ -4535,7 +4543,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * ->self.children as dead children linger on it while being * drained; otherwise, "rmdir parent/child parent" may fail. */ - if (cgroup_has_live_children(cgrp)) + if (css_has_online_children(&cgrp->self)) return -EBUSY; /* @@ -5014,8 +5022,8 @@ void cgroup_exit(struct task_struct *tsk) static void check_for_release(struct cgroup *cgrp) { - if (cgroup_is_releasable(cgrp) && - list_empty(&cgrp->cset_links) && !cgroup_has_live_children(cgrp)) { + if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) && + !css_has_online_children(&cgrp->self)) { /* * Control Group is currently removeable. If it's not * already queued for a userspace notification, queue -- cgit v1.2.3 From 8c7260c0d9a6fa327546af724fb48375df493326 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Fri, 16 May 2014 23:26:02 +0200 Subject: sparc64: fix sparse warning in tsb.c Fix following warning: tsb.c:290:5: warning: symbol 'sysctl_tsb_ratio' was not declared. Should it be static? Add extern declaration in asm/setup.h and remove local declaration in kernel/sysctl.c Signed-off-by: Sam Ravnborg Signed-off-by: David S. Miller --- kernel/sysctl.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 74f5b580fe34..3c202d00f6e8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -152,10 +152,6 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); #ifdef CONFIG_SPARC #endif -#ifdef CONFIG_SPARC64 -extern int sysctl_tsb_ratio; -#endif - #ifdef __hppa__ extern int pwrsw_enabled; #endif -- cgit v1.2.3 From 5533e0114425dcdb878f11b291f2727af8667a7c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 19:33:07 -0400 Subject: cgroup: disallow debug controller on the default hierarchy The debug controller, as its name suggests, exposes cgroup core internals to userland to aid debugging. Unfortunately, except for the name, there's no provision to prevent its usage in production configurations and the controller is widely enabled and mounted leaking internal details to userland. Like most other debug information, the information exposed by debug isn't interesting even for debugging itself once the related parts are working reliably. This controller has no reason for existing. This patch implements cgrp_dfl_root_inhibit_ss_mask which can suppress specific subsystems on the default hierarchy and adds the debug subsystem to it so that it can be gradually deprecated as usages move towards the unified hierarchy. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 082bb842b11a..a5f75ac4e793 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -148,6 +148,13 @@ struct cgroup_root cgrp_dfl_root; */ static bool cgrp_dfl_root_visible; +/* some controllers are not supported in the default hierarchy */ +static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0 +#ifdef CONFIG_CGROUP_DEBUG + | (1 << debug_cgrp_id) +#endif + ; + /* The list of hierarchy roots */ static LIST_HEAD(cgroup_roots); @@ -1126,6 +1133,7 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask) static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) { struct cgroup_subsys *ss; + unsigned int tmp_ss_mask; int ssid, i, ret; lockdep_assert_held(&cgroup_mutex); @@ -1143,7 +1151,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) return -EBUSY; } - ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask); + /* skip creating root files on dfl_root for inhibited subsystems */ + tmp_ss_mask = ss_mask; + if (dst_root == &cgrp_dfl_root) + tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; + + ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask); if (ret) { if (dst_root != &cgrp_dfl_root) return ret; @@ -2426,7 +2439,8 @@ static int cgroup_root_controllers_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - cgroup_print_ss_mask(seq, cgrp->root->subsys_mask); + cgroup_print_ss_mask(seq, cgrp->root->subsys_mask & + ~cgrp_dfl_root_inhibit_ss_mask); return 0; } @@ -2564,7 +2578,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, if (tok[0] == '\0') continue; for_each_subsys(ss, ssid) { - if (ss->disabled || strcmp(tok + 1, ss->name)) + if (ss->disabled || strcmp(tok + 1, ss->name) || + ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask)) continue; if (*tok == '+') { -- cgit v1.2.3 From 9625ab1727743f6a164df26b7b1eeeced7380b42 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 20 May 2014 17:46:27 +0800 Subject: workqueue: use manager lock only to protect worker_idr worker_idr is highly bound to managers and is always/only accessed in manager lock context. So we don't need pool->lock for it. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c8411085466f..910d963f6b76 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -124,8 +124,7 @@ enum { * cpu or grabbing pool->lock is enough for read access. If * POOL_DISASSOCIATED is set, it's identical to L. * - * MG: pool->manager_mutex and pool->lock protected. Writes require both - * locks. Reads can happen under either lock. + * M: pool->manager_mutex protected. * * PL: wq_pool_mutex protected. * @@ -164,7 +163,7 @@ struct worker_pool { /* see manage_workers() for details on the two manager mutexes */ struct mutex manager_arb; /* manager arbitration */ struct mutex manager_mutex; /* manager exclusion */ - struct idr worker_idr; /* MG: worker IDs and iteration */ + struct idr worker_idr; /* M: worker IDs and iteration */ struct workqueue_attrs *attrs; /* I: worker attributes */ struct hlist_node hash_node; /* PL: unbound_pool_hash node */ @@ -340,16 +339,6 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, lockdep_is_held(&wq->mutex), \ "sched RCU or wq->mutex should be held") -#ifdef CONFIG_LOCKDEP -#define assert_manager_or_pool_lock(pool) \ - WARN_ONCE(debug_locks && \ - !lockdep_is_held(&(pool)->manager_mutex) && \ - !lockdep_is_held(&(pool)->lock), \ - "pool->manager_mutex or ->lock should be held") -#else -#define assert_manager_or_pool_lock(pool) do { } while (0) -#endif - #define for_each_cpu_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ @@ -378,14 +367,14 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, * @wi: integer used for iteration * @pool: worker_pool to iterate workers of * - * This must be called with either @pool->manager_mutex or ->lock held. + * This must be called with @pool->manager_mutex. * * The if/else clause exists only for the lockdep assertion and can be * ignored. */ #define for_each_pool_worker(worker, wi, pool) \ idr_for_each_entry(&(pool)->worker_idr, (worker), (wi)) \ - if (({ assert_manager_or_pool_lock((pool)); false; })) { } \ + if (({ lockdep_assert_held(&pool->manager_mutex); false; })) { } \ else /** @@ -1725,13 +1714,7 @@ static struct worker *create_worker(struct worker_pool *pool) * ID is needed to determine kthread name. Allocate ID first * without installing the pointer. */ - idr_preload(GFP_KERNEL); - spin_lock_irq(&pool->lock); - - id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT); - - spin_unlock_irq(&pool->lock); - idr_preload_end(); + id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_KERNEL); if (id < 0) goto fail; @@ -1773,18 +1756,13 @@ static struct worker *create_worker(struct worker_pool *pool) worker->flags |= WORKER_UNBOUND; /* successful, commit the pointer to idr */ - spin_lock_irq(&pool->lock); idr_replace(&pool->worker_idr, worker, worker->id); - spin_unlock_irq(&pool->lock); return worker; fail: - if (id >= 0) { - spin_lock_irq(&pool->lock); + if (id >= 0) idr_remove(&pool->worker_idr, id); - spin_unlock_irq(&pool->lock); - } kfree(worker); return NULL; } -- cgit v1.2.3 From 73eb7fe73ae303996187fff38b1c162f1df0e9d1 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 20 May 2014 17:46:28 +0800 Subject: workqueue: destroy_worker() should destroy idle workers only We used to have the CPU online failure path where a worker is created and then destroyed without being started. A worker was created for the CPU coming online and if the online operation failed the created worker was shut down without being started. But this behavior was changed. The first worker is created and started at the same time for the CPU coming online. It means that we had already ensured in the code that destroy_worker() destroys only idle workers and we don't want to allow it to destroy any non-idle worker in the future. Otherwise, it may be buggy and it may be extremely hard to check. We should force destroy_worker() to destroy only idle workers explicitly. Since destroy_worker() destroys only idle workers, this patch does not change any functionality. We just need to update the comments and the sanity check code. In the sanity check code, we will refuse to destroy the worker if !(worker->flags & WORKER_IDLE). If the worker entered idle which means it is already started, so we remove the check of "worker->flags & WORKER_STARTED", after this removal, WORKER_STARTED is totally unneeded, so we remove WORKER_STARTED too. In the comments for create_worker(), "Create a new worker which is bound..." is changed to "... which is attached..." due to we change the name of this behavior to attaching. tj: Minor description / comment updates. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 910d963f6b76..189d79be8091 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -73,7 +73,6 @@ enum { POOL_FREEZING = 1 << 3, /* freeze in progress */ /* worker flags */ - WORKER_STARTED = 1 << 0, /* started */ WORKER_DIE = 1 << 1, /* die die die */ WORKER_IDLE = 1 << 2, /* is idle */ WORKER_PREP = 1 << 3, /* preparing to run works */ @@ -1692,9 +1691,8 @@ static struct worker *alloc_worker(void) * create_worker - create a new workqueue worker * @pool: pool the new worker will belong to * - * Create a new worker which is bound to @pool. The returned worker - * can be started by calling start_worker() or destroyed using - * destroy_worker(). + * Create a new worker which is attached to @pool. The new worker must be + * started by start_worker(). * * CONTEXT: * Might sleep. Does GFP_KERNEL allocations. @@ -1778,7 +1776,6 @@ fail: */ static void start_worker(struct worker *worker) { - worker->flags |= WORKER_STARTED; worker->pool->nr_workers++; worker_enter_idle(worker); wake_up_process(worker->task); @@ -1814,7 +1811,8 @@ static int create_and_start_worker(struct worker_pool *pool) * destroy_worker - destroy a workqueue worker * @worker: worker to be destroyed * - * Destroy @worker and adjust @pool stats accordingly. + * Destroy @worker and adjust @pool stats accordingly. The worker should + * be idle. * * CONTEXT: * spin_lock_irq(pool->lock) which is released and regrabbed. @@ -1828,13 +1826,12 @@ static void destroy_worker(struct worker *worker) /* sanity check frenzy */ if (WARN_ON(worker->current_work) || - WARN_ON(!list_empty(&worker->scheduled))) + WARN_ON(!list_empty(&worker->scheduled)) || + WARN_ON(!(worker->flags & WORKER_IDLE))) return; - if (worker->flags & WORKER_STARTED) - pool->nr_workers--; - if (worker->flags & WORKER_IDLE) - pool->nr_idle--; + pool->nr_workers--; + pool->nr_idle--; /* * Once WORKER_DIE is set, the kworker may destroy itself at any -- cgit v1.2.3 From 60f5a4bcf852b5dec698b08cd34efc302ea72f2b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 20 May 2014 17:46:29 +0800 Subject: workqueue: async worker destruction worker destruction includes these parts of code: adjust pool's stats remove the worker from idle list detach the worker from the pool kthread_stop() to wait for the worker's task exit free the worker struct We can find out that there is no essential work to do after kthread_stop(), which means destroy_worker() doesn't need to wait for the worker's task exit, so we can remove kthread_stop() and free the worker struct in the worker exiting path. However, put_unbound_pool() still needs to sync the all the workers' destruction before destroying the pool; otherwise, the workers may access to the invalid pool when they are exiting. So we also move the code of "detach the worker" to the exiting path and let put_unbound_pool() to sync with this code via detach_completion. The code of "detach the worker" is wrapped in a new function "worker_detach_from_pool()" although worker_detach_from_pool() is only called once (in worker_thread()) after this patch, but we need to wrap it for these reasons: 1) The code of "detach the worker" is not short enough to unfold them in worker_thread(). 2) the name of "worker_detach_from_pool()" is self-comment, and we add some comments above the function. 3) it will be shared by rescuer in later patch which allows rescuer and normal thread use the same attach/detach frameworks. The worker id is freed when detaching which happens before the worker is fully dead, but this id of the dying worker may be re-used for a new worker, so the dying worker's task name is changed to "worker/dying" to avoid two or several workers having the same name. Since "detach the worker" is moved out from destroy_worker(), destroy_worker() doesn't require manager_mutex, so the "lockdep_assert_held(&pool->manager_mutex)" in destroy_worker() is removed, and destroy_worker() is not protected by manager_mutex in put_unbound_pool(). tj: Minor description updates. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 62 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 189d79be8091..c458d73022bb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -163,6 +163,7 @@ struct worker_pool { struct mutex manager_arb; /* manager arbitration */ struct mutex manager_mutex; /* manager exclusion */ struct idr worker_idr; /* M: worker IDs and iteration */ + struct completion *detach_completion; /* all workers detached */ struct workqueue_attrs *attrs; /* I: worker attributes */ struct hlist_node hash_node; /* PL: unbound_pool_hash node */ @@ -1687,6 +1688,30 @@ static struct worker *alloc_worker(void) return worker; } +/** + * worker_detach_from_pool() - detach a worker from its pool + * @worker: worker which is attached to its pool + * @pool: the pool @worker is attached to + * + * Undo the attaching which had been done in create_worker(). The caller + * worker shouldn't access to the pool after detached except it has other + * reference to the pool. + */ +static void worker_detach_from_pool(struct worker *worker, + struct worker_pool *pool) +{ + struct completion *detach_completion = NULL; + + mutex_lock(&pool->manager_mutex); + idr_remove(&pool->worker_idr, worker->id); + if (idr_is_empty(&pool->worker_idr)) + detach_completion = pool->detach_completion; + mutex_unlock(&pool->manager_mutex); + + if (detach_completion) + complete(detach_completion); +} + /** * create_worker - create a new workqueue worker * @pool: pool the new worker will belong to @@ -1815,13 +1840,12 @@ static int create_and_start_worker(struct worker_pool *pool) * be idle. * * CONTEXT: - * spin_lock_irq(pool->lock) which is released and regrabbed. + * spin_lock_irq(pool->lock). */ static void destroy_worker(struct worker *worker) { struct worker_pool *pool = worker->pool; - lockdep_assert_held(&pool->manager_mutex); lockdep_assert_held(&pool->lock); /* sanity check frenzy */ @@ -1833,24 +1857,9 @@ static void destroy_worker(struct worker *worker) pool->nr_workers--; pool->nr_idle--; - /* - * Once WORKER_DIE is set, the kworker may destroy itself at any - * point. Pin to ensure the task stays until we're done with it. - */ - get_task_struct(worker->task); - list_del_init(&worker->entry); worker->flags |= WORKER_DIE; - - idr_remove(&pool->worker_idr, worker->id); - - spin_unlock_irq(&pool->lock); - - kthread_stop(worker->task); - put_task_struct(worker->task); - kfree(worker); - - spin_lock_irq(&pool->lock); + wake_up_process(worker->task); } static void idle_worker_timeout(unsigned long __pool) @@ -2289,6 +2298,10 @@ woke_up: spin_unlock_irq(&pool->lock); WARN_ON_ONCE(!list_empty(&worker->entry)); worker->task->flags &= ~PF_WQ_WORKER; + + set_task_comm(worker->task, "kworker/dying"); + worker_detach_from_pool(worker, pool); + kfree(worker); return 0; } @@ -3560,6 +3573,7 @@ static void rcu_free_pool(struct rcu_head *rcu) */ static void put_unbound_pool(struct worker_pool *pool) { + DECLARE_COMPLETION_ONSTACK(detach_completion); struct worker *worker; lockdep_assert_held(&wq_pool_mutex); @@ -3583,15 +3597,21 @@ static void put_unbound_pool(struct worker_pool *pool) * manager_mutex. */ mutex_lock(&pool->manager_arb); - mutex_lock(&pool->manager_mutex); - spin_lock_irq(&pool->lock); + spin_lock_irq(&pool->lock); while ((worker = first_worker(pool))) destroy_worker(worker); WARN_ON(pool->nr_workers || pool->nr_idle); - spin_unlock_irq(&pool->lock); + + mutex_lock(&pool->manager_mutex); + if (!idr_is_empty(&pool->worker_idr)) + pool->detach_completion = &detach_completion; mutex_unlock(&pool->manager_mutex); + + if (pool->detach_completion) + wait_for_completion(pool->detach_completion); + mutex_unlock(&pool->manager_arb); /* shut down the timers */ -- cgit v1.2.3 From 3347fc9f36e7e5d3ebe504fc4034745b5d8971d3 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 20 May 2014 17:46:30 +0800 Subject: workqueue: destroy worker directly in the idle timeout handler Since destroy_worker() doesn't need to sleep nor require manager_mutex, destroy_worker() can be directly called in the idle timeout handler, it helps us remove POOL_MANAGE_WORKERS and maybe_destroy_worker() and simplify the manage_workers() After POOL_MANAGE_WORKERS is removed, worker_thread() doesn't need to test whether it needs to manage after processed works. So we can remove the test branch. Signed-off-by: Lai Jiangshan --- kernel/workqueue.c | 67 ++++-------------------------------------------------- 1 file changed, 5 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c458d73022bb..938836bc6207 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -68,7 +68,6 @@ enum { * manager_mutex to avoid changing binding state while * create_worker() is in progress. */ - POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ POOL_FREEZING = 1 << 3, /* freeze in progress */ @@ -752,13 +751,6 @@ static bool need_to_create_worker(struct worker_pool *pool) return need_more_worker(pool) && !may_start_working(pool); } -/* Do I need to be the manager? */ -static bool need_to_manage_workers(struct worker_pool *pool) -{ - return need_to_create_worker(pool) || - (pool->flags & POOL_MANAGE_WORKERS); -} - /* Do we have too many workers and should some go away? */ static bool too_many_workers(struct worker_pool *pool) { @@ -1868,7 +1860,7 @@ static void idle_worker_timeout(unsigned long __pool) spin_lock_irq(&pool->lock); - if (too_many_workers(pool)) { + while (too_many_workers(pool)) { struct worker *worker; unsigned long expires; @@ -1876,13 +1868,12 @@ static void idle_worker_timeout(unsigned long __pool) worker = list_entry(pool->idle_list.prev, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; - if (time_before(jiffies, expires)) + if (time_before(jiffies, expires)) { mod_timer(&pool->idle_timer, expires); - else { - /* it's been idle for too long, wake up manager */ - pool->flags |= POOL_MANAGE_WORKERS; - wake_up_worker(pool); + break; } + + destroy_worker(worker); } spin_unlock_irq(&pool->lock); @@ -2000,44 +1991,6 @@ restart: return true; } -/** - * maybe_destroy_worker - destroy workers which have been idle for a while - * @pool: pool to destroy workers for - * - * Destroy @pool workers which have been idle for longer than - * IDLE_WORKER_TIMEOUT. - * - * LOCKING: - * spin_lock_irq(pool->lock) which may be released and regrabbed - * multiple times. Called only from manager. - * - * Return: - * %false if no action was taken and pool->lock stayed locked, %true - * otherwise. - */ -static bool maybe_destroy_workers(struct worker_pool *pool) -{ - bool ret = false; - - while (too_many_workers(pool)) { - struct worker *worker; - unsigned long expires; - - worker = list_entry(pool->idle_list.prev, struct worker, entry); - expires = worker->last_active + IDLE_WORKER_TIMEOUT; - - if (time_before(jiffies, expires)) { - mod_timer(&pool->idle_timer, expires); - break; - } - - destroy_worker(worker); - ret = true; - } - - return ret; -} - /** * manage_workers - manage worker pool * @worker: self @@ -2101,13 +2054,6 @@ static bool manage_workers(struct worker *worker) ret = true; } - pool->flags &= ~POOL_MANAGE_WORKERS; - - /* - * Destroy and then create so that may_start_working() is true - * on return. - */ - ret |= maybe_destroy_workers(pool); ret |= maybe_create_worker(pool); mutex_unlock(&pool->manager_mutex); @@ -2349,9 +2295,6 @@ recheck: worker_set_flags(worker, WORKER_PREP, false); sleep: - if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker)) - goto recheck; - /* * pool->lock is held and there's no work to process and no need to * manage, sleep. Workers are woken up only while holding -- cgit v1.2.3 From da028469ba173e9c634b6ecf80bb0c69c7d1024d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 20 May 2014 17:46:31 +0800 Subject: workqueue: separate iteration role from worker_idr worker_idr has the iteration (iterating for attached workers) and worker ID duties. These two duties don't have to be tied together. We can separate them and use a list for tracking attached workers and iteration. Before this separation, it wasn't possible to add rescuer workers to worker_idr due to rescuer workers couldn't allocate ID dynamically because ID-allocation depends on memory-allocation, which rescuer can't depend on. After separation, we can easily add the rescuer workers to the list for iteration without any memory-allocation. It is required when we attach the rescuer worker to the pool in later patch. tj: Minor description updates. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 28 +++++++++++++++------------- kernel/workqueue_internal.h | 2 ++ 2 files changed, 17 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 938836bc6207..a6c38b717957 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -161,7 +161,8 @@ struct worker_pool { /* see manage_workers() for details on the two manager mutexes */ struct mutex manager_arb; /* manager arbitration */ struct mutex manager_mutex; /* manager exclusion */ - struct idr worker_idr; /* M: worker IDs and iteration */ + struct idr worker_idr; /* M: worker IDs */ + struct list_head workers; /* M: attached workers */ struct completion *detach_completion; /* all workers detached */ struct workqueue_attrs *attrs; /* I: worker attributes */ @@ -363,7 +364,6 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, /** * for_each_pool_worker - iterate through all workers of a worker_pool * @worker: iteration cursor - * @wi: integer used for iteration * @pool: worker_pool to iterate workers of * * This must be called with @pool->manager_mutex. @@ -371,8 +371,8 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, * The if/else clause exists only for the lockdep assertion and can be * ignored. */ -#define for_each_pool_worker(worker, wi, pool) \ - idr_for_each_entry(&(pool)->worker_idr, (worker), (wi)) \ +#define for_each_pool_worker(worker, pool) \ + list_for_each_entry((worker), &(pool)->workers, node) \ if (({ lockdep_assert_held(&pool->manager_mutex); false; })) { } \ else @@ -1674,6 +1674,7 @@ static struct worker *alloc_worker(void) if (worker) { INIT_LIST_HEAD(&worker->entry); INIT_LIST_HEAD(&worker->scheduled); + INIT_LIST_HEAD(&worker->node); /* on creation a worker is in !idle && prep state */ worker->flags = WORKER_PREP; } @@ -1696,7 +1697,8 @@ static void worker_detach_from_pool(struct worker *worker, mutex_lock(&pool->manager_mutex); idr_remove(&pool->worker_idr, worker->id); - if (idr_is_empty(&pool->worker_idr)) + list_del(&worker->node); + if (list_empty(&pool->workers)) detach_completion = pool->detach_completion; mutex_unlock(&pool->manager_mutex); @@ -1772,6 +1774,8 @@ static struct worker *create_worker(struct worker_pool *pool) /* successful, commit the pointer to idr */ idr_replace(&pool->worker_idr, worker, worker->id); + /* successful, attach the worker to the pool */ + list_add_tail(&worker->node, &pool->workers); return worker; @@ -3483,6 +3487,7 @@ static int init_worker_pool(struct worker_pool *pool) mutex_init(&pool->manager_arb); mutex_init(&pool->manager_mutex); idr_init(&pool->worker_idr); + INIT_LIST_HEAD(&pool->workers); INIT_HLIST_NODE(&pool->hash_node); pool->refcnt = 1; @@ -3548,7 +3553,7 @@ static void put_unbound_pool(struct worker_pool *pool) spin_unlock_irq(&pool->lock); mutex_lock(&pool->manager_mutex); - if (!idr_is_empty(&pool->worker_idr)) + if (!list_empty(&pool->workers)) pool->detach_completion = &detach_completion; mutex_unlock(&pool->manager_mutex); @@ -4533,7 +4538,6 @@ static void wq_unbind_fn(struct work_struct *work) int cpu = smp_processor_id(); struct worker_pool *pool; struct worker *worker; - int wi; for_each_cpu_worker_pool(pool, cpu) { WARN_ON_ONCE(cpu != smp_processor_id()); @@ -4548,7 +4552,7 @@ static void wq_unbind_fn(struct work_struct *work) * before the last CPU down must be on the cpu. After * this, they may become diasporas. */ - for_each_pool_worker(worker, wi, pool) + for_each_pool_worker(worker, pool) worker->flags |= WORKER_UNBOUND; pool->flags |= POOL_DISASSOCIATED; @@ -4594,7 +4598,6 @@ static void wq_unbind_fn(struct work_struct *work) static void rebind_workers(struct worker_pool *pool) { struct worker *worker; - int wi; lockdep_assert_held(&pool->manager_mutex); @@ -4605,13 +4608,13 @@ static void rebind_workers(struct worker_pool *pool) * of all workers first and then clear UNBOUND. As we're called * from CPU_ONLINE, the following shouldn't fail. */ - for_each_pool_worker(worker, wi, pool) + for_each_pool_worker(worker, pool) WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask) < 0); spin_lock_irq(&pool->lock); - for_each_pool_worker(worker, wi, pool) { + for_each_pool_worker(worker, pool) { unsigned int worker_flags = worker->flags; /* @@ -4663,7 +4666,6 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) { static cpumask_t cpumask; struct worker *worker; - int wi; lockdep_assert_held(&pool->manager_mutex); @@ -4677,7 +4679,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) return; /* as we're called from CPU_ONLINE, the following shouldn't fail */ - for_each_pool_worker(worker, wi, pool) + for_each_pool_worker(worker, pool) WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask) < 0); } diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 7e2204db0b1a..8888e0672442 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -37,6 +37,8 @@ struct worker { struct task_struct *task; /* I: worker task */ struct worker_pool *pool; /* I: the associated pool */ /* L: for rescuers */ + struct list_head node; /* M: anchored at pool->workers */ + /* M: runs through worker->node */ unsigned long last_active; /* L: last active timestamp */ unsigned int flags; /* X: flags */ -- cgit v1.2.3 From 7cda9aae0596d871a8d7a6888d7b447c60e5ab30 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 20 May 2014 17:46:32 +0800 Subject: workqueue: convert worker_idr to worker_ida We no longer iterate workers via worker_idr and worker_idr is used only for allocating/freeing ID, so we can convert it to worker_ida. By using ida_simple_get/remove(), worker_ida doesn't require external synchronization, so we don't need manager_mutex to protect it and the ID-removal code is allowed to be moved out from worker_detach_from_pool(). In a later patch, worker_detach_from_pool() will be used in rescuers which don't have IDs, so we move the ID-removal code out from worker_detach_from_pool() into worker_thread(). tj: Minor description updates. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a6c38b717957..092f2098746d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -161,10 +161,11 @@ struct worker_pool { /* see manage_workers() for details on the two manager mutexes */ struct mutex manager_arb; /* manager arbitration */ struct mutex manager_mutex; /* manager exclusion */ - struct idr worker_idr; /* M: worker IDs */ struct list_head workers; /* M: attached workers */ struct completion *detach_completion; /* all workers detached */ + struct ida worker_ida; /* worker IDs for task name */ + struct workqueue_attrs *attrs; /* I: worker attributes */ struct hlist_node hash_node; /* PL: unbound_pool_hash node */ int refcnt; /* PL: refcnt for unbound pools */ @@ -1696,7 +1697,6 @@ static void worker_detach_from_pool(struct worker *worker, struct completion *detach_completion = NULL; mutex_lock(&pool->manager_mutex); - idr_remove(&pool->worker_idr, worker->id); list_del(&worker->node); if (list_empty(&pool->workers)) detach_completion = pool->detach_completion; @@ -1727,11 +1727,8 @@ static struct worker *create_worker(struct worker_pool *pool) lockdep_assert_held(&pool->manager_mutex); - /* - * ID is needed to determine kthread name. Allocate ID first - * without installing the pointer. - */ - id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_KERNEL); + /* ID is needed to determine kthread name */ + id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL); if (id < 0) goto fail; @@ -1772,8 +1769,6 @@ static struct worker *create_worker(struct worker_pool *pool) if (pool->flags & POOL_DISASSOCIATED) worker->flags |= WORKER_UNBOUND; - /* successful, commit the pointer to idr */ - idr_replace(&pool->worker_idr, worker, worker->id); /* successful, attach the worker to the pool */ list_add_tail(&worker->node, &pool->workers); @@ -1781,7 +1776,7 @@ static struct worker *create_worker(struct worker_pool *pool) fail: if (id >= 0) - idr_remove(&pool->worker_idr, id); + ida_simple_remove(&pool->worker_ida, id); kfree(worker); return NULL; } @@ -2250,6 +2245,7 @@ woke_up: worker->task->flags &= ~PF_WQ_WORKER; set_task_comm(worker->task, "kworker/dying"); + ida_simple_remove(&pool->worker_ida, worker->id); worker_detach_from_pool(worker, pool); kfree(worker); return 0; @@ -3486,9 +3482,9 @@ static int init_worker_pool(struct worker_pool *pool) mutex_init(&pool->manager_arb); mutex_init(&pool->manager_mutex); - idr_init(&pool->worker_idr); INIT_LIST_HEAD(&pool->workers); + ida_init(&pool->worker_ida); INIT_HLIST_NODE(&pool->hash_node); pool->refcnt = 1; @@ -3503,7 +3499,7 @@ static void rcu_free_pool(struct rcu_head *rcu) { struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); - idr_destroy(&pool->worker_idr); + ida_destroy(&pool->worker_ida); free_workqueue_attrs(pool->attrs); kfree(pool); } -- cgit v1.2.3 From 4d757c5c81edba2052aae10d5b36dfcb9902b141 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 20 May 2014 17:46:33 +0800 Subject: workqueue: narrow the protection range of manager_mutex In create_worker(), as pool->worker_ida now uses ida_simple_get()/ida_simple_put() and doesn't require external synchronization, it doesn't need manager_mutex. struct worker allocation and kthread allocation are not visible by any one before attached, so they don't need manager_mutex either. The above operations are before the attaching operation which attaches the worker to the pool. Between attaching and starting the worker, the worker is already attached to the pool, so the cpu hotplug will handle cpu-binding for the worker correctly and we don't need the manager_mutex after attaching. The conclusion is that only the attaching operation needs manager_mutex, so we narrow the protection section of manager_mutex in create_worker(). Some comments about manager_mutex are removed, because we will rename it to attach_mutex and add worker_attach_to_pool() later which will be self-explanatory. tj: Minor description updates. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 35 +++++------------------------------ 1 file changed, 5 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 092f2098746d..d6b31ff60c52 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1725,8 +1725,6 @@ static struct worker *create_worker(struct worker_pool *pool) int id = -1; char id_buf[16]; - lockdep_assert_held(&pool->manager_mutex); - /* ID is needed to determine kthread name */ id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL); if (id < 0) @@ -1755,6 +1753,8 @@ static struct worker *create_worker(struct worker_pool *pool) /* prevent userland from meddling with cpumask of workqueue workers */ worker->task->flags |= PF_NO_SETAFFINITY; + mutex_lock(&pool->manager_mutex); + /* * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any * online CPUs. It'll be re-applied when any of the CPUs come up. @@ -1762,7 +1762,7 @@ static struct worker *create_worker(struct worker_pool *pool) set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); /* - * The caller is responsible for ensuring %POOL_DISASSOCIATED + * The pool->manager_mutex ensures %POOL_DISASSOCIATED * remains stable across this function. See the comments above the * flag definition for details. */ @@ -1772,6 +1772,8 @@ static struct worker *create_worker(struct worker_pool *pool) /* successful, attach the worker to the pool */ list_add_tail(&worker->node, &pool->workers); + mutex_unlock(&pool->manager_mutex); + return worker; fail: @@ -1809,8 +1811,6 @@ static int create_and_start_worker(struct worker_pool *pool) { struct worker *worker; - mutex_lock(&pool->manager_mutex); - worker = create_worker(pool); if (worker) { spin_lock_irq(&pool->lock); @@ -1818,8 +1818,6 @@ static int create_and_start_worker(struct worker_pool *pool) spin_unlock_irq(&pool->lock); } - mutex_unlock(&pool->manager_mutex); - return worker ? 0 : -ENOMEM; } @@ -2019,8 +2017,6 @@ static bool manage_workers(struct worker *worker) bool ret = false; /* - * Managership is governed by two mutexes - manager_arb and - * manager_mutex. manager_arb handles arbitration of manager role. * Anyone who successfully grabs manager_arb wins the arbitration * and becomes the manager. mutex_trylock() on pool->manager_arb * failure while holding pool->lock reliably indicates that someone @@ -2029,33 +2025,12 @@ static bool manage_workers(struct worker *worker) * grabbing manager_arb is responsible for actually performing * manager duties. If manager_arb is grabbed and released without * actual management, the pool may stall indefinitely. - * - * manager_mutex is used for exclusion of actual management - * operations. The holder of manager_mutex can be sure that none - * of management operations, including creation and destruction of - * workers, won't take place until the mutex is released. Because - * manager_mutex doesn't interfere with manager role arbitration, - * it is guaranteed that the pool's management, while may be - * delayed, won't be disturbed by someone else grabbing - * manager_mutex. */ if (!mutex_trylock(&pool->manager_arb)) return ret; - /* - * With manager arbitration won, manager_mutex would be free in - * most cases. trylock first without dropping @pool->lock. - */ - if (unlikely(!mutex_trylock(&pool->manager_mutex))) { - spin_unlock_irq(&pool->lock); - mutex_lock(&pool->manager_mutex); - spin_lock_irq(&pool->lock); - ret = true; - } - ret |= maybe_create_worker(pool); - mutex_unlock(&pool->manager_mutex); mutex_unlock(&pool->manager_arb); return ret; } -- cgit v1.2.3 From 92f9c5c40cc67ffcc5ac7f55fdbd6ae8afc7e0b4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 20 May 2014 17:46:34 +0800 Subject: workqueue: rename manager_mutex to attach_mutex manager_mutex is only used to protect the attaching for the pool and the pool->workers list. It protects the pool->workers and operations based on this list, such as: cpu-binding for the workers in the pool->workers the operations to set/clear WORKER_UNBOUND So let's rename manager_mutex to attach_mutex to better reflect its role. This patch is a pure rename. tj: Minor command and description updates. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 44 ++++++++++++++++++++++---------------------- kernel/workqueue_internal.h | 4 ++-- 2 files changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d6b31ff60c52..38b9ea7c204c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -65,7 +65,7 @@ enum { * be executing on any CPU. The pool behaves as an unbound one. * * Note that DISASSOCIATED should be flipped only while holding - * manager_mutex to avoid changing binding state while + * attach_mutex to avoid changing binding state while * create_worker() is in progress. */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ @@ -122,7 +122,7 @@ enum { * cpu or grabbing pool->lock is enough for read access. If * POOL_DISASSOCIATED is set, it's identical to L. * - * M: pool->manager_mutex protected. + * A: pool->attach_mutex protected. * * PL: wq_pool_mutex protected. * @@ -160,8 +160,8 @@ struct worker_pool { /* see manage_workers() for details on the two manager mutexes */ struct mutex manager_arb; /* manager arbitration */ - struct mutex manager_mutex; /* manager exclusion */ - struct list_head workers; /* M: attached workers */ + struct mutex attach_mutex; /* attach/detach exclusion */ + struct list_head workers; /* A: attached workers */ struct completion *detach_completion; /* all workers detached */ struct ida worker_ida; /* worker IDs for task name */ @@ -367,14 +367,14 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, * @worker: iteration cursor * @pool: worker_pool to iterate workers of * - * This must be called with @pool->manager_mutex. + * This must be called with @pool->attach_mutex. * * The if/else clause exists only for the lockdep assertion and can be * ignored. */ #define for_each_pool_worker(worker, pool) \ list_for_each_entry((worker), &(pool)->workers, node) \ - if (({ lockdep_assert_held(&pool->manager_mutex); false; })) { } \ + if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \ else /** @@ -1696,11 +1696,11 @@ static void worker_detach_from_pool(struct worker *worker, { struct completion *detach_completion = NULL; - mutex_lock(&pool->manager_mutex); + mutex_lock(&pool->attach_mutex); list_del(&worker->node); if (list_empty(&pool->workers)) detach_completion = pool->detach_completion; - mutex_unlock(&pool->manager_mutex); + mutex_unlock(&pool->attach_mutex); if (detach_completion) complete(detach_completion); @@ -1753,7 +1753,7 @@ static struct worker *create_worker(struct worker_pool *pool) /* prevent userland from meddling with cpumask of workqueue workers */ worker->task->flags |= PF_NO_SETAFFINITY; - mutex_lock(&pool->manager_mutex); + mutex_lock(&pool->attach_mutex); /* * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any @@ -1762,7 +1762,7 @@ static struct worker *create_worker(struct worker_pool *pool) set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); /* - * The pool->manager_mutex ensures %POOL_DISASSOCIATED + * The pool->attach_mutex ensures %POOL_DISASSOCIATED * remains stable across this function. See the comments above the * flag definition for details. */ @@ -1772,7 +1772,7 @@ static struct worker *create_worker(struct worker_pool *pool) /* successful, attach the worker to the pool */ list_add_tail(&worker->node, &pool->workers); - mutex_unlock(&pool->manager_mutex); + mutex_unlock(&pool->attach_mutex); return worker; @@ -3456,7 +3456,7 @@ static int init_worker_pool(struct worker_pool *pool) (unsigned long)pool); mutex_init(&pool->manager_arb); - mutex_init(&pool->manager_mutex); + mutex_init(&pool->attach_mutex); INIT_LIST_HEAD(&pool->workers); ida_init(&pool->worker_ida); @@ -3513,7 +3513,7 @@ static void put_unbound_pool(struct worker_pool *pool) /* * Become the manager and destroy all workers. Grabbing * manager_arb prevents @pool's workers from blocking on - * manager_mutex. + * attach_mutex. */ mutex_lock(&pool->manager_arb); @@ -3523,10 +3523,10 @@ static void put_unbound_pool(struct worker_pool *pool) WARN_ON(pool->nr_workers || pool->nr_idle); spin_unlock_irq(&pool->lock); - mutex_lock(&pool->manager_mutex); + mutex_lock(&pool->attach_mutex); if (!list_empty(&pool->workers)) pool->detach_completion = &detach_completion; - mutex_unlock(&pool->manager_mutex); + mutex_unlock(&pool->attach_mutex); if (pool->detach_completion) wait_for_completion(pool->detach_completion); @@ -4513,11 +4513,11 @@ static void wq_unbind_fn(struct work_struct *work) for_each_cpu_worker_pool(pool, cpu) { WARN_ON_ONCE(cpu != smp_processor_id()); - mutex_lock(&pool->manager_mutex); + mutex_lock(&pool->attach_mutex); spin_lock_irq(&pool->lock); /* - * We've blocked all manager operations. Make all workers + * We've blocked all attach/detach operations. Make all workers * unbound and set DISASSOCIATED. Before this, all workers * except for the ones which are still executing works from * before the last CPU down must be on the cpu. After @@ -4529,7 +4529,7 @@ static void wq_unbind_fn(struct work_struct *work) pool->flags |= POOL_DISASSOCIATED; spin_unlock_irq(&pool->lock); - mutex_unlock(&pool->manager_mutex); + mutex_unlock(&pool->attach_mutex); /* * Call schedule() so that we cross rq->lock and thus can @@ -4570,7 +4570,7 @@ static void rebind_workers(struct worker_pool *pool) { struct worker *worker; - lockdep_assert_held(&pool->manager_mutex); + lockdep_assert_held(&pool->attach_mutex); /* * Restore CPU affinity of all workers. As all idle workers should @@ -4638,7 +4638,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) static cpumask_t cpumask; struct worker *worker; - lockdep_assert_held(&pool->manager_mutex); + lockdep_assert_held(&pool->attach_mutex); /* is @cpu allowed for @pool? */ if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) @@ -4683,7 +4683,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb, mutex_lock(&wq_pool_mutex); for_each_pool(pool, pi) { - mutex_lock(&pool->manager_mutex); + mutex_lock(&pool->attach_mutex); if (pool->cpu == cpu) { spin_lock_irq(&pool->lock); @@ -4695,7 +4695,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb, restore_unbound_workers_cpumask(pool, cpu); } - mutex_unlock(&pool->manager_mutex); + mutex_unlock(&pool->attach_mutex); } /* update NUMA affinity of unbound workqueues */ diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 8888e0672442..45215870ac6c 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -37,8 +37,8 @@ struct worker { struct task_struct *task; /* I: worker task */ struct worker_pool *pool; /* I: the associated pool */ /* L: for rescuers */ - struct list_head node; /* M: anchored at pool->workers */ - /* M: runs through worker->node */ + struct list_head node; /* A: anchored at pool->workers */ + /* A: runs through worker->node */ unsigned long last_active; /* L: last active timestamp */ unsigned int flags; /* X: flags */ -- cgit v1.2.3 From 4736cbf7a4b2a6bbb50a809a6933fb7eb29dc38f Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 20 May 2014 17:46:35 +0800 Subject: workqueue: separate pool-attaching code out from create_worker() Currently, the code to attach a new worker to its pool is embedded in create_worker(). Separating this code out will make the codes clearer and will allow rescuers to share the code path later. tj: Description and comment updates. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 61 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 38b9ea7c204c..b1de6ac4a0e3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -66,7 +66,7 @@ enum { * * Note that DISASSOCIATED should be flipped only while holding * attach_mutex to avoid changing binding state while - * create_worker() is in progress. + * worker_attach_to_pool() is in progress. */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ POOL_FREEZING = 1 << 3, /* freeze in progress */ @@ -1682,14 +1682,47 @@ static struct worker *alloc_worker(void) return worker; } +/** + * worker_attach_to_pool() - attach a worker to a pool + * @worker: worker to be attached + * @pool: the target pool + * + * Attach @worker to @pool. Once attached, the %WORKER_UNBOUND flag and + * cpu-binding of @worker are kept coordinated with the pool across + * cpu-[un]hotplugs. + */ +static void worker_attach_to_pool(struct worker *worker, + struct worker_pool *pool) +{ + mutex_lock(&pool->attach_mutex); + + /* + * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any + * online CPUs. It'll be re-applied when any of the CPUs come up. + */ + set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); + + /* + * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains + * stable across this function. See the comments above the + * flag definition for details. + */ + if (pool->flags & POOL_DISASSOCIATED) + worker->flags |= WORKER_UNBOUND; + + list_add_tail(&worker->node, &pool->workers); + + mutex_unlock(&pool->attach_mutex); +} + /** * worker_detach_from_pool() - detach a worker from its pool * @worker: worker which is attached to its pool * @pool: the pool @worker is attached to * - * Undo the attaching which had been done in create_worker(). The caller - * worker shouldn't access to the pool after detached except it has other - * reference to the pool. + * Undo the attaching which had been done in worker_attach_to_pool(). The + * caller worker shouldn't access to the pool after detached except it has + * other reference to the pool. */ static void worker_detach_from_pool(struct worker *worker, struct worker_pool *pool) @@ -1753,26 +1786,8 @@ static struct worker *create_worker(struct worker_pool *pool) /* prevent userland from meddling with cpumask of workqueue workers */ worker->task->flags |= PF_NO_SETAFFINITY; - mutex_lock(&pool->attach_mutex); - - /* - * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any - * online CPUs. It'll be re-applied when any of the CPUs come up. - */ - set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); - - /* - * The pool->attach_mutex ensures %POOL_DISASSOCIATED - * remains stable across this function. See the comments above the - * flag definition for details. - */ - if (pool->flags & POOL_DISASSOCIATED) - worker->flags |= WORKER_UNBOUND; - /* successful, attach the worker to the pool */ - list_add_tail(&worker->node, &pool->workers); - - mutex_unlock(&pool->attach_mutex); + worker_attach_to_pool(worker, pool); return worker; -- cgit v1.2.3 From 51697d393922eb643e78ac5db86e8fa5a45b469a Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 20 May 2014 17:46:36 +0800 Subject: workqueue: use generic attach/detach routine for rescuers There are several problems with the code that rescuers use to bind themselve to the target pool's cpumask. 1) It is very different from how the normal workers bind to cpumask, increasing code complexity and maintenance overhead. 2) The code of cpu-binding for rescuers is complicated. 3) If one or more cpu hotplugs happen while a rescuer is processing its scheduled work items, the rescuer may not stay bound to the cpumask of the pool. This is an allowed behavior, but is still hairy. It will be better if the cpumask of the rescuer is always kept synchronized with the pool across cpu hotplugs. Using generic attach/detach routine will solve the above problems and results in much simpler code. tj: Minor description updates. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 74 ++++++------------------------------------------------ 1 file changed, 8 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b1de6ac4a0e3..6603923b7ede 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1603,70 +1603,6 @@ static void worker_leave_idle(struct worker *worker) list_del_init(&worker->entry); } -/** - * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it - * @pool: target worker_pool - * - * Bind %current to the cpu of @pool if it is associated and lock @pool. - * - * Works which are scheduled while the cpu is online must at least be - * scheduled to a worker which is bound to the cpu so that if they are - * flushed from cpu callbacks while cpu is going down, they are - * guaranteed to execute on the cpu. - * - * This function is to be used by unbound workers and rescuers to bind - * themselves to the target cpu and may race with cpu going down or - * coming online. kthread_bind() can't be used because it may put the - * worker to already dead cpu and set_cpus_allowed_ptr() can't be used - * verbatim as it's best effort and blocking and pool may be - * [dis]associated in the meantime. - * - * This function tries set_cpus_allowed() and locks pool and verifies the - * binding against %POOL_DISASSOCIATED which is set during - * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker - * enters idle state or fetches works without dropping lock, it can - * guarantee the scheduling requirement described in the first paragraph. - * - * CONTEXT: - * Might sleep. Called without any lock but returns with pool->lock - * held. - * - * Return: - * %true if the associated pool is online (@worker is successfully - * bound), %false if offline. - */ -static bool worker_maybe_bind_and_lock(struct worker_pool *pool) -__acquires(&pool->lock) -{ - while (true) { - /* - * The following call may fail, succeed or succeed - * without actually migrating the task to the cpu if - * it races with cpu hotunplug operation. Verify - * against POOL_DISASSOCIATED. - */ - if (!(pool->flags & POOL_DISASSOCIATED)) - set_cpus_allowed_ptr(current, pool->attrs->cpumask); - - spin_lock_irq(&pool->lock); - if (pool->flags & POOL_DISASSOCIATED) - return false; - if (task_cpu(current) == pool->cpu && - cpumask_equal(¤t->cpus_allowed, pool->attrs->cpumask)) - return true; - spin_unlock_irq(&pool->lock); - - /* - * We've raced with CPU hot[un]plug. Give it a breather - * and retry migration. cond_resched() is required here; - * otherwise, we might deadlock against cpu_stop trying to - * bring down the CPU on non-preemptive kernel. - */ - cpu_relax(); - cond_resched(); - } -} - static struct worker *alloc_worker(void) { struct worker *worker; @@ -2361,8 +2297,9 @@ repeat: spin_unlock_irq(&wq_mayday_lock); - /* migrate to the target cpu if possible */ - worker_maybe_bind_and_lock(pool); + worker_attach_to_pool(rescuer, pool); + + spin_lock_irq(&pool->lock); rescuer->pool = pool; /* @@ -2375,6 +2312,11 @@ repeat: move_linked_works(work, scheduled, &n); process_scheduled_works(rescuer); + spin_unlock_irq(&pool->lock); + + worker_detach_from_pool(rescuer, pool); + + spin_lock_irq(&pool->lock); /* * Put the reference grabbed by send_mayday(). @pool won't -- cgit v1.2.3 From ccdb594653b1c0d5b0f3e6f8bf764c544ba0606d Mon Sep 17 00:00:00 2001 From: Robert Elliott Date: Tue, 20 May 2014 17:10:36 -0500 Subject: tracing: Eliminate duplicate TRACE_GRAPH_PRINT_xx defines Eliminate duplicate TRACE_GRAPH_PRINT_xx defines in trace_functions_graph.c that are already in trace.h. Add TRACE_GRAPH_PRINT_IRQS to trace.h, which is the only one that is missing. Link: http://lkml.kernel.org/p/20140520221031.8359.24733.stgit@beardog.cce.hp.com Signed-off-by: Robert Elliott Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 1 + kernel/trace/trace_functions_graph.c | 9 --------- 2 files changed, 1 insertion(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3b3e09e61f33..68050633255e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -724,6 +724,7 @@ extern unsigned long trace_flags; #define TRACE_GRAPH_PRINT_PROC 0x8 #define TRACE_GRAPH_PRINT_DURATION 0x10 #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 +#define TRACE_GRAPH_PRINT_IRQS 0x40 #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b86dd4d8c6a6..af08dd531cb8 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -38,15 +38,6 @@ struct fgraph_data { #define TRACE_GRAPH_INDENT 2 -/* Flag options */ -#define TRACE_GRAPH_PRINT_OVERRUN 0x1 -#define TRACE_GRAPH_PRINT_CPU 0x2 -#define TRACE_GRAPH_PRINT_OVERHEAD 0x4 -#define TRACE_GRAPH_PRINT_PROC 0x8 -#define TRACE_GRAPH_PRINT_DURATION 0x10 -#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 -#define TRACE_GRAPH_PRINT_IRQS 0x40 - static unsigned int max_depth; static struct tracer_opt trace_opts[] = { -- cgit v1.2.3 From 607e3a29203633eaec7334f2f58f9653df788a06 Mon Sep 17 00:00:00 2001 From: Robert Elliott Date: Tue, 20 May 2014 17:10:51 -0500 Subject: tracing: Add funcgraph_tail option to print function name after closing braces In the function-graph tracer, add a funcgraph_tail option to print the function name on all } lines, not just functions whose first line is no longer in the trace buffer. If a function calls other traced functions, its total time appears on its } line. This change allows grep to be used to determine the function for which the line corresponds. Update Documentation/trace/ftrace.txt to describe this new option. Link: http://lkml.kernel.org/p/20140520221041.8359.6782.stgit@beardog.cce.hp.com Signed-off-by: Robert Elliott Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 1 + kernel/trace/trace_functions_graph.c | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 68050633255e..217207ad60b3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -725,6 +725,7 @@ extern unsigned long trace_flags; #define TRACE_GRAPH_PRINT_DURATION 0x10 #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 #define TRACE_GRAPH_PRINT_IRQS 0x40 +#define TRACE_GRAPH_PRINT_TAIL 0x80 #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index af08dd531cb8..4de3e57f723c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -55,11 +55,13 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, /* Display interrupts */ { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, + /* Display function name after trailing } */ + { TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) }, { } /* Empty entry */ }; static struct tracer_flags tracer_flags = { - /* Don't display overruns and proc by default */ + /* Don't display overruns, proc, or tail by default */ .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, .opts = trace_opts @@ -1167,9 +1169,10 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, * If the return function does not have a matching entry, * then the entry was lost. Instead of just printing * the '}' and letting the user guess what function this - * belongs to, write out the function name. + * belongs to, write out the function name. Always do + * that if the funcgraph-tail option is enabled. */ - if (func_match) { + if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) { ret = trace_seq_puts(s, "}\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; -- cgit v1.2.3 From 5fe821a9dee241fa450703ab7015d970ee0cfb8d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 19 May 2014 14:56:14 -0700 Subject: net: filter: cleanup invocation of internal BPF Kernel API for classic BPF socket filters is: sk_unattached_filter_create() - validate classic BPF, convert, JIT SK_RUN_FILTER() - run it sk_unattached_filter_destroy() - destroy socket filter Cleanup internal BPF kernel API as following: sk_filter_select_runtime() - final step of internal BPF creation. Try to JIT internal BPF program, if JIT is not available select interpreter SK_RUN_FILTER() - run it sk_filter_free() - free internal BPF program Disallow direct calls to BPF interpreter. Execution of the BPF program should be done with SK_RUN_FILTER() macro. Example of internal BPF create, run, destroy: struct sk_filter *fp; fp = kzalloc(sk_filter_size(prog_len), GFP_KERNEL); memcpy(fp->insni, prog, prog_len * sizeof(fp->insni[0])); fp->len = prog_len; sk_filter_select_runtime(fp); SK_RUN_FILTER(fp, ctx); sk_filter_free(fp); Sockets, seccomp, testsuite, tracing are using different ways to populate sk_filter, so first steps of program creation are not common. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/seccomp.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 7e02d624cc50..1036b6f2fded 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -273,10 +273,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) atomic_set(&filter->usage, 1); filter->prog->len = new_len; - filter->prog->bpf_func = (void *)sk_run_filter_int_seccomp; - /* JIT internal BPF into native HW instructions */ - bpf_int_jit_compile(filter->prog); + sk_filter_select_runtime(filter->prog); /* * If there is an existing filter, make it the prev and don't drop its @@ -340,7 +338,7 @@ void put_seccomp_filter(struct task_struct *tsk) while (orig && atomic_dec_and_test(&orig->usage)) { struct seccomp_filter *freeme = orig; orig = orig->prev; - bpf_jit_free(freeme->prog); + sk_filter_free(freeme->prog); kfree(freeme); } } -- cgit v1.2.3 From 1037de36edae30f1ddc0a2532decd50f92ac4901 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 22 May 2014 16:44:07 +0800 Subject: workqueue: rename first_worker() to first_idle_worker() first_worker() actually returns the first idle workers, the name first_idle_worker() which is self-commnet will be better. All the callers of first_worker() expect it returns an idle worker, the name first_idle_worker() with "idle" notation makes reviewers happier. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6603923b7ede..dd107b83f45f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -773,8 +773,8 @@ static bool too_many_workers(struct worker_pool *pool) * Wake up functions. */ -/* Return the first worker. Safe with preemption disabled */ -static struct worker *first_worker(struct worker_pool *pool) +/* Return the first idle worker. Safe with preemption disabled */ +static struct worker *first_idle_worker(struct worker_pool *pool) { if (unlikely(list_empty(&pool->idle_list))) return NULL; @@ -793,7 +793,7 @@ static struct worker *first_worker(struct worker_pool *pool) */ static void wake_up_worker(struct worker_pool *pool) { - struct worker *worker = first_worker(pool); + struct worker *worker = first_idle_worker(pool); if (likely(worker)) wake_up_process(worker->task); @@ -867,7 +867,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) */ if (atomic_dec_and_test(&pool->nr_running) && !list_empty(&pool->worklist)) - to_wakeup = first_worker(pool); + to_wakeup = first_idle_worker(pool); return to_wakeup ? to_wakeup->task : NULL; } @@ -3475,7 +3475,7 @@ static void put_unbound_pool(struct worker_pool *pool) mutex_lock(&pool->manager_arb); spin_lock_irq(&pool->lock); - while ((worker = first_worker(pool))) + while ((worker = first_idle_worker(pool))) destroy_worker(worker); WARN_ON(pool->nr_workers || pool->nr_idle); spin_unlock_irq(&pool->lock); -- cgit v1.2.3 From 74b414ead1133972817d3ce7b934356150d03a7d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 22 May 2014 19:01:16 +0800 Subject: workqueue: remove the confusing POOL_FREEZING Currently, the global freezing state is propagated to worker_pools via POOL_FREEZING and then to each workqueue; however, the middle step - propagation through worker_pools - can be skipped as long as one or more max_active adjustments happens for each workqueue after the update to the global state is visible. The global workqueue freezing state and the max_active adjustments during workqueue creation and [un]freezing are serialized with wq_pool_mutex, so it's trivial to guarantee that max_actives stay in sync with global freezing state. POOL_FREEZING is unnecessary and makes the code more confusing and complicates freeze_workqueues_begin() and thaw_workqueues() by requiring them to walk through all pools. Remove POOL_FREEZING and use workqueue_freezing directly instead. tj: Description and comment updates. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 32 +++++++------------------------- 1 file changed, 7 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index dd107b83f45f..bc3c18892b7d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -69,7 +69,6 @@ enum { * worker_attach_to_pool() is in progress. */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ - POOL_FREEZING = 1 << 3, /* freeze in progress */ /* worker flags */ WORKER_DIE = 1 << 1, /* die die die */ @@ -3533,9 +3532,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) if (!pool || init_worker_pool(pool) < 0) goto fail; - if (workqueue_freezing) - pool->flags |= POOL_FREEZING; - lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ copy_workqueue_attrs(pool->attrs, attrs); @@ -3642,7 +3638,12 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) spin_lock_irq(&pwq->pool->lock); - if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) { + /* + * During [un]freezing, the caller is responsible for ensuring that + * this function is called at least once after @workqueue_freezing + * is updated and visible. + */ + if (!freezable || !workqueue_freezing) { pwq->max_active = wq->saved_max_active; while (!list_empty(&pwq->delayed_works) && @@ -4751,24 +4752,14 @@ EXPORT_SYMBOL_GPL(work_on_cpu); */ void freeze_workqueues_begin(void) { - struct worker_pool *pool; struct workqueue_struct *wq; struct pool_workqueue *pwq; - int pi; mutex_lock(&wq_pool_mutex); WARN_ON_ONCE(workqueue_freezing); workqueue_freezing = true; - /* set FREEZING */ - for_each_pool(pool, pi) { - spin_lock_irq(&pool->lock); - WARN_ON_ONCE(pool->flags & POOL_FREEZING); - pool->flags |= POOL_FREEZING; - spin_unlock_irq(&pool->lock); - } - list_for_each_entry(wq, &workqueues, list) { mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) @@ -4838,21 +4829,13 @@ void thaw_workqueues(void) { struct workqueue_struct *wq; struct pool_workqueue *pwq; - struct worker_pool *pool; - int pi; mutex_lock(&wq_pool_mutex); if (!workqueue_freezing) goto out_unlock; - /* clear FREEZING */ - for_each_pool(pool, pi) { - spin_lock_irq(&pool->lock); - WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); - pool->flags &= ~POOL_FREEZING; - spin_unlock_irq(&pool->lock); - } + workqueue_freezing = false; /* restore max_active and repopulate worklist */ list_for_each_entry(wq, &workqueues, list) { @@ -4862,7 +4845,6 @@ void thaw_workqueues(void) mutex_unlock(&wq->mutex); } - workqueue_freezing = false; out_unlock: mutex_unlock(&wq_pool_mutex); } -- cgit v1.2.3 From 015af06e103fa47af29ada0f564301c81d4973b2 Mon Sep 17 00:00:00 2001 From: Valdis Kletnieks Date: Tue, 27 May 2014 14:28:59 -0400 Subject: kernel/workqueue.c: pr_warning/pr_warn & printk/pr_info This commit did an incorrect printk->pr_info conversion. If we were converting to pr_info() we should lose the log_level parameter. The problem is that this is called (indirectly) by show_regs_print_info(), which is called with various log_levels (from _INFO clear to _EMERG). So we leave it as a printk() call so the desired log_level is applied. Not a full revert, as the other half of the patch is correct. Signed-off-by: Valdis Kletnieks Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bc3c18892b7d..90a0fa592b72 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4440,7 +4440,7 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) probe_kernel_read(desc, worker->desc, sizeof(desc) - 1); if (fn || name[0] || desc[0]) { - pr_info("%sWorkqueue: %s %pf", log_lvl, name, fn); + printk("%sWorkqueue: %s %pf", log_lvl, name, fn); if (desc[0]) pr_cont(" (%s)", desc); pr_cont("\n"); -- cgit v1.2.3 From 2184db46e425c2b84a783eeead0e2b24ce67cd7f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 28 May 2014 13:14:40 -0400 Subject: tracing: Print nasty banner when trace_printk() is in use trace_printk() is used to debug fast paths within the kernel. Places that gets called in any context (interrupt or NMI) or thousands of times a second. Something you do not want to do with a printk(). In order to make it completely lockless as it needs a temporary buffer to handle some of the string formatting, a page is created per cpu for every context (four per cpu; normal, softirq, irq, NMI). Since trace_printk() should only be used for debugging purposes, there's no reason to waste memory on these buffers on a production system. That means, trace_printk() should never be used unless a developer is debugging their kernel. There's macro magic to allocate the buffers if trace_printk() is used anywhere in the kernel. To help enforce that trace_printk() isn't used outside of development, when it is used, a nasty banner is displayed on bootup (or when a module is loaded that uses trace_printk() and the kernel core does not). Here's the banner: ********************************************************** ** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE ** ** ** ** trace_printk() being used. Allocating extra memory. ** ** ** ** This means that this is a DEBUG kernel and it is ** ** unsafe for produciton use. ** ** ** ** If you see this message and you are not debugging ** ** the kernel, report this immediately to your vendor! ** ** ** ** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE ** ********************************************************** That should hopefully keep developers from trying to sneak in a trace_printk() or two. Link: http://lkml.kernel.org/p/20140528131440.2283213c@gandalf.local.home Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 05431696b10c..eb228b9de170 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1975,7 +1975,21 @@ void trace_printk_init_buffers(void) if (alloc_percpu_trace_buffer()) return; - pr_info("ftrace: Allocated trace_printk buffers\n"); + /* trace_printk() is for debug use only. Don't use it in production. */ + + pr_warning("\n**********************************************************\n"); + pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warning("** **\n"); + pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); + pr_warning("** **\n"); + pr_warning("** This means that this is a DEBUG kernel and it is **\n"); + pr_warning("** unsafe for produciton use. **\n"); + pr_warning("** **\n"); + pr_warning("** If you see this message and you are not debugging **\n"); + pr_warning("** the kernel, report this immediately to your vendor! **\n"); + pr_warning("** **\n"); + pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warning("**********************************************************\n"); /* Expand the buffers to set size */ tracing_update_buffers(); -- cgit v1.2.3 From 81dc9f0ef21e40114cc895894c7acf3055f6d1fb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 29 May 2014 22:49:07 -0400 Subject: tracing: Add tracepoint benchmark tracepoint In order to help benchmark the time tracepoints take, a new config option is added called CONFIG_TRACEPOINT_BENCHMARK. When this option is set a tracepoint is created called "benchmark:benchmark_event". When the tracepoint is enabled, it kicks off a kernel thread that goes into an infinite loop (calling cond_sched() to let other tasks run), and calls the tracepoint. Each iteration will record the time it took to write to the tracepoint and the next iteration that data will be passed to the tracepoint itself. That is, the tracepoint will report the time it took to do the previous tracepoint. The string written to the tracepoint is a static string of 128 bytes to keep the time the same. The initial string is simply a write of "START". The second string records the cold cache time of the first write which is not added to the rest of the calculations. As it is a tight loop, it benchmarks as hot cache. That's fine because we care most about hot paths that are probably in cache already. An example of the output: START first=3672 [COLD CACHED] last=632 first=3672 max=632 min=632 avg=316 std=446 std^2=199712 last=278 first=3672 max=632 min=278 avg=303 std=316 std^2=100337 last=277 first=3672 max=632 min=277 avg=296 std=258 std^2=67064 last=273 first=3672 max=632 min=273 avg=292 std=224 std^2=50411 last=273 first=3672 max=632 min=273 avg=288 std=200 std^2=40389 last=281 first=3672 max=632 min=273 avg=287 std=183 std^2=33666 Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 30 +++++++ kernel/trace/Makefile | 3 + kernel/trace/trace_benchmark.c | 176 +++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_benchmark.h | 41 ++++++++++ 4 files changed, 250 insertions(+) create mode 100644 kernel/trace/trace_benchmark.c create mode 100644 kernel/trace/trace_benchmark.h (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8639819f6cef..d4409356f40d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -535,6 +535,36 @@ config MMIOTRACE_TEST Say N, unless you absolutely know what you are doing. +config TRACEPOINT_BENCHMARK + bool "Add tracepoint that benchmarks tracepoints" + help + This option creates the tracepoint "benchmark:benchmark_event". + When the tracepoint is enabled, it kicks off a kernel thread that + goes into an infinite loop (calling cond_sched() to let other tasks + run), and calls the tracepoint. Each iteration will record the time + it took to write to the tracepoint and the next iteration that + data will be passed to the tracepoint itself. That is, the tracepoint + will report the time it took to do the previous tracepoint. + The string written to the tracepoint is a static string of 128 bytes + to keep the time the same. The initial string is simply a write of + "START". The second string records the cold cache time of the first + write which is not added to the rest of the calculations. + + As it is a tight loop, it benchmarks as hot cache. That's fine because + we care most about hot paths that are probably in cache already. + + An example of the output: + + START + first=3672 [COLD CACHED] + last=632 first=3672 max=632 min=632 avg=316 std=446 std^2=199712 + last=278 first=3672 max=632 min=278 avg=303 std=316 std^2=100337 + last=277 first=3672 max=632 min=277 avg=296 std=258 std^2=67064 + last=273 first=3672 max=632 min=273 avg=292 std=224 std^2=50411 + last=273 first=3672 max=632 min=273 avg=288 std=200 std^2=40389 + last=281 first=3672 max=632 min=273 avg=287 std=183 std^2=33666 + + config RING_BUFFER_BENCHMARK tristate "Ring buffer benchmark stress tester" depends on RING_BUFFER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 1378e84fbe39..2611613f14f1 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -17,6 +17,7 @@ ifdef CONFIG_TRACING_BRANCHES KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING endif +CFLAGS_trace_benchmark.o := -I$(src) CFLAGS_trace_events_filter.o := -I$(src) obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o @@ -62,4 +63,6 @@ endif obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o +obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o + libftrace-y := ftrace.o diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c new file mode 100644 index 000000000000..7dc1c42dfee2 --- /dev/null +++ b/kernel/trace/trace_benchmark.c @@ -0,0 +1,176 @@ +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include "trace_benchmark.h" + +static struct task_struct *bm_event_thread; + +static char bm_str[BENCHMARK_EVENT_STRLEN] = "START"; + +static u64 bm_total; +static u64 bm_totalsq; +static u64 bm_last; +static u64 bm_max; +static u64 bm_min; +static u64 bm_first; +static s64 bm_cnt; + +/* + * This gets called in a loop recording the time it took to write + * the tracepoint. What it writes is the time statistics of the last + * tracepoint write. As there is nothing to write the first time + * it simply writes "START". As the first write is cold cache and + * the rest is hot, we save off that time in bm_first and it is + * reported as "first", which is shown in the second write to the + * tracepoint. The "first" field is writen within the statics from + * then on but never changes. + */ +static void trace_do_benchmark(void) +{ + u64 start; + u64 stop; + u64 delta; + s64 stddev; + u64 seed; + u64 seedsq; + u64 last_seed; + unsigned int avg; + unsigned int std = 0; + + /* Only run if the tracepoint is actually active */ + if (!trace_benchmark_event_enabled()) + return; + + local_irq_disable(); + start = trace_clock_local(); + trace_benchmark_event(bm_str); + stop = trace_clock_local(); + local_irq_enable(); + + bm_cnt++; + + delta = stop - start; + + /* + * The first read is cold cached, keep it separate from the + * other calculations. + */ + if (bm_cnt == 1) { + bm_first = delta; + scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, + "first=%llu [COLD CACHED]", bm_first); + return; + } + + bm_last = delta; + + bm_total += delta; + bm_totalsq += delta * delta; + + if (delta > bm_max) + bm_max = delta; + if (!bm_min || delta < bm_min) + bm_min = delta; + + if (bm_cnt > 1) { + /* + * Apply Welford's method to calculate standard deviation: + * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) + */ + stddev = (u64)bm_cnt * bm_totalsq - bm_total * bm_total; + do_div(stddev, bm_cnt); + do_div(stddev, bm_cnt - 1); + } else + stddev = 0; + + delta = bm_total; + do_div(delta, bm_cnt); + avg = delta; + + if (stddev > 0) { + int i = 0; + /* + * stddev is the square of standard deviation but + * we want the actualy number. Use the average + * as our seed to find the std. + * + * The next try is: + * x = (x + N/x) / 2 + * + * Where N is the squared number to find the square + * root of. + */ + seed = avg; + do { + last_seed = seed; + seed = stddev; + if (!last_seed) + break; + do_div(seed, last_seed); + seed += last_seed; + do_div(seed, 2); + } while (i++ < 10 && last_seed != seed); + + std = seed; + } + + scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, + "last=%llu first=%llu max=%llu min=%llu avg=%u std=%d std^2=%lld", + bm_last, bm_first, bm_max, bm_min, avg, std, stddev); +} + +static int benchmark_event_kthread(void *arg) +{ + /* sleep a bit to make sure the tracepoint gets activated */ + msleep(100); + + while (!kthread_should_stop()) { + + trace_do_benchmark(); + + /* + * We don't go to sleep, but let others + * run as well. + */ + cond_resched(); + } + + return 0; +} + +/* + * When the benchmark tracepoint is enabled, it calls this + * function and the thread that calls the tracepoint is created. + */ +void trace_benchmark_reg(void) +{ + bm_event_thread = kthread_run(benchmark_event_kthread, + NULL, "event_benchmark"); + WARN_ON(!bm_event_thread); +} + +/* + * When the benchmark tracepoint is disabled, it calls this + * function and the thread that calls the tracepoint is deleted + * and all the numbers are reset. + */ +void trace_benchmark_unreg(void) +{ + if (!bm_event_thread) + return; + + kthread_stop(bm_event_thread); + + strcpy(bm_str, "START"); + bm_total = 0; + bm_totalsq = 0; + bm_last = 0; + bm_max = 0; + bm_min = 0; + bm_cnt = 0; + /* bm_first doesn't need to be reset but reset it anyway */ + bm_first = 0; +} diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h new file mode 100644 index 000000000000..3c1df1df4e29 --- /dev/null +++ b/kernel/trace/trace_benchmark.h @@ -0,0 +1,41 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM benchmark + +#if !defined(_TRACE_BENCHMARK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_BENCHMARK_H + +#include + +extern void trace_benchmark_reg(void); +extern void trace_benchmark_unreg(void); + +#define BENCHMARK_EVENT_STRLEN 128 + +TRACE_EVENT_FN(benchmark_event, + + TP_PROTO(const char *str), + + TP_ARGS(str), + + TP_STRUCT__entry( + __array( char, str, BENCHMARK_EVENT_STRLEN ) + ), + + TP_fast_assign( + memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN); + ), + + TP_printk("%s", __entry->str), + + trace_benchmark_reg, trace_benchmark_unreg +); + +#endif /* _TRACE_BENCHMARK_H */ + +#undef TRACE_INCLUDE_FILE +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_benchmark + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From 42584c81c5adc1737a6fe0687facc5e62a5dc8c1 Mon Sep 17 00:00:00 2001 From: Yoshihiro YUNOMAE Date: Thu, 20 Feb 2014 17:44:31 +0900 Subject: tracing: Have saved_cmdlines use the seq_read infrastructure Current tracing_saved_cmdlines_read() implementation is naive; It allocates a large buffer, constructs output data to that buffer for each read operation, and then copies a portion of the buffer to the user space buffer. This has several issues such as slow memory allocation, high CPU usage, and even corruption of the output data. The seq_read infrastructure is made to handle this type of work. By converting it to use seq_read() the code becomes smaller, simplified, as well as correct. Link: http://lkml.kernel.org/p/20140220084431.3839.51793.stgit@yunodevel Signed-off-by: Hidehiro Kawai Cc: Namhyung Kim Cc: Masami Hiramatsu Signed-off-by: Yoshihiro YUNOMAE Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: linux-kernel@vger.kernel.org Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 89 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 54 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index eb228b9de170..1855dae73f34 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3699,55 +3699,74 @@ static const struct file_operations tracing_readme_fops = { .llseek = generic_file_llseek, }; -static ssize_t -tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, - size_t cnt, loff_t *ppos) +static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) { - char *buf_comm; - char *file_buf; - char *buf; - int len = 0; - int pid; - int i; + unsigned int *ptr = v; - file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL); - if (!file_buf) - return -ENOMEM; + if (*pos || m->count) + ptr++; - buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL); - if (!buf_comm) { - kfree(file_buf); - return -ENOMEM; - } + (*pos)++; + + for (; ptr < &map_cmdline_to_pid[SAVED_CMDLINES]; ptr++) { + if (*ptr == -1 || *ptr == NO_CMDLINE_MAP) + continue; - buf = file_buf; + return ptr; + } - for (i = 0; i < SAVED_CMDLINES; i++) { - int r; + return NULL; +} - pid = map_cmdline_to_pid[i]; - if (pid == -1 || pid == NO_CMDLINE_MAP) - continue; +static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) +{ + void *v; + loff_t l = 0; - trace_find_cmdline(pid, buf_comm); - r = sprintf(buf, "%d %s\n", pid, buf_comm); - buf += r; - len += r; + v = &map_cmdline_to_pid[0]; + while (l <= *pos) { + v = saved_cmdlines_next(m, v, &l); + if (!v) + return NULL; } - len = simple_read_from_buffer(ubuf, cnt, ppos, - file_buf, len); + return v; +} - kfree(file_buf); - kfree(buf_comm); +static void saved_cmdlines_stop(struct seq_file *m, void *v) +{ +} - return len; +static int saved_cmdlines_show(struct seq_file *m, void *v) +{ + char buf[TASK_COMM_LEN]; + unsigned int *pid = v; + + trace_find_cmdline(*pid, buf); + seq_printf(m, "%d %s\n", *pid, buf); + return 0; +} + +static const struct seq_operations tracing_saved_cmdlines_seq_ops = { + .start = saved_cmdlines_start, + .next = saved_cmdlines_next, + .stop = saved_cmdlines_stop, + .show = saved_cmdlines_show, +}; + +static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp) +{ + if (tracing_disabled) + return -ENODEV; + + return seq_open(filp, &tracing_saved_cmdlines_seq_ops); } static const struct file_operations tracing_saved_cmdlines_fops = { - .open = tracing_open_generic, - .read = tracing_saved_cmdlines_read, - .llseek = generic_file_llseek, + .open = tracing_saved_cmdlines_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, }; static ssize_t -- cgit v1.2.3 From 379cfdac37923653c9d4242d10052378b7563005 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 30 May 2014 09:42:39 -0400 Subject: tracing: Try again for saved cmdline if failed due to locking In order to prevent the saved cmdline cache from being filled when tracing is not active, the comms are only recorded after a trace event is recorded. The problem is, a comm can fail to be recorded if the trace_cmdline_lock is held. That lock is taken via a trylock to allow it to happen from any context (including NMI). If the lock fails to be taken, the comm is skipped. No big deal, as we will try again later. But! Because of the code that was added to only record after an event, we may not try again later as the recording is made as a oneshot per event per CPU. Only disable the recording of the comm if the comm is actually recorded. Fixes: 7ffbd48d5cab "tracing: Cache comms only after an event occurred" Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1855dae73f34..22a902e2ded9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1441,12 +1441,12 @@ static void tracing_stop_tr(struct trace_array *tr) void trace_stop_cmdline_recording(void); -static void trace_save_cmdline(struct task_struct *tsk) +static int trace_save_cmdline(struct task_struct *tsk) { unsigned pid, idx; if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) - return; + return 0; /* * It's not the end of the world if we don't get @@ -1455,7 +1455,7 @@ static void trace_save_cmdline(struct task_struct *tsk) * so if we miss here, then better luck next time. */ if (!arch_spin_trylock(&trace_cmdline_lock)) - return; + return 0; idx = map_pid_to_cmdline[tsk->pid]; if (idx == NO_CMDLINE_MAP) { @@ -1480,6 +1480,8 @@ static void trace_save_cmdline(struct task_struct *tsk) memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); arch_spin_unlock(&trace_cmdline_lock); + + return 1; } void trace_find_cmdline(int pid, char comm[]) @@ -1521,9 +1523,8 @@ void tracing_record_cmdline(struct task_struct *tsk) if (!__this_cpu_read(trace_cmdline_save)) return; - __this_cpu_write(trace_cmdline_save, false); - - trace_save_cmdline(tsk); + if (trace_save_cmdline(tsk)) + __this_cpu_write(trace_cmdline_save, false); } void -- cgit v1.2.3 From 4c27e756bc019ec1c11232893af036fdae720a97 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 30 May 2014 10:49:46 -0400 Subject: tracing: Move locking of trace_cmdline_lock into start/stop seq calls With the conversion of the saved_cmdlines output to use seq_read, there is now a race between accessing the values of the saved_cmdlines and the writing to them. The trace_cmdline_lock needs to be taken at the start and stop of the seq calls. A new __trace_find_cmdline() call is created to allow for the look up to happen without taking the lock. Fixes: 42584c81c5ad tracing: Have saved_cmdlines use the seq_read infrastructure Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 22a902e2ded9..626dbfde5d56 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1484,7 +1484,7 @@ static int trace_save_cmdline(struct task_struct *tsk) return 1; } -void trace_find_cmdline(int pid, char comm[]) +static void __trace_find_cmdline(int pid, char comm[]) { unsigned map; @@ -1503,13 +1503,19 @@ void trace_find_cmdline(int pid, char comm[]) return; } - preempt_disable(); - arch_spin_lock(&trace_cmdline_lock); map = map_pid_to_cmdline[pid]; if (map != NO_CMDLINE_MAP) strcpy(comm, saved_cmdlines[map]); else strcpy(comm, "<...>"); +} + +void trace_find_cmdline(int pid, char comm[]) +{ + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + + __trace_find_cmdline(pid, comm); arch_spin_unlock(&trace_cmdline_lock); preempt_enable(); @@ -3724,6 +3730,9 @@ static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) void *v; loff_t l = 0; + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + v = &map_cmdline_to_pid[0]; while (l <= *pos) { v = saved_cmdlines_next(m, v, &l); @@ -3736,6 +3745,8 @@ static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) static void saved_cmdlines_stop(struct seq_file *m, void *v) { + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); } static int saved_cmdlines_show(struct seq_file *m, void *v) @@ -3743,7 +3754,7 @@ static int saved_cmdlines_show(struct seq_file *m, void *v) char buf[TASK_COMM_LEN]; unsigned int *pid = v; - trace_find_cmdline(*pid, buf); + __trace_find_cmdline(*pid, buf); seq_printf(m, "%d %s\n", *pid, buf); return 0; } -- cgit v1.2.3 From 3480593131e0b781287dae0139bf7ccee7cba7ff Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 29 May 2014 10:22:50 +0200 Subject: net: filter: get rid of BPF_S_* enum This patch finally allows us to get rid of the BPF_S_* enum. Currently, the code performs unnecessary encode and decode workarounds in seccomp and filter migration itself when a filter is being attached in order to overcome BPF_S_* encoding which is not used anymore by the new interpreter resp. JIT compilers. Keeping it around would mean that also in future we would need to extend and maintain this enum and related encoders/decoders. We can get rid of all that and save us these operations during filter attaching. Naturally, also JIT compilers need to be updated by this. Before JIT conversion is being done, each compiler checks if A is being loaded at startup to obtain information if it needs to emit instructions to clear A first. Since BPF extensions are a subset of BPF_LD | BPF_{W,H,B} | BPF_ABS variants, case statements for extensions can be removed at that point. To ease and minimalize code changes in the classic JITs, we have introduced bpf_anc_helper(). Tested with test_bpf on x86_64 (JIT, int), s390x (JIT, int), arm (JIT, int), i368 (int), ppc64 (JIT, int); for sparc we unfortunately didn't have access, but changes are analogous to the rest. Joint work with Alexei Starovoitov. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: Mircea Gherzan Cc: Kees Cook Acked-by: Chema Gonzalez Signed-off-by: David S. Miller --- kernel/seccomp.c | 83 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 41 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 1036b6f2fded..44e69483b383 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -103,60 +103,59 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) u32 k = ftest->k; switch (code) { - case BPF_S_LD_W_ABS: + case BPF_LD | BPF_W | BPF_ABS: ftest->code = BPF_LDX | BPF_W | BPF_ABS; /* 32-bit aligned and not out of bounds. */ if (k >= sizeof(struct seccomp_data) || k & 3) return -EINVAL; continue; - case BPF_S_LD_W_LEN: + case BPF_LD | BPF_W | BPF_LEN: ftest->code = BPF_LD | BPF_IMM; ftest->k = sizeof(struct seccomp_data); continue; - case BPF_S_LDX_W_LEN: + case BPF_LDX | BPF_W | BPF_LEN: ftest->code = BPF_LDX | BPF_IMM; ftest->k = sizeof(struct seccomp_data); continue; /* Explicitly include allowed calls. */ - case BPF_S_RET_K: - case BPF_S_RET_A: - case BPF_S_ALU_ADD_K: - case BPF_S_ALU_ADD_X: - case BPF_S_ALU_SUB_K: - case BPF_S_ALU_SUB_X: - case BPF_S_ALU_MUL_K: - case BPF_S_ALU_MUL_X: - case BPF_S_ALU_DIV_X: - case BPF_S_ALU_AND_K: - case BPF_S_ALU_AND_X: - case BPF_S_ALU_OR_K: - case BPF_S_ALU_OR_X: - case BPF_S_ALU_XOR_K: - case BPF_S_ALU_XOR_X: - case BPF_S_ALU_LSH_K: - case BPF_S_ALU_LSH_X: - case BPF_S_ALU_RSH_K: - case BPF_S_ALU_RSH_X: - case BPF_S_ALU_NEG: - case BPF_S_LD_IMM: - case BPF_S_LDX_IMM: - case BPF_S_MISC_TAX: - case BPF_S_MISC_TXA: - case BPF_S_ALU_DIV_K: - case BPF_S_LD_MEM: - case BPF_S_LDX_MEM: - case BPF_S_ST: - case BPF_S_STX: - case BPF_S_JMP_JA: - case BPF_S_JMP_JEQ_K: - case BPF_S_JMP_JEQ_X: - case BPF_S_JMP_JGE_K: - case BPF_S_JMP_JGE_X: - case BPF_S_JMP_JGT_K: - case BPF_S_JMP_JGT_X: - case BPF_S_JMP_JSET_K: - case BPF_S_JMP_JSET_X: - sk_decode_filter(ftest, ftest); + case BPF_RET | BPF_K: + case BPF_RET | BPF_A: + case BPF_ALU | BPF_ADD | BPF_K: + case BPF_ALU | BPF_ADD | BPF_X: + case BPF_ALU | BPF_SUB | BPF_K: + case BPF_ALU | BPF_SUB | BPF_X: + case BPF_ALU | BPF_MUL | BPF_K: + case BPF_ALU | BPF_MUL | BPF_X: + case BPF_ALU | BPF_DIV | BPF_K: + case BPF_ALU | BPF_DIV | BPF_X: + case BPF_ALU | BPF_AND | BPF_K: + case BPF_ALU | BPF_AND | BPF_X: + case BPF_ALU | BPF_OR | BPF_K: + case BPF_ALU | BPF_OR | BPF_X: + case BPF_ALU | BPF_XOR | BPF_K: + case BPF_ALU | BPF_XOR | BPF_X: + case BPF_ALU | BPF_LSH | BPF_K: + case BPF_ALU | BPF_LSH | BPF_X: + case BPF_ALU | BPF_RSH | BPF_K: + case BPF_ALU | BPF_RSH | BPF_X: + case BPF_ALU | BPF_NEG: + case BPF_LD | BPF_IMM: + case BPF_LDX | BPF_IMM: + case BPF_MISC | BPF_TAX: + case BPF_MISC | BPF_TXA: + case BPF_LD | BPF_MEM: + case BPF_LDX | BPF_MEM: + case BPF_ST: + case BPF_STX: + case BPF_JMP | BPF_JA: + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JEQ | BPF_X: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JSET | BPF_K: + case BPF_JMP | BPF_JSET | BPF_X: continue; default: return -EINVAL; -- cgit v1.2.3 From e3172181946fadd3bd0e4c362931094ba87e5718 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 2 Jun 2014 13:33:12 +0900 Subject: tracing: Print max callstack on stacktrace bug While I played with my own feature(ex, something on the way to reclaim), the kernel would easily oops. I guessed that the reason had to do with stack overflow and wanted to prove it. I discovered the stack tracer which proved to be very useful for me but the kernel would oops before my user program gather the information via "watch cat /sys/kernel/debug/tracing/stack_trace" so I couldn't get any message from that. What I needed was to have the stack tracer emit the kernel stack usage before it does the oops so I could find what was hogging the stack. This patch shows the callstack of max stack usage right before an oops so we can find a culprit. So, the result is as follows. [ 1116.522206] init: lightdm main process (1246) terminated with status 1 [ 1119.922916] init: failsafe-x main process (1272) terminated with status 1 [ 3887.728131] kworker/u24:1 (6637) used greatest stack depth: 256 bytes left [ 6397.629227] cc1 (9554) used greatest stack depth: 128 bytes left [ 7174.467392] Depth Size Location (47 entries) [ 7174.467392] ----- ---- -------- [ 7174.467785] 0) 7248 256 get_page_from_freelist+0xa7/0x920 [ 7174.468506] 1) 6992 352 __alloc_pages_nodemask+0x1cd/0xb20 [ 7174.469224] 2) 6640 8 alloc_pages_current+0x10f/0x1f0 [ 7174.469413] 3) 6632 168 new_slab+0x2c5/0x370 [ 7174.469413] 4) 6464 8 __slab_alloc+0x3a9/0x501 [ 7174.469413] 5) 6456 80 __kmalloc+0x1cb/0x200 [ 7174.469413] 6) 6376 376 vring_add_indirect+0x36/0x200 [ 7174.469413] 7) 6000 144 virtqueue_add_sgs+0x2e2/0x320 [ 7174.469413] 8) 5856 288 __virtblk_add_req+0xda/0x1b0 [ 7174.469413] 9) 5568 96 virtio_queue_rq+0xd3/0x1d0 [ 7174.469413] 10) 5472 128 __blk_mq_run_hw_queue+0x1ef/0x440 [ 7174.469413] 11) 5344 16 blk_mq_run_hw_queue+0x35/0x40 [ 7174.469413] 12) 5328 96 blk_mq_insert_requests+0xdb/0x160 [ 7174.469413] 13) 5232 112 blk_mq_flush_plug_list+0x12b/0x140 [ 7174.469413] 14) 5120 112 blk_flush_plug_list+0xc7/0x220 [ 7174.469413] 15) 5008 64 io_schedule_timeout+0x88/0x100 [ 7174.469413] 16) 4944 128 mempool_alloc+0x145/0x170 [ 7174.469413] 17) 4816 96 bio_alloc_bioset+0x10b/0x1d0 [ 7174.469413] 18) 4720 48 get_swap_bio+0x30/0x90 [ 7174.469413] 19) 4672 160 __swap_writepage+0x150/0x230 [ 7174.469413] 20) 4512 32 swap_writepage+0x42/0x90 [ 7174.469413] 21) 4480 320 shrink_page_list+0x676/0xa80 [ 7174.469413] 22) 4160 208 shrink_inactive_list+0x262/0x4e0 [ 7174.469413] 23) 3952 304 shrink_lruvec+0x3e1/0x6a0 [ 7174.469413] 24) 3648 80 shrink_zone+0x3f/0x110 [ 7174.469413] 25) 3568 128 do_try_to_free_pages+0x156/0x4c0 [ 7174.469413] 26) 3440 208 try_to_free_pages+0xf7/0x1e0 [ 7174.469413] 27) 3232 352 __alloc_pages_nodemask+0x783/0xb20 [ 7174.469413] 28) 2880 8 alloc_pages_current+0x10f/0x1f0 [ 7174.469413] 29) 2872 200 __page_cache_alloc+0x13f/0x160 [ 7174.469413] 30) 2672 80 find_or_create_page+0x4c/0xb0 [ 7174.469413] 31) 2592 80 ext4_mb_load_buddy+0x1e9/0x370 [ 7174.469413] 32) 2512 176 ext4_mb_regular_allocator+0x1b7/0x460 [ 7174.469413] 33) 2336 128 ext4_mb_new_blocks+0x458/0x5f0 [ 7174.469413] 34) 2208 256 ext4_ext_map_blocks+0x70b/0x1010 [ 7174.469413] 35) 1952 160 ext4_map_blocks+0x325/0x530 [ 7174.469413] 36) 1792 384 ext4_writepages+0x6d1/0xce0 [ 7174.469413] 37) 1408 16 do_writepages+0x23/0x40 [ 7174.469413] 38) 1392 96 __writeback_single_inode+0x45/0x2e0 [ 7174.469413] 39) 1296 176 writeback_sb_inodes+0x2ad/0x500 [ 7174.469413] 40) 1120 80 __writeback_inodes_wb+0x9e/0xd0 [ 7174.469413] 41) 1040 160 wb_writeback+0x29b/0x350 [ 7174.469413] 42) 880 208 bdi_writeback_workfn+0x11c/0x480 [ 7174.469413] 43) 672 144 process_one_work+0x1d2/0x570 [ 7174.469413] 44) 528 112 worker_thread+0x116/0x370 [ 7174.469413] 45) 416 240 kthread+0xf3/0x110 [ 7174.469413] 46) 176 176 ret_from_fork+0x7c/0xb0 [ 7174.469413] ------------[ cut here ]------------ [ 7174.469413] kernel BUG at kernel/trace/trace_stack.c:174! [ 7174.469413] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC [ 7174.469413] Dumping ftrace buffer: [ 7174.469413] (ftrace buffer empty) [ 7174.469413] Modules linked in: [ 7174.469413] CPU: 0 PID: 440 Comm: kworker/u24:0 Not tainted 3.14.0+ #212 [ 7174.469413] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 7174.469413] Workqueue: writeback bdi_writeback_workfn (flush-253:0) [ 7174.469413] task: ffff880034170000 ti: ffff880029518000 task.ti: ffff880029518000 [ 7174.469413] RIP: 0010:[] [] stack_trace_call+0x2de/0x340 [ 7174.469413] RSP: 0000:ffff880029518290 EFLAGS: 00010046 [ 7174.469413] RAX: 0000000000000030 RBX: 000000000000002f RCX: 0000000000000000 [ 7174.469413] RDX: 0000000000000000 RSI: 000000000000002f RDI: ffffffff810b7159 [ 7174.469413] RBP: ffff8800295182f0 R08: ffffffffffffffff R09: 0000000000000000 [ 7174.469413] R10: 0000000000000001 R11: 0000000000000001 R12: ffffffff82768dfc [ 7174.469413] R13: 000000000000f2e8 R14: ffff8800295182b8 R15: 00000000000000f8 [ 7174.469413] FS: 0000000000000000(0000) GS:ffff880037c00000(0000) knlGS:0000000000000000 [ 7174.469413] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 7174.469413] CR2: 00002acd0b994000 CR3: 0000000001c0b000 CR4: 00000000000006f0 [ 7174.469413] Stack: [ 7174.469413] 0000000000000000 ffffffff8114fdb7 0000000000000087 0000000000001c50 [ 7174.469413] 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 7174.469413] 0000000000000002 ffff880034170000 ffff880034171028 0000000000000000 [ 7174.469413] Call Trace: [ 7174.469413] [] ? get_page_from_freelist+0xa7/0x920 [ 7174.469413] [] ftrace_call+0x5/0x2f [ 7174.469413] [] ? next_zones_zonelist+0x5/0x70 [ 7174.469413] [] ? __bfs+0x11a/0x270 [ 7174.469413] [] ? next_zones_zonelist+0x5/0x70 [ 7174.469413] [] ? get_page_from_freelist+0xa7/0x920 [ 7174.469413] [] ? alloc_pages_current+0x10f/0x1f0 [ 7174.469413] [] __alloc_pages_nodemask+0x1cd/0xb20 [ 7174.469413] [] ? check_irq_usage+0x96/0xe0 [ 7174.469413] [] ? ftrace_call+0x5/0x2f [ 7174.469413] [] alloc_pages_current+0x10f/0x1f0 [ 7174.469413] [] ? new_slab+0x2c5/0x370 [ 7174.469413] [] new_slab+0x2c5/0x370 [ 7174.469413] [] ? ftrace_call+0x5/0x2f [ 7174.469413] [] __slab_alloc+0x3a9/0x501 [ 7174.469413] [] ? __kmalloc+0x1cb/0x200 [ 7174.469413] [] ? vring_add_indirect+0x36/0x200 [ 7174.469413] [] ? vring_add_indirect+0x36/0x200 [ 7174.469413] [] ? vring_add_indirect+0x36/0x200 [ 7174.469413] [] __kmalloc+0x1cb/0x200 [ 7174.469413] [] ? vring_add_indirect+0x200/0x200 [ 7174.469413] [] vring_add_indirect+0x36/0x200 [ 7174.469413] [] virtqueue_add_sgs+0x2e2/0x320 [ 7174.469413] [] __virtblk_add_req+0xda/0x1b0 [ 7174.469413] [] virtio_queue_rq+0xd3/0x1d0 [ 7174.469413] [] __blk_mq_run_hw_queue+0x1ef/0x440 [ 7174.469413] [] blk_mq_run_hw_queue+0x35/0x40 [ 7174.469413] [] blk_mq_insert_requests+0xdb/0x160 [ 7174.469413] [] blk_mq_flush_plug_list+0x12b/0x140 [ 7174.469413] [] blk_flush_plug_list+0xc7/0x220 [ 7174.469413] [] ? _raw_spin_unlock_irqrestore+0x3f/0x70 [ 7174.469413] [] io_schedule_timeout+0x88/0x100 [ 7174.469413] [] ? io_schedule_timeout+0x5/0x100 [ 7174.469413] [] mempool_alloc+0x145/0x170 [ 7174.469413] [] ? __init_waitqueue_head+0x60/0x60 [ 7174.469413] [] bio_alloc_bioset+0x10b/0x1d0 [ 7174.469413] [] ? end_swap_bio_read+0xc0/0xc0 [ 7174.469413] [] ? end_swap_bio_read+0xc0/0xc0 [ 7174.469413] [] get_swap_bio+0x30/0x90 [ 7174.469413] [] ? end_swap_bio_read+0xc0/0xc0 [ 7174.469413] [] __swap_writepage+0x150/0x230 [ 7174.469413] [] ? do_raw_spin_unlock+0x5/0xa0 [ 7174.469413] [] ? end_swap_bio_read+0xc0/0xc0 [ 7174.469413] [] ? __swap_writepage+0x5/0x230 [ 7174.469413] [] swap_writepage+0x42/0x90 [ 7174.469413] [] shrink_page_list+0x676/0xa80 [ 7174.469413] [] ? ftrace_call+0x5/0x2f [ 7174.469413] [] shrink_inactive_list+0x262/0x4e0 [ 7174.469413] [] shrink_lruvec+0x3e1/0x6a0 [ 7174.469413] [] shrink_zone+0x3f/0x110 [ 7174.469413] [] ? ftrace_call+0x5/0x2f [ 7174.469413] [] do_try_to_free_pages+0x156/0x4c0 [ 7174.469413] [] try_to_free_pages+0xf7/0x1e0 [ 7174.469413] [] __alloc_pages_nodemask+0x783/0xb20 [ 7174.469413] [] alloc_pages_current+0x10f/0x1f0 [ 7174.469413] [] ? __page_cache_alloc+0x13f/0x160 [ 7174.469413] [] __page_cache_alloc+0x13f/0x160 [ 7174.469413] [] find_or_create_page+0x4c/0xb0 [ 7174.469413] [] ? find_get_page+0x5/0x130 [ 7174.469413] [] ext4_mb_load_buddy+0x1e9/0x370 [ 7174.469413] [] ext4_mb_regular_allocator+0x1b7/0x460 [ 7174.469413] [] ? ext4_mb_use_preallocated+0x40/0x360 [ 7174.469413] [] ? ftrace_call+0x5/0x2f [ 7174.469413] [] ext4_mb_new_blocks+0x458/0x5f0 [ 7174.469413] [] ext4_ext_map_blocks+0x70b/0x1010 [ 7174.469413] [] ext4_map_blocks+0x325/0x530 [ 7174.469413] [] ext4_writepages+0x6d1/0xce0 [ 7174.469413] [] ? ext4_journalled_write_end+0x330/0x330 [ 7174.469413] [] do_writepages+0x23/0x40 [ 7174.469413] [] __writeback_single_inode+0x45/0x2e0 [ 7174.469413] [] writeback_sb_inodes+0x2ad/0x500 [ 7174.469413] [] __writeback_inodes_wb+0x9e/0xd0 [ 7174.469413] [] wb_writeback+0x29b/0x350 [ 7174.469413] [] ? __local_bh_enable_ip+0x6d/0xd0 [ 7174.469413] [] bdi_writeback_workfn+0x11c/0x480 [ 7174.469413] [] ? process_one_work+0x170/0x570 [ 7174.469413] [] process_one_work+0x1d2/0x570 [ 7174.469413] [] ? process_one_work+0x170/0x570 [ 7174.469413] [] worker_thread+0x116/0x370 [ 7174.469413] [] ? manage_workers.isra.19+0x2e0/0x2e0 [ 7174.469413] [] kthread+0xf3/0x110 [ 7174.469413] [] ? flush_kthread_worker+0x150/0x150 [ 7174.469413] [] ret_from_fork+0x7c/0xb0 [ 7174.469413] [] ? flush_kthread_worker+0x150/0x150 [ 7174.469413] Code: c0 49 bc fc 8d 76 82 ff ff ff ff e8 44 5a 5b 00 31 f6 8b 05 95 2b b3 00 48 39 c6 7d 0e 4c 8b 04 f5 20 5f c5 81 49 83 f8 ff 75 11 <0f> 0b 48 63 05 71 5a 64 01 48 29 c3 e9 d0 fd ff ff 48 8d 5e 01 [ 7174.469413] RIP [] stack_trace_call+0x2de/0x340 [ 7174.469413] RSP [ 7174.469413] ---[ end trace c97d325b36b718f3 ]--- Link: http://lkml.kernel.org/p/1401683592-1651-1-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 5aa9a5b9b6e2..8a4e5cb66a4c 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -51,11 +51,33 @@ static DEFINE_MUTEX(stack_sysctl_mutex); int stack_tracer_enabled; static int last_stack_tracer_enabled; +static inline void print_max_stack(void) +{ + long i; + int size; + + pr_emerg(" Depth Size Location (%d entries)\n" + " ----- ---- --------\n", + max_stack_trace.nr_entries - 1); + + for (i = 0; i < max_stack_trace.nr_entries; i++) { + if (stack_dump_trace[i] == ULONG_MAX) + break; + if (i+1 == max_stack_trace.nr_entries || + stack_dump_trace[i+1] == ULONG_MAX) + size = stack_dump_index[i]; + else + size = stack_dump_index[i] - stack_dump_index[i+1]; + + pr_emerg("%3ld) %8d %5d %pS\n", i, stack_dump_index[i], + size, (void *)stack_dump_trace[i]); + } +} + static inline void check_stack(unsigned long ip, unsigned long *stack) { - unsigned long this_size, flags; - unsigned long *p, *top, *start; + unsigned long this_size, flags; unsigned long *p, *top, *start; static int tracer_frame; int frame_size = ACCESS_ONCE(tracer_frame); int i; @@ -149,8 +171,12 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } - BUG_ON(current != &init_task && - *(end_of_stack(current)) != STACK_END_MAGIC); + if ((current != &init_task && + *(end_of_stack(current)) != STACK_END_MAGIC)) { + print_max_stack(); + BUG(); + } + out: arch_spin_unlock(&max_stack_lock); local_irq_restore(flags); -- cgit v1.2.3 From 198376cd8877a612cc6494201b815c36d1e40391 Mon Sep 17 00:00:00 2001 From: Yoshihiro YUNOMAE Date: Tue, 3 Jun 2014 13:28:03 +0900 Subject: tracing: Eliminate double free on failure of allocation on boot up If allocation of the max_buffer fails on boot up, the error path will free both per_cpu data structures from the buffers. With the new redesign of the code, those structures are freed if allocations failed. That is, the helper function that allocates the buffers will free the per cpu data on failure. No need to do it again. In fact, the second free will cause a bug as the code can not handle a double free. Link: http://lkml.kernel.org/p/20140603042803.27308.30956.stgit@yunodevel Signed-off-by: Yoshihiro YUNOMAE Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 626dbfde5d56..135af323608b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6671,10 +6671,6 @@ __init static int tracer_alloc_buffers(void) out_free_temp_buffer: ring_buffer_free(temp_buffer); out_free_cpumask: - free_percpu(global_trace.trace_buffer.data); -#ifdef CONFIG_TRACER_MAX_TRACE - free_percpu(global_trace.max_buffer.data); -#endif free_cpumask_var(global_trace.tracing_cpumask); out_free_buffer_mask: free_cpumask_var(tracing_buffer_mask); -- cgit v1.2.3 From 195d28020147843dcf809730b2e9d876722371e6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 4 Jun 2014 13:36:29 -0400 Subject: tracing: Remove unused variable in trace_benchmark Somehow this unused variable warning sneaked past my warnings check (probably due to it depending on a new config). kernel/trace/trace_benchmark.c: In function 'trace_do_benchmark': kernel/trace/trace_benchmark.c:38:6: warning: unused variable 'seedsq' [-Wunused-variable] u64 seedsq; ^ Link: http://lkml.kernel.org/r/20140604160921.4f4e69c4@canb.auug.org.au Reported-by: Stephen Rothwell Signed-off-by: Steven Rostedt --- kernel/trace/trace_benchmark.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 7dc1c42dfee2..a10adc7095cd 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -35,7 +35,6 @@ static void trace_do_benchmark(void) u64 delta; s64 stddev; u64 seed; - u64 seedsq; u64 last_seed; unsigned int avg; unsigned int std = 0; -- cgit v1.2.3 From 1f779fb28aa07350d72976d304591d216ca86f0e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 4 Jun 2014 16:48:15 +0800 Subject: cgroup: don't destroy the default root The default root is allocated and initialized at boot phase, so we shouldn't destroy the default root when it's umounted, otherwise it will lead to disaster. Just try mount and then umount the default root, and the kernel will crash immediately. v2: - No need to check for CSS_NO_REF in cgroup_get/put(). (Tejun) - Better call cgroup_put() for the default root in kill_sb(). (Tejun) - Add a comment. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a5f75ac4e793..3f46165829a4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1780,8 +1780,11 @@ static void cgroup_kill_sb(struct super_block *sb) * If @root doesn't have any mounts or children, start killing it. * This prevents new mounts by disabling percpu_ref_tryget_live(). * cgroup_mount() may wait for @root's release. + * + * And don't kill the default root. */ - if (css_has_online_children(&root->cgrp.self)) + if (css_has_online_children(&root->cgrp.self) || + root == &cgrp_dfl_root) cgroup_put(&root->cgrp); else percpu_ref_kill(&root->cgrp.self.refcnt); -- cgit v1.2.3 From 4fc828e24cd9c385d3a44e1b499ec7fc70239d8a Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 2 May 2014 11:24:15 -0700 Subject: locking/rwsem: Support optimistic spinning We have reached the point where our mutexes are quite fine tuned for a number of situations. This includes the use of heuristics and optimistic spinning, based on MCS locking techniques. Exclusive ownership of read-write semaphores are, conceptually, just about the same as mutexes, making them close cousins. To this end we need to make them both perform similarly, and right now, rwsems are simply not up to it. This was discovered by both reverting commit 4fc3f1d6 (mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable) and similarly, converting some other mutexes (ie: i_mmap_mutex) to rwsems. This creates a situation where users have to choose between a rwsem and mutex taking into account this important performance difference. Specifically, biggest difference between both locks is when we fail to acquire a mutex in the fastpath, optimistic spinning comes in to play and we can avoid a large amount of unnecessary sleeping and overhead of moving tasks in and out of wait queue. Rwsems do not have such logic. This patch, based on the work from Tim Chen and I, adds support for write-side optimistic spinning when the lock is contended. It also includes support for the recently added cancelable MCS locking for adaptive spinning. Note that is is only applicable to the xadd method, and the spinlock rwsem variant remains intact. Allowing optimistic spinning before putting the writer on the wait queue reduces wait queue contention and provided greater chance for the rwsem to get acquired. With these changes, rwsem is on par with mutex. The performance benefits can be seen on a number of workloads. For instance, on a 8 socket, 80 core 64bit Westmere box, aim7 shows the following improvements in throughput: +--------------+---------------------+-----------------+ | Workload | throughput-increase | number of users | +--------------+---------------------+-----------------+ | alltests | 20% | >1000 | | custom | 27%, 60% | 10-100, >1000 | | high_systime | 36%, 30% | >100, >1000 | | shared | 58%, 29% | 10-100, >1000 | +--------------+---------------------+-----------------+ There was also improvement on smaller systems, such as a quad-core x86-64 laptop running a 30Gb PostgreSQL (pgbench) workload for up to +60% in throughput for over 50 clients. Additionally, benefits were also noticed in exim (mail server) workloads. Furthermore, no performance regression have been seen at all. Based-on-work-from: Tim Chen Signed-off-by: Davidlohr Bueso [peterz: rej fixup due to comment patches, sched/rt.h header] Signed-off-by: Peter Zijlstra Cc: Alex Shi Cc: Andi Kleen Cc: Michel Lespinasse Cc: Rik van Riel Cc: Peter Hurley Cc: "Paul E.McKenney" Cc: Jason Low Cc: Aswin Chandramouleeswaran Cc: Andrew Morton Cc: Linus Torvalds Cc: "Scott J Norton" Cc: Andrea Arcangeli Cc: Chris Mason Cc: Josef Bacik Link: http://lkml.kernel.org/r/1399055055.6275.15.camel@buesod1.americas.hpqcorp.net Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 225 ++++++++++++++++++++++++++++++++++++++------ kernel/locking/rwsem.c | 31 +++++- 2 files changed, 226 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index b4219ff87b8c..4a75278142cd 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -5,11 +5,17 @@ * * Writer lock-stealing by Alex Shi * and Michel Lespinasse + * + * Optimistic spinning by Tim Chen + * and Davidlohr Bueso . Based on mutexes. */ #include #include #include #include +#include + +#include "mcs_spinlock.h" /* * Guide to the rw_semaphore's count field for common values. @@ -76,6 +82,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, sem->count = RWSEM_UNLOCKED_VALUE; raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); +#ifdef CONFIG_SMP + sem->owner = NULL; + sem->osq = NULL; +#endif } EXPORT_SYMBOL(__init_rwsem); @@ -190,7 +200,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) } /* - * wait for the read lock to be granted + * Wait for the read lock to be granted */ __visible struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) @@ -237,64 +247,221 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) return sem; } +static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) +{ + if (!(count & RWSEM_ACTIVE_MASK)) { + /* try acquiring the write lock */ + if (sem->count == RWSEM_WAITING_BIAS && + cmpxchg(&sem->count, RWSEM_WAITING_BIAS, + RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { + if (!list_is_singular(&sem->wait_list)) + rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); + return true; + } + } + return false; +} + +#ifdef CONFIG_SMP /* - * wait until we successfully acquire the write lock + * Try to acquire write lock before the writer has been put on wait queue. + */ +static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) +{ + long old, count = ACCESS_ONCE(sem->count); + + while (true) { + if (!(count == 0 || count == RWSEM_WAITING_BIAS)) + return false; + + old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); + if (old == count) + return true; + + count = old; + } +} + +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) +{ + struct task_struct *owner; + bool on_cpu = true; + + if (need_resched()) + return 0; + + rcu_read_lock(); + owner = ACCESS_ONCE(sem->owner); + if (owner) + on_cpu = owner->on_cpu; + rcu_read_unlock(); + + /* + * If sem->owner is not set, the rwsem owner may have + * just acquired it and not set the owner yet or the rwsem + * has been released. + */ + return on_cpu; +} + +static inline bool owner_running(struct rw_semaphore *sem, + struct task_struct *owner) +{ + if (sem->owner != owner) + return false; + + /* + * Ensure we emit the owner->on_cpu, dereference _after_ checking + * sem->owner still matches owner, if that fails, owner might + * point to free()d memory, if it still matches, the rcu_read_lock() + * ensures the memory stays valid. + */ + barrier(); + + return owner->on_cpu; +} + +static noinline +bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) +{ + rcu_read_lock(); + while (owner_running(sem, owner)) { + if (need_resched()) + break; + + arch_mutex_cpu_relax(); + } + rcu_read_unlock(); + + /* + * We break out the loop above on need_resched() or when the + * owner changed, which is a sign for heavy contention. Return + * success only when sem->owner is NULL. + */ + return sem->owner == NULL; +} + +static bool rwsem_optimistic_spin(struct rw_semaphore *sem) +{ + struct task_struct *owner; + bool taken = false; + + preempt_disable(); + + /* sem->wait_lock should not be held when doing optimistic spinning */ + if (!rwsem_can_spin_on_owner(sem)) + goto done; + + if (!osq_lock(&sem->osq)) + goto done; + + while (true) { + owner = ACCESS_ONCE(sem->owner); + if (owner && !rwsem_spin_on_owner(sem, owner)) + break; + + /* wait_lock will be acquired if write_lock is obtained */ + if (rwsem_try_write_lock_unqueued(sem)) { + taken = true; + break; + } + + /* + * When there's no owner, we might have preempted between the + * owner acquiring the lock and setting the owner field. If + * we're an RT task that will live-lock because we won't let + * the owner complete. + */ + if (!owner && (need_resched() || rt_task(current))) + break; + + /* + * The cpu_relax() call is a compiler barrier which forces + * everything in this loop to be re-loaded. We don't need + * memory barriers as we'll eventually observe the right + * values at the cost of a few extra spins. + */ + arch_mutex_cpu_relax(); + } + osq_unlock(&sem->osq); +done: + preempt_enable(); + return taken; +} + +#else +static bool rwsem_optimistic_spin(struct rw_semaphore *sem) +{ + return false; +} +#endif + +/* + * Wait until we successfully acquire the write lock */ __visible struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) { - long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; + long count; + bool waiting = true; /* any queued threads before us */ struct rwsem_waiter waiter; - struct task_struct *tsk = current; - /* set up my own style of waitqueue */ - waiter.task = tsk; + /* undo write bias from down_write operation, stop active locking */ + count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); + + /* do optimistic spinning and steal lock if possible */ + if (rwsem_optimistic_spin(sem)) + return sem; + + /* + * Optimistic spinning failed, proceed to the slowpath + * and block until we can acquire the sem. + */ + waiter.task = current; waiter.type = RWSEM_WAITING_FOR_WRITE; raw_spin_lock_irq(&sem->wait_lock); + + /* account for this before adding a new element to the list */ if (list_empty(&sem->wait_list)) - adjustment += RWSEM_WAITING_BIAS; + waiting = false; + list_add_tail(&waiter.list, &sem->wait_list); /* we're now waiting on the lock, but no longer actively locking */ - count = rwsem_atomic_update(adjustment, sem); + if (waiting) { + count = ACCESS_ONCE(sem->count); - /* If there were already threads queued before us and there are no - * active writers, the lock must be read owned; so we try to wake - * any read locks that were queued ahead of us. */ - if (count > RWSEM_WAITING_BIAS && - adjustment == -RWSEM_ACTIVE_WRITE_BIAS) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); + /* + * If there were already threads queued before us and there are no + * active writers, the lock must be read owned; so we try to wake + * any read locks that were queued ahead of us. + */ + if (count > RWSEM_WAITING_BIAS) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); + + } else + count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); /* wait until we successfully acquire the lock */ - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); while (true) { - if (!(count & RWSEM_ACTIVE_MASK)) { - /* Try acquiring the write lock. */ - count = RWSEM_ACTIVE_WRITE_BIAS; - if (!list_is_singular(&sem->wait_list)) - count += RWSEM_WAITING_BIAS; - - if (sem->count == RWSEM_WAITING_BIAS && - cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == - RWSEM_WAITING_BIAS) - break; - } - + if (rwsem_try_write_lock(count, sem)) + break; raw_spin_unlock_irq(&sem->wait_lock); /* Block until there are no active lockers. */ do { schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); } while ((count = sem->count) & RWSEM_ACTIVE_MASK); raw_spin_lock_irq(&sem->wait_lock); } + __set_current_state(TASK_RUNNING); list_del(&waiter.list); raw_spin_unlock_irq(&sem->wait_lock); - tsk->state = TASK_RUNNING; return sem; } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index cfff1435bdfb..42f806de49d4 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -12,6 +12,27 @@ #include +#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM) +static inline void rwsem_set_owner(struct rw_semaphore *sem) +{ + sem->owner = current; +} + +static inline void rwsem_clear_owner(struct rw_semaphore *sem) +{ + sem->owner = NULL; +} + +#else +static inline void rwsem_set_owner(struct rw_semaphore *sem) +{ +} + +static inline void rwsem_clear_owner(struct rw_semaphore *sem) +{ +} +#endif + /* * lock for reading */ @@ -48,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem) rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); + rwsem_set_owner(sem); } EXPORT_SYMBOL(down_write); @@ -59,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem) { int ret = __down_write_trylock(sem); - if (ret == 1) + if (ret == 1) { rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); + rwsem_set_owner(sem); + } + return ret; } @@ -85,6 +110,7 @@ void up_write(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); + rwsem_clear_owner(sem); __up_write(sem); } @@ -99,6 +125,7 @@ void downgrade_write(struct rw_semaphore *sem) * lockdep: a downgraded write will live on as a write * dependency. */ + rwsem_clear_owner(sem); __downgrade_write(sem); } @@ -122,6 +149,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); + rwsem_set_owner(sem); } EXPORT_SYMBOL(_down_write_nest_lock); @@ -141,6 +169,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); + rwsem_set_owner(sem); } EXPORT_SYMBOL(down_write_nested); -- cgit v1.2.3 From 0cc3d01164aba483edd8232aa5c781136843c367 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 4 Jun 2014 20:19:48 +0200 Subject: locking/rwsem: Fix checkpatch.pl warnings WARNING: line over 80 characters #205: FILE: kernel/locking/rwsem-xadd.c:275: + old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); WARNING: line over 80 characters #376: FILE: kernel/locking/rwsem-xadd.c:434: + * If there were already threads queued before us and there are no WARNING: line over 80 characters #377: FILE: kernel/locking/rwsem-xadd.c:435: + * active writers, the lock must be read owned; so we try to wake total: 0 errors, 3 warnings, 417 lines checked Signed-off-by: Andrew Morton Signed-off-by: Peter Zijlstra Cc: "H. Peter Anvin" Cc: Davidlohr Bueso Cc: Tim Chen Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-pn6pslaplw031lykweojsn8c@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 4a75278142cd..dacc32142fcc 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -433,9 +433,9 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) count = ACCESS_ONCE(sem->count); /* - * If there were already threads queued before us and there are no - * active writers, the lock must be read owned; so we try to wake - * any read locks that were queued ahead of us. + * If there were already threads queued before us and there are + * no active writers, the lock must be read owned; so we try to + * wake any read locks that were queued ahead of us. */ if (count > RWSEM_WAITING_BIAS) sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); -- cgit v1.2.3 From 51f2176d74ace4c3f58579a605ef5a9720befb00 Mon Sep 17 00:00:00 2001 From: Ben Segall Date: Mon, 19 May 2014 15:49:45 -0700 Subject: sched/fair: Fix unlocked reads of some cfs_b->quota/period sched_cfs_period_timer() reads cfs_b->period without locks before calling do_sched_cfs_period_timer(), and similarly unthrottle_offline_cfs_rqs() would read cfs_b->period without the right lock. Thus a simultaneous change of bandwidth could cause corruption on any platform where ktime_t or u64 writes/reads are not atomic. Extend cfs_b->lock from do_sched_cfs_period_timer() to include the read of cfs_b->period to solve that issue; unthrottle_offline_cfs_rqs() can just use 1 rather than the exact quota, much like distribute_cfs_runtime() does. There is also an unlocked read of cfs_b->runtime_expires, but a race there would only delay runtime expiry by a tick. Still, the comparison should just be != anyway, which clarifies even that problem. Signed-off-by: Ben Segall Tested-by: Roman Gushchin [peterz: Fix compile warn] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140519224945.20303.93530.stgit@sword-of-the-dawn.mtv.corp.google.com Cc: pjt@google.com Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c9617b73bcc0..b71d8c39f1fd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3224,10 +3224,12 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) * has not truly expired. * * Fortunately we can check determine whether this the case by checking - * whether the global deadline has advanced. + * whether the global deadline has advanced. It is valid to compare + * cfs_b->runtime_expires without any locks since we only care about + * exact equality, so a partial write will still work. */ - if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) { + if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { /* extend local deadline, drift is bounded above by 2 ticks */ cfs_rq->runtime_expires += TICK_NSEC; } else { @@ -3456,21 +3458,21 @@ next: static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) { u64 runtime, runtime_expires; - int idle = 1, throttled; + int throttled; - raw_spin_lock(&cfs_b->lock); /* no need to continue the timer with no bandwidth constraint */ if (cfs_b->quota == RUNTIME_INF) - goto out_unlock; + goto out_deactivate; throttled = !list_empty(&cfs_b->throttled_cfs_rq); - /* idle depends on !throttled (for the case of a large deficit) */ - idle = cfs_b->idle && !throttled; cfs_b->nr_periods += overrun; - /* if we're going inactive then everything else can be deferred */ - if (idle) - goto out_unlock; + /* + * idle depends on !throttled (for the case of a large deficit), and if + * we're going inactive then everything else can be deferred + */ + if (cfs_b->idle && !throttled) + goto out_deactivate; /* * if we have relooped after returning idle once, we need to update our @@ -3484,7 +3486,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) if (!throttled) { /* mark as potentially idle for the upcoming period */ cfs_b->idle = 1; - goto out_unlock; + return 0; } /* account preceding periods in which throttling occurred */ @@ -3524,12 +3526,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) * timer to remain active while there are any throttled entities.) */ cfs_b->idle = 0; -out_unlock: - if (idle) - cfs_b->timer_active = 0; - raw_spin_unlock(&cfs_b->lock); - return idle; + return 0; + +out_deactivate: + cfs_b->timer_active = 0; + return 1; } /* a cfs_rq won't donate quota below this amount */ @@ -3706,6 +3708,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) int overrun; int idle = 0; + raw_spin_lock(&cfs_b->lock); for (;;) { now = hrtimer_cb_get_time(timer); overrun = hrtimer_forward(timer, now, cfs_b->period); @@ -3715,6 +3718,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) idle = do_sched_cfs_period_timer(cfs_b, overrun); } + raw_spin_unlock(&cfs_b->lock); return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; } @@ -3774,8 +3778,6 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) struct cfs_rq *cfs_rq; for_each_leaf_cfs_rq(rq, cfs_rq) { - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - if (!cfs_rq->runtime_enabled) continue; @@ -3783,7 +3785,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) * clock_task is not advancing so we just need to make sure * there's some valid quota amount */ - cfs_rq->runtime_remaining = cfs_b->quota; + cfs_rq->runtime_remaining = 1; if (cfs_rq_throttled(cfs_rq)) unthrottle_cfs_rq(cfs_rq); } -- cgit v1.2.3 From ed61bbc69c773465782476c7e5869fa5607fa73a Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Tue, 20 May 2014 14:39:27 -0700 Subject: sched/balancing: Reduce the rate of needless idle load balancing The current no_hz idle load balancer do load balancing for *all* idle cpus, even though the time due to load balance for a particular idle cpu could be still a while in the future. This introduces a much higher load balancing rate than what is necessary. The patch changes the behavior by only doing idle load balancing on behalf of an idle cpu only when it is due for load balancing. On SGI's systems with over 3000 cores, the cpu responsible for idle balancing got overwhelmed with idle balancing, and introduces a lot of OS noise to workloads. This patch fixes the issue. Signed-off-by: Tim Chen Acked-by: Russ Anderson Reviewed-by: Rik van Riel Reviewed-by: Jason Low Signed-off-by: Peter Zijlstra Cc: Andrew Morton Cc: Len Brown Cc: Dimitri Sivanich Cc: Hedi Berriche Cc: Andi Kleen Cc: MichelLespinasse Cc: Peter Hurley Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1400621967.2970.280.camel@schen9-DESK Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b71d8c39f1fd..7a0c000b6005 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7193,12 +7193,17 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) rq = cpu_rq(balance_cpu); - raw_spin_lock_irq(&rq->lock); - update_rq_clock(rq); - update_idle_cpu_load(rq); - raw_spin_unlock_irq(&rq->lock); - - rebalance_domains(rq, CPU_IDLE); + /* + * If time for next balance is due, + * do the balance. + */ + if (time_after_eq(jiffies, rq->next_balance)) { + raw_spin_lock_irq(&rq->lock); + update_rq_clock(rq); + update_idle_cpu_load(rq); + raw_spin_unlock_irq(&rq->lock); + rebalance_domains(rq, CPU_IDLE); + } if (time_after(this_rq->next_balance, rq->next_balance)) this_rq->next_balance = rq->next_balance; -- cgit v1.2.3 From 2538d960d0c74cdc639f05723e04a67aed1efdf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20Sch=C3=B6lling?= Date: Thu, 22 May 2014 19:45:23 +0200 Subject: sched/fair: Use time_after() in record_wakee() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To be future-proof and for better readability the time comparisons are modified to use time_after() instead of plain, error-prone math. Signed-off-by: Manuel Schölling Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1400780723-24626-1-git-send-email-manuel.schoelling@gmx.de Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7a0c000b6005..c63dde984956 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4066,7 +4066,7 @@ static void record_wakee(struct task_struct *p) * about the boundary, really active task won't care * about the loss. */ - if (jiffies > current->wakee_flip_decay_ts + HZ) { + if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) { current->wakee_flips >>= 1; current->wakee_flip_decay_ts = jiffies; } -- cgit v1.2.3 From fa93384f40deeb294fd29f2fdcadbd0ebc2dedf1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 23 May 2014 13:20:42 +0300 Subject: sched: Fix signedness bug in yield_to() yield_to() is supposed to return -ESRCH if there is no task to yield to, but because the type is bool that is the same as returning true. The only place I see which cares is kvm_vcpu_on_spin(). Signed-off-by: Dan Carpenter Reviewed-by: Raghavendra Signed-off-by: Peter Zijlstra Cc: Gleb Natapov Cc: Linus Torvalds Cc: Paolo Bonzini Cc: kvm@vger.kernel.org Link: http://lkml.kernel.org/r/20140523102042.GA7267@mwanda Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 321d800e4baa..afcc84234a3e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4195,7 +4195,7 @@ EXPORT_SYMBOL(yield); * false (0) if we failed to boost the target. * -ESRCH if there's no task to yield to. */ -bool __sched yield_to(struct task_struct *p, bool preempt) +int __sched yield_to(struct task_struct *p, bool preempt) { struct task_struct *curr = current; struct rq *rq, *p_rq; -- cgit v1.2.3 From 5ef20ca181ec592e4684a45f4d5f1385f6055534 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 26 May 2014 18:19:34 -0400 Subject: sched/fair: Remove "power" from 'struct numa_stats' It is better not to think about compute capacity as being equivalent to "CPU power". The upcoming "power aware" scheduler work may create confusion with the notion of energy consumption if "power" is used too liberally. To make things explicit and not create more confusion with the existing "capacity" member, let's rename things as follows: power -> compute_capacity capacity -> task_capacity Note: none of those fields are actually used outside update_numa_stats(). Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra Cc: Vincent Guittot Cc: Daniel Lezcano Cc: Morten Rasmussen Cc: "Rafael J. Wysocki" Cc: linaro-kernel@lists.linaro.org Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/n/tip-2e2ndymj5gyshyjq8am79f20@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c63dde984956..1cfe5a25086d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1026,10 +1026,10 @@ struct numa_stats { unsigned long load; /* Total compute capacity of CPUs on a node */ - unsigned long power; + unsigned long compute_capacity; /* Approximate capacity in terms of runnable tasks on a node */ - unsigned long capacity; + unsigned long task_capacity; int has_capacity; }; @@ -1046,7 +1046,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) ns->nr_running += rq->nr_running; ns->load += weighted_cpuload(cpu); - ns->power += power_of(cpu); + ns->compute_capacity += power_of(cpu); cpus++; } @@ -1062,9 +1062,10 @@ static void update_numa_stats(struct numa_stats *ns, int nid) if (!cpus) return; - ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power; - ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE); - ns->has_capacity = (ns->nr_running < ns->capacity); + ns->load = (ns->load * SCHED_POWER_SCALE) / ns->compute_capacity; + ns->task_capacity = + DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_POWER_SCALE); + ns->has_capacity = (ns->nr_running < ns->task_capacity); } struct task_numa_env { -- cgit v1.2.3 From 1b6a7495d343fcfe22ff3a8285544bb8e40f1920 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 26 May 2014 18:19:35 -0400 Subject: sched/fair: Change "has_capacity" to "has_free_capacity" The capacity of a CPU/group should be some intrinsic value that doesn't change with task placement. It is like a container which capacity is stable regardless of the amount of liquid in it (its "utilization")... unless the container itself is crushed that is, but that's another story. Therefore let's rename "has_capacity" to "has_free_capacity" in order to better convey the wanted meaning. Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra Cc: Vincent Guittot Cc: Daniel Lezcano Cc: Morten Rasmussen Cc: "Rafael J. Wysocki" Cc: linaro-kernel@lists.linaro.org Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/n/tip-djzkk027jm0e8x8jxy70opzh@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1cfe5a25086d..8993dfa2e82b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1030,7 +1030,7 @@ struct numa_stats { /* Approximate capacity in terms of runnable tasks on a node */ unsigned long task_capacity; - int has_capacity; + int has_free_capacity; }; /* @@ -1056,8 +1056,8 @@ static void update_numa_stats(struct numa_stats *ns, int nid) * the @ns structure is NULL'ed and task_numa_compare() will * not find this node attractive. * - * We'll either bail at !has_capacity, or we'll detect a huge imbalance - * and bail there. + * We'll either bail at !has_free_capacity, or we'll detect a huge + * imbalance and bail there. */ if (!cpus) return; @@ -1065,7 +1065,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) ns->load = (ns->load * SCHED_POWER_SCALE) / ns->compute_capacity; ns->task_capacity = DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_POWER_SCALE); - ns->has_capacity = (ns->nr_running < ns->task_capacity); + ns->has_free_capacity = (ns->nr_running < ns->task_capacity); } struct task_numa_env { @@ -1196,8 +1196,8 @@ static void task_numa_compare(struct task_numa_env *env, if (!cur) { /* Is there capacity at our destination? */ - if (env->src_stats.has_capacity && - !env->dst_stats.has_capacity) + if (env->src_stats.has_free_capacity && + !env->dst_stats.has_free_capacity) goto unlock; goto balance; @@ -1302,8 +1302,8 @@ static int task_numa_migrate(struct task_struct *p) groupimp = group_weight(p, env.dst_nid) - groupweight; update_numa_stats(&env.dst_stats, env.dst_nid); - /* If the preferred nid has capacity, try to use it. */ - if (env.dst_stats.has_capacity) + /* If the preferred nid has free capacity, try to use it. */ + if (env.dst_stats.has_free_capacity) task_numa_find_cpu(&env, taskimp, groupimp); /* No space available on the preferred nid. Look elsewhere. */ @@ -5538,7 +5538,7 @@ struct sg_lb_stats { unsigned int idle_cpus; unsigned int group_weight; int group_imb; /* Is there an imbalance in the group ? */ - int group_has_capacity; /* Is there extra capacity in the group? */ + int group_has_free_capacity; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -5905,7 +5905,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_capacity = sg_capacity(env, group); if (sgs->group_capacity > sgs->sum_nr_running) - sgs->group_has_capacity = 1; + sgs->group_has_free_capacity = 1; } /** @@ -6029,7 +6029,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd * with a large weight task outweighs the tasks on the system). */ if (prefer_sibling && sds->local && - sds->local_stat.group_has_capacity) + sds->local_stat.group_has_free_capacity) sgs->group_capacity = min(sgs->group_capacity, 1U); if (update_sd_pick_busiest(env, sds, sg, sgs)) { @@ -6289,8 +6289,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto force_balance; /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity && - !busiest->group_has_capacity) + if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity && + !busiest->group_has_free_capacity) goto force_balance; /* -- cgit v1.2.3 From 0fedc6c8e34f4ce0b37b1f25c3619b4a8faa244c Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 26 May 2014 18:19:36 -0400 Subject: sched/fair: Disambiguate existing/remaining "capacity" usage We have "power" (which should actually become "capacity") and "capacity" which is a scaled down "capacity factor" in terms of unitary tasks. Let's use "capacity_factor" to make room for proper usage of "capacity" later. Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra Cc: Vincent Guittot Cc: Daniel Lezcano Cc: Morten Rasmussen Cc: "Rafael J. Wysocki" Cc: linaro-kernel@lists.linaro.org Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/n/tip-gk1co8sqdev3763opqm6ovml@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8993dfa2e82b..e401e446e87c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5534,7 +5534,7 @@ struct sg_lb_stats { unsigned long load_per_task; unsigned long group_power; unsigned int sum_nr_running; /* Nr tasks running in the group */ - unsigned int group_capacity; + unsigned int group_capacity_factor; unsigned int idle_cpus; unsigned int group_weight; int group_imb; /* Is there an imbalance in the group ? */ @@ -5829,15 +5829,15 @@ static inline int sg_imbalanced(struct sched_group *group) } /* - * Compute the group capacity. + * Compute the group capacity factor. * * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by * first dividing out the smt factor and computing the actual number of cores * and limit power unit capacity with that. */ -static inline int sg_capacity(struct lb_env *env, struct sched_group *group) +static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) { - unsigned int capacity, smt, cpus; + unsigned int capacity_factor, smt, cpus; unsigned int power, power_orig; power = group->sgp->power; @@ -5846,13 +5846,13 @@ static inline int sg_capacity(struct lb_env *env, struct sched_group *group) /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */ smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig); - capacity = cpus / smt; /* cores */ + capacity_factor = cpus / smt; /* cores */ - capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE)); - if (!capacity) - capacity = fix_small_capacity(env->sd, group); + capacity_factor = min_t(unsigned, capacity_factor, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE)); + if (!capacity_factor) + capacity_factor = fix_small_capacity(env->sd, group); - return capacity; + return capacity_factor; } /** @@ -5902,9 +5902,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_weight = group->group_weight; sgs->group_imb = sg_imbalanced(group); - sgs->group_capacity = sg_capacity(env, group); + sgs->group_capacity_factor = sg_capacity_factor(env, group); - if (sgs->group_capacity > sgs->sum_nr_running) + if (sgs->group_capacity_factor > sgs->sum_nr_running) sgs->group_has_free_capacity = 1; } @@ -5929,7 +5929,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->avg_load <= sds->busiest_stat.avg_load) return false; - if (sgs->sum_nr_running > sgs->group_capacity) + if (sgs->sum_nr_running > sgs->group_capacity_factor) return true; if (sgs->group_imb) @@ -6020,17 +6020,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd /* * In case the child domain prefers tasks go to siblings - * first, lower the sg capacity to one so that we'll try + * first, lower the sg capacity factor to one so that we'll try * and move all the excess tasks away. We lower the capacity * of a group only if the local group has the capacity to fit - * these excess tasks, i.e. nr_running < group_capacity. The + * these excess tasks, i.e. nr_running < group_capacity_factor. The * extra check prevents the case where you always pull from the * heaviest group when it is already under-utilized (possible * with a large weight task outweighs the tasks on the system). */ if (prefer_sibling && sds->local && sds->local_stat.group_has_free_capacity) - sgs->group_capacity = min(sgs->group_capacity, 1U); + sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; @@ -6204,7 +6204,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * have to drop below capacity to reach cpu-load equilibrium. */ load_above_capacity = - (busiest->sum_nr_running - busiest->group_capacity); + (busiest->sum_nr_running - busiest->group_capacity_factor); load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); load_above_capacity /= busiest->group_power; @@ -6348,7 +6348,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, int i; for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { - unsigned long power, capacity, wl; + unsigned long power, capacity_factor, wl; enum fbq_type rt; rq = cpu_rq(i); @@ -6377,9 +6377,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, continue; power = power_of(i); - capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); - if (!capacity) - capacity = fix_small_capacity(env->sd, group); + capacity_factor = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); + if (!capacity_factor) + capacity_factor = fix_small_capacity(env->sd, group); wl = weighted_cpuload(i); @@ -6387,7 +6387,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, * When comparing with imbalance, use weighted_cpuload() * which is not scaled with the cpu power. */ - if (capacity && rq->nr_running == 1 && wl > env->imbalance) + if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) continue; /* -- cgit v1.2.3 From 63b2ca30bdb3dbf60bc7ac5f46713c0d32308261 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 26 May 2014 18:19:37 -0400 Subject: sched: Let 'struct sched_group_power' care about CPU capacity It is better not to think about compute capacity as being equivalent to "CPU power". The upcoming "power aware" scheduler work may create confusion with the notion of energy consumption if "power" is used too liberally. Since struct sched_group_power is really about compute capacity of sched groups, let's rename it to struct sched_group_capacity. Similarly sgp becomes sgc. Related variables and functions dealing with groups are also adjusted accordingly. Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra Cc: Vincent Guittot Cc: Daniel Lezcano Cc: Morten Rasmussen Cc: "Rafael J. Wysocki" Cc: linaro-kernel@lists.linaro.org Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/n/tip-5yeix833vvgf2uyj5o36hpu9@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 81 ++++++++++++++++--------------- kernel/sched/fair.c | 131 ++++++++++++++++++++++++++------------------------- kernel/sched/sched.h | 16 +++---- 3 files changed, 114 insertions(+), 114 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index afcc84234a3e..2e1fb0902200 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5221,14 +5221,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, } /* - * Even though we initialize ->power to something semi-sane, - * we leave power_orig unset. This allows us to detect if + * Even though we initialize ->capacity to something semi-sane, + * we leave capacity_orig unset. This allows us to detect if * domain iteration is still funny without causing /0 traps. */ - if (!group->sgp->power_orig) { + if (!group->sgc->capacity_orig) { printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: domain->cpu_power not " - "set\n"); + printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n"); break; } @@ -5250,9 +5249,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); - if (group->sgp->power != SCHED_POWER_SCALE) { - printk(KERN_CONT " (cpu_power = %d)", - group->sgp->power); + if (group->sgc->capacity != SCHED_POWER_SCALE) { + printk(KERN_CONT " (cpu_capacity = %d)", + group->sgc->capacity); } group = group->next; @@ -5466,7 +5465,7 @@ static struct root_domain *alloc_rootdomain(void) return rd; } -static void free_sched_groups(struct sched_group *sg, int free_sgp) +static void free_sched_groups(struct sched_group *sg, int free_sgc) { struct sched_group *tmp, *first; @@ -5477,8 +5476,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgp) do { tmp = sg->next; - if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) - kfree(sg->sgp); + if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) + kfree(sg->sgc); kfree(sg); sg = tmp; @@ -5496,7 +5495,7 @@ static void free_sched_domain(struct rcu_head *rcu) if (sd->flags & SD_OVERLAP) { free_sched_groups(sd->groups, 1); } else if (atomic_dec_and_test(&sd->groups->ref)) { - kfree(sd->groups->sgp); + kfree(sd->groups->sgc); kfree(sd->groups); } kfree(sd); @@ -5707,17 +5706,17 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) cpumask_or(covered, covered, sg_span); - sg->sgp = *per_cpu_ptr(sdd->sgp, i); - if (atomic_inc_return(&sg->sgp->ref) == 1) + sg->sgc = *per_cpu_ptr(sdd->sgc, i); + if (atomic_inc_return(&sg->sgc->ref) == 1) build_group_mask(sd, sg); /* - * Initialize sgp->power such that even if we mess up the + * Initialize sgc->capacity such that even if we mess up the * domains and no possible iteration will get us here, we won't * die on a /0 trap. */ - sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); - sg->sgp->power_orig = sg->sgp->power; + sg->sgc->capacity = SCHED_POWER_SCALE * cpumask_weight(sg_span); + sg->sgc->capacity_orig = sg->sgc->capacity; /* * Make sure the first group of this domain contains the @@ -5755,8 +5754,8 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) if (sg) { *sg = *per_cpu_ptr(sdd->sg, cpu); - (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); - atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ + (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); + atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */ } return cpu; @@ -5819,16 +5818,16 @@ build_sched_groups(struct sched_domain *sd, int cpu) } /* - * Initialize sched groups cpu_power. + * Initialize sched groups cpu_capacity. * - * cpu_power indicates the capacity of sched group, which is used while + * cpu_capacity indicates the capacity of sched group, which is used while * distributing the load between different sched groups in a sched domain. - * Typically cpu_power for all the groups in a sched domain will be same unless - * there are asymmetries in the topology. If there are asymmetries, group - * having more cpu_power will pickup more load compared to the group having - * less cpu_power. + * Typically cpu_capacity for all the groups in a sched domain will be same + * unless there are asymmetries in the topology. If there are asymmetries, + * group having more cpu_capacity will pickup more load compared to the + * group having less cpu_capacity. */ -static void init_sched_groups_power(int cpu, struct sched_domain *sd) +static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) { struct sched_group *sg = sd->groups; @@ -5842,8 +5841,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) if (cpu != group_balance_cpu(sg)) return; - update_group_power(sd, cpu); - atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); + update_group_capacity(sd, cpu); + atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); } /* @@ -5934,8 +5933,8 @@ static void claim_allocations(int cpu, struct sched_domain *sd) if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) *per_cpu_ptr(sdd->sg, cpu) = NULL; - if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) - *per_cpu_ptr(sdd->sgp, cpu) = NULL; + if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) + *per_cpu_ptr(sdd->sgc, cpu) = NULL; } #ifdef CONFIG_NUMA @@ -6337,14 +6336,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sdd->sg) return -ENOMEM; - sdd->sgp = alloc_percpu(struct sched_group_power *); - if (!sdd->sgp) + sdd->sgc = alloc_percpu(struct sched_group_capacity *); + if (!sdd->sgc) return -ENOMEM; for_each_cpu(j, cpu_map) { struct sched_domain *sd; struct sched_group *sg; - struct sched_group_power *sgp; + struct sched_group_capacity *sgc; sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); @@ -6362,12 +6361,12 @@ static int __sdt_alloc(const struct cpumask *cpu_map) *per_cpu_ptr(sdd->sg, j) = sg; - sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), + sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); - if (!sgp) + if (!sgc) return -ENOMEM; - *per_cpu_ptr(sdd->sgp, j) = sgp; + *per_cpu_ptr(sdd->sgc, j) = sgc; } } @@ -6394,15 +6393,15 @@ static void __sdt_free(const struct cpumask *cpu_map) if (sdd->sg) kfree(*per_cpu_ptr(sdd->sg, j)); - if (sdd->sgp) - kfree(*per_cpu_ptr(sdd->sgp, j)); + if (sdd->sgc) + kfree(*per_cpu_ptr(sdd->sgc, j)); } free_percpu(sdd->sd); sdd->sd = NULL; free_percpu(sdd->sg); sdd->sg = NULL; - free_percpu(sdd->sgp); - sdd->sgp = NULL; + free_percpu(sdd->sgc); + sdd->sgc = NULL; } } @@ -6479,7 +6478,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { claim_allocations(i, sd); - init_sched_groups_power(i, sd); + init_sched_groups_capacity(i, sd); } } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e401e446e87c..36bd4d23fca8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4369,8 +4369,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, avg_load += load; } - /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power; + /* Adjust by relative CPU capacity of the group */ + avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgc->capacity; if (local_group) { this_load = avg_load; @@ -5532,7 +5532,7 @@ struct sg_lb_stats { unsigned long group_load; /* Total load over the CPUs of the group */ unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long load_per_task; - unsigned long group_power; + unsigned long group_capacity; unsigned int sum_nr_running; /* Nr tasks running in the group */ unsigned int group_capacity_factor; unsigned int idle_cpus; @@ -5553,7 +5553,7 @@ struct sd_lb_stats { struct sched_group *busiest; /* Busiest group in this sd */ struct sched_group *local; /* Local group in this sd */ unsigned long total_load; /* Total load of all groups in sd */ - unsigned long total_pwr; /* Total power of all groups in sd */ + unsigned long total_capacity; /* Total capacity of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */ struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ @@ -5572,7 +5572,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .busiest = NULL, .local = NULL, .total_load = 0UL, - .total_pwr = 0UL, + .total_capacity = 0UL, .busiest_stat = { .avg_load = 0UL, }, @@ -5681,7 +5681,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) power >>= SCHED_POWER_SHIFT; } - sdg->sgp->power_orig = power; + sdg->sgc->capacity_orig = power; if (sched_feat(ARCH_POWER)) power *= arch_scale_freq_power(sd, cpu); @@ -5697,26 +5697,26 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) power = 1; cpu_rq(cpu)->cpu_power = power; - sdg->sgp->power = power; + sdg->sgc->capacity = power; } -void update_group_power(struct sched_domain *sd, int cpu) +void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long power, power_orig; + unsigned long capacity, capacity_orig; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); interval = clamp(interval, 1UL, max_load_balance_interval); - sdg->sgp->next_update = jiffies + interval; + sdg->sgc->next_update = jiffies + interval; if (!child) { update_cpu_power(sd, cpu); return; } - power_orig = power = 0; + capacity_orig = capacity = 0; if (child->flags & SD_OVERLAP) { /* @@ -5725,31 +5725,31 @@ void update_group_power(struct sched_domain *sd, int cpu) */ for_each_cpu(cpu, sched_group_cpus(sdg)) { - struct sched_group_power *sgp; + struct sched_group_capacity *sgc; struct rq *rq = cpu_rq(cpu); /* - * build_sched_domains() -> init_sched_groups_power() + * build_sched_domains() -> init_sched_groups_capacity() * gets here before we've attached the domains to the * runqueues. * * Use power_of(), which is set irrespective of domains * in update_cpu_power(). * - * This avoids power/power_orig from being 0 and + * This avoids capacity/capacity_orig from being 0 and * causing divide-by-zero issues on boot. * - * Runtime updates will correct power_orig. + * Runtime updates will correct capacity_orig. */ if (unlikely(!rq->sd)) { - power_orig += power_of(cpu); - power += power_of(cpu); + capacity_orig += power_of(cpu); + capacity += power_of(cpu); continue; } - sgp = rq->sd->groups->sgp; - power_orig += sgp->power_orig; - power += sgp->power; + sgc = rq->sd->groups->sgc; + capacity_orig += sgc->capacity_orig; + capacity += sgc->capacity; } } else { /* @@ -5759,14 +5759,14 @@ void update_group_power(struct sched_domain *sd, int cpu) group = child->groups; do { - power_orig += group->sgp->power_orig; - power += group->sgp->power; + capacity_orig += group->sgc->capacity_orig; + capacity += group->sgc->capacity; group = group->next; } while (group != child->groups); } - sdg->sgp->power_orig = power_orig; - sdg->sgp->power = power; + sdg->sgc->capacity_orig = capacity_orig; + sdg->sgc->capacity = capacity; } /* @@ -5786,9 +5786,9 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) return 0; /* - * If ~90% of the cpu_power is still there, we're good. + * If ~90% of the cpu_capacity is still there, we're good. */ - if (group->sgp->power * 32 > group->sgp->power_orig * 29) + if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29) return 1; return 0; @@ -5825,7 +5825,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) static inline int sg_imbalanced(struct sched_group *group) { - return group->sgp->imbalance; + return group->sgc->imbalance; } /* @@ -5833,22 +5833,23 @@ static inline int sg_imbalanced(struct sched_group *group) * * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by * first dividing out the smt factor and computing the actual number of cores - * and limit power unit capacity with that. + * and limit unit capacity with that. */ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) { unsigned int capacity_factor, smt, cpus; - unsigned int power, power_orig; + unsigned int capacity, capacity_orig; - power = group->sgp->power; - power_orig = group->sgp->power_orig; + capacity = group->sgc->capacity; + capacity_orig = group->sgc->capacity_orig; cpus = group->group_weight; - /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */ - smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig); + /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ + smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, capacity_orig); capacity_factor = cpus / smt; /* cores */ - capacity_factor = min_t(unsigned, capacity_factor, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE)); + capacity_factor = min_t(unsigned, + capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_POWER_SCALE)); if (!capacity_factor) capacity_factor = fix_small_capacity(env->sd, group); @@ -5892,9 +5893,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->idle_cpus++; } - /* Adjust by relative CPU power of the group */ - sgs->group_power = group->sgp->power; - sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; + /* Adjust by relative CPU capacity of the group */ + sgs->group_capacity = group->sgc->capacity; + sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_capacity; if (sgs->sum_nr_running) sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; @@ -6009,8 +6010,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd sgs = &sds->local_stat; if (env->idle != CPU_NEWLY_IDLE || - time_after_eq(jiffies, sg->sgp->next_update)) - update_group_power(env->sd, env->dst_cpu); + time_after_eq(jiffies, sg->sgc->next_update)) + update_group_capacity(env->sd, env->dst_cpu); } update_sg_lb_stats(env, sg, load_idx, local_group, sgs); @@ -6040,7 +6041,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd next_group: /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; - sds->total_pwr += sgs->group_power; + sds->total_capacity += sgs->group_capacity; sg = sg->next; } while (sg != env->sd->groups); @@ -6087,7 +6088,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) return 0; env->imbalance = DIV_ROUND_CLOSEST( - sds->busiest_stat.avg_load * sds->busiest_stat.group_power, + sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity, SCHED_POWER_SCALE); return 1; @@ -6103,7 +6104,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) static inline void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) { - unsigned long tmp, pwr_now = 0, pwr_move = 0; + unsigned long tmp, capa_now = 0, capa_move = 0; unsigned int imbn = 2; unsigned long scaled_busy_load_per_task; struct sg_lb_stats *local, *busiest; @@ -6118,7 +6119,7 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) scaled_busy_load_per_task = (busiest->load_per_task * SCHED_POWER_SCALE) / - busiest->group_power; + busiest->group_capacity; if (busiest->avg_load + scaled_busy_load_per_task >= local->avg_load + (scaled_busy_load_per_task * imbn)) { @@ -6132,34 +6133,34 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) * moving them. */ - pwr_now += busiest->group_power * + capa_now += busiest->group_capacity * min(busiest->load_per_task, busiest->avg_load); - pwr_now += local->group_power * + capa_now += local->group_capacity * min(local->load_per_task, local->avg_load); - pwr_now /= SCHED_POWER_SCALE; + capa_now /= SCHED_POWER_SCALE; /* Amount of load we'd subtract */ if (busiest->avg_load > scaled_busy_load_per_task) { - pwr_move += busiest->group_power * + capa_move += busiest->group_capacity * min(busiest->load_per_task, busiest->avg_load - scaled_busy_load_per_task); } /* Amount of load we'd add */ - if (busiest->avg_load * busiest->group_power < + if (busiest->avg_load * busiest->group_capacity < busiest->load_per_task * SCHED_POWER_SCALE) { - tmp = (busiest->avg_load * busiest->group_power) / - local->group_power; + tmp = (busiest->avg_load * busiest->group_capacity) / + local->group_capacity; } else { tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / - local->group_power; + local->group_capacity; } - pwr_move += local->group_power * + capa_move += local->group_capacity * min(local->load_per_task, local->avg_load + tmp); - pwr_move /= SCHED_POWER_SCALE; + capa_move /= SCHED_POWER_SCALE; /* Move if we gain throughput */ - if (pwr_move > pwr_now) + if (capa_move > capa_now) env->imbalance = busiest->load_per_task; } @@ -6207,7 +6208,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s (busiest->sum_nr_running - busiest->group_capacity_factor); load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); - load_above_capacity /= busiest->group_power; + load_above_capacity /= busiest->group_capacity; } /* @@ -6222,8 +6223,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* How much load to actually move to equalise the imbalance */ env->imbalance = min( - max_pull * busiest->group_power, - (sds->avg_load - local->avg_load) * local->group_power + max_pull * busiest->group_capacity, + (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_POWER_SCALE; /* @@ -6278,7 +6279,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (!sds.busiest || busiest->sum_nr_running == 0) goto out_balanced; - sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; + sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_capacity; /* * If the busiest group is imbalanced the below checks don't @@ -6611,7 +6612,7 @@ more_balance: * We failed to reach balance because of affinity. */ if (sd_parent) { - int *group_imbalance = &sd_parent->groups->sgp->imbalance; + int *group_imbalance = &sd_parent->groups->sgc->imbalance; if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { *group_imbalance = 1; @@ -6998,7 +6999,7 @@ static inline void set_cpu_sd_state_busy(void) goto unlock; sd->nohz_idle = 0; - atomic_inc(&sd->groups->sgp->nr_busy_cpus); + atomic_inc(&sd->groups->sgc->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -7015,7 +7016,7 @@ void set_cpu_sd_state_idle(void) goto unlock; sd->nohz_idle = 1; - atomic_dec(&sd->groups->sgp->nr_busy_cpus); + atomic_dec(&sd->groups->sgc->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -7219,7 +7220,7 @@ end: * of an idle cpu is the system. * - This rq has more than one task. * - At any scheduler domain level, this cpu's scheduler group has multiple - * busy cpu's exceeding the group's power. + * busy cpu's exceeding the group's capacity. * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ @@ -7227,7 +7228,7 @@ static inline int nohz_kick_needed(struct rq *rq) { unsigned long now = jiffies; struct sched_domain *sd; - struct sched_group_power *sgp; + struct sched_group_capacity *sgc; int nr_busy, cpu = rq->cpu; if (unlikely(rq->idle_balance)) @@ -7257,8 +7258,8 @@ static inline int nohz_kick_needed(struct rq *rq) sd = rcu_dereference(per_cpu(sd_busy, cpu)); if (sd) { - sgp = sd->groups->sgp; - nr_busy = atomic_read(&sgp->nr_busy_cpus); + sgc = sd->groups->sgc; + nr_busy = atomic_read(&sgc->nr_busy_cpus); if (nr_busy > 1) goto need_kick_unlock; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 600e2291a75c..a5b957d53c92 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -728,15 +728,15 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa); DECLARE_PER_CPU(struct sched_domain *, sd_busy); DECLARE_PER_CPU(struct sched_domain *, sd_asym); -struct sched_group_power { +struct sched_group_capacity { atomic_t ref; /* - * CPU power of this group, SCHED_LOAD_SCALE being max power for a - * single CPU. + * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity + * for a single CPU. */ - unsigned int power, power_orig; + unsigned int capacity, capacity_orig; unsigned long next_update; - int imbalance; /* XXX unrelated to power but shared group state */ + int imbalance; /* XXX unrelated to capacity but shared group state */ /* * Number of busy cpus in this group. */ @@ -750,7 +750,7 @@ struct sched_group { atomic_t ref; unsigned int group_weight; - struct sched_group_power *sgp; + struct sched_group_capacity *sgc; /* * The CPUs this group covers. @@ -773,7 +773,7 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg) */ static inline struct cpumask *sched_group_mask(struct sched_group *sg) { - return to_cpumask(sg->sgp->cpumask); + return to_cpumask(sg->sgc->cpumask); } /** @@ -1167,7 +1167,7 @@ extern const struct sched_class idle_sched_class; #ifdef CONFIG_SMP -extern void update_group_power(struct sched_domain *sd, int cpu); +extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); -- cgit v1.2.3 From ced549fa5fc1fdaf7fac93894dc673092eb3dc20 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 26 May 2014 18:19:38 -0400 Subject: sched: Remove remaining dubious usage of "power" It is better not to think about compute capacity as being equivalent to "CPU power". The upcoming "power aware" scheduler work may create confusion with the notion of energy consumption if "power" is used too liberally. This is the remaining "power" -> "capacity" rename for local symbols. Those symbols visible to the rest of the kernel are not included yet. Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra Cc: Vincent Guittot Cc: Daniel Lezcano Cc: Morten Rasmussen Cc: "Rafael J. Wysocki" Cc: linaro-kernel@lists.linaro.org Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/n/tip-yyyhohzhkwnaotr3lx8zd5aa@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +-- kernel/sched/fair.c | 102 +++++++++++++++++++++++++-------------------------- kernel/sched/sched.h | 2 +- 3 files changed, 55 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2e1fb0902200..07bc78a50329 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5764,7 +5764,7 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) /* * build_sched_groups will build a circular linked list of the groups * covered by the given span, and will set each group's ->cpumask correctly, - * and ->cpu_power to 0. + * and ->cpu_capacity to 0. * * Assumes the sched_domain tree is fully constructed */ @@ -6471,7 +6471,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, } } - /* Calculate CPU power for physical packages and nodes */ + /* Calculate CPU capacity for physical packages and nodes */ for (i = nr_cpumask_bits-1; i >= 0; i--) { if (!cpumask_test_cpu(i, cpu_map)) continue; @@ -6921,7 +6921,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; - rq->cpu_power = SCHED_POWER_SCALE; + rq->cpu_capacity = SCHED_POWER_SCALE; rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 36bd4d23fca8..58684f684fa8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1017,7 +1017,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, static unsigned long weighted_cpuload(const int cpu); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); -static unsigned long power_of(int cpu); +static unsigned long capacity_of(int cpu); static long effective_load(struct task_group *tg, int cpu, long wl, long wg); /* Cached statistics for all CPUs within a node */ @@ -1046,7 +1046,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) ns->nr_running += rq->nr_running; ns->load += weighted_cpuload(cpu); - ns->compute_capacity += power_of(cpu); + ns->compute_capacity += capacity_of(cpu); cpus++; } @@ -1214,7 +1214,7 @@ balance: orig_dst_load = env->dst_stats.load; orig_src_load = env->src_stats.load; - /* XXX missing power terms */ + /* XXX missing capacity terms */ load = task_h_load(env->p); dst_load = orig_dst_load + load; src_load = orig_src_load - load; @@ -4043,9 +4043,9 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } -static unsigned long power_of(int cpu) +static unsigned long capacity_of(int cpu) { - return cpu_rq(cpu)->cpu_power; + return cpu_rq(cpu)->cpu_capacity; } static unsigned long cpu_avg_load_per_task(int cpu) @@ -4288,12 +4288,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) s64 this_eff_load, prev_eff_load; this_eff_load = 100; - this_eff_load *= power_of(prev_cpu); + this_eff_load *= capacity_of(prev_cpu); this_eff_load *= this_load + effective_load(tg, this_cpu, weight, weight); prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= power_of(this_cpu); + prev_eff_load *= capacity_of(this_cpu); prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); balanced = this_eff_load <= prev_eff_load; @@ -4950,14 +4950,14 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) * - * P_i is the cpu power (or compute capacity) of cpu i, typically it is the + * C_i is the compute capacity of cpu i, typically it is the * fraction of 'recent' time available for SCHED_OTHER task execution. But it * can also include other factors [XXX]. * * To achieve this balance we define a measure of imbalance which follows * directly from (1): * - * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4) + * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4) * * We them move tasks around to minimize the imbalance. In the continuous * function space it is obvious this converges, in the discrete case we get @@ -5607,17 +5607,17 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) +static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu) { return SCHED_POWER_SCALE; } unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) { - return default_scale_freq_power(sd, cpu); + return default_scale_capacity(sd, cpu); } -static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) +static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu) { unsigned long weight = sd->span_weight; unsigned long smt_gain = sd->smt_gain; @@ -5629,10 +5629,10 @@ static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) { - return default_scale_smt_power(sd, cpu); + return default_scale_smt_capacity(sd, cpu); } -static unsigned long scale_rt_power(int cpu) +static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); u64 total, available, age_stamp, avg; @@ -5652,7 +5652,7 @@ static unsigned long scale_rt_power(int cpu) total = sched_avg_period() + delta; if (unlikely(total < avg)) { - /* Ensures that power won't end up being negative */ + /* Ensures that capacity won't end up being negative */ available = 0; } else { available = total - avg; @@ -5666,38 +5666,38 @@ static unsigned long scale_rt_power(int cpu) return div_u64(available, total); } -static void update_cpu_power(struct sched_domain *sd, int cpu) +static void update_cpu_capacity(struct sched_domain *sd, int cpu) { unsigned long weight = sd->span_weight; - unsigned long power = SCHED_POWER_SCALE; + unsigned long capacity = SCHED_POWER_SCALE; struct sched_group *sdg = sd->groups; if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { if (sched_feat(ARCH_POWER)) - power *= arch_scale_smt_power(sd, cpu); + capacity *= arch_scale_smt_power(sd, cpu); else - power *= default_scale_smt_power(sd, cpu); + capacity *= default_scale_smt_capacity(sd, cpu); - power >>= SCHED_POWER_SHIFT; + capacity >>= SCHED_POWER_SHIFT; } - sdg->sgc->capacity_orig = power; + sdg->sgc->capacity_orig = capacity; if (sched_feat(ARCH_POWER)) - power *= arch_scale_freq_power(sd, cpu); + capacity *= arch_scale_freq_power(sd, cpu); else - power *= default_scale_freq_power(sd, cpu); + capacity *= default_scale_capacity(sd, cpu); - power >>= SCHED_POWER_SHIFT; + capacity >>= SCHED_POWER_SHIFT; - power *= scale_rt_power(cpu); - power >>= SCHED_POWER_SHIFT; + capacity *= scale_rt_capacity(cpu); + capacity >>= SCHED_POWER_SHIFT; - if (!power) - power = 1; + if (!capacity) + capacity = 1; - cpu_rq(cpu)->cpu_power = power; - sdg->sgc->capacity = power; + cpu_rq(cpu)->cpu_capacity = capacity; + sdg->sgc->capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) @@ -5712,7 +5712,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) sdg->sgc->next_update = jiffies + interval; if (!child) { - update_cpu_power(sd, cpu); + update_cpu_capacity(sd, cpu); return; } @@ -5733,8 +5733,8 @@ void update_group_capacity(struct sched_domain *sd, int cpu) * gets here before we've attached the domains to the * runqueues. * - * Use power_of(), which is set irrespective of domains - * in update_cpu_power(). + * Use capacity_of(), which is set irrespective of domains + * in update_cpu_capacity(). * * This avoids capacity/capacity_orig from being 0 and * causing divide-by-zero issues on boot. @@ -5742,8 +5742,8 @@ void update_group_capacity(struct sched_domain *sd, int cpu) * Runtime updates will correct capacity_orig. */ if (unlikely(!rq->sd)) { - capacity_orig += power_of(cpu); - capacity += power_of(cpu); + capacity_orig += capacity_of(cpu); + capacity += capacity_of(cpu); continue; } @@ -5831,7 +5831,7 @@ static inline int sg_imbalanced(struct sched_group *group) /* * Compute the group capacity factor. * - * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by + * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by * first dividing out the smt factor and computing the actual number of cores * and limit unit capacity with that. */ @@ -6129,7 +6129,7 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) /* * OK, we don't have enough imbalance to justify moving tasks, - * however we may be able to increase total CPU power used by + * however we may be able to increase total CPU capacity used by * moving them. */ @@ -6190,7 +6190,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* * In the presence of smp nice balancing, certain scenarios can have * max load less than avg load(as we skip the groups at or below - * its cpu_power, while calculating max_load..) + * its cpu_capacity, while calculating max_load..) */ if (busiest->avg_load <= sds->avg_load || local->avg_load >= sds->avg_load) { @@ -6345,11 +6345,11 @@ static struct rq *find_busiest_queue(struct lb_env *env, struct sched_group *group) { struct rq *busiest = NULL, *rq; - unsigned long busiest_load = 0, busiest_power = 1; + unsigned long busiest_load = 0, busiest_capacity = 1; int i; for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { - unsigned long power, capacity_factor, wl; + unsigned long capacity, capacity_factor, wl; enum fbq_type rt; rq = cpu_rq(i); @@ -6377,8 +6377,8 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; - power = power_of(i); - capacity_factor = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); + capacity = capacity_of(i); + capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_POWER_SCALE); if (!capacity_factor) capacity_factor = fix_small_capacity(env->sd, group); @@ -6386,25 +6386,25 @@ static struct rq *find_busiest_queue(struct lb_env *env, /* * When comparing with imbalance, use weighted_cpuload() - * which is not scaled with the cpu power. + * which is not scaled with the cpu capacity. */ if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) continue; /* * For the load comparisons with the other cpu's, consider - * the weighted_cpuload() scaled with the cpu power, so that - * the load can be moved away from the cpu that is potentially - * running at a lower capacity. + * the weighted_cpuload() scaled with the cpu capacity, so + * that the load can be moved away from the cpu that is + * potentially running at a lower capacity. * - * Thus we're looking for max(wl_i / power_i), crosswise + * Thus we're looking for max(wl_i / capacity_i), crosswise * multiplication to rid ourselves of the division works out - * to: wl_i * power_j > wl_j * power_i; where j is our - * previous maximum. + * to: wl_i * capacity_j > wl_j * capacity_i; where j is + * our previous maximum. */ - if (wl * busiest_power > busiest_load * power) { + if (wl * busiest_capacity > busiest_load * capacity) { busiest_load = wl; - busiest_power = power; + busiest_capacity = capacity; busiest = rq; } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a5b957d53c92..956b8ca24893 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -567,7 +567,7 @@ struct rq { struct root_domain *rd; struct sched_domain *sd; - unsigned long cpu_power; + unsigned long cpu_capacity; unsigned char idle_balance; /* For active balancing */ -- cgit v1.2.3 From ca8ce3d0b144c318a5a9ce99649053e9029061ea Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 26 May 2014 18:19:39 -0400 Subject: sched: Final power vs. capacity cleanups It is better not to think about compute capacity as being equivalent to "CPU power". The upcoming "power aware" scheduler work may create confusion with the notion of energy consumption if "power" is used too liberally. This contains the architecture visible changes. Incidentally, only ARM takes advantage of the available pow^H^H^Hcapacity scaling hooks and therefore those changes outside kernel/sched/ are confined to one ARM specific file. The default arch_scale_smt_power() hook is not overridden by anyone. Replacements are as follows: arch_scale_freq_power --> arch_scale_freq_capacity arch_scale_smt_power --> arch_scale_smt_capacity SCHED_POWER_SCALE --> SCHED_CAPACITY_SCALE SCHED_POWER_SHIFT --> SCHED_CAPACITY_SHIFT The local usage of "power" in arch/arm/kernel/topology.c is also changed to "capacity" as appropriate. Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra Cc: Vincent Guittot Cc: Daniel Lezcano Cc: Morten Rasmussen Cc: "Rafael J. Wysocki" Cc: linaro-kernel@lists.linaro.org Cc: Arnd Bergmann Cc: Dietmar Eggemann Cc: Grant Likely Cc: Linus Torvalds Cc: Mark Brown Cc: Rob Herring Cc: Russell King Cc: Sudeep KarkadaNagesha Cc: Vincent Guittot Cc: devicetree@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/n/tip-48zba9qbznvglwelgq2cfygh@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +++--- kernel/sched/fair.c | 59 +++++++++++++++++++++++++++-------------------------- 2 files changed, 33 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 07bc78a50329..7ba4f5413a10 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5249,7 +5249,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); - if (group->sgc->capacity != SCHED_POWER_SCALE) { + if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { printk(KERN_CONT " (cpu_capacity = %d)", group->sgc->capacity); } @@ -5715,7 +5715,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) * domains and no possible iteration will get us here, we won't * die on a /0 trap. */ - sg->sgc->capacity = SCHED_POWER_SCALE * cpumask_weight(sg_span); + sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); sg->sgc->capacity_orig = sg->sgc->capacity; /* @@ -6921,7 +6921,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; - rq->cpu_capacity = SCHED_POWER_SCALE; + rq->cpu_capacity = SCHED_CAPACITY_SCALE; rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 58684f684fa8..dc7d6527a282 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1062,9 +1062,9 @@ static void update_numa_stats(struct numa_stats *ns, int nid) if (!cpus) return; - ns->load = (ns->load * SCHED_POWER_SCALE) / ns->compute_capacity; + ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity; ns->task_capacity = - DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_POWER_SCALE); + DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); ns->has_free_capacity = (ns->nr_running < ns->task_capacity); } @@ -4370,7 +4370,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, } /* Adjust by relative CPU capacity of the group */ - avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgc->capacity; + avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity; if (local_group) { this_load = avg_load; @@ -5609,10 +5609,10 @@ static inline int get_sd_load_idx(struct sched_domain *sd, static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu) { - return SCHED_POWER_SCALE; + return SCHED_CAPACITY_SCALE; } -unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) +unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) { return default_scale_capacity(sd, cpu); } @@ -5627,7 +5627,7 @@ static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu return smt_gain; } -unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) +unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu) { return default_scale_smt_capacity(sd, cpu); } @@ -5658,10 +5658,10 @@ static unsigned long scale_rt_capacity(int cpu) available = total - avg; } - if (unlikely((s64)total < SCHED_POWER_SCALE)) - total = SCHED_POWER_SCALE; + if (unlikely((s64)total < SCHED_CAPACITY_SCALE)) + total = SCHED_CAPACITY_SCALE; - total >>= SCHED_POWER_SHIFT; + total >>= SCHED_CAPACITY_SHIFT; return div_u64(available, total); } @@ -5669,29 +5669,29 @@ static unsigned long scale_rt_capacity(int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { unsigned long weight = sd->span_weight; - unsigned long capacity = SCHED_POWER_SCALE; + unsigned long capacity = SCHED_CAPACITY_SCALE; struct sched_group *sdg = sd->groups; if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { if (sched_feat(ARCH_POWER)) - capacity *= arch_scale_smt_power(sd, cpu); + capacity *= arch_scale_smt_capacity(sd, cpu); else capacity *= default_scale_smt_capacity(sd, cpu); - capacity >>= SCHED_POWER_SHIFT; + capacity >>= SCHED_CAPACITY_SHIFT; } sdg->sgc->capacity_orig = capacity; if (sched_feat(ARCH_POWER)) - capacity *= arch_scale_freq_power(sd, cpu); + capacity *= arch_scale_freq_capacity(sd, cpu); else capacity *= default_scale_capacity(sd, cpu); - capacity >>= SCHED_POWER_SHIFT; + capacity >>= SCHED_CAPACITY_SHIFT; capacity *= scale_rt_capacity(cpu); - capacity >>= SCHED_POWER_SHIFT; + capacity >>= SCHED_CAPACITY_SHIFT; if (!capacity) capacity = 1; @@ -5780,7 +5780,7 @@ static inline int fix_small_capacity(struct sched_domain *sd, struct sched_group *group) { /* - * Only siblings can have significantly less than SCHED_POWER_SCALE + * Only siblings can have significantly less than SCHED_CAPACITY_SCALE */ if (!(sd->flags & SD_SHARE_CPUPOWER)) return 0; @@ -5845,11 +5845,11 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro cpus = group->group_weight; /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ - smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, capacity_orig); + smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig); capacity_factor = cpus / smt; /* cores */ capacity_factor = min_t(unsigned, - capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_POWER_SCALE)); + capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE)); if (!capacity_factor) capacity_factor = fix_small_capacity(env->sd, group); @@ -5895,7 +5895,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, /* Adjust by relative CPU capacity of the group */ sgs->group_capacity = group->sgc->capacity; - sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_capacity; + sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; if (sgs->sum_nr_running) sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; @@ -6089,7 +6089,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) env->imbalance = DIV_ROUND_CLOSEST( sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity, - SCHED_POWER_SCALE); + SCHED_CAPACITY_SCALE); return 1; } @@ -6118,7 +6118,7 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) imbn = 1; scaled_busy_load_per_task = - (busiest->load_per_task * SCHED_POWER_SCALE) / + (busiest->load_per_task * SCHED_CAPACITY_SCALE) / busiest->group_capacity; if (busiest->avg_load + scaled_busy_load_per_task >= @@ -6137,7 +6137,7 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) min(busiest->load_per_task, busiest->avg_load); capa_now += local->group_capacity * min(local->load_per_task, local->avg_load); - capa_now /= SCHED_POWER_SCALE; + capa_now /= SCHED_CAPACITY_SCALE; /* Amount of load we'd subtract */ if (busiest->avg_load > scaled_busy_load_per_task) { @@ -6148,16 +6148,16 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) /* Amount of load we'd add */ if (busiest->avg_load * busiest->group_capacity < - busiest->load_per_task * SCHED_POWER_SCALE) { + busiest->load_per_task * SCHED_CAPACITY_SCALE) { tmp = (busiest->avg_load * busiest->group_capacity) / local->group_capacity; } else { - tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / + tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) / local->group_capacity; } capa_move += local->group_capacity * min(local->load_per_task, local->avg_load + tmp); - capa_move /= SCHED_POWER_SCALE; + capa_move /= SCHED_CAPACITY_SCALE; /* Move if we gain throughput */ if (capa_move > capa_now) @@ -6207,7 +6207,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s load_above_capacity = (busiest->sum_nr_running - busiest->group_capacity_factor); - load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); + load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE); load_above_capacity /= busiest->group_capacity; } @@ -6225,7 +6225,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s env->imbalance = min( max_pull * busiest->group_capacity, (sds->avg_load - local->avg_load) * local->group_capacity - ) / SCHED_POWER_SCALE; + ) / SCHED_CAPACITY_SCALE; /* * if *imbalance is less than the average load per runnable task @@ -6279,7 +6279,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (!sds.busiest || busiest->sum_nr_running == 0) goto out_balanced; - sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_capacity; + sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load) + / sds.total_capacity; /* * If the busiest group is imbalanced the below checks don't @@ -6378,7 +6379,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, continue; capacity = capacity_of(i); - capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_POWER_SCALE); + capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE); if (!capacity_factor) capacity_factor = fix_small_capacity(env->sd, group); -- cgit v1.2.3 From 5d4dfddd4f02b028d6ddaaa04d75d3b0cad1c9ae Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 27 May 2014 13:50:41 -0400 Subject: sched: Rename capacity related flags It is better not to think about compute capacity as being equivalent to "CPU power". The upcoming "power aware" scheduler work may create confusion with the notion of energy consumption if "power" is used too liberally. Let's rename the following feature flags since they do relate to capacity: SD_SHARE_CPUPOWER -> SD_SHARE_CPUCAPACITY ARCH_POWER -> ARCH_CAPACITY NONTASK_POWER -> NONTASK_CAPACITY Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra Cc: Vincent Guittot Cc: Daniel Lezcano Cc: Morten Rasmussen Cc: "Rafael J. Wysocki" Cc: linaro-kernel@lists.linaro.org Cc: Andy Fleming Cc: Anton Blanchard Cc: Benjamin Herrenschmidt Cc: Grant Likely Cc: Linus Torvalds Cc: Michael Ellerman Cc: Paul Gortmaker Cc: Paul Mackerras Cc: Preeti U Murthy Cc: Rob Herring Cc: Srivatsa S. Bhat Cc: Toshi Kani Cc: Vasant Hegde Cc: Vincent Guittot Cc: devicetree@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/n/tip-e93lpnxb87owfievqatey6b5@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 14 +++++++------- kernel/sched/fair.c | 8 ++++---- kernel/sched/features.h | 8 ++++---- 3 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7ba4f5413a10..5976ca579d3e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -872,7 +872,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->clock_task += delta; #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) - if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) + if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) sched_rt_avg_update(rq, irq_delta + steal); #endif } @@ -5309,7 +5309,7 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | - SD_SHARE_CPUPOWER | + SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN)) { if (sd->groups != sd->groups->next) @@ -5340,7 +5340,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | - SD_SHARE_CPUPOWER | + SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_PREFER_SIBLING | SD_SHARE_POWERDOMAIN); @@ -5947,7 +5947,7 @@ static int sched_domains_curr_level; /* * SD_flags allowed in topology descriptions. * - * SD_SHARE_CPUPOWER - describes SMT topologies + * SD_SHARE_CPUCAPACITY - describes SMT topologies * SD_SHARE_PKG_RESOURCES - describes shared caches * SD_NUMA - describes NUMA topologies * SD_SHARE_POWERDOMAIN - describes shared power domain @@ -5956,7 +5956,7 @@ static int sched_domains_curr_level; * SD_ASYM_PACKING - describes SMT quirks */ #define TOPOLOGY_SD_FLAGS \ - (SD_SHARE_CPUPOWER | \ + (SD_SHARE_CPUCAPACITY | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING | \ @@ -6002,7 +6002,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) | 1*SD_BALANCE_FORK | 0*SD_BALANCE_WAKE | 1*SD_WAKE_AFFINE - | 0*SD_SHARE_CPUPOWER + | 0*SD_SHARE_CPUCAPACITY | 0*SD_SHARE_PKG_RESOURCES | 0*SD_SERIALIZE | 0*SD_PREFER_SIBLING @@ -6024,7 +6024,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) * Convert topological properties into behaviour. */ - if (sd->flags & SD_SHARE_CPUPOWER) { + if (sd->flags & SD_SHARE_CPUCAPACITY) { sd->imbalance_pct = 110; sd->smt_gain = 1178; /* ~15% */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dc7d6527a282..d3c731222199 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5672,8 +5672,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) unsigned long capacity = SCHED_CAPACITY_SCALE; struct sched_group *sdg = sd->groups; - if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { - if (sched_feat(ARCH_POWER)) + if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) { + if (sched_feat(ARCH_CAPACITY)) capacity *= arch_scale_smt_capacity(sd, cpu); else capacity *= default_scale_smt_capacity(sd, cpu); @@ -5683,7 +5683,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) sdg->sgc->capacity_orig = capacity; - if (sched_feat(ARCH_POWER)) + if (sched_feat(ARCH_CAPACITY)) capacity *= arch_scale_freq_capacity(sd, cpu); else capacity *= default_scale_capacity(sd, cpu); @@ -5782,7 +5782,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) /* * Only siblings can have significantly less than SCHED_CAPACITY_SCALE */ - if (!(sd->flags & SD_SHARE_CPUPOWER)) + if (!(sd->flags & SD_SHARE_CPUCAPACITY)) return 0; /* diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 5716929a2e3a..90284d117fe6 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -37,18 +37,18 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) SCHED_FEAT(WAKEUP_PREEMPTION, true) /* - * Use arch dependent cpu power functions + * Use arch dependent cpu capacity functions */ -SCHED_FEAT(ARCH_POWER, true) +SCHED_FEAT(ARCH_CAPACITY, true) SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) SCHED_FEAT(LB_BIAS, true) /* - * Decrement CPU power based on time not spent running tasks + * Decrement CPU capacity based on time not spent running tasks */ -SCHED_FEAT(NONTASK_POWER, true) +SCHED_FEAT(NONTASK_CAPACITY, true) /* * Queue remote wakeups on the target CPU and process them -- cgit v1.2.3 From 0b07939cbfdd05bed0c5ec01b8b25493e6ecd34c Mon Sep 17 00:00:00 2001 From: Giedrius Rekasius Date: Sun, 25 May 2014 15:23:31 +0100 Subject: sched: Remove redundant assignment to "rt_rq" in update_curr_rt(...) Variable "rt_rq" is used only in block "for_each_sched_rt_entity" so the value assigned to it at the beginning of the update_curr_rt(...) gets overwritten without ever being read. Remove redundant assignment and move variable declaration to the block in which it is being used. Signed-off-by: Giedrius Rekasius Signed-off-by: Peter Zijlstra Cc: kernel-janitors@vger.kernel.org Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1401027811-30066-1-git-send-email-giedrius.rekasius@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0ebfd7a29472..43406db306af 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -924,7 +924,6 @@ static void update_curr_rt(struct rq *rq) { struct task_struct *curr = rq->curr; struct sched_rt_entity *rt_se = &curr->rt; - struct rt_rq *rt_rq = rt_rq_of_se(rt_se); u64 delta_exec; if (curr->sched_class != &rt_sched_class) @@ -949,7 +948,7 @@ static void update_curr_rt(struct rq *rq) return; for_each_sched_rt_entity(rt_se) { - rt_rq = rt_rq_of_se(rt_se); + struct rt_rq *rt_rq = rt_rq_of_se(rt_se); if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { raw_spin_lock(&rt_rq->rt_runtime_lock); -- cgit v1.2.3 From dfc68f29ae67f2a6e799b44e6a4eb3417dffbfcd Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 4 Jun 2014 10:31:15 -0700 Subject: sched, trace: Add a tracepoint for IPI-less remote wakeups Remote wakeups of polling CPUs are a valuable performance improvement; add a tracepoint to make it much easier to verify that they're working. Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra Cc: nicolas.pitre@linaro.org Cc: daniel.lezcano@linaro.org Cc: umgwanakikbuti@gmail.com Cc: David Ahern Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Mel Gorman Cc: Oleg Nesterov Cc: Steven Rostedt Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/16205aee116772aa686814f9b13bccb562108047.1401902905.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5976ca579d3e..e4c0ddd3db8e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -564,6 +564,8 @@ void resched_task(struct task_struct *p) if (set_nr_and_not_polling(p)) smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); } void resched_cpu(int cpu) @@ -647,6 +649,8 @@ static void wake_up_idle_cpu(int cpu) smp_mb(); if (!tsk_is_polling(rq->idle)) smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); } static bool wake_up_full_nohz_cpu(int cpu) -- cgit v1.2.3 From 82c65d60d64401aedc1006d6572469bbfdf148de Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 4 Jun 2014 10:31:16 -0700 Subject: sched/idle: Clear polling before descheduling the idle thread Currently, the only real guarantee provided by the polling bit is that, if you hold rq->lock and the polling bit is set, then you can set need_resched to force a reschedule. The only reason the lock is needed is that the idle thread might not be running at all when setting its need_resched bit, and rq->lock keeps it pinned. This is easy to fix: just clear the polling bit before scheduling. Now the idle thread's polling bit is only ever set when rq->curr == rq->idle. Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra Cc: nicolas.pitre@linaro.org Cc: daniel.lezcano@linaro.org Cc: umgwanakikbuti@gmail.com Cc: Rafael J. Wysocki Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/b2059fcb4c613d520cb503b6fad6e47033c7c203.1401902905.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- kernel/sched/idle.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 25b9423abce9..fe4b24bf33ca 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -67,6 +67,10 @@ void __weak arch_cpu_idle(void) * cpuidle_idle_call - the main idle function * * NOTE: no locks or semaphores should be used here + * + * On archs that support TIF_POLLING_NRFLAG, is called with polling + * set, and it returns with polling set. If it ever stops polling, it + * must clear the polling bit. */ static void cpuidle_idle_call(void) { @@ -175,10 +179,22 @@ exit_idle: /* * Generic idle loop implementation + * + * Called with polling cleared. */ static void cpu_idle_loop(void) { while (1) { + /* + * If the arch has a polling bit, we maintain an invariant: + * + * Our polling bit is clear if we're not scheduled (i.e. if + * rq->curr != rq->idle). This means that, if rq->idle has + * the polling bit set, then setting need_resched is + * guaranteed to cause the cpu to reschedule. + */ + + __current_set_polling(); tick_nohz_idle_enter(); while (!need_resched()) { @@ -218,6 +234,15 @@ static void cpu_idle_loop(void) */ preempt_set_need_resched(); tick_nohz_idle_exit(); + __current_clr_polling(); + + /* + * We promise to reschedule if need_resched is set while + * polling is set. That means that clearing polling + * needs to be visible before rescheduling. + */ + smp_mb__after_atomic(); + schedule_preempt_disabled(); } } @@ -239,7 +264,6 @@ void cpu_startup_entry(enum cpuhp_state state) */ boot_init_stack_canary(); #endif - __current_set_polling(); arch_cpu_idle_prepare(); cpu_idle_loop(); } -- cgit v1.2.3 From 67b9ca70c3030e832999e8d1cdba2984c7bb5bfc Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 4 Jun 2014 10:31:17 -0700 Subject: sched/idle: Simplify wake_up_idle_cpu() Now that rq->idle's polling bit is a reliable indication that the cpu is polling, use it. Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra Cc: Rafael J. Wysocki Cc: nicolas.pitre@linaro.org Cc: daniel.lezcano@linaro.org Cc: umgwanakikbuti@gmail.com Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/922f00761445a830ebb23d058e2ae53956ce2d73.1401902905.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e4c0ddd3db8e..6afbfeefddd6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -628,26 +628,7 @@ static void wake_up_idle_cpu(int cpu) if (cpu == smp_processor_id()) return; - /* - * This is safe, as this function is called with the timer - * wheel base lock of (cpu) held. When the CPU is on the way - * to idle and has not yet set rq->curr to idle then it will - * be serialized on the timer wheel base lock and take the new - * timer into account automatically. - */ - if (rq->curr != rq->idle) - return; - - /* - * We can set TIF_RESCHED on the idle task of the other CPU - * lockless. The worst case is that the other CPU runs the - * idle task through an additional NOOP schedule() - */ - set_tsk_need_resched(rq->idle); - - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(rq->idle)) + if (set_nr_and_not_polling(rq->idle)) smp_send_reschedule(cpu); else trace_sched_wake_idle_without_ipi(cpu); -- cgit v1.2.3 From e3baac47f0e82c4be632f4f97215bb93bf16b342 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 4 Jun 2014 10:31:18 -0700 Subject: sched/idle: Optimize try-to-wake-up IPI [ This series reduces the number of IPIs on Andy's workload by something like 99%. It's down from many hundreds per second to very few. The basic idea behind this series is to make TIF_POLLING_NRFLAG be a reliable indication that the idle task is polling. Once that's done, the rest is reasonably straightforward. ] When enqueueing tasks on remote LLC domains, we send an IPI to do the work 'locally' and avoid bouncing all the cachelines over. However, when the remote CPU is idle (and polling, say x86 mwait), we don't need to send an IPI, we can simply kick the TIF word to wake it up and have the 'idle' loop do the work. So when _TIF_POLLING_NRFLAG is set, but _TIF_NEED_RESCHED is not (yet) set, set _TIF_NEED_RESCHED and avoid sending the IPI. Much-requested-by: Andy Lutomirski Signed-off-by: Peter Zijlstra [Edited by Andy Lutomirski, but this is mostly Peter Zijlstra's code.] Signed-off-by: Andy Lutomirski Cc: nicolas.pitre@linaro.org Cc: daniel.lezcano@linaro.org Cc: Mike Galbraith Cc: umgwanakikbuti@gmail.com Cc: Rafael J. Wysocki Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/ce06f8b02e7e337be63e97597fc4b248d3aa6f9b.1401902905.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++------ kernel/sched/idle.c | 10 +++++++--- kernel/sched/sched.h | 6 ++++++ 3 files changed, 61 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6afbfeefddd6..60d4e05d64dd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -519,7 +519,7 @@ static inline void init_hrtick(void) __old; \ }) -#ifdef TIF_POLLING_NRFLAG +#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) /* * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, * this avoids any races wrt polling state changes and thereby avoids @@ -530,12 +530,44 @@ static bool set_nr_and_not_polling(struct task_struct *p) struct thread_info *ti = task_thread_info(p); return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); } + +/* + * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. + * + * If this returns true, then the idle task promises to call + * sched_ttwu_pending() and reschedule soon. + */ +static bool set_nr_if_polling(struct task_struct *p) +{ + struct thread_info *ti = task_thread_info(p); + typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags); + + for (;;) { + if (!(val & _TIF_POLLING_NRFLAG)) + return false; + if (val & _TIF_NEED_RESCHED) + return true; + old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); + if (old == val) + break; + val = old; + } + return true; +} + #else static bool set_nr_and_not_polling(struct task_struct *p) { set_tsk_need_resched(p); return true; } + +#ifdef CONFIG_SMP +static bool set_nr_if_polling(struct task_struct *p) +{ + return false; +} +#endif #endif /* @@ -1490,13 +1522,17 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) } #ifdef CONFIG_SMP -static void sched_ttwu_pending(void) +void sched_ttwu_pending(void) { struct rq *rq = this_rq(); struct llist_node *llist = llist_del_all(&rq->wake_list); struct task_struct *p; + unsigned long flags; - raw_spin_lock(&rq->lock); + if (!llist) + return; + + raw_spin_lock_irqsave(&rq->lock, flags); while (llist) { p = llist_entry(llist, struct task_struct, wake_entry); @@ -1504,7 +1540,7 @@ static void sched_ttwu_pending(void) ttwu_do_activate(rq, p, 0); } - raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&rq->lock, flags); } void scheduler_ipi(void) @@ -1550,8 +1586,14 @@ void scheduler_ipi(void) static void ttwu_queue_remote(struct task_struct *p, int cpu) { - if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) - smp_send_reschedule(cpu); + struct rq *rq = cpu_rq(cpu); + + if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { + if (!set_nr_if_polling(rq->idle)) + smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); + } } bool cpus_share_cache(int this_cpu, int that_cpu) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index fe4b24bf33ca..cf009fb0bc25 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -12,6 +12,8 @@ #include +#include "sched.h" + static int __read_mostly cpu_idle_force_poll; void cpu_idle_poll_ctrl(bool enable) @@ -237,12 +239,14 @@ static void cpu_idle_loop(void) __current_clr_polling(); /* - * We promise to reschedule if need_resched is set while - * polling is set. That means that clearing polling - * needs to be visible before rescheduling. + * We promise to call sched_ttwu_pending and reschedule + * if need_resched is set while polling is set. That + * means that clearing polling needs to be visible + * before doing these things. */ smp_mb__after_atomic(); + sched_ttwu_pending(); schedule_preempt_disabled(); } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 956b8ca24893..2f8636199b83 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -670,6 +670,8 @@ extern int migrate_swap(struct task_struct *, struct task_struct *); #ifdef CONFIG_SMP +extern void sched_ttwu_pending(void); + #define rcu_dereference_check_sched_domain(p) \ rcu_dereference_check((p), \ lockdep_is_held(&sched_domains_mutex)) @@ -787,6 +789,10 @@ static inline unsigned int group_first_cpu(struct sched_group *group) extern int group_balance_cpu(struct sched_group *sg); +#else + +static inline void sched_ttwu_pending(void) { } + #endif /* CONFIG_SMP */ #include "stats.h" -- cgit v1.2.3 From ebf905fc7a6e7c99c53b5afc888d8f950da90aff Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 29 May 2014 19:00:24 +0200 Subject: perf: Fix use after free in perf_remove_from_context() While that mutex should guard the elements, it doesn't guard against the use-after-free that's from list_for_each_entry_rcu(). __perf_event_exit_task() can actually free the event. And because list addition/deletion is guarded by both ctx->mutex and ctx->lock, holding ctx->mutex is sufficient for reading the list, so we don't actually need the rcu list iteration. Fixes: 3a497f48637e ("perf: Simplify perf_event_exit_task_context()") Reported-by: Sasha Levin Tested-by: Sasha Levin Signed-off-by: Peter Zijlstra Cc: Dave Jones Cc: acme@ghostprotocols.net Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140529170024.GA2315@laptop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index ed50b0943213..a62d142ad498 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7431,7 +7431,7 @@ __perf_event_exit_task(struct perf_event *child_event, static void perf_event_exit_task_context(struct task_struct *child, int ctxn) { - struct perf_event *child_event; + struct perf_event *child_event, *next; struct perf_event_context *child_ctx; unsigned long flags; @@ -7485,7 +7485,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) */ mutex_lock(&child_ctx->mutex); - list_for_each_entry_rcu(child_event, &child_ctx->event_list, event_entry) + list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) __perf_event_exit_task(child_event, child_ctx, child); mutex_unlock(&child_ctx->mutex); -- cgit v1.2.3 From 53b25335dd60981ad608da7890420898a34469a6 Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Fri, 16 May 2014 17:12:12 -0400 Subject: perf: Disable sampled events if no PMU interrupt Add common code to generate -ENOTSUPP at event creation time if an architecture attempts to create a sampled event and PERF_PMU_NO_INTERRUPT is set. This adds a new pmu->capabilities flag. Initially we only support PERF_PMU_NO_INTERRUPT (to indicate a PMU has no support for generating hardware interrupts) but there are other capabilities that can be added later. Signed-off-by: Vince Weaver Acked-by: Will Deacon [peterz: rename to PERF_PMU_CAP_* and moved the pmu::capabilities word into a hole] Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1405161708060.11099@vincent-weaver-1.umelst.maine.edu Signed-off-by: Ingo Molnar Signed-off-by: Ingo Molnar --- kernel/events/core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index a62d142ad498..e9ef0c6646af 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7120,6 +7120,13 @@ SYSCALL_DEFINE5(perf_event_open, } } + if (is_sampling_event(event)) { + if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { + err = -ENOTSUPP; + goto err_alloc; + } + } + account_event(event); /* -- cgit v1.2.3 From 41ccba029e9492dd0fc1bf5c23b72c6322a6dfe9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 19 May 2014 20:40:54 +0200 Subject: uprobes: Shift ->readpage check from __copy_insn() to uprobe_register() copy_insn() fails with -EIO if ->readpage == NULL, but this error is not propagated unless uprobe_register() path finds ->mm which already mmaps this file. In this case (say) "perf record" does not actually install the probe, but the user can't know about this. Move this check into uprobe_register() so that this problem can be detected earlier and reported to user. Note: this is still not perfect, - copy_insn() and arch_uprobe_analyze_insn() should be called by uprobe_register() but this is not simple, we need vm_file for read_mapping_page() (although perhaps we can pass NULL), and we need ->mm for is_64bit_mm() (although this logic is broken anyway). - uprobe_register() should be called by create_trace_uprobe(), not by probe_event_enable(), so that an error can be detected at "perf probe -x" time. This also needs more changes in the core uprobe code, uprobe register/unregister interface was poorly designed from the very beginning. Reported-by: Denys Vlasenko Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra Cc: Hugh Dickins Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140519184054.GA6750@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 3b02c72938a8..c56b13e3b5e1 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -536,9 +536,6 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, void *insn, int nbytes, loff_t offset) { struct page *page; - - if (!mapping->a_ops->readpage) - return -EIO; /* * Ensure that the page that has the original instruction is * populated and in page-cache. @@ -879,6 +876,9 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * if (!uc->handler && !uc->ret_handler) return -EINVAL; + /* copy_insn()->read_mapping_page() needs ->readpage() */ + if (!inode->i_mapping->a_ops->readpage) + return -EIO; /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) return -EINVAL; -- cgit v1.2.3 From 40814f6805a524130a5ec313871147f7aa9b5318 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 19 May 2014 20:41:36 +0200 Subject: uprobes: Teach copy_insn() to support tmpfs tmpfs is widely used but as Denys reports shmem_aops doesn't have ->readpage() and thus you can't probe a binary on this filesystem. As Hugh suggested we can use shmem_read_mapping_page() in this case, just we need to check shmem_mapping() if ->readpage == NULL. Reported-by: Denys Vlasenko Suggested-by: Hugh Dickins Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140519184136.GB6750@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c56b13e3b5e1..6bfb6717f878 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -36,6 +36,7 @@ #include "../../mm/internal.h" /* munlock_vma_page */ #include #include +#include #include @@ -537,10 +538,14 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, { struct page *page; /* - * Ensure that the page that has the original instruction is - * populated and in page-cache. + * Ensure that the page that has the original instruction is populated + * and in page-cache. If ->readpage == NULL it must be shmem_mapping(), + * see uprobe_register(). */ - page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp); + if (mapping->a_ops->readpage) + page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp); + else + page = shmem_read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT); if (IS_ERR(page)) return PTR_ERR(page); @@ -876,8 +881,8 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * if (!uc->handler && !uc->ret_handler) return -EINVAL; - /* copy_insn()->read_mapping_page() needs ->readpage() */ - if (!inode->i_mapping->a_ops->readpage) + /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */ + if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping)) return -EIO; /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) -- cgit v1.2.3 From f602d0632755be4f7eddfdd6c0af13216790177b Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Tue, 13 May 2014 16:40:05 -0400 Subject: sched/deadline: Delete extraneous extern for to_ratio() There was a prototype for it added to kernel/sched/sched.h at the same time the extern was added, so the extern in the C file was never really ever needed. See commit 332ac17ef5bfcff4766dfdfd3b4cdf10b8f8f155 ("sched/deadline: Add bandwidth management for SCHED_DEADLINE tasks") for details. Signed-off-by: Paul Gortmaker Cc: Peter Zijlstra Cc: Dario Faggioli Link: http://lkml.kernel.org/r/1400013605-18717-1-git-send-email-paul.gortmaker@windriver.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f9ca7d19781a..0d6b17057188 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -57,8 +57,6 @@ void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) dl_b->dl_runtime = runtime; } -extern unsigned long to_ratio(u64 period, u64 runtime); - void init_dl_bw(struct dl_bw *dl_b) { raw_spin_lock_init(&dl_b->lock); -- cgit v1.2.3 From c731ae1d0f02a300697a8b1564780ad28a6c2013 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 5 Jun 2014 17:16:30 +0800 Subject: cgroup: disallow disabled controllers on the default hierarchy After booting with cgroup_disable=memory, I still saw memcg files in the default hierarchy, and I can write to them, though it won't take effect. # dmesg ... Disabling memory control group subsystem ... # mount -t cgroup -o __DEVEL__sane_behavior xxx /cgroup # ls /cgroup ... memory.failcnt memory.move_charge_at_immigrate memory.force_empty memory.numa_stat memory.limit_in_bytes memory.oom_control ... # cat /cgroup/memory.usage_in_bytes 0 tj: Minor comment update. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3f46165829a4..3edcc8ae83b5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3069,6 +3069,9 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { int ret; + if (ss->disabled) + return 0; + if (!cfts || cfts[0].name[0] == '\0') return 0; @@ -4678,8 +4681,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) BUG_ON(online_css(css)); - cgrp_dfl_root.subsys_mask |= 1 << ss->id; - mutex_unlock(&cgroup_mutex); } @@ -4758,11 +4759,14 @@ int __init cgroup_init(void) &cgrp_dfl_root.cgrp.e_csets[ssid]); /* - * cftype registration needs kmalloc and can't be done - * during early_init. Register base cftypes separately. + * Setting dfl_root subsys_mask needs to consider the + * disabled flag and cftype registration needs kmalloc, + * both of which aren't available during early_init. */ - if (ss->base_cftypes) + if (!ss->disabled) { + cgrp_dfl_root.subsys_mask |= 1 << ss->id; WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); + } } cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); -- cgit v1.2.3 From 939c7a4f04fcd2162109744e8bf88194948a6e65 Mon Sep 17 00:00:00 2001 From: Yoshihiro YUNOMAE Date: Thu, 5 Jun 2014 10:24:27 +0900 Subject: tracing: Introduce saved_cmdlines_size file Introduce saved_cmdlines_size file for changing the number of saved pid-comms. saved_cmdlines currently stores 128 command names using SAVED_CMDLINES, but 'no-existing processes' names are often lost in saved_cmdlines when we read the trace data. So, by introducing saved_cmdlines_size file, we can now change the 128 command names saved to something much larger if needed. When we write a value to saved_cmdlines_size, the number of the value will be stored in pid-comm list: # echo 1024 > /sys/kernel/debug/tracing/saved_cmdlines_size Here, 1024 command names can be stored. The default number is 128 and the maximum number is PID_MAX_DEFAULT (=32768 if CONFIG_BASE_SMALL is not set). So, if we want to avoid losing any command names, we need to set 32768 to saved_cmdlines_size. We can read the maximum number of the list: # cat /sys/kernel/debug/tracing/saved_cmdlines_size 128 Link: http://lkml.kernel.org/p/20140605012427.22115.16173.stgit@yunodevel Signed-off-by: Yoshihiro YUNOMAE Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 178 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 154 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 135af323608b..e29edee1542a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1285,22 +1285,71 @@ void tracing_reset_all_online_cpus(void) } } -#define SAVED_CMDLINES 128 +#define SAVED_CMDLINES_DEFAULT 128 #define NO_CMDLINE_MAP UINT_MAX -static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; -static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; -static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; -static int cmdline_idx; static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; +struct saved_cmdlines_buffer { + unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; + unsigned *map_cmdline_to_pid; + unsigned cmdline_num; + int cmdline_idx; + char *saved_cmdlines; +}; +static struct saved_cmdlines_buffer *savedcmd; /* temporary disable recording */ static atomic_t trace_record_cmdline_disabled __read_mostly; -static void trace_init_cmdlines(void) +static inline char *get_saved_cmdlines(int idx) +{ + return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; +} + +static inline void set_cmdline(int idx, const char *cmdline) { - memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline)); - memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid)); - cmdline_idx = 0; + memcpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); +} + +static int allocate_cmdlines_buffer(unsigned int val, + struct saved_cmdlines_buffer *s) +{ + s->map_cmdline_to_pid = kmalloc(val * sizeof(*s->map_cmdline_to_pid), + GFP_KERNEL); + if (!s->map_cmdline_to_pid) + return -ENOMEM; + + s->saved_cmdlines = kmalloc(val * TASK_COMM_LEN, GFP_KERNEL); + if (!s->saved_cmdlines) { + kfree(s->map_cmdline_to_pid); + return -ENOMEM; + } + + s->cmdline_idx = 0; + s->cmdline_num = val; + memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, + sizeof(s->map_pid_to_cmdline)); + memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, + val * sizeof(*s->map_cmdline_to_pid)); + + return 0; +} + +static int trace_create_savedcmd(void) +{ + int ret; + + savedcmd = kmalloc(sizeof(struct saved_cmdlines_buffer), GFP_KERNEL); + if (!savedcmd) + return -ENOMEM; + + ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd); + if (ret < 0) { + kfree(savedcmd); + savedcmd = NULL; + return -ENOMEM; + } + + return 0; } int is_tracing_stopped(void) @@ -1457,9 +1506,9 @@ static int trace_save_cmdline(struct task_struct *tsk) if (!arch_spin_trylock(&trace_cmdline_lock)) return 0; - idx = map_pid_to_cmdline[tsk->pid]; + idx = savedcmd->map_pid_to_cmdline[tsk->pid]; if (idx == NO_CMDLINE_MAP) { - idx = (cmdline_idx + 1) % SAVED_CMDLINES; + idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num; /* * Check whether the cmdline buffer at idx has a pid @@ -1467,17 +1516,17 @@ static int trace_save_cmdline(struct task_struct *tsk) * need to clear the map_pid_to_cmdline. Otherwise we * would read the new comm for the old pid. */ - pid = map_cmdline_to_pid[idx]; + pid = savedcmd->map_cmdline_to_pid[idx]; if (pid != NO_CMDLINE_MAP) - map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; + savedcmd->map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; - map_cmdline_to_pid[idx] = tsk->pid; - map_pid_to_cmdline[tsk->pid] = idx; + savedcmd->map_cmdline_to_pid[idx] = tsk->pid; + savedcmd->map_pid_to_cmdline[tsk->pid] = idx; - cmdline_idx = idx; + savedcmd->cmdline_idx = idx; } - memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); + set_cmdline(idx, tsk->comm); arch_spin_unlock(&trace_cmdline_lock); @@ -1503,9 +1552,9 @@ static void __trace_find_cmdline(int pid, char comm[]) return; } - map = map_pid_to_cmdline[pid]; + map = savedcmd->map_pid_to_cmdline[pid]; if (map != NO_CMDLINE_MAP) - strcpy(comm, saved_cmdlines[map]); + strcpy(comm, get_saved_cmdlines(map)); else strcpy(comm, "<...>"); } @@ -3593,6 +3642,7 @@ static const char readme_msg[] = " trace_options\t\t- Set format or modify how tracing happens\n" "\t\t\t Disable an option by adding a suffix 'no' to the\n" "\t\t\t option name\n" + " saved_cmdlines_size\t- echo command number in here to store comm-pid list\n" #ifdef CONFIG_DYNAMIC_FTRACE "\n available_filter_functions - list of functions that can be filtered on\n" " set_ftrace_filter\t- echo function name in here to only trace these\n" @@ -3715,7 +3765,8 @@ static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) (*pos)++; - for (; ptr < &map_cmdline_to_pid[SAVED_CMDLINES]; ptr++) { + for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num]; + ptr++) { if (*ptr == -1 || *ptr == NO_CMDLINE_MAP) continue; @@ -3733,7 +3784,7 @@ static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) preempt_disable(); arch_spin_lock(&trace_cmdline_lock); - v = &map_cmdline_to_pid[0]; + v = &savedcmd->map_cmdline_to_pid[0]; while (l <= *pos) { v = saved_cmdlines_next(m, v, &l); if (!v) @@ -3781,6 +3832,79 @@ static const struct file_operations tracing_saved_cmdlines_fops = { .release = seq_release, }; +static ssize_t +tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + int r; + + arch_spin_lock(&trace_cmdline_lock); + r = sprintf(buf, "%u\n", savedcmd->cmdline_num); + arch_spin_unlock(&trace_cmdline_lock); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) +{ + kfree(s->saved_cmdlines); + kfree(s->map_cmdline_to_pid); + kfree(s); +} + +static int tracing_resize_saved_cmdlines(unsigned int val) +{ + struct saved_cmdlines_buffer *s, *savedcmd_temp; + + s = kmalloc(sizeof(struct saved_cmdlines_buffer), GFP_KERNEL); + if (!s) + return -ENOMEM; + + if (allocate_cmdlines_buffer(val, s) < 0) { + kfree(s); + return -ENOMEM; + } + + arch_spin_lock(&trace_cmdline_lock); + savedcmd_temp = savedcmd; + savedcmd = s; + arch_spin_unlock(&trace_cmdline_lock); + free_saved_cmdlines_buffer(savedcmd_temp); + + return 0; +} + +static ssize_t +tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + /* must have at least 1 entry or less than PID_MAX_DEFAULT */ + if (!val || val > PID_MAX_DEFAULT) + return -EINVAL; + + ret = tracing_resize_saved_cmdlines((unsigned int)val); + if (ret < 0) + return ret; + + *ppos += cnt; + + return cnt; +} + +static const struct file_operations tracing_saved_cmdlines_size_fops = { + .open = tracing_open_generic, + .read = tracing_saved_cmdlines_size_read, + .write = tracing_saved_cmdlines_size_write, +}; + static ssize_t tracing_set_trace_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -6375,6 +6499,9 @@ static __init int tracer_init_debugfs(void) trace_create_file("saved_cmdlines", 0444, d_tracer, NULL, &tracing_saved_cmdlines_fops); + trace_create_file("saved_cmdlines_size", 0644, d_tracer, + NULL, &tracing_saved_cmdlines_size_fops); + #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, &tracing_dyn_info_fops); @@ -6611,18 +6738,19 @@ __init static int tracer_alloc_buffers(void) if (!temp_buffer) goto out_free_cpumask; + if (trace_create_savedcmd() < 0) + goto out_free_temp_buffer; + /* TODO: make the number of buffers hot pluggable with CPUS */ if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); WARN_ON(1); - goto out_free_temp_buffer; + goto out_free_savedcmd; } if (global_trace.buffer_disabled) tracing_off(); - trace_init_cmdlines(); - if (trace_boot_clock) { ret = tracing_set_clock(&global_trace, trace_boot_clock); if (ret < 0) @@ -6668,6 +6796,8 @@ __init static int tracer_alloc_buffers(void) return 0; +out_free_savedcmd: + free_saved_cmdlines_buffer(savedcmd); out_free_temp_buffer: ring_buffer_free(temp_buffer); out_free_cpumask: -- cgit v1.2.3 From 72e2fe38eac2dbf258d4295d75f78b123dd5b823 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 5 Jun 2014 20:35:30 -0400 Subject: tracing: Convert stddev into u64 in tracepoint benchmark I've been told that do_div() expects an unsigned 64 bit number, and is undefined if a signed is used. This gave a warning on the MIPS build. I'm not sure if a signed 64 bit dividend is really an issue or not, but the calculation this is used for is standard deviation, and that isn't going to be negative. We can just convert it to unsigned and be safe. Reported-by: David Daney Signed-off-by: Steven Rostedt --- kernel/trace/trace_benchmark.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index a10adc7095cd..8bd3365a65b2 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -33,7 +33,7 @@ static void trace_do_benchmark(void) u64 start; u64 stop; u64 delta; - s64 stddev; + u64 stddev; u64 seed; u64 last_seed; unsigned int avg; -- cgit v1.2.3 From 34839f5a69989c0ee48386a788fba37eb75910f7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 5 Jun 2014 23:34:02 -0400 Subject: tracing: Only calculate stats of tracepoint benchmarks for 2^32 times When calculating the average and standard deviation, it is required that the count be less than UINT_MAX, otherwise the do_div() will get undefined results. After 2^32 counts of data, the average and standard deviation should pretty much be set anyway. Signed-off-by: Steven Rostedt --- kernel/trace/trace_benchmark.c | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 8bd3365a65b2..40a14cbcf8e0 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -16,7 +16,10 @@ static u64 bm_last; static u64 bm_max; static u64 bm_min; static u64 bm_first; -static s64 bm_cnt; +static u64 bm_cnt; +static u64 bm_stddev; +static unsigned int bm_avg; +static unsigned int bm_std; /* * This gets called in a loop recording the time it took to write @@ -66,22 +69,35 @@ static void trace_do_benchmark(void) bm_last = delta; - bm_total += delta; - bm_totalsq += delta * delta; - if (delta > bm_max) bm_max = delta; if (!bm_min || delta < bm_min) bm_min = delta; + /* + * When bm_cnt is greater than UINT_MAX, it breaks the statistics + * accounting. Freeze the statistics when that happens. + * We should have enough data for the avg and stddev anyway. + */ + if (bm_cnt > UINT_MAX) { + scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, + "last=%llu first=%llu max=%llu min=%llu ** avg=%u std=%d std^2=%lld", + bm_last, bm_first, bm_max, bm_min, bm_avg, bm_std, bm_stddev); + return; + } + + bm_total += delta; + bm_totalsq += delta * delta; + + if (bm_cnt > 1) { /* * Apply Welford's method to calculate standard deviation: * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) */ stddev = (u64)bm_cnt * bm_totalsq - bm_total * bm_total; - do_div(stddev, bm_cnt); - do_div(stddev, bm_cnt - 1); + do_div(stddev, (u32)bm_cnt); + do_div(stddev, (u32)bm_cnt - 1); } else stddev = 0; @@ -119,6 +135,10 @@ static void trace_do_benchmark(void) scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, "last=%llu first=%llu max=%llu min=%llu avg=%u std=%d std^2=%lld", bm_last, bm_first, bm_max, bm_min, avg, std, stddev); + + bm_std = std; + bm_avg = avg; + bm_stddev = stddev; } static int benchmark_event_kthread(void *arg) @@ -170,6 +190,9 @@ void trace_benchmark_unreg(void) bm_max = 0; bm_min = 0; bm_cnt = 0; - /* bm_first doesn't need to be reset but reset it anyway */ + /* These don't need to be reset but reset them anyway */ bm_first = 0; + bm_std = 0; + bm_avg = 0; + bm_stddev = 0; } -- cgit v1.2.3 From e041e328c4b41e1db79bfe5ba9992c2ed771ad19 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 21 May 2014 17:32:19 +0200 Subject: perf: Fix perf_event_comm() vs. exec() assumption perf_event_comm() assumes that set_task_comm() is only called on exec(), and in particular that its only called on current. Neither are true, as Dave reported a WARN triggered by set_task_comm() being called on !current. Separate the exec() hook from the comm hook. Reported-by: Dave Jones Signed-off-by: Peter Zijlstra Cc: Alexander Viro Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: linux-fsdevel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20140521153219.GH5226@laptop.programming.kicks-ass.net [ Build fix. ] Signed-off-by: Ingo Molnar --- kernel/events/core.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 440eefc67397..647698f91988 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2970,6 +2970,22 @@ out: local_irq_restore(flags); } +void perf_event_exec(void) +{ + struct perf_event_context *ctx; + int ctxn; + + rcu_read_lock(); + for_each_task_context_nr(ctxn) { + ctx = current->perf_event_ctxp[ctxn]; + if (!ctx) + continue; + + perf_event_enable_on_exec(ctx); + } + rcu_read_unlock(); +} + /* * Cross CPU call to read the hardware event */ @@ -5057,18 +5073,6 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) void perf_event_comm(struct task_struct *task) { struct perf_comm_event comm_event; - struct perf_event_context *ctx; - int ctxn; - - rcu_read_lock(); - for_each_task_context_nr(ctxn) { - ctx = task->perf_event_ctxp[ctxn]; - if (!ctx) - continue; - - perf_event_enable_on_exec(ctx); - } - rcu_read_unlock(); if (!atomic_read(&nr_comm_events)) return; -- cgit v1.2.3 From 82b897782d10fcc4930c9d4a15b175348fdd2871 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 28 May 2014 11:45:04 +0300 Subject: perf: Differentiate exec() and non-exec() comm events perf tools like 'perf report' can aggregate samples by comm strings, which generally works. However, there are other potential use-cases. For example, to pair up 'calls' with 'returns' accurately (from branch events like Intel BTS) it is necessary to identify whether the process has exec'd. Although a comm event is generated when an 'exec' happens it is also generated whenever the comm string is changed on a whim (e.g. by prctl PR_SET_NAME). This patch adds a flag to the comm event to differentiate one case from the other. In order to determine whether the kernel supports the new flag, a selection bit named 'exec' is added to struct perf_event_attr. The bit does nothing but will cause perf_event_open() to fail if the bit is set on kernels that do not have it defined. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/537D9EBE.7030806@intel.com Cc: Paul Mackerras Cc: Dave Jones Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Alexander Viro Cc: Linus Torvalds Cc: linux-fsdevel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 8fac2056d51e..7da5e561e89a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5090,7 +5090,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) NULL); } -void perf_event_comm(struct task_struct *task) +void perf_event_comm(struct task_struct *task, bool exec) { struct perf_comm_event comm_event; @@ -5104,7 +5104,7 @@ void perf_event_comm(struct task_struct *task) .event_id = { .header = { .type = PERF_RECORD_COMM, - .misc = 0, + .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0, /* .size */ }, /* .pid */ -- cgit v1.2.3 From 70af2f8a4f48d6cebdf92d533d3aef37853ce6de Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 3 Feb 2014 13:18:49 +0100 Subject: locking/rwlocks: Introduce 'qrwlocks' - fair, queued rwlocks This rwlock uses the arch_spin_lock_t as a waitqueue, and assuming the arch_spin_lock_t is a fair lock (ticket,mcs etc..) the resulting rwlock is a fair lock. It fits in the same 8 bytes as the regular rwlock_t by folding the reader and writer count into a single integer, using the remaining 4 bytes for the arch_spinlock_t. Architectures that can single-copy adress bytes can optimize queue_write_unlock() with a 0 write to the LSB (the write count). Performance as measured by Davidlohr Bueso (rwlock_t -> qrwlock_t): +--------------+-------------+---------------+ | Workload | #users | delta | +--------------+-------------+---------------+ | alltests | > 1400 | -4.83% | | custom | 0-100,> 100 | +1.43%,-1.57% | | high_systime | > 1000 | -2.61 | | shared | all | +0.32 | +--------------+-------------+---------------+ http://www.stgolabs.net/qrwlock-stuff/aim7-results-vs-rwsem_optsin/ Signed-off-by: Waiman Long [peterz: near complete rewrite] Signed-off-by: Peter Zijlstra Cc: Arnd Bergmann Cc: Linus Torvalds Cc: "Paul E.McKenney" Cc: linux-arch@vger.kernel.org Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/n/tip-gac1nnl3wvs2ij87zv2xkdzq@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/Kconfig.locks | 7 +++ kernel/locking/Makefile | 1 + kernel/locking/qrwlock.c | 133 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 141 insertions(+) create mode 100644 kernel/locking/qrwlock.c (limited to 'kernel') diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index d2b32ac27a39..35536d9c0964 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -223,3 +223,10 @@ endif config MUTEX_SPIN_ON_OWNER def_bool y depends on SMP && !DEBUG_MUTEXES + +config ARCH_USE_QUEUE_RWLOCK + bool + +config QUEUE_RWLOCK + def_bool y if ARCH_USE_QUEUE_RWLOCK + depends on SMP diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index b8bdcd4785b7..8541bfdfd232 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -24,4 +24,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o +obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c new file mode 100644 index 000000000000..fb5b8ac411a5 --- /dev/null +++ b/kernel/locking/qrwlock.c @@ -0,0 +1,133 @@ +/* + * Queue read/write lock + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P. + * + * Authors: Waiman Long + */ +#include +#include +#include +#include +#include +#include +#include + +/** + * rspin_until_writer_unlock - inc reader count & spin until writer is gone + * @lock : Pointer to queue rwlock structure + * @writer: Current queue rwlock writer status byte + * + * In interrupt context or at the head of the queue, the reader will just + * increment the reader count & wait until the writer releases the lock. + */ +static __always_inline void +rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) +{ + while ((cnts & _QW_WMASK) == _QW_LOCKED) { + arch_mutex_cpu_relax(); + cnts = smp_load_acquire((u32 *)&lock->cnts); + } +} + +/** + * queue_read_lock_slowpath - acquire read lock of a queue rwlock + * @lock: Pointer to queue rwlock structure + */ +void queue_read_lock_slowpath(struct qrwlock *lock) +{ + u32 cnts; + + /* + * Readers come here when they cannot get the lock without waiting + */ + if (unlikely(in_interrupt())) { + /* + * Readers in interrupt context will spin until the lock is + * available without waiting in the queue. + */ + cnts = smp_load_acquire((u32 *)&lock->cnts); + rspin_until_writer_unlock(lock, cnts); + return; + } + atomic_sub(_QR_BIAS, &lock->cnts); + + /* + * Put the reader into the wait queue + */ + arch_spin_lock(&lock->lock); + + /* + * At the head of the wait queue now, wait until the writer state + * goes to 0 and then try to increment the reader count and get + * the lock. It is possible that an incoming writer may steal the + * lock in the interim, so it is necessary to check the writer byte + * to make sure that the write lock isn't taken. + */ + while (atomic_read(&lock->cnts) & _QW_WMASK) + arch_mutex_cpu_relax(); + + cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS; + rspin_until_writer_unlock(lock, cnts); + + /* + * Signal the next one in queue to become queue head + */ + arch_spin_unlock(&lock->lock); +} +EXPORT_SYMBOL(queue_read_lock_slowpath); + +/** + * queue_write_lock_slowpath - acquire write lock of a queue rwlock + * @lock : Pointer to queue rwlock structure + */ +void queue_write_lock_slowpath(struct qrwlock *lock) +{ + u32 cnts; + + /* Put the writer into the wait queue */ + arch_spin_lock(&lock->lock); + + /* Try to acquire the lock directly if no reader is present */ + if (!atomic_read(&lock->cnts) && + (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0)) + goto unlock; + + /* + * Set the waiting flag to notify readers that a writer is pending, + * or wait for a previous writer to go away. + */ + for (;;) { + cnts = atomic_read(&lock->cnts); + if (!(cnts & _QW_WMASK) && + (atomic_cmpxchg(&lock->cnts, cnts, + cnts | _QW_WAITING) == cnts)) + break; + + arch_mutex_cpu_relax(); + } + + /* When no more readers, set the locked flag */ + for (;;) { + cnts = atomic_read(&lock->cnts); + if ((cnts == _QW_WAITING) && + (atomic_cmpxchg(&lock->cnts, _QW_WAITING, + _QW_LOCKED) == _QW_WAITING)) + break; + + arch_mutex_cpu_relax(); + } +unlock: + arch_spin_unlock(&lock->lock); +} +EXPORT_SYMBOL(queue_write_lock_slowpath); -- cgit v1.2.3 From dc81e5e3abb9f98a3cb6f269c0bee595b2c1235d Mon Sep 17 00:00:00 2001 From: Yoshihiro YUNOMAE Date: Fri, 6 Jun 2014 07:35:17 +0900 Subject: tracing: Return error if ftrace_trace_arrays list is empty ftrace_trace_arrays links global_trace.list. However, global_trace is not added to ftrace_trace_arrays if trace_alloc_buffers() failed. As the result, ftrace_trace_arrays becomes an empty list. If ftrace_trace_arrays is an empty list, current top_trace_array() returns an invalid pointer. As the result, the kernel can induce memory corruption or panic. Current implementation does not check whether ftrace_trace_arrays is empty list or not. So, in this patch, if ftrace_trace_arrays is empty list, top_trace_array() returns NULL. Moreover, this patch makes all functions calling top_trace_array() handle it appropriately. Link: http://lkml.kernel.org/p/20140605223517.32311.99233.stgit@yunodevel Signed-off-by: Yoshihiro YUNOMAE Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 3 +++ kernel/trace/trace_events.c | 13 +++++++++++++ 2 files changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 217207ad60b3..9e82551dd566 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -252,6 +252,9 @@ static inline struct trace_array *top_trace_array(void) { struct trace_array *tr; + if (list_empty(ftrace_trace_arrays.prev)) + return NULL; + tr = list_entry(ftrace_trace_arrays.prev, typeof(*tr), list); WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 3ddfd8f62c05..f99e0b3bca8c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -574,6 +574,9 @@ int trace_set_clr_event(const char *system, const char *event, int set) { struct trace_array *tr = top_trace_array(); + if (!tr) + return -ENODEV; + return __ftrace_set_clr_event(tr, NULL, system, event, set); } EXPORT_SYMBOL_GPL(trace_set_clr_event); @@ -2065,6 +2068,9 @@ event_enable_func(struct ftrace_hash *hash, bool enable; int ret; + if (!tr) + return -ENODEV; + /* hash funcs only work with set_ftrace_filter */ if (!enabled || !param) return -EINVAL; @@ -2396,6 +2402,9 @@ static __init int event_trace_enable(void) char *token; int ret; + if (!tr) + return -ENODEV; + for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) { call = *iter; @@ -2442,6 +2451,8 @@ static __init int event_trace_init(void) int ret; tr = top_trace_array(); + if (!tr) + return -ENODEV; d_tracer = tracing_init_dentry(); if (!d_tracer) @@ -2535,6 +2546,8 @@ static __init void event_trace_self_tests(void) int ret; tr = top_trace_array(); + if (!tr) + return; pr_info("Running tests on trace events:\n"); -- cgit v1.2.3 From 748ec3a20eb44fce19af3cef04f8db8a8e7aead3 Mon Sep 17 00:00:00 2001 From: Yoshihiro YUNOMAE Date: Fri, 6 Jun 2014 07:35:20 +0900 Subject: tracing/kprobes: Avoid self tests if tracing is disabled on boot up If tracing is disabled on boot up, the kernel should not execute tracing self tests. The kernel should check whether tracing is disabled or not before executing any of the tracing self tests. Link: http://lkml.kernel.org/p/20140605223520.32311.56097.stgit@yunodevel Acked-by: Masami Hiramatsu Signed-off-by: Yoshihiro YUNOMAE Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 903ae28962be..ef2fba1f46b5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1377,6 +1377,9 @@ static __init int kprobe_trace_self_tests_init(void) struct trace_kprobe *tk; struct ftrace_event_file *file; + if (tracing_is_disabled()) + return -ENODEV; + target = kprobe_trace_selftest_target; pr_info("Testing kprobe tracing: "); -- cgit v1.2.3 From 23aaa3c18e33fe048671b419781b5e44175efafe Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 6 Jun 2014 00:01:46 -0400 Subject: tracing: Fix leak of ring buffer data when new instances creation fails Yoshihiro Yunomae reported that the ring buffer data for a trace instance does not get properly cleaned up when it fails. He proposed a patch that manually cleaned the data up and addad a bunch of labels. The labels are not needed because all trace array is allocated with a kzalloc which initializes it to 0 and all kfree()s can take a NULL pointer and will ignore it. Adding a new helper function free_trace_buffers() that can also take null buffers to free the buffers that were allocated by allocate_trace_buffers(). Link: http://lkml.kernel.org/r/20140605223522.32311.31664.stgit@yunodevel Reported-by: Yoshihiro YUNOMAE Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e29edee1542a..26cfff38e2ab 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6232,6 +6232,25 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) return 0; } +static void free_trace_buffers(struct trace_array *tr) +{ + if (!tr) + return; + + if (tr->trace_buffer.buffer) { + ring_buffer_free(tr->trace_buffer.buffer); + tr->trace_buffer.buffer = NULL; + free_percpu(tr->trace_buffer.data); + } + +#ifdef CONFIG_TRACER_MAX_TRACE + if (tr->max_buffer.buffer) { + ring_buffer_free(tr->max_buffer.buffer); + tr->max_buffer.buffer = NULL; + } +#endif +} + static int new_instance_create(const char *name) { struct trace_array *tr; @@ -6290,8 +6309,7 @@ static int new_instance_create(const char *name) return 0; out_free_tr: - if (tr->trace_buffer.buffer) - ring_buffer_free(tr->trace_buffer.buffer); + free_trace_buffers(tr); free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); -- cgit v1.2.3 From bb3632c6101b2fad07e6246721466b984b1e0e9d Mon Sep 17 00:00:00 2001 From: Todd E Brandt Date: Fri, 6 Jun 2014 05:40:17 -0700 Subject: PM / sleep: trace events for suspend/resume Adds trace events that give finer resolution into suspend/resume. These events are graphed in the timelines generated by the analyze_suspend.py script. They represent large areas of time consumed that are typical to suspend and resume. The event is triggered by calling the function "trace_suspend_resume" with three arguments: a string (the name of the event to be displayed in the timeline), an integer (case specific number, such as the power state or cpu number), and a boolean (where true is used to denote the start of the timeline event, and false to denote the end). The suspend_resume trace event reproduces the data that the machine_suspend trace event did, so the latter has been removed. Signed-off-by: Todd Brandt Acked-by: Steven Rostedt Signed-off-by: Rafael J. Wysocki --- kernel/cpu.c | 5 +++++ kernel/power/hibernate.c | 3 +++ kernel/power/process.c | 3 +++ kernel/power/suspend.c | 14 ++++++++++++-- 4 files changed, 23 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index a9e710eef0e2..76deba01322a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "smpboot.h" @@ -522,7 +523,9 @@ int disable_nonboot_cpus(void) for_each_online_cpu(cpu) { if (cpu == first_cpu) continue; + trace_suspend_resume(TPS("CPU_OFF"), cpu, true); error = _cpu_down(cpu, 1); + trace_suspend_resume(TPS("CPU_OFF"), cpu, false); if (!error) cpumask_set_cpu(cpu, frozen_cpus); else { @@ -566,7 +569,9 @@ void __ref enable_nonboot_cpus(void) arch_enable_nonboot_cpus_begin(); for_each_cpu(cpu, frozen_cpus) { + trace_suspend_resume(TPS("CPU_ON"), cpu, true); error = _cpu_up(cpu, 1); + trace_suspend_resume(TPS("CPU_ON"), cpu, false); if (!error) { printk(KERN_INFO "CPU%d is up\n", cpu); continue; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index df88d55dc436..49e0a20fd010 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "power.h" @@ -292,7 +293,9 @@ static int create_image(int platform_mode) in_suspend = 1; save_processor_state(); + trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true); error = swsusp_arch_suspend(); + trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false); if (error) printk(KERN_ERR "PM: Error %d creating hibernation image\n", error); diff --git a/kernel/power/process.c b/kernel/power/process.c index 06ec8869dbf1..0ca8d83e2369 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -17,6 +17,7 @@ #include #include #include +#include /* * Timeout for stopping processes @@ -175,6 +176,7 @@ void thaw_processes(void) struct task_struct *g, *p; struct task_struct *curr = current; + trace_suspend_resume(TPS("thaw_processes"), 0, true); if (pm_freezing) atomic_dec(&system_freezing_cnt); pm_freezing = false; @@ -201,6 +203,7 @@ void thaw_processes(void) schedule(); printk("done.\n"); + trace_suspend_resume(TPS("thaw_processes"), 0, false); } void thaw_kernel_threads(void) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 963e6d0f050b..4dd8822f732a 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -177,7 +177,9 @@ static int suspend_prepare(suspend_state_t state) if (error) goto Finish; + trace_suspend_resume(TPS("freeze_processes"), 0, true); error = suspend_freeze_processes(); + trace_suspend_resume(TPS("freeze_processes"), 0, false); if (!error) return 0; @@ -240,7 +242,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) * all the devices are suspended. */ if (state == PM_SUSPEND_FREEZE) { + trace_suspend_resume(TPS("machine_suspend"), state, true); freeze_enter(); + trace_suspend_resume(TPS("machine_suspend"), state, false); goto Platform_wake; } @@ -256,7 +260,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) if (!error) { *wakeup = pm_wakeup_pending(); if (!(suspend_test(TEST_CORE) || *wakeup)) { + trace_suspend_resume(TPS("machine_suspend"), + state, true); error = suspend_ops->enter(state); + trace_suspend_resume(TPS("machine_suspend"), + state, false); events_check_enabled = false; } syscore_resume(); @@ -294,7 +302,6 @@ int suspend_devices_and_enter(suspend_state_t state) if (need_suspend_ops(state) && !suspend_ops) return -ENOSYS; - trace_machine_suspend(state); if (need_suspend_ops(state) && suspend_ops->begin) { error = suspend_ops->begin(state); if (error) @@ -331,7 +338,6 @@ int suspend_devices_and_enter(suspend_state_t state) else if (state == PM_SUSPEND_FREEZE && freeze_ops->end) freeze_ops->end(); - trace_machine_suspend(PWR_EVENT_EXIT); return error; Recover_platform: @@ -365,6 +371,7 @@ static int enter_state(suspend_state_t state) { int error; + trace_suspend_resume(TPS("suspend_enter"), state, true); if (state == PM_SUSPEND_FREEZE) { #ifdef CONFIG_PM_DEBUG if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) { @@ -382,9 +389,11 @@ static int enter_state(suspend_state_t state) if (state == PM_SUSPEND_FREEZE) freeze_begin(); + trace_suspend_resume(TPS("sync_filesystems"), 0, true); printk(KERN_INFO "PM: Syncing filesystems ... "); sys_sync(); printk("done.\n"); + trace_suspend_resume(TPS("sync_filesystems"), 0, false); pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label); error = suspend_prepare(state); @@ -394,6 +403,7 @@ static int enter_state(suspend_state_t state) if (suspend_test(TEST_FREEZER)) goto Finish; + trace_suspend_resume(TPS("suspend_enter"), state, false); pr_debug("PM: Entering %s sleep\n", pm_states[state].label); pm_restrict_gfp_mask(); error = suspend_devices_and_enter(state); -- cgit v1.2.3 From a9fcaaac37b3baba1343f906f52aeb65c4d4e356 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 6 Jun 2014 23:17:28 -0400 Subject: tracing: Fix memory leak on instance deletion When an instance is created, it also gets a snapshot ring buffer allocated (with minimum of pages). But when it is deleted the snapshot buffer is not. There was a helper function added to match the allocation of these ring buffers to a way to free them, but it wasn't used by the deletion of an instance. Using that helper function solves this memory leak. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 26cfff38e2ab..16f7038d1f4d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6349,8 +6349,7 @@ static int instance_delete(const char *name) event_trace_del_tracer(tr); ftrace_destroy_function_files(tr); debugfs_remove_recursive(tr->dir); - free_percpu(tr->trace_buffer.data); - ring_buffer_free(tr->trace_buffer.buffer); + free_trace_buffers(tr); kfree(tr->name); kfree(tr); -- cgit v1.2.3 From 3d5c9340d1949733eb37616abd15db36aef9a57c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 5 Jun 2014 12:34:23 +0200 Subject: rtmutex: Handle deadlock detection smarter Even in the case when deadlock detection is not requested by the caller, we can detect deadlocks. Right now the code stops the lock chain walk and keeps the waiter enqueued, even on itself. Silly not to yell when such a scenario is detected and to keep the waiter enqueued. Return -EDEADLK unconditionally and handle it at the call sites. The futex calls return -EDEADLK. The non futex ones dequeue the waiter, throw a warning and put the task into a schedule loop. Tagged for stable as it makes the code more robust. Signed-off-by: Thomas Gleixner Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Brad Mouring Link: http://lkml.kernel.org/r/20140605152801.836501969@linutronix.de Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/locking/rtmutex-debug.h | 5 +++++ kernel/locking/rtmutex.c | 33 ++++++++++++++++++++++++++++----- kernel/locking/rtmutex.h | 5 +++++ 3 files changed, 38 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index 14193d596d78..ab29b6a22669 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h @@ -31,3 +31,8 @@ static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, { return (waiter != NULL); } + +static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) +{ + debug_rt_mutex_print_deadlock(w); +} diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index a620d4d08ca6..eb7a46327798 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -314,7 +314,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, } put_task_struct(task); - return deadlock_detect ? -EDEADLK : 0; + return -EDEADLK; } retry: /* @@ -377,7 +377,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); raw_spin_unlock(&lock->wait_lock); - ret = deadlock_detect ? -EDEADLK : 0; + ret = -EDEADLK; goto out_unlock_pi; } @@ -548,7 +548,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, * which is wrong, as the other waiter is not in a deadlock * situation. */ - if (detect_deadlock && owner == task) + if (owner == task) return -EDEADLK; raw_spin_lock_irqsave(&task->pi_lock, flags); @@ -763,6 +763,26 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, return ret; } +static void rt_mutex_handle_deadlock(int res, int detect_deadlock, + struct rt_mutex_waiter *w) +{ + /* + * If the result is not -EDEADLOCK or the caller requested + * deadlock detection, nothing to do here. + */ + if (res != -EDEADLOCK || detect_deadlock) + return; + + /* + * Yell lowdly and stop the task right here. + */ + rt_mutex_print_deadlock(w); + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } +} + /* * Slow path lock function: */ @@ -802,8 +822,10 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, set_current_state(TASK_RUNNING); - if (unlikely(ret)) + if (unlikely(ret)) { remove_waiter(lock, &waiter); + rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter); + } /* * try_to_take_rt_mutex() sets the waiter bit @@ -1112,7 +1134,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, return 1; } - ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); + /* We enforce deadlock detection for futexes */ + ret = task_blocks_on_rt_mutex(lock, waiter, task, 1); if (ret && !rt_mutex_owner(lock)) { /* diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index a1a1dd06421d..f6a1f3c133b1 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h @@ -24,3 +24,8 @@ #define debug_rt_mutex_print_deadlock(w) do { } while (0) #define debug_rt_mutex_detect_deadlock(w,d) (d) #define debug_rt_mutex_reset_waiter(w) do { } while (0) + +static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) +{ + WARN(1, "rtmutex deadlock detected\n"); +} -- cgit v1.2.3 From 82084984383babe728e6e3c9a8e5c46278091315 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 5 Jun 2014 11:16:12 +0200 Subject: rtmutex: Detect changes in the pi lock chain When we walk the lock chain, we drop all locks after each step. So the lock chain can change under us before we reacquire the locks. That's harmless in principle as we just follow the wrong lock path. But it can lead to a false positive in the dead lock detection logic: T0 holds L0 T0 blocks on L1 held by T1 T1 blocks on L2 held by T2 T2 blocks on L3 held by T3 T4 blocks on L4 held by T4 Now we walk the chain lock T1 -> lock L2 -> adjust L2 -> unlock T1 -> lock T2 -> adjust T2 -> drop locks T2 times out and blocks on L0 Now we continue: lock T2 -> lock L0 -> deadlock detected, but it's not a deadlock at all. Brad tried to work around that in the deadlock detection logic itself, but the more I looked at it the less I liked it, because it's crystal ball magic after the fact. We actually can detect a chain change very simple: lock T1 -> lock L2 -> adjust L2 -> unlock T1 -> lock T2 -> adjust T2 -> next_lock = T2->pi_blocked_on->lock; drop locks T2 times out and blocks on L0 Now we continue: lock T2 -> if (next_lock != T2->pi_blocked_on->lock) return; So if we detect that T2 is now blocked on a different lock we stop the chain walk. That's also correct in the following scenario: lock T1 -> lock L2 -> adjust L2 -> unlock T1 -> lock T2 -> adjust T2 -> next_lock = T2->pi_blocked_on->lock; drop locks T3 times out and drops L3 T2 acquires L3 and blocks on L4 now Now we continue: lock T2 -> if (next_lock != T2->pi_blocked_on->lock) return; We don't have to follow up the chain at that point, because T2 propagated our priority up to T4 already. [ Folded a cleanup patch from peterz ] Signed-off-by: Thomas Gleixner Reported-by: Brad Mouring Cc: Steven Rostedt Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20140605152801.930031935@linutronix.de Cc: stable@vger.kernel.org --- kernel/locking/rtmutex.c | 95 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 71 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index eb7a46327798..a8a83a22bb91 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -260,27 +260,36 @@ static void rt_mutex_adjust_prio(struct task_struct *task) */ int max_lock_depth = 1024; +static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) +{ + return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL; +} + /* * Adjust the priority chain. Also used for deadlock detection. * Decreases task's usage by one - may thus free the task. * - * @task: the task owning the mutex (owner) for which a chain walk is probably - * needed + * @task: the task owning the mutex (owner) for which a chain walk is + * probably needed * @deadlock_detect: do we have to carry out deadlock detection? - * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck - * things for a task that has just got its priority adjusted, and - * is waiting on a mutex) + * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck + * things for a task that has just got its priority adjusted, and + * is waiting on a mutex) + * @next_lock: the mutex on which the owner of @orig_lock was blocked before + * we dropped its pi_lock. Is never dereferenced, only used for + * comparison to detect lock chain changes. * @orig_waiter: rt_mutex_waiter struct for the task that has just donated - * its priority to the mutex owner (can be NULL in the case - * depicted above or if the top waiter is gone away and we are - * actually deboosting the owner) - * @top_task: the current top waiter + * its priority to the mutex owner (can be NULL in the case + * depicted above or if the top waiter is gone away and we are + * actually deboosting the owner) + * @top_task: the current top waiter * * Returns 0 or -EDEADLK. */ static int rt_mutex_adjust_prio_chain(struct task_struct *task, int deadlock_detect, struct rt_mutex *orig_lock, + struct rt_mutex *next_lock, struct rt_mutex_waiter *orig_waiter, struct task_struct *top_task) { @@ -338,6 +347,18 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, if (orig_waiter && !rt_mutex_owner(orig_lock)) goto out_unlock_pi; + /* + * We dropped all locks after taking a refcount on @task, so + * the task might have moved on in the lock chain or even left + * the chain completely and blocks now on an unrelated lock or + * on @orig_lock. + * + * We stored the lock on which @task was blocked in @next_lock, + * so we can detect the chain change. + */ + if (next_lock != waiter->lock) + goto out_unlock_pi; + /* * Drop out, when the task has no waiters. Note, * top_waiter can be NULL, when we are in the deboosting @@ -422,11 +443,26 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, __rt_mutex_adjust_prio(task); } + /* + * Check whether the task which owns the current lock is pi + * blocked itself. If yes we store a pointer to the lock for + * the lock chain change detection above. After we dropped + * task->pi_lock next_lock cannot be dereferenced anymore. + */ + next_lock = task_blocked_on_lock(task); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); top_waiter = rt_mutex_top_waiter(lock); raw_spin_unlock(&lock->wait_lock); + /* + * We reached the end of the lock chain. Stop right here. No + * point to go back just to figure that out. + */ + if (!next_lock) + goto out_put_task; + if (!detect_deadlock && waiter != top_waiter) goto out_put_task; @@ -536,8 +572,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, { struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; - unsigned long flags; + struct rt_mutex *next_lock; int chain_walk = 0, res; + unsigned long flags; /* * Early deadlock detection. We really don't want the task to @@ -569,20 +606,28 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, if (!owner) return 0; + raw_spin_lock_irqsave(&owner->pi_lock, flags); if (waiter == rt_mutex_top_waiter(lock)) { - raw_spin_lock_irqsave(&owner->pi_lock, flags); rt_mutex_dequeue_pi(owner, top_waiter); rt_mutex_enqueue_pi(owner, waiter); __rt_mutex_adjust_prio(owner); if (owner->pi_blocked_on) chain_walk = 1; - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); - } - else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) + } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) { chain_walk = 1; + } - if (!chain_walk) + /* Store the lock on which owner is blocked or NULL */ + next_lock = task_blocked_on_lock(owner); + + raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + /* + * Even if full deadlock detection is on, if the owner is not + * blocked itself, we can avoid finding this out in the chain + * walk. + */ + if (!chain_walk || !next_lock) return 0; /* @@ -594,8 +639,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, raw_spin_unlock(&lock->wait_lock); - res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, - task); + res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, + next_lock, waiter, task); raw_spin_lock(&lock->wait_lock); @@ -644,8 +689,8 @@ static void remove_waiter(struct rt_mutex *lock, { int first = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); + struct rt_mutex *next_lock = NULL; unsigned long flags; - int chain_walk = 0; raw_spin_lock_irqsave(¤t->pi_lock, flags); rt_mutex_dequeue(lock, waiter); @@ -669,13 +714,13 @@ static void remove_waiter(struct rt_mutex *lock, } __rt_mutex_adjust_prio(owner); - if (owner->pi_blocked_on) - chain_walk = 1; + /* Store the lock on which owner is blocked or NULL */ + next_lock = task_blocked_on_lock(owner); raw_spin_unlock_irqrestore(&owner->pi_lock, flags); } - if (!chain_walk) + if (!next_lock) return; /* gets dropped in rt_mutex_adjust_prio_chain()! */ @@ -683,7 +728,7 @@ static void remove_waiter(struct rt_mutex *lock, raw_spin_unlock(&lock->wait_lock); - rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); + rt_mutex_adjust_prio_chain(owner, 0, lock, next_lock, NULL, current); raw_spin_lock(&lock->wait_lock); } @@ -696,6 +741,7 @@ static void remove_waiter(struct rt_mutex *lock, void rt_mutex_adjust_pi(struct task_struct *task) { struct rt_mutex_waiter *waiter; + struct rt_mutex *next_lock; unsigned long flags; raw_spin_lock_irqsave(&task->pi_lock, flags); @@ -706,12 +752,13 @@ void rt_mutex_adjust_pi(struct task_struct *task) raw_spin_unlock_irqrestore(&task->pi_lock, flags); return; } - + next_lock = waiter->lock; raw_spin_unlock_irqrestore(&task->pi_lock, flags); /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(task); - rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); + + rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task); } /** -- cgit v1.2.3 From f972eb63b1003fae68d7b7e9b674d4ba5db681c2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 19 May 2014 15:13:47 -0400 Subject: perf: Pass protection and flags bits through mmap2 interface The mmap2 interface was missing the protection and flags bits needed to accurately determine if a mmap memory area was shared or private and if it was readable or not. Signed-off-by: Peter Zijlstra [tweaked patch to compile and wrote changelog] Signed-off-by: Don Zickus Link: http://lkml.kernel.org/r/1400526833-141779-2-git-send-email-dzickus@redhat.com Signed-off-by: Jiri Olsa --- kernel/events/core.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 7da5e561e89a..eea1955c7868 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -40,6 +40,7 @@ #include #include #include +#include #include "internal.h" @@ -5127,6 +5128,7 @@ struct perf_mmap_event { int maj, min; u64 ino; u64 ino_generation; + u32 prot, flags; struct { struct perf_event_header header; @@ -5168,6 +5170,8 @@ static void perf_event_mmap_output(struct perf_event *event, mmap_event->event_id.header.size += sizeof(mmap_event->min); mmap_event->event_id.header.size += sizeof(mmap_event->ino); mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation); + mmap_event->event_id.header.size += sizeof(mmap_event->prot); + mmap_event->event_id.header.size += sizeof(mmap_event->flags); } perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); @@ -5186,6 +5190,8 @@ static void perf_event_mmap_output(struct perf_event *event, perf_output_put(&handle, mmap_event->min); perf_output_put(&handle, mmap_event->ino); perf_output_put(&handle, mmap_event->ino_generation); + perf_output_put(&handle, mmap_event->prot); + perf_output_put(&handle, mmap_event->flags); } __output_copy(&handle, mmap_event->file_name, @@ -5204,6 +5210,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) struct file *file = vma->vm_file; int maj = 0, min = 0; u64 ino = 0, gen = 0; + u32 prot = 0, flags = 0; unsigned int size; char tmp[16]; char *buf = NULL; @@ -5234,6 +5241,28 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) gen = inode->i_generation; maj = MAJOR(dev); min = MINOR(dev); + + if (vma->vm_flags & VM_READ) + prot |= PROT_READ; + if (vma->vm_flags & VM_WRITE) + prot |= PROT_WRITE; + if (vma->vm_flags & VM_EXEC) + prot |= PROT_EXEC; + + if (vma->vm_flags & VM_MAYSHARE) + flags = MAP_SHARED; + else + flags = MAP_PRIVATE; + + if (vma->vm_flags & VM_DENYWRITE) + flags |= MAP_DENYWRITE; + if (vma->vm_flags & VM_MAYEXEC) + flags |= MAP_EXECUTABLE; + if (vma->vm_flags & VM_LOCKED) + flags |= MAP_LOCKED; + if (vma->vm_flags & VM_HUGETLB) + flags |= MAP_HUGETLB; + goto got_name; } else { name = (char *)arch_vma_name(vma); @@ -5274,6 +5303,8 @@ got_name: mmap_event->min = min; mmap_event->ino = ino; mmap_event->ino_generation = gen; + mmap_event->prot = prot; + mmap_event->flags = flags; if (!(vma->vm_flags & VM_EXEC)) mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; @@ -5314,6 +5345,8 @@ void perf_event_mmap(struct vm_area_struct *vma) /* .min (attr_mmap2 only) */ /* .ino (attr_mmap2 only) */ /* .ino_generation (attr_mmap2 only) */ + /* .prot (attr_mmap2 only) */ + /* .flags (attr_mmap2 only) */ }; perf_event_mmap_event(&mmap_event); -- cgit v1.2.3 From a5a5ba72843dd05f991184d6cb9a4471acce1005 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 30 May 2014 10:49:42 -0400 Subject: Revert "perf: Disable PERF_RECORD_MMAP2 support" This reverts commit 3090ffb5a2515990182f3f55b0688a7817325488. Re-enable the mmap2 interface as we will have a user soon. Since things have changed since perf disabled mmap2, small tweaks to the revert had to be done: o commit 9d4ecc88 forced (n!=8) to become (n<7) o a new libunwind test needed updating to use mmap2 interface Signed-off-by: Don Zickus Link: http://lkml.kernel.org/r/1401461382-209586-1-git-send-email-dzickus@redhat.com Signed-off-by: Jiri Olsa --- kernel/events/core.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index eea1955c7868..cd28335b3d28 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6929,10 +6929,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (ret) return -EFAULT; - /* disabled for now */ - if (attr->mmap2) - return -EINVAL; - if (attr->__reserved_1) return -EINVAL; -- cgit v1.2.3 From 8b8b36834d0fff67fc8668093f4312dd04dcf21d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 10 Jun 2014 09:46:00 -0400 Subject: ring-buffer: Check if buffer exists before polling The per_cpu buffers are created one per possible CPU. But these do not mean that those CPUs are online, nor do they even exist. With the addition of the ring buffer polling, it assumes that the caller polls on an existing buffer. But this is not the case if the user reads trace_pipe from a CPU that does not exist, and this causes the kernel to crash. Simple fix is to check the cpu against buffer bitmask against to see if the buffer was allocated or not and return -ENODEV if it is not. More updates were done to pass the -ENODEV back up to userspace. Link: http://lkml.kernel.org/r/5393DB61.6060707@oracle.com Reported-by: Sasha Levin Cc: stable@vger.kernel.org # 3.10+ Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 5 ++++- kernel/trace/trace.c | 22 ++++++++++++++++------ 2 files changed, 20 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c634868c2921..7c56c3d06943 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -543,7 +543,7 @@ static void rb_wake_up_waiters(struct irq_work *work) * as data is added to any of the @buffer's cpu buffers. Otherwise * it will wait for data to be added to a specific cpu buffer. */ -void ring_buffer_wait(struct ring_buffer *buffer, int cpu) +int ring_buffer_wait(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; DEFINE_WAIT(wait); @@ -557,6 +557,8 @@ void ring_buffer_wait(struct ring_buffer *buffer, int cpu) if (cpu == RING_BUFFER_ALL_CPUS) work = &buffer->irq_work; else { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -ENODEV; cpu_buffer = buffer->buffers[cpu]; work = &cpu_buffer->irq_work; } @@ -591,6 +593,7 @@ void ring_buffer_wait(struct ring_buffer *buffer, int cpu) schedule(); finish_wait(&work->waiters, &wait); + return 0; } /** diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 16f7038d1f4d..56422f1decba 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1085,13 +1085,13 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) } #endif /* CONFIG_TRACER_MAX_TRACE */ -static void wait_on_pipe(struct trace_iterator *iter) +static int wait_on_pipe(struct trace_iterator *iter) { /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) - return; + return 0; - ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); + return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); } #ifdef CONFIG_FTRACE_STARTUP_TEST @@ -4378,6 +4378,7 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table) static int tracing_wait_pipe(struct file *filp) { struct trace_iterator *iter = filp->private_data; + int ret; while (trace_empty(iter)) { @@ -4399,10 +4400,13 @@ static int tracing_wait_pipe(struct file *filp) mutex_unlock(&iter->mutex); - wait_on_pipe(iter); + ret = wait_on_pipe(iter); mutex_lock(&iter->mutex); + if (ret) + return ret; + if (signal_pending(current)) return -EINTR; } @@ -5327,8 +5331,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, goto out_unlock; } mutex_unlock(&trace_types_lock); - wait_on_pipe(iter); + ret = wait_on_pipe(iter); mutex_lock(&trace_types_lock); + if (ret) { + size = ret; + goto out_unlock; + } if (signal_pending(current)) { size = -EINTR; goto out_unlock; @@ -5538,8 +5546,10 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, goto out; } mutex_unlock(&trace_types_lock); - wait_on_pipe(iter); + ret = wait_on_pipe(iter); mutex_lock(&trace_types_lock); + if (ret) + goto out; if (signal_pending(current)) { ret = -EINTR; goto out; -- cgit v1.2.3 From a6af8fbf17989e41fef5cacf3988a724fb687d78 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 10 Jun 2014 16:11:35 +0900 Subject: tracing: Cleanup saved_cmdlines_size changes The recent addition of saved_cmdlines_size file had some remaining (minor - mostly coding style) issues. Fix them by passing pointer name to sizeof() and using scnprintf(). Link: http://lkml.kernel.org/p/1402384295-23680-1-git-send-email-namhyung@kernel.org Cc: Namhyung Kim Cc: Ingo Molnar Cc: Yoshihiro YUNOMAE Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 56422f1decba..2b458c60e0da 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1338,7 +1338,7 @@ static int trace_create_savedcmd(void) { int ret; - savedcmd = kmalloc(sizeof(struct saved_cmdlines_buffer), GFP_KERNEL); + savedcmd = kmalloc(sizeof(*savedcmd), GFP_KERNEL); if (!savedcmd) return -ENOMEM; @@ -3840,7 +3840,7 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, int r; arch_spin_lock(&trace_cmdline_lock); - r = sprintf(buf, "%u\n", savedcmd->cmdline_num); + r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num); arch_spin_unlock(&trace_cmdline_lock); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); @@ -3857,7 +3857,7 @@ static int tracing_resize_saved_cmdlines(unsigned int val) { struct saved_cmdlines_buffer *s, *savedcmd_temp; - s = kmalloc(sizeof(struct saved_cmdlines_buffer), GFP_KERNEL); + s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; -- cgit v1.2.3 From a3c54931199565930d6d84f4c3456f6440aefd41 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 28 May 2014 23:09:58 -0400 Subject: auditsc: audit_krule mask accesses need bounds checking Fixes an easy DoS and possible information disclosure. This does nothing about the broken state of x32 auditing. eparis: If the admin has enabled auditd and has specifically loaded audit rules. This bug has been around since before git. Wow... Cc: stable@vger.kernel.org Signed-off-by: Andy Lutomirski Signed-off-by: Eric Paris Signed-off-by: Linus Torvalds --- kernel/auditsc.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index f251a5e8d17a..21eae3c05ec0 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -728,6 +728,22 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key) return AUDIT_BUILD_CONTEXT; } +static int audit_in_mask(const struct audit_krule *rule, unsigned long val) +{ + int word, bit; + + if (val > 0xffffffff) + return false; + + word = AUDIT_WORD(val); + if (word >= AUDIT_BITMASK_SIZE) + return false; + + bit = AUDIT_BIT(val); + + return rule->mask[word] & bit; +} + /* At syscall entry and exit time, this filter is called if the * audit_state is not low enough that auditing cannot take place, but is * also not high enough that we already know we have to write an audit @@ -745,11 +761,8 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, rcu_read_lock(); if (!list_empty(list)) { - int word = AUDIT_WORD(ctx->major); - int bit = AUDIT_BIT(ctx->major); - list_for_each_entry_rcu(e, list, list) { - if ((e->rule.mask[word] & bit) == bit && + if (audit_in_mask(&e->rule, ctx->major) && audit_filter_rules(tsk, &e->rule, ctx, NULL, &state, false)) { rcu_read_unlock(); @@ -769,20 +782,16 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, static int audit_filter_inode_name(struct task_struct *tsk, struct audit_names *n, struct audit_context *ctx) { - int word, bit; int h = audit_hash_ino((u32)n->ino); struct list_head *list = &audit_inode_hash[h]; struct audit_entry *e; enum audit_state state; - word = AUDIT_WORD(ctx->major); - bit = AUDIT_BIT(ctx->major); - if (list_empty(list)) return 0; list_for_each_entry_rcu(e, list, list) { - if ((e->rule.mask[word] & bit) == bit && + if (audit_in_mask(&e->rule, ctx->major) && audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { ctx->current_state = state; return 1; -- cgit v1.2.3 From f0b70cc48cc282cb326a4d71b3d1dda7d8fafd2a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 10 Jun 2014 12:06:30 -0400 Subject: tracing: Fix leak of per cpu max data in instances The freeing of an instance, if max data is configured, there will be per cpu data structures created. But these are not freed when the instance is deleted, which causes a memory leak. A new helper function is added that frees the individual buffers within a trace array, instead of duplicating the code. This way changes made for one are applied to the other (normal buffer vs max buffer). Link: http://lkml.kernel.org/r/87k38pbake.fsf@sejong.aot.lge.com Reported-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2b458c60e0da..384ede311717 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6242,22 +6242,25 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) return 0; } +static void free_trace_buffer(struct trace_buffer *buf) +{ + if (buf->buffer) { + ring_buffer_free(buf->buffer); + buf->buffer = NULL; + free_percpu(buf->data); + buf->data = NULL; + } +} + static void free_trace_buffers(struct trace_array *tr) { if (!tr) return; - if (tr->trace_buffer.buffer) { - ring_buffer_free(tr->trace_buffer.buffer); - tr->trace_buffer.buffer = NULL; - free_percpu(tr->trace_buffer.data); - } + free_trace_buffer(&tr->trace_buffer); #ifdef CONFIG_TRACER_MAX_TRACE - if (tr->max_buffer.buffer) { - ring_buffer_free(tr->max_buffer.buffer); - tr->max_buffer.buffer = NULL; - } + free_trace_buffer(&tr->max_buffer); #endif } -- cgit v1.2.3 From da9c3413a27be5ba6f996e90495c836dd30b8841 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 10 Jun 2014 13:53:50 -0400 Subject: tracing: Fix check of ftrace_trace_arrays list_empty() check The check that tests if ftrace_trace_arrays is empty in top_trace_array(), uses the .prev pointer: if (list_empty(ftrace_trace_arrays.prev)) instead of testing the variable itself: if (list_empty(&ftrace_trace_arrays)) Although it is technically correct, it is awkward and confusing. Use the proper method. Link: http://lkml.kernel.org/r/87oay1bas8.fsf@sejong.aot.lge.com Reported-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9e82551dd566..9258f5a815db 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -252,7 +252,7 @@ static inline struct trace_array *top_trace_array(void) { struct trace_array *tr; - if (list_empty(ftrace_trace_arrays.prev)) + if (list_empty(&ftrace_trace_arrays)) return NULL; tr = list_entry(ftrace_trace_arrays.prev, -- cgit v1.2.3 From 23adbe12ef7d3d4195e80800ab36b37bee28cd03 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 10 Jun 2014 12:45:42 -0700 Subject: fs,userns: Change inode_capable to capable_wrt_inode_uidgid The kernel has no concept of capabilities with respect to inodes; inodes exist independently of namespaces. For example, inode_capable(inode, CAP_LINUX_IMMUTABLE) would be nonsense. This patch changes inode_capable to check for uid and gid mappings and renames it to capable_wrt_inode_uidgid, which should make it more obvious what it does. Fixes CVE-2014-4014. Cc: Theodore Ts'o Cc: Serge Hallyn Cc: "Eric W. Biederman" Cc: Dave Chinner Cc: stable@vger.kernel.org Signed-off-by: Andy Lutomirski Signed-off-by: Linus Torvalds --- kernel/capability.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index 84b2bbf443e7..a5cf13c018ce 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -424,23 +424,19 @@ bool capable(int cap) EXPORT_SYMBOL(capable); /** - * inode_capable - Check superior capability over inode + * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped * @inode: The inode in question * @cap: The capability in question * - * Return true if the current task has the given superior capability - * targeted at it's own user namespace and that the given inode is owned - * by the current user namespace or a child namespace. - * - * Currently we check to see if an inode is owned by the current - * user namespace by seeing if the inode's owner maps into the - * current user namespace. - * + * Return true if the current task has the given capability targeted at + * its own user namespace and that the given inode's uid and gid are + * mapped into the current user namespace. */ -bool inode_capable(const struct inode *inode, int cap) +bool capable_wrt_inode_uidgid(const struct inode *inode, int cap) { struct user_namespace *ns = current_user_ns(); - return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid); + return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) && + kgid_has_mapping(ns, inode->i_gid); } -EXPORT_SYMBOL(inode_capable); +EXPORT_SYMBOL(capable_wrt_inode_uidgid); -- cgit v1.2.3 From a992bf836f9c3039a16f4bd068d161c86c6c3e2c Mon Sep 17 00:00:00 2001 From: Yuan Pengfei Date: Tue, 10 Jun 2014 15:18:39 -0700 Subject: gcov: add support for GCC 4.9 This patch handles the gcov-related changes in GCC 4.9: A new counter (time profile) is added. The total number is 9 now. A new profile merge function __gcov_merge_time_profile is added. See gcc/gcov-io.h and libgcc/libgcov-merge.c For the first change, the layout of struct gcov_info is affected. For the second one, a dummy function is added to kernel/gcov/base.c similarly. Signed-off-by: Yuan Pengfei Acked-by: Peter Oberparleiter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/gcov/base.c | 6 ++++++ kernel/gcov/gcc_4_7.c | 5 +++++ 2 files changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index f45b75b713c0..b358a802fd18 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -85,6 +85,12 @@ void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) } EXPORT_SYMBOL(__gcov_merge_ior); +void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_time_profile); + /** * gcov_enable_events - enable event reporting through gcov_event() * diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 2c6e4631c814..826ba9fb5e32 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,7 +18,12 @@ #include #include "gcov.h" +#if __GNUC__ == 4 && __GNUC_MINOR__ >= 9 +#define GCOV_COUNTERS 9 +#else #define GCOV_COUNTERS 8 +#endif + #define GCOV_TAG_FUNCTION_LENGTH 3 static struct gcov_info *gcov_info_head; -- cgit v1.2.3 From 4cdf77a828b056258f48a9f6078bd2f77d9704bb Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 14 Jun 2014 06:47:12 +0000 Subject: x86/kprobes: Fix build errors and blacklist context_track_user This essentially reverts commit: ecd50f714c42 ("kprobes, x86: Call exception_enter after kprobes handled") since it causes build errors with CONFIG_CONTEXT_TRACKING and that has been made from misunderstandings; context_track_user_*() don't involve much in interrupt context, it just returns if in_interrupt() is true. Instead of changing the do_debug/int3(), this just adds context_track_user_*() to kprobes blacklist, since those are still can be called right before kprobes handles int3 and debug exceptions, and probing those will cause an infinite loop. Reported-by: Frederic Weisbecker Signed-off-by: Masami Hiramatsu Cc: Borislav Petkov Cc: Kees Cook Cc: Jiri Kosina Cc: Rusty Russell Cc: Steven Rostedt Cc: Seiji Aguchi Cc: Andrew Morton Cc: Kees Cook Link: http://lkml.kernel.org/r/20140614064711.7865.45957.stgit@kbuild-fedora.novalocal Signed-off-by: Ingo Molnar --- kernel/context_tracking.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 019d45008448..5664985c46a0 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -19,6 +19,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -104,6 +105,7 @@ void context_tracking_user_enter(void) } local_irq_restore(flags); } +NOKPROBE_SYMBOL(context_tracking_user_enter); #ifdef CONFIG_PREEMPT /** @@ -181,6 +183,7 @@ void context_tracking_user_exit(void) } local_irq_restore(flags); } +NOKPROBE_SYMBOL(context_tracking_user_exit); /** * __context_tracking_task_switch - context switch the syscall callbacks -- cgit v1.2.3 From 27e35715df54cbc4f2d044f681802ae30479e7fb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Jun 2014 18:44:04 +0000 Subject: rtmutex: Plug slow unlock race When the rtmutex fast path is enabled the slow unlock function can create the following situation: spin_lock(foo->m->wait_lock); foo->m->owner = NULL; rt_mutex_lock(foo->m); <-- fast path free = atomic_dec_and_test(foo->refcnt); rt_mutex_unlock(foo->m); <-- fast path if (free) kfree(foo); spin_unlock(foo->m->wait_lock); <--- Use after free. Plug the race by changing the slow unlock to the following scheme: while (!rt_mutex_has_waiters(m)) { /* Clear the waiters bit in m->owner */ clear_rt_mutex_waiters(m); owner = rt_mutex_owner(m); spin_unlock(m->wait_lock); if (cmpxchg(m->owner, owner, 0) == owner) return; spin_lock(m->wait_lock); } So in case of a new waiter incoming while the owner tries the slow path unlock we have two situations: unlock(wait_lock); lock(wait_lock); cmpxchg(p, owner, 0) == owner mark_rt_mutex_waiters(lock); acquire(lock); Or: unlock(wait_lock); lock(wait_lock); mark_rt_mutex_waiters(lock); cmpxchg(p, owner, 0) != owner enqueue_waiter(); unlock(wait_lock); lock(wait_lock); wakeup_next waiter(); unlock(wait_lock); lock(wait_lock); acquire(lock); If the fast path is disabled, then the simple m->owner = NULL; unlock(m->wait_lock); is sufficient as all access to m->owner is serialized via m->wait_lock; Also document and clarify the wakeup_next_waiter function as suggested by Oleg Nesterov. Reported-by: Steven Rostedt Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20140611183852.937945560@linutronix.de Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/locking/rtmutex.c | 115 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 109 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index a8a83a22bb91..fc605941b9b8 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -83,6 +83,47 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) owner = *p; } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); } + +/* + * Safe fastpath aware unlock: + * 1) Clear the waiters bit + * 2) Drop lock->wait_lock + * 3) Try to unlock the lock with cmpxchg + */ +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) + __releases(lock->wait_lock) +{ + struct task_struct *owner = rt_mutex_owner(lock); + + clear_rt_mutex_waiters(lock); + raw_spin_unlock(&lock->wait_lock); + /* + * If a new waiter comes in between the unlock and the cmpxchg + * we have two situations: + * + * unlock(wait_lock); + * lock(wait_lock); + * cmpxchg(p, owner, 0) == owner + * mark_rt_mutex_waiters(lock); + * acquire(lock); + * or: + * + * unlock(wait_lock); + * lock(wait_lock); + * mark_rt_mutex_waiters(lock); + * + * cmpxchg(p, owner, 0) != owner + * enqueue_waiter(); + * unlock(wait_lock); + * lock(wait_lock); + * wake waiter(); + * unlock(wait_lock); + * lock(wait_lock); + * acquire(lock); + */ + return rt_mutex_cmpxchg(lock, owner, NULL); +} + #else # define rt_mutex_cmpxchg(l,c,n) (0) static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) @@ -90,6 +131,17 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) lock->owner = (struct task_struct *) ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); } + +/* + * Simple slow path only version: lock->owner is protected by lock->wait_lock. + */ +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) + __releases(lock->wait_lock) +{ + lock->owner = NULL; + raw_spin_unlock(&lock->wait_lock); + return true; +} #endif static inline int @@ -650,7 +702,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, /* * Wake up the next waiter on the lock. * - * Remove the top waiter from the current tasks waiter list and wake it up. + * Remove the top waiter from the current tasks pi waiter list and + * wake it up. * * Called with lock->wait_lock held. */ @@ -671,10 +724,23 @@ static void wakeup_next_waiter(struct rt_mutex *lock) */ rt_mutex_dequeue_pi(current, waiter); - rt_mutex_set_owner(lock, NULL); + /* + * As we are waking up the top waiter, and the waiter stays + * queued on the lock until it gets the lock, this lock + * obviously has waiters. Just set the bit here and this has + * the added benefit of forcing all new tasks into the + * slow path making sure no task of lower priority than + * the top waiter can steal this lock. + */ + lock->owner = (void *) RT_MUTEX_HAS_WAITERS; raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + /* + * It's safe to dereference waiter as it cannot go away as + * long as we hold lock->wait_lock. The waiter task needs to + * acquire it in order to dequeue the waiter. + */ wake_up_process(waiter->task); } @@ -928,12 +994,49 @@ rt_mutex_slowunlock(struct rt_mutex *lock) rt_mutex_deadlock_account_unlock(current); - if (!rt_mutex_has_waiters(lock)) { - lock->owner = NULL; - raw_spin_unlock(&lock->wait_lock); - return; + /* + * We must be careful here if the fast path is enabled. If we + * have no waiters queued we cannot set owner to NULL here + * because of: + * + * foo->lock->owner = NULL; + * rtmutex_lock(foo->lock); <- fast path + * free = atomic_dec_and_test(foo->refcnt); + * rtmutex_unlock(foo->lock); <- fast path + * if (free) + * kfree(foo); + * raw_spin_unlock(foo->lock->wait_lock); + * + * So for the fastpath enabled kernel: + * + * Nothing can set the waiters bit as long as we hold + * lock->wait_lock. So we do the following sequence: + * + * owner = rt_mutex_owner(lock); + * clear_rt_mutex_waiters(lock); + * raw_spin_unlock(&lock->wait_lock); + * if (cmpxchg(&lock->owner, owner, 0) == owner) + * return; + * goto retry; + * + * The fastpath disabled variant is simple as all access to + * lock->owner is serialized by lock->wait_lock: + * + * lock->owner = NULL; + * raw_spin_unlock(&lock->wait_lock); + */ + while (!rt_mutex_has_waiters(lock)) { + /* Drops lock->wait_lock ! */ + if (unlock_rt_mutex_safe(lock) == true) + return; + /* Relock the rtmutex and try again */ + raw_spin_lock(&lock->wait_lock); } + /* + * The wakeup next waiter path does not suffer from the above + * race. See the comments there. + */ wakeup_next_waiter(lock); raw_spin_unlock(&lock->wait_lock); -- cgit v1.2.3 From a6e15a39048ec3229b9a53425f4384f55f6cc1b3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 13 Jun 2014 13:30:35 -0700 Subject: PM / hibernate: introduce "nohibernate" boot parameter To support using kernel features that are not compatible with hibernation, this creates the "nohibernate" kernel boot parameter to disable both hibernation and resume. This allows hibernation support to be a boot-time choice instead of only a compile-time choice. Signed-off-by: Kees Cook Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 31 ++++++++++++++++++++++++++++++- kernel/power/main.c | 6 ++---- kernel/power/user.c | 3 +++ 3 files changed, 35 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 49e0a20fd010..258f492f0347 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -35,6 +35,7 @@ static int nocompress; static int noresume; +static int nohibernate; static int resume_wait; static unsigned int resume_delay; static char resume_file[256] = CONFIG_PM_STD_PARTITION; @@ -62,6 +63,11 @@ bool freezer_test_done; static const struct platform_hibernation_ops *hibernation_ops; +bool hibernation_available(void) +{ + return (nohibernate == 0); +} + /** * hibernation_set_ops - Set the global hibernate operations. * @ops: Hibernation operations to use in subsequent hibernation transitions. @@ -642,6 +648,11 @@ int hibernate(void) { int error; + if (!hibernation_available()) { + pr_debug("PM: Hibernation not available.\n"); + return -EPERM; + } + lock_system_sleep(); /* The snapshot device should not be opened while we're running */ if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { @@ -734,7 +745,7 @@ static int software_resume(void) /* * If the user said "noresume".. bail out early. */ - if (noresume) + if (noresume || !hibernation_available()) return 0; /* @@ -900,6 +911,9 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, int i; char *start = buf; + if (!hibernation_available()) + return sprintf(buf, "[disabled]\n"); + for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { if (!hibernation_modes[i]) continue; @@ -934,6 +948,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, char *p; int mode = HIBERNATION_INVALID; + if (!hibernation_available()) + return -EPERM; + p = memchr(buf, '\n', n); len = p ? p - buf : n; @@ -1101,6 +1118,10 @@ static int __init hibernate_setup(char *str) noresume = 1; else if (!strncmp(str, "nocompress", 10)) nocompress = 1; + else if (!strncmp(str, "no", 2)) { + noresume = 1; + nohibernate = 1; + } return 1; } @@ -1125,9 +1146,17 @@ static int __init resumedelay_setup(char *str) return 1; } +static int __init nohibernate_setup(char *str) +{ + noresume = 1; + nohibernate = 1; + return 1; +} + __setup("noresume", noresume_setup); __setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); __setup("hibernate=", hibernate_setup); __setup("resumewait", resumewait_setup); __setup("resumedelay=", resumedelay_setup); +__setup("nohibernate", nohibernate_setup); diff --git a/kernel/power/main.c b/kernel/power/main.c index 573410d6647e..8e90f330f139 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -300,13 +300,11 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, s += sprintf(s,"%s ", pm_states[i].label); #endif -#ifdef CONFIG_HIBERNATION - s += sprintf(s, "%s\n", "disk"); -#else + if (hibernation_available()) + s += sprintf(s, "disk "); if (s != buf) /* convert the last space to a newline */ *(s-1) = '\n'; -#endif return (s - buf); } diff --git a/kernel/power/user.c b/kernel/power/user.c index 98d357584cd6..526e8911460a 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -49,6 +49,9 @@ static int snapshot_open(struct inode *inode, struct file *filp) struct snapshot_data *data; int error; + if (!hibernation_available()) + return -EPERM; + lock_system_sleep(); if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { -- cgit v1.2.3 From 24f2e0273f80ec262a772059e140a0adef35296d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 13 Jun 2014 13:30:36 -0700 Subject: x86, kaslr: boot-time selectable with hibernation Changes kASLR from being compile-time selectable (blocked by CONFIG_HIBERNATION), to being boot-time selectable (with hibernation available by default) via the "kaslr" kernel command line. Signed-off-by: Kees Cook Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 258f492f0347..fcc2611d3f14 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1153,6 +1153,11 @@ static int __init nohibernate_setup(char *str) return 1; } +static int __init kaslr_nohibernate_setup(char *str) +{ + return nohibernate_setup(str); +} + __setup("noresume", noresume_setup); __setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); @@ -1160,3 +1165,4 @@ __setup("hibernate=", hibernate_setup); __setup("resumewait", resumewait_setup); __setup("resumedelay=", resumedelay_setup); __setup("nohibernate", nohibernate_setup); +__setup("kaslr", kaslr_nohibernate_setup); -- cgit v1.2.3 From 99bae5f94185c2cc65701e95c54e31e2f4345b88 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 12 Jun 2014 14:31:31 +0800 Subject: cgroup: fix broken css_has_online_children() After running: # mount -t cgroup cpu xxx /cgroup && mkdir /cgroup/sub && \ rmdir /cgroup/sub && umount /cgroup I found the cgroup root still existed: # cat /proc/cgroups #subsys_name hierarchy num_cgroups enabled cpuset 0 1 1 cpu 1 1 1 ... It turned out css_has_online_children() is broken. Signed-off-by: Li Zefan Sigend-off-by: Tejun Heo --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7868fc3c0bc5..d9a8be911f5b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3328,7 +3328,7 @@ bool css_has_online_children(struct cgroup_subsys_state *css) rcu_read_lock(); css_for_each_child(child, css) { - if (css->flags & CSS_ONLINE) { + if (child->flags & CSS_ONLINE) { ret = true; break; } -- cgit v1.2.3 From 4af4206be2bd1933cae20c2b6fb2058dbc887f7c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 13 Apr 2014 20:58:54 +0200 Subject: tracing: Fix syscall_*regfunc() vs copy_process() race syscall_regfunc() and syscall_unregfunc() should set/clear TIF_SYSCALL_TRACEPOINT system-wide, but do_each_thread() can race with copy_process() and miss the new child which was not added to the process/thread lists yet. Change copy_process() to update the child's TIF_SYSCALL_TRACEPOINT under tasklist. Link: http://lkml.kernel.org/p/20140413185854.GB20668@redhat.com Cc: stable@vger.kernel.org # 2.6.33 Fixes: a871bd33a6c0 "tracing: Add syscall tracepoints" Acked-by: Frederic Weisbecker Acked-by: Paul E. McKenney Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/fork.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index d2799d1fc952..6a13c46cd87d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1487,7 +1487,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, total_forks++; spin_unlock(¤t->sighand->siglock); + syscall_tracepoint_update(p); write_unlock_irq(&tasklist_lock); + proc_fork_connector(p); cgroup_post_fork(p); if (clone_flags & CLONE_THREAD) -- cgit v1.2.3 From 8063e41d2ffc0b0ce974ea802158be35902072f3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 13 Apr 2014 20:59:18 +0200 Subject: tracing: Change syscall_*regfunc() to check PF_KTHREAD and use for_each_process_thread() 1. Remove _irqsafe from syscall_regfunc/syscall_unregfunc, read_lock(tasklist) doesn't need to disable irqs. 2. Change this code to avoid the deprecated do_each_thread() and use for_each_process_thread() (stolen from the patch from Frederic). 3. Change syscall_regfunc() to check PF_KTHREAD to skip the kernel threads, ->mm != NULL is the common mistake. Note: probably this check should be simply removed, needs another patch. [fweisbec@gmail.com: s/do_each_thread/for_each_process_thread/] Link: http://lkml.kernel.org/p/20140413185918.GC20668@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/tracepoint.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 33cbd8c203f8..9cf12640de5a 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -492,33 +492,31 @@ static int sys_tracepoint_refcount; void syscall_regfunc(void) { - unsigned long flags; - struct task_struct *g, *t; + struct task_struct *p, *t; if (!sys_tracepoint_refcount) { - read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, t) { + read_lock(&tasklist_lock); + for_each_process_thread(p, t) { /* Skip kernel threads. */ - if (t->mm) + if (!(t->flags & PF_KTHREAD)) set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); - } while_each_thread(g, t); - read_unlock_irqrestore(&tasklist_lock, flags); + } + read_unlock(&tasklist_lock); } sys_tracepoint_refcount++; } void syscall_unregfunc(void) { - unsigned long flags; - struct task_struct *g, *t; + struct task_struct *p, *t; sys_tracepoint_refcount--; if (!sys_tracepoint_refcount) { - read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, t) { + read_lock(&tasklist_lock); + for_each_process_thread(p, t) { clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); - } while_each_thread(g, t); - read_unlock_irqrestore(&tasklist_lock, flags); + } + read_unlock(&tasklist_lock); } } #endif -- cgit v1.2.3 From ea73c79e33c45e1fa0071e216f06fd5682314490 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 13 Apr 2014 20:59:38 +0200 Subject: tracing: syscall_regfunc() should not skip kernel threads syscall_regfunc() ignores the kernel threads because "it has no effect", see cc3b13c1 "Don't trace kernel thread syscalls" which added this check. However, this means that a user-space task spawned by call_usermodehelper() will run without TIF_SYSCALL_TRACEPOINT if sys_tracepoint_refcount != 0. Remove this check. The unnecessary report from ret_from_fork path mentioned by cc3b13c1 is no longer possible, see See commit fb45550d76bb5 "make sure that kernel_thread() callbacks call do_exit() themselves". A kernel_thread() callback can only return and take the int_ret_from_sys_call path after do_execve() succeeds, otherwise the kernel will crash. But in this case it is no longer a kernel thread and thus is needs TIF_SYSCALL_TRACEPOINT. Link: http://lkml.kernel.org/p/20140413185938.GD20668@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/tracepoint.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 9cf12640de5a..3490407dc7b7 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -497,9 +497,7 @@ void syscall_regfunc(void) if (!sys_tracepoint_refcount) { read_lock(&tasklist_lock); for_each_process_thread(p, t) { - /* Skip kernel threads. */ - if (!(t->flags & PF_KTHREAD)) - set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); + set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); } read_unlock(&tasklist_lock); } -- cgit v1.2.3 From bddbceb688c6d0decaabc7884fede319d02f96c8 Mon Sep 17 00:00:00 2001 From: Maxime Bizon Date: Mon, 23 Jun 2014 16:35:35 +0200 Subject: workqueue: fix dev_set_uevent_suppress() imbalance Uevents are suppressed during attributes registration, but never restored, so kobject_uevent() does nothing. Signed-off-by: Maxime Bizon Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org Fixes: 226223ab3c4118ddd10688cc2c131135848371ab --- kernel/workqueue.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6203d2900877..6f5f9c7323f4 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3284,6 +3284,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq) } } + dev_set_uevent_suppress(&wq_dev->dev, false); kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); return 0; } -- cgit v1.2.3 From 8d056c48e486249e6487910b83e0f3be7c14acf7 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Mon, 23 Jun 2014 13:22:02 -0700 Subject: CPU hotplug, smp: flush any pending IPI callbacks before CPU offline There is a race between the CPU offline code (within stop-machine) and the smp-call-function code, which can lead to getting IPIs on the outgoing CPU, *after* it has gone offline. Specifically, this can happen when using smp_call_function_single_async() to send the IPI, since this API allows sending asynchronous IPIs from IRQ disabled contexts. The exact race condition is described below. During CPU offline, in stop-machine, we don't enforce any rule in the _DISABLE_IRQ stage, regarding the order in which the outgoing CPU and the other CPUs disable their local interrupts. Due to this, we can encounter a situation in which an IPI is sent by one of the other CPUs to the outgoing CPU (while it is *still* online), but the outgoing CPU ends up noticing it only *after* it has gone offline. CPU 1 CPU 2 (Online CPU) (CPU going offline) Enter _PREPARE stage Enter _PREPARE stage Enter _DISABLE_IRQ stage = Got a device interrupt, and | Didn't notice the IPI the interrupt handler sent an | since interrupts were IPI to CPU 2 using | disabled on this CPU. smp_call_function_single_async() | = Enter _DISABLE_IRQ stage Enter _RUN stage Enter _RUN stage = Busy loop with interrupts | Invoke take_cpu_down() disabled. | and take CPU 2 offline = Enter _EXIT stage Enter _EXIT stage Re-enable interrupts Re-enable interrupts The pending IPI is noted immediately, but alas, the CPU is offline at this point. This of course, makes the smp-call-function IPI handler code running on CPU 2 unhappy and it complains about "receiving an IPI on an offline CPU". One real example of the scenario on CPU 1 is the block layer's complete-request call-path: __blk_complete_request() [interrupt-handler] raise_blk_irq() smp_call_function_single_async() However, if we look closely, the block layer does check that the target CPU is online before firing the IPI. So in this case, it is actually the unfortunate ordering/timing of events in the stop-machine phase that leads to receiving IPIs after the target CPU has gone offline. In reality, getting a late IPI on an offline CPU is not too bad by itself (this can happen even due to hardware latencies in IPI send-receive). It is a bug only if the target CPU really went offline without executing all the callbacks queued on its list. (Note that a CPU is free to execute its pending smp-call-function callbacks in a batch, without waiting for the corresponding IPIs to arrive for each one of those callbacks). So, fixing this issue can be broken up into two parts: 1. Ensure that a CPU goes offline only after executing all the callbacks queued on it. 2. Modify the warning condition in the smp-call-function IPI handler code such that it warns only if an offline CPU got an IPI *and* that CPU had gone offline with callbacks still pending in its queue. Achieving part 1 is straight-forward - just flush (execute) all the queued callbacks on the outgoing CPU in the CPU_DYING stage[1], including those callbacks for which the source CPU's IPIs might not have been received on the outgoing CPU yet. Once we do this, an IPI that arrives late on the CPU going offline (either due to the race mentioned above, or due to hardware latencies) will be completely harmless, since the outgoing CPU would have executed all the queued callbacks before going offline. Overall, this fix (parts 1 and 2 put together) additionally guarantees that we will see a warning only when the *IPI-sender code* is buggy - that is, if it queues the callback _after_ the target CPU has gone offline. [1]. The CPU_DYING part needs a little more explanation: by the time we execute the CPU_DYING notifier callbacks, the CPU would have already been marked offline. But we want to flush out the pending callbacks at this stage, ignoring the fact that the CPU is offline. So restructure the IPI handler code so that we can by-pass the "is-cpu-offline?" check in this particular case. (Of course, the right solution here is to fix CPU hotplug to mark the CPU offline _after_ invoking the CPU_DYING notifiers, but this requires a lot of audit to ensure that this change doesn't break any existing code; hence lets go with the solution proposed above until that is done). [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Srivatsa S. Bhat Suggested-by: Frederic Weisbecker Cc: "Paul E. McKenney" Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Frederic Weisbecker Cc: Gautham R Shenoy Cc: Ingo Molnar Cc: Mel Gorman Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Rik van Riel Cc: Rusty Russell Cc: Steven Rostedt Cc: Tejun Heo Cc: Thomas Gleixner Tested-by: Sachin Kamat Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 49 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 306f8180b0d5..80c33f8de14f 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -29,6 +29,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); +static void flush_smp_call_function_queue(bool warn_cpu_offline); + static int hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -51,12 +53,27 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: + /* Fall-through to the CPU_DEAD[_FROZEN] case. */ case CPU_DEAD: case CPU_DEAD_FROZEN: free_cpumask_var(cfd->cpumask); free_percpu(cfd->csd); break; + + case CPU_DYING: + case CPU_DYING_FROZEN: + /* + * The IPIs for the smp-call-function callbacks queued by other + * CPUs might arrive late, either due to hardware latencies or + * because this CPU disabled interrupts (inside stop-machine) + * before the IPIs were sent. So flush out any pending callbacks + * explicitly (without waiting for the IPIs to arrive), to + * ensure that the outgoing CPU doesn't go offline with work + * still pending. + */ + flush_smp_call_function_queue(false); + break; #endif }; @@ -177,23 +194,47 @@ static int generic_exec_single(int cpu, struct call_single_data *csd, return 0; } -/* - * Invoked by arch to handle an IPI for call function single. Must be - * called from the arch with interrupts disabled. +/** + * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks + * + * Invoked by arch to handle an IPI for call function single. + * Must be called with interrupts disabled. */ void generic_smp_call_function_single_interrupt(void) { + flush_smp_call_function_queue(true); +} + +/** + * flush_smp_call_function_queue - Flush pending smp-call-function callbacks + * + * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an + * offline CPU. Skip this check if set to 'false'. + * + * Flush any pending smp-call-function callbacks queued on this CPU. This is + * invoked by the generic IPI handler, as well as by a CPU about to go offline, + * to ensure that all pending IPI callbacks are run before it goes completely + * offline. + * + * Loop through the call_single_queue and run all the queued callbacks. + * Must be called with interrupts disabled. + */ +static void flush_smp_call_function_queue(bool warn_cpu_offline) +{ + struct llist_head *head; struct llist_node *entry; struct call_single_data *csd, *csd_next; static bool warned; - entry = llist_del_all(&__get_cpu_var(call_single_queue)); + WARN_ON(!irqs_disabled()); + + head = &__get_cpu_var(call_single_queue); + entry = llist_del_all(head); entry = llist_reverse_order(entry); - /* - * Shouldn't receive this interrupt on a cpu that is not yet online. - */ - if (unlikely(!cpu_online(smp_processor_id()) && !warned)) { + /* There shouldn't be any pending callbacks on an offline CPU. */ + if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) && + !warned && !llist_empty(head))) { warned = true; WARN(1, "IPI on offline CPU %d\n", smp_processor_id()); -- cgit v1.2.3 From b3acc56bfe1287c6b666e80edc70b89eea2a1a80 Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Mon, 23 Jun 2014 13:22:03 -0700 Subject: kexec: save PG_head_mask in VMCOREINFO To allow filtering of huge pages, makedumpfile must be able to identify them in the dump. This can be done by checking the appropriate page flag, so communicate its value to makedumpfile through the VMCOREINFO interface. There's only one small catch. Depending on how many page flags are available on a given architecture, this bit can be called PG_head or PG_compound. I sent a similar patch back in 2012, but Eric Biederman did not like using an #ifdef. So, this time I'm adding a common symbol (PG_head_mask) instead. See https://lkml.org/lkml/2012/11/28/91 for the previous version. Signed-off-by: Petr Tesarik Acked-by: Vivek Goyal Cc: Eric Biederman Cc: Paul Mackerras Cc: Fengguang Wu Cc: Benjamin Herrenschmidt Cc: Shaohua Li Cc: Alexey Kardashevskiy Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 6748688813d0..369f41a94124 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1617,6 +1617,7 @@ static int __init crash_save_vmcoreinfo_init(void) #ifdef CONFIG_MEMORY_FAILURE VMCOREINFO_NUMBER(PG_hwpoison); #endif + VMCOREINFO_NUMBER(PG_head_mask); VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); arch_crash_save_vmcoreinfo(); -- cgit v1.2.3 From bde92cf455a03a91badb7046855592d8c008e929 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Mon, 23 Jun 2014 13:22:03 -0700 Subject: kernel/watchdog.c: remove preemption restrictions when restarting lockup detector Peter Wu noticed the following splat on his machine when updating /proc/sys/kernel/watchdog_thresh: BUG: sleeping function called from invalid context at mm/slub.c:965 in_atomic(): 1, irqs_disabled(): 0, pid: 1, name: init 3 locks held by init/1: #0: (sb_writers#3){.+.+.+}, at: [] vfs_write+0x143/0x180 #1: (watchdog_proc_mutex){+.+.+.}, at: [] proc_dowatchdog+0x33/0x110 #2: (cpu_hotplug.lock){.+.+.+}, at: [] get_online_cpus+0x32/0x80 Preemption disabled at:[] proc_dowatchdog+0xe4/0x110 CPU: 0 PID: 1 Comm: init Not tainted 3.16.0-rc1-testing #34 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x4e/0x7a __might_sleep+0x11d/0x190 kmem_cache_alloc_trace+0x4e/0x1e0 perf_event_alloc+0x55/0x440 perf_event_create_kernel_counter+0x26/0xe0 watchdog_nmi_enable+0x75/0x140 update_timers_all_cpus+0x53/0xa0 proc_dowatchdog+0xe4/0x110 proc_sys_call_handler+0xb3/0xc0 proc_sys_write+0x14/0x20 vfs_write+0xad/0x180 SyS_write+0x49/0xb0 system_call_fastpath+0x16/0x1b NMI watchdog: disabled (cpu0): hardware events not enabled What happened is after updating the watchdog_thresh, the lockup detector is restarted to utilize the new value. Part of this process involved disabling preemption. Once preemption was disabled, perf tried to allocate a new event (as part of the restart). This caused the above BUG_ON as you can't sleep with preemption disabled. The preemption restriction seemed agressive as we are not doing anything on that particular cpu, but with all the online cpus (which are protected by the get_online_cpus lock). Remove the restriction and the BUG_ON goes away. Signed-off-by: Don Zickus Acked-by: Michal Hocko Reported-by: Peter Wu Tested-by: Peter Wu Acked-by: David Rientjes Cc: [3.13+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 516203e665fc..30e482240dae 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -527,10 +527,8 @@ static void update_timers_all_cpus(void) int cpu; get_online_cpus(); - preempt_disable(); for_each_online_cpu(cpu) update_timers(cpu); - preempt_enable(); put_online_cpus(); } -- cgit v1.2.3 From 7cd2b0a34ab8e4db971920eef8982f985441adfb Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 23 Jun 2014 13:22:04 -0700 Subject: mm, pcp: allow restoring percpu_pagelist_fraction default Oleg reports a division by zero error on zero-length write() to the percpu_pagelist_fraction sysctl: divide error: 0000 [#1] SMP DEBUG_PAGEALLOC CPU: 1 PID: 9142 Comm: badarea_io Not tainted 3.15.0-rc2-vm-nfs+ #19 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 task: ffff8800d5aeb6e0 ti: ffff8800d87a2000 task.ti: ffff8800d87a2000 RIP: 0010: percpu_pagelist_fraction_sysctl_handler+0x84/0x120 RSP: 0018:ffff8800d87a3e78 EFLAGS: 00010246 RAX: 0000000000000f89 RBX: ffff88011f7fd000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000010 RBP: ffff8800d87a3e98 R08: ffffffff81d002c8 R09: ffff8800d87a3f50 R10: 000000000000000b R11: 0000000000000246 R12: 0000000000000060 R13: ffffffff81c3c3e0 R14: ffffffff81cfddf8 R15: ffff8801193b0800 FS: 00007f614f1e9740(0000) GS:ffff88011f440000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007f614f1fa000 CR3: 00000000d9291000 CR4: 00000000000006e0 Call Trace: proc_sys_call_handler+0xb3/0xc0 proc_sys_write+0x14/0x20 vfs_write+0xba/0x1e0 SyS_write+0x46/0xb0 tracesys+0xe1/0xe6 However, if the percpu_pagelist_fraction sysctl is set by the user, it is also impossible to restore it to the kernel default since the user cannot write 0 to the sysctl. This patch allows the user to write 0 to restore the default behavior. It still requires a fraction equal to or larger than 8, however, as stated by the documentation for sanity. If a value in the range [1, 7] is written, the sysctl will return EINVAL. This successfully solves the divide by zero issue at the same time. Signed-off-by: David Rientjes Reported-by: Oleg Drokin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7de6555cfea0..075d1903138f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -136,7 +136,6 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; static int minolduid; -static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -1317,7 +1316,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(percpu_pagelist_fraction), .mode = 0644, .proc_handler = percpu_pagelist_fraction_sysctl_handler, - .extra1 = &min_percpu_pagelist_fract, + .extra1 = &zero, }, #ifdef CONFIG_MMU { -- cgit v1.2.3 From ed235875e2ca983197831337a986f0517074e1a0 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Mon, 23 Jun 2014 13:22:05 -0700 Subject: kernel/watchdog.c: print traces for all cpus on lockup detection A 'softlockup' is defined as a bug that causes the kernel to loop in kernel mode for more than a predefined period to time, without giving other tasks a chance to run. Currently, upon detection of this condition by the per-cpu watchdog task, debug information (including a stack trace) is sent to the system log. On some occasions, we have observed that the "victim" rather than the actual "culprit" (i.e. the owner/holder of the contended resource) is reported to the user. Often this information has proven to be insufficient to assist debugging efforts. To avoid loss of useful debug information, for architectures which support NMI, this patch makes it possible to improve soft lockup reporting. This is accomplished by issuing an NMI to each cpu to obtain a stack trace. If NMI is not supported we just revert back to the old method. A sysctl and boot-time parameter is available to toggle this feature. [dzickus@redhat.com: add CONFIG_SMP in certain areas] [akpm@linux-foundation.org: additional CONFIG_SMP=n optimisations] [mq@suse.cz: fix warning] Signed-off-by: Aaron Tomlin Signed-off-by: Don Zickus Cc: David S. Miller Cc: Mateusz Guzik Cc: Oleg Nesterov Signed-off-by: Jan Moskyto Matejka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 11 +++++++++++ kernel/watchdog.c | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 075d1903138f..75b22e22a72c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -860,6 +860,17 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, +#ifdef CONFIG_SMP + { + .procname = "softlockup_all_cpu_backtrace", + .data = &sysctl_softlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif /* CONFIG_SMP */ { .procname = "nmi_watchdog", .data = &watchdog_user_enabled, diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 30e482240dae..c3319bd1b040 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -31,6 +31,12 @@ int watchdog_user_enabled = 1; int __read_mostly watchdog_thresh = 10; +#ifdef CONFIG_SMP +int __read_mostly sysctl_softlockup_all_cpu_backtrace; +#else +#define sysctl_softlockup_all_cpu_backtrace 0 +#endif + static int __read_mostly watchdog_running; static u64 __read_mostly sample_period; @@ -47,6 +53,7 @@ static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); #endif +static unsigned long soft_lockup_nmi_warn; /* boot commands */ /* @@ -95,6 +102,15 @@ static int __init nosoftlockup_setup(char *str) } __setup("nosoftlockup", nosoftlockup_setup); /* */ +#ifdef CONFIG_SMP +static int __init softlockup_all_cpu_backtrace_setup(char *str) +{ + sysctl_softlockup_all_cpu_backtrace = + !!simple_strtol(str, NULL, 0); + return 1; +} +__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); +#endif /* * Hard-lockup warnings should be triggered after just a few seconds. Soft- @@ -271,6 +287,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); struct pt_regs *regs = get_irq_regs(); int duration; + int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; /* kick the hardlockup detector */ watchdog_interrupt_count(); @@ -317,6 +334,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (__this_cpu_read(soft_watchdog_warn) == true) return HRTIMER_RESTART; + if (softlockup_all_cpu_backtrace) { + /* Prevent multiple soft-lockup reports if one cpu is already + * engaged in dumping cpu back traces + */ + if (test_and_set_bit(0, &soft_lockup_nmi_warn)) { + /* Someone else will report us. Let's give up */ + __this_cpu_write(soft_watchdog_warn, true); + return HRTIMER_RESTART; + } + } + printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); @@ -327,6 +355,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) else dump_stack(); + if (softlockup_all_cpu_backtrace) { + /* Avoid generating two back traces for current + * given that one is already made above + */ + trigger_allbutself_cpu_backtrace(); + + clear_bit(0, &soft_lockup_nmi_warn); + /* Barrier to sync with other cpus */ + smp_mb__after_atomic(); + } + if (softlockup_panic) panic("softlockup: hung tasks"); __this_cpu_write(soft_watchdog_warn, true); -- cgit v1.2.3 From 391acf970d21219a2a5446282d3b20eace0c0d7a Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 25 Jun 2014 09:57:18 +0800 Subject: cpuset,mempolicy: fix sleeping function called from invalid context When runing with the kernel(3.15-rc7+), the follow bug occurs: [ 9969.258987] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:586 [ 9969.359906] in_atomic(): 1, irqs_disabled(): 0, pid: 160655, name: python [ 9969.441175] INFO: lockdep is turned off. [ 9969.488184] CPU: 26 PID: 160655 Comm: python Tainted: G A 3.15.0-rc7+ #85 [ 9969.581032] Hardware name: FUJITSU-SV PRIMEQUEST 1800E/SB, BIOS PRIMEQUEST 1000 Series BIOS Version 1.39 11/16/2012 [ 9969.706052] ffffffff81a20e60 ffff8803e941fbd0 ffffffff8162f523 ffff8803e941fd18 [ 9969.795323] ffff8803e941fbe0 ffffffff8109995a ffff8803e941fc58 ffffffff81633e6c [ 9969.884710] ffffffff811ba5dc ffff880405c6b480 ffff88041fdd90a0 0000000000002000 [ 9969.974071] Call Trace: [ 9970.003403] [] dump_stack+0x4d/0x66 [ 9970.065074] [] __might_sleep+0xfa/0x130 [ 9970.130743] [] mutex_lock_nested+0x3c/0x4f0 [ 9970.200638] [] ? kmem_cache_alloc+0x1bc/0x210 [ 9970.272610] [] cpuset_mems_allowed+0x27/0x140 [ 9970.344584] [] ? __mpol_dup+0x63/0x150 [ 9970.409282] [] __mpol_dup+0xe5/0x150 [ 9970.471897] [] ? __mpol_dup+0x63/0x150 [ 9970.536585] [] ? copy_process.part.23+0x606/0x1d40 [ 9970.613763] [] ? trace_hardirqs_on+0xd/0x10 [ 9970.683660] [] ? monotonic_to_bootbased+0x2f/0x50 [ 9970.759795] [] copy_process.part.23+0x670/0x1d40 [ 9970.834885] [] do_fork+0xd8/0x380 [ 9970.894375] [] ? __audit_syscall_entry+0x9c/0xf0 [ 9970.969470] [] SyS_clone+0x16/0x20 [ 9971.030011] [] stub_clone+0x69/0x90 [ 9971.091573] [] ? system_call_fastpath+0x16/0x1b The cause is that cpuset_mems_allowed() try to take mutex_lock(&callback_mutex) under the rcu_read_lock(which was hold in __mpol_dup()). And in cpuset_mems_allowed(), the access to cpuset is under rcu_read_lock, so in __mpol_dup, we can reduce the rcu_read_lock protection region to protect the access to cpuset only in current_cpuset_is_being_rebound(). So that we can avoid this bug. This patch is a temporary solution that just addresses the bug mentioned above, can not fix the long-standing issue about cpuset.mems rebinding on fork(): "When the forker's task_struct is duplicated (which includes ->mems_allowed) and it races with an update to cpuset_being_rebound in update_tasks_nodemask() then the task's mems_allowed doesn't get updated. And the child task's mems_allowed can be wrong if the cpuset's nodemask changes before the child has been added to the cgroup's tasklist." Signed-off-by: Gu Zheng Acked-by: Li Zefan Signed-off-by: Tejun Heo Cc: stable --- kernel/cpuset.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f6b33c696224..d3df02e76643 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1181,7 +1181,13 @@ done: int current_cpuset_is_being_rebound(void) { - return task_cs(current) == cpuset_being_rebound; + int ret; + + rcu_read_lock(); + ret = task_cs(current) == cpuset_being_rebound; + rcu_read_unlock(); + + return ret; } static int update_relax_domain_level(struct cpuset *cs, s64 val) -- cgit v1.2.3 From 970317aa48c6ef66cd023c039c2650c897bad927 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 30 Jun 2014 11:49:58 +0800 Subject: cgroup: fix mount failure in a corner case # cat test.sh #! /bin/bash mount -t cgroup -o cpu xxx /cgroup umount /cgroup mount -t cgroup -o cpu,cpuacct xxx /cgroup umount /cgroup # ./test.sh mount: xxx already mounted or /cgroup busy mount: according to mtab, xxx is already mounted on /cgroup It's because the cgroupfs_root of the first mount was under destruction asynchronously. Fix this by delaying and then retrying mount for this case. v3: - put the refcnt immediately after getting it. (Tejun) v2: - use percpu_ref_tryget_live() rather that introducing percpu_ref_alive(). (Tejun) - adjust comment. tj: Updated the comment a bit. Cc: # 3.15 Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d9a8be911f5b..64068667be84 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1648,10 +1648,12 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { + struct cgroup_subsys *ss; struct cgroup_root *root; struct cgroup_sb_opts opts; struct dentry *dentry; int ret; + int i; bool new_sb; /* @@ -1677,6 +1679,27 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, goto out_unlock; } + /* + * Destruction of cgroup root is asynchronous, so subsystems may + * still be dying after the previous unmount. Let's drain the + * dying subsystems. We just need to ensure that the ones + * unmounted previously finish dying and don't care about new ones + * starting. Testing ref liveliness is good enough. + */ + for_each_subsys(ss, i) { + if (!(opts.subsys_mask & (1 << i)) || + ss->root == &cgrp_dfl_root) + continue; + + if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) { + mutex_unlock(&cgroup_mutex); + msleep(10); + ret = restart_syscall(); + goto out_free; + } + cgroup_put(&ss->root->cgrp); + } + for_each_root(root) { bool name_match = false; -- cgit v1.2.3 From 3a32bd72d77058d768dbb38183ad517f720dd1bc Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 30 Jun 2014 11:50:59 +0800 Subject: cgroup: fix a race between cgroup_mount() and cgroup_kill_sb() We've converted cgroup to kernfs so cgroup won't be intertwined with vfs objects and locking, but there are dark areas. Run two instances of this script concurrently: for ((; ;)) { mount -t cgroup -o cpuacct xxx /cgroup umount /cgroup } After a while, I saw two mount processes were stuck at retrying, because they were waiting for a subsystem to become free, but the root associated with this subsystem never got freed. This can happen, if thread A is in the process of killing superblock but hasn't called percpu_ref_kill(), and at this time thread B is mounting the same cgroup root and finds the root in the root list and performs percpu_ref_try_get(). To fix this, we try to increase both the refcnt of the superblock and the percpu refcnt of cgroup root. v2: - we should try to get both the superblock refcnt and cgroup_root refcnt, because cgroup_root may have no superblock assosiated with it. - adjust/add comments. tj: Updated comments. Renamed @sb to @pinned_sb. Cc: # 3.15 Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 64068667be84..70776aec2562 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1648,6 +1648,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { + struct super_block *pinned_sb = NULL; struct cgroup_subsys *ss; struct cgroup_root *root; struct cgroup_sb_opts opts; @@ -1740,15 +1741,23 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, } /* - * A root's lifetime is governed by its root cgroup. - * tryget_live failure indicate that the root is being - * destroyed. Wait for destruction to complete so that the - * subsystems are free. We can use wait_queue for the wait - * but this path is super cold. Let's just sleep for a bit - * and retry. + * We want to reuse @root whose lifetime is governed by its + * ->cgrp. Let's check whether @root is alive and keep it + * that way. As cgroup_kill_sb() can happen anytime, we + * want to block it by pinning the sb so that @root doesn't + * get killed before mount is complete. + * + * With the sb pinned, tryget_live can reliably indicate + * whether @root can be reused. If it's being killed, + * drain it. We can use wait_queue for the wait but this + * path is super cold. Let's just sleep a bit and retry. */ - if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { + pinned_sb = kernfs_pin_sb(root->kf_root, NULL); + if (IS_ERR(pinned_sb) || + !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { mutex_unlock(&cgroup_mutex); + if (!IS_ERR_OR_NULL(pinned_sb)) + deactivate_super(pinned_sb); msleep(10); ret = restart_syscall(); goto out_free; @@ -1793,6 +1802,16 @@ out_free: CGROUP_SUPER_MAGIC, &new_sb); if (IS_ERR(dentry) || !new_sb) cgroup_put(&root->cgrp); + + /* + * If @pinned_sb, we're reusing an existing root and holding an + * extra ref on its sb. Mount is complete. Put the extra ref. + */ + if (pinned_sb) { + WARN_ON(new_sb); + deactivate_super(pinned_sb); + } + return dentry; } -- cgit v1.2.3 From 48212542067a7ff6cbe829dbae279c2ff7557b44 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 27 Jun 2014 19:01:36 +0200 Subject: tracing/uprobes: Revert "Support mix of ftrace and perf" This reverts commit 43fe98913c9f67e3b523615ee3316f9520a623e0. This patch is very wrong. Firstly, this change leads to unbalanced uprobe_unregister(). Just for example, # perf probe -x /lib/libc.so.6 syscall # echo 1 >> /sys/kernel/debug/tracing/events/probe_libc/enable # perf record -e probe_libc:syscall whatever after that uprobe is dead (unregistered) but the user of ftrace/perf can't know this, and it looks as if nobody hits this probe. This would be easy to fix, but there are other reasons why it is not simple to mix ftrace and perf. If nothing else, they can't share the same ->consumer.filter. This is fixable too, but probably we need to fix the poorly designed uprobe_register() interface first. At least "register" and "apply" should be clearly separated. Link: http://lkml.kernel.org/p/20140627170136.GA18319@redhat.com Cc: Tom Zanussi Cc: "zhangwei(Jovi)" Cc: stable@vger.kernel.org # v3.14 Acked-by: Namhyung Kim Acked-by: Srikar Dronamraju Reviewed-by: Masami Hiramatsu Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_uprobe.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 04fdb5de823c..08e7970bf3f9 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -893,6 +893,9 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file, int ret; if (file) { + if (tu->tp.flags & TP_FLAG_PROFILE) + return -EINTR; + link = kmalloc(sizeof(*link), GFP_KERNEL); if (!link) return -ENOMEM; @@ -901,8 +904,12 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file, list_add_tail_rcu(&link->list, &tu->tp.files); tu->tp.flags |= TP_FLAG_TRACE; - } else + } else { + if (tu->tp.flags & TP_FLAG_TRACE) + return -EINTR; + tu->tp.flags |= TP_FLAG_PROFILE; + } ret = uprobe_buffer_enable(); if (ret < 0) -- cgit v1.2.3 From 06d0713904e508f765e0d7146c14b67bbd248fe7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 27 Jun 2014 19:01:40 +0200 Subject: uprobes: Change unregister/apply to WARN() if uprobe/consumer is gone Add WARN_ON's into uprobe_unregister() and uprobe_apply() to ensure that nobody tries to play with the dead uprobe/consumer. This helps to catch the bugs like the one fixed by the previous patch. In the longer term we should fix this poorly designed interface. uprobe_register() should return "struct uprobe *" which should be passed to apply/unregister. Plus other semantic changes, see the changelog in commit 41ccba029e94. Link: http://lkml.kernel.org/p/20140627170140.GA18322@redhat.com Acked-by: Namhyung Kim Acked-by: Srikar Dronamraju Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/events/uprobes.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c445e392e93f..6f3254e8c137 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -846,7 +846,7 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u { int err; - if (!consumer_del(uprobe, uc)) /* WARN? */ + if (WARN_ON(!consumer_del(uprobe, uc))) return; err = register_for_each_vma(uprobe, NULL); @@ -927,7 +927,7 @@ int uprobe_apply(struct inode *inode, loff_t offset, int ret = -ENOENT; uprobe = find_uprobe(inode, offset); - if (!uprobe) + if (WARN_ON(!uprobe)) return ret; down_write(&uprobe->register_rwsem); @@ -952,7 +952,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume struct uprobe *uprobe; uprobe = find_uprobe(inode, offset); - if (!uprobe) + if (WARN_ON(!uprobe)) return; down_write(&uprobe->register_rwsem); -- cgit v1.2.3 From f786106e8081bbec57053fec7fcf25dc25d02144 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 27 Jun 2014 19:01:43 +0200 Subject: tracing/uprobes: Kill the bogus UPROBE_HANDLER_REMOVE code in uprobe_dispatcher() I do not know why dd9fa555d7bb "tracing/uprobes: Move argument fetching to uprobe_dispatcher()" added the UPROBE_HANDLER_REMOVE, but it looks wrong. OK, perhaps it makes sense to avoid store_trace_args() if the tracee is nacked by uprobe_perf_filter(). But then we should kill the same code in uprobe_perf_func() and unify the TRACE/PROFILE filtering (we need to do this anyway to mix perf/ftrace). Until then this code actually adds the pessimization because uprobe_perf_filter() will be called twice and return T in likely case. Link: http://lkml.kernel.org/p/20140627170143.GA18329@redhat.com Acked-by: Namhyung Kim Acked-by: Srikar Dronamraju Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_uprobe.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 08e7970bf3f9..c4cf0abd60ba 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1208,12 +1208,6 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) current->utask->vaddr = (unsigned long) &udd; -#ifdef CONFIG_PERF_EVENTS - if ((tu->tp.flags & TP_FLAG_TRACE) == 0 && - !uprobe_perf_filter(&tu->consumer, 0, current->mm)) - return UPROBE_HANDLER_REMOVE; -#endif - if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; -- cgit v1.2.3 From fb6bab6a5ad46d00b5ffa22268f21df1cd7c59df Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 27 Jun 2014 19:01:46 +0200 Subject: tracing/uprobes: Fix the usage of uprobe_buffer_enable() in probe_event_enable() The usage of uprobe_buffer_enable() added by dcad1a20 is very wrong, 1. uprobe_buffer_enable() and uprobe_buffer_disable() are not balanced, _enable() should be called only if !enabled. 2. If uprobe_buffer_enable() fails probe_event_enable() should clear tp.flags and free event_file_link. 3. If uprobe_register() fails it should do uprobe_buffer_disable(). Link: http://lkml.kernel.org/p/20140627170146.GA18332@redhat.com Acked-by: Namhyung Kim Acked-by: Srikar Dronamraju Reviewed-by: Masami Hiramatsu Fixes: dcad1a204f72 "tracing/uprobes: Fetch args before reserving a ring buffer" Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_uprobe.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c4cf0abd60ba..3c9b97e6b1f4 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -911,26 +911,33 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file, tu->tp.flags |= TP_FLAG_PROFILE; } - ret = uprobe_buffer_enable(); - if (ret < 0) - return ret; - WARN_ON(!uprobe_filter_is_empty(&tu->filter)); if (enabled) return 0; + ret = uprobe_buffer_enable(); + if (ret) + goto err_flags; + tu->consumer.filter = filter; ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); - if (ret) { - if (file) { - list_del(&link->list); - kfree(link); - tu->tp.flags &= ~TP_FLAG_TRACE; - } else - tu->tp.flags &= ~TP_FLAG_PROFILE; - } + if (ret) + goto err_buffer; + return 0; + + err_buffer: + uprobe_buffer_disable(); + + err_flags: + if (file) { + list_del(&link->list); + kfree(link); + tu->tp.flags &= ~TP_FLAG_TRACE; + } else { + tu->tp.flags &= ~TP_FLAG_PROFILE; + } return ret; } -- cgit v1.2.3 From 099ed151675cd1d2dbeae1dac697975f6a68716d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 24 Jun 2014 23:50:09 -0400 Subject: tracing: Remove ftrace_stop/start() from reading the trace file Disabling reading and writing to the trace file should not be able to disable all function tracing callbacks. There's other users today (like kprobes and perf). Reading a trace file should not stop those from happening. Cc: stable@vger.kernel.org # 3.0+ Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 384ede311717..f243444a3772 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1396,7 +1396,6 @@ void tracing_start(void) arch_spin_unlock(&global_trace.max_lock); - ftrace_start(); out: raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); } @@ -1443,7 +1442,6 @@ void tracing_stop(void) struct ring_buffer *buffer; unsigned long flags; - ftrace_stop(); raw_spin_lock_irqsave(&global_trace.start_lock, flags); if (global_trace.stop_count++) goto out; -- cgit v1.2.3 From 76bb5ab8f6e3e7bebdcefec4146ff305e7d0b465 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 30 Jun 2014 15:47:32 -0400 Subject: cpuset: break kernfs active protection in cpuset_write_resmask() Writing to either "cpuset.cpus" or "cpuset.mems" file flushes cpuset_hotplug_work so that cpu or memory hotunplug doesn't end up migrating tasks off a cpuset after new resources are added to it. As cpuset_hotplug_work calls into cgroup core via cgroup_transfer_tasks(), this flushing adds the dependency to cgroup core locking from cpuset_write_resmak(). This used to be okay because cgroup interface files were protected by a different mutex; however, 8353da1f91f1 ("cgroup: remove cgroup_tree_mutex") simplified the cgroup core locking and this dependency became a deadlock hazard - cgroup file removal performed under cgroup core lock tries to drain on-going file operation which is trying to flush cpuset_hotplug_work blocked on the same cgroup core lock. The locking simplification was done because kernfs added an a lot easier way to deal with circular dependencies involving kernfs active protection. Let's use the same strategy in cpuset and break active protection in cpuset_write_resmask(). While it isn't the prettiest, this is a very rare, likely unique, situation which also goes away on the unified hierarchy. The commands to trigger the deadlock warning without the patch and the lockdep output follow. localhost:/ # mount -t cgroup -o cpuset xxx /cpuset localhost:/ # mkdir /cpuset/tmp localhost:/ # echo 1 > /cpuset/tmp/cpuset.cpus localhost:/ # echo 0 > cpuset/tmp/cpuset.mems localhost:/ # echo $$ > /cpuset/tmp/tasks localhost:/ # echo 0 > /sys/devices/system/cpu/cpu1/online ====================================================== [ INFO: possible circular locking dependency detected ] 3.16.0-rc1-0.1-default+ #7 Not tainted ------------------------------------------------------- kworker/1:0/32649 is trying to acquire lock: (cgroup_mutex){+.+.+.}, at: [] cgroup_transfer_tasks+0x37/0x150 but task is already holding lock: (cpuset_hotplug_work){+.+...}, at: [] process_one_work+0x192/0x520 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (cpuset_hotplug_work){+.+...}: ... -> #1 (s_active#175){++++.+}: ... -> #0 (cgroup_mutex){+.+.+.}: ... other info that might help us debug this: Chain exists of: cgroup_mutex --> s_active#175 --> cpuset_hotplug_work Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(cpuset_hotplug_work); lock(s_active#175); lock(cpuset_hotplug_work); lock(cgroup_mutex); *** DEADLOCK *** 2 locks held by kworker/1:0/32649: #0: ("events"){.+.+.+}, at: [] process_one_work+0x192/0x520 #1: (cpuset_hotplug_work){+.+...}, at: [] process_one_work+0x192/0x520 stack backtrace: CPU: 1 PID: 32649 Comm: kworker/1:0 Not tainted 3.16.0-rc1-0.1-default+ #7 ... Call Trace: [] dump_stack+0x72/0x8a [] print_circular_bug+0x10f/0x120 [] check_prev_add+0x43e/0x4b0 [] validate_chain+0x656/0x7c0 [] __lock_acquire+0x382/0x660 [] lock_acquire+0xf9/0x170 [] mutex_lock_nested+0x6f/0x380 [] cgroup_transfer_tasks+0x37/0x150 [] hotplug_update_tasks_insane+0x110/0x1d0 [] cpuset_hotplug_update_tasks+0x13d/0x180 [] cpuset_hotplug_workfn+0x18c/0x630 [] process_one_work+0x254/0x520 [] worker_thread+0x13d/0x3d0 [] kthread+0xf8/0x100 [] ret_from_fork+0x7c/0xb0 Signed-off-by: Tejun Heo Reported-by: Li Zefan Tested-by: Li Zefan --- kernel/cpuset.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index d3df02e76643..116a4164720a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1623,7 +1623,17 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, * resources, wait for the previously scheduled operations before * proceeding, so that we don't end up keep removing tasks added * after execution capability is restored. + * + * cpuset_hotplug_work calls back into cgroup core via + * cgroup_transfer_tasks() and waiting for it from a cgroupfs + * operation like this one can lead to a deadlock through kernfs + * active_ref protection. Let's break the protection. Losing the + * protection is okay as we check whether @cs is online after + * grabbing cpuset_mutex anyway. This only happens on the legacy + * hierarchies. */ + css_get(&cs->css); + kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); mutex_lock(&cpuset_mutex); @@ -1651,6 +1661,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_trial_cpuset(trialcs); out_unlock: mutex_unlock(&cpuset_mutex); + kernfs_unbreak_active_protection(of->kn); + css_put(&cs->css); return retval ?: nbytes; } -- cgit v1.2.3 From d18bbc215f81710e1eab7120becafa910554d68d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 2 Jul 2014 15:22:38 -0700 Subject: kernel/printk/printk.c: revert "printk: enable interrupts before calling console_trylock_for_printk()" Revert commit 939f04bec1a4 ("printk: enable interrupts before calling console_trylock_for_printk()"). Andreas reported: : None of the post 3.15 kernel boot for me. They all hang at the GRUB : screen telling me it loaded and started the kernel, but the kernel : itself stops before it prints anything (or even replaces the GRUB : background graphics). 939f04bec1a4 is modest latency reduction. Revert it until we understand the reason for these failures. Reported-by: Andreas Bombe Cc: Jan Kara Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 44 ++++++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ea2d5f6962ed..13e839dbca07 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1416,9 +1416,10 @@ static int have_callable_console(void) /* * Can we actually use the console at this time on this cpu? * - * Console drivers may assume that per-cpu resources have been allocated. So - * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't - * call them until this CPU is officially up. + * Console drivers may assume that per-cpu resources have + * been allocated. So unless they're explicitly marked as + * being able to cope (CON_ANYTIME) don't call them until + * this CPU is officially up. */ static inline int can_use_console(unsigned int cpu) { @@ -1431,10 +1432,8 @@ static inline int can_use_console(unsigned int cpu) * console_lock held, and 'console_locked' set) if it * is successful, false otherwise. */ -static int console_trylock_for_printk(void) +static int console_trylock_for_printk(unsigned int cpu) { - unsigned int cpu = smp_processor_id(); - if (!console_trylock()) return 0; /* @@ -1609,8 +1608,7 @@ asmlinkage int vprintk_emit(int facility, int level, */ if (!oops_in_progress && !lockdep_recursing(current)) { recursion_bug = 1; - local_irq_restore(flags); - return 0; + goto out_restore_irqs; } zap_locks(); } @@ -1718,27 +1716,21 @@ asmlinkage int vprintk_emit(int facility, int level, logbuf_cpu = UINT_MAX; raw_spin_unlock(&logbuf_lock); - lockdep_on(); - local_irq_restore(flags); /* If called from the scheduler, we can not call up(). */ - if (in_sched) - return printed_len; - - /* - * Disable preemption to avoid being preempted while holding - * console_sem which would prevent anyone from printing to console - */ - preempt_disable(); - /* - * Try to acquire and then immediately release the console semaphore. - * The release will print out buffers and wake up /dev/kmsg and syslog() - * users. - */ - if (console_trylock_for_printk()) - console_unlock(); - preempt_enable(); + if (!in_sched) { + /* + * Try to acquire and then immediately release the console + * semaphore. The release will print out buffers and wake up + * /dev/kmsg and syslog() users. + */ + if (console_trylock_for_printk(this_cpu)) + console_unlock(); + } + lockdep_on(); +out_restore_irqs: + local_irq_restore(flags); return printed_len; } EXPORT_SYMBOL(vprintk_emit); -- cgit v1.2.3 From 8844aad89ed61545b4db6a3467e1b21ca1c49460 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 30 Jun 2014 16:24:44 -0600 Subject: genirq: Fix memory leak when calling irq_free_hwirqs() irq_free_hwirqs() always calls irq_free_descs() with a cnt == 0 which makes it a no-op since the interrupt count to free is decremented in itself. Fixes: 7b6ef1262549f6afc5c881aaef80beb8fd15f908 Signed-off-by: Keith Busch Acked-by: David Rientjes Link: http://lkml.kernel.org/r/1404167084-8070-1-git-send-email-keith.busch@intel.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdesc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 7339e42a85ab..1487a123db5c 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -455,9 +455,9 @@ EXPORT_SYMBOL_GPL(irq_alloc_hwirqs); */ void irq_free_hwirqs(unsigned int from, int cnt) { - int i; + int i, j; - for (i = from; cnt > 0; i++, cnt--) { + for (i = from, j = cnt; j > 0; i++, j--) { irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE); arch_teardown_hwirq(i); } -- cgit v1.2.3 From 5a6024f1604eef119cf3a6fa413fe0261a81a8f3 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Mon, 7 Jul 2014 09:56:48 -0400 Subject: workqueue: zero cpumask of wq_numa_possible_cpumask on init When hot-adding and onlining CPU, kernel panic occurs, showing following call trace. BUG: unable to handle kernel paging request at 0000000000001d08 IP: [] __alloc_pages_nodemask+0x9d/0xb10 PGD 0 Oops: 0000 [#1] SMP ... Call Trace: [] ? cpumask_next_and+0x35/0x50 [] ? find_busiest_group+0x113/0x8f0 [] ? deactivate_slab+0x349/0x3c0 [] new_slab+0x91/0x300 [] __slab_alloc+0x2bb/0x482 [] ? copy_process.part.25+0xfc/0x14c0 [] ? load_balance+0x218/0x890 [] ? sched_clock+0x9/0x10 [] ? trace_clock_local+0x9/0x10 [] kmem_cache_alloc_node+0x8c/0x200 [] copy_process.part.25+0xfc/0x14c0 [] ? trace_buffer_unlock_commit+0x4d/0x60 [] ? kthread_create_on_node+0x140/0x140 [] do_fork+0xbc/0x360 [] kernel_thread+0x26/0x30 [] kthreadd+0x2c2/0x300 [] ? kthread_create_on_cpu+0x60/0x60 [] ret_from_fork+0x7c/0xb0 [] ? kthread_create_on_cpu+0x60/0x60 In my investigation, I found the root cause is wq_numa_possible_cpumask. All entries of wq_numa_possible_cpumask is allocated by alloc_cpumask_var_node(). And these entries are used without initializing. So these entries have wrong value. When hot-adding and onlining CPU, wq_update_unbound_numa() is called. wq_update_unbound_numa() calls alloc_unbound_pwq(). And alloc_unbound_pwq() calls get_unbound_pool(). In get_unbound_pool(), worker_pool->node is set as follow: 3592 /* if cpumask is contained inside a NUMA node, we belong to that node */ 3593 if (wq_numa_enabled) { 3594 for_each_node(node) { 3595 if (cpumask_subset(pool->attrs->cpumask, 3596 wq_numa_possible_cpumask[node])) { 3597 pool->node = node; 3598 break; 3599 } 3600 } 3601 } But wq_numa_possible_cpumask[node] does not have correct cpumask. So, wrong node is selected. As a result, kernel panic occurs. By this patch, all entries of wq_numa_possible_cpumask are allocated by zalloc_cpumask_var_node to initialize them. And the panic disappeared. Signed-off-by: Yasuaki Ishimatsu Reviewed-by: Lai Jiangshan Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org Fixes: bce903809ab3 ("workqueue: add wq_numa_tbl_len and wq_numa_possible_cpumask[]") --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6f5f9c7323f4..35974ac69600 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4880,7 +4880,7 @@ static void __init wq_numa_init(void) BUG_ON(!tbl); for_each_node(node) - BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, + BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL, node_online(node) ? node : NUMA_NO_NODE)); for_each_possible_cpu(cpu) { -- cgit v1.2.3