From fbd44a607a1a5019bc32c3615cead8c5ee8f89c9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 May 2013 20:22:36 +0200 Subject: tick: Use zalloc_cpumask_var for allocating offstack cpumasks commit b352bc1cbc (tick: Convert broadcast cpu bitmaps to cpumask_var_t) broke CONFIG_CPUMASK_OFFSTACK in a very subtle way. Instead of allocating the cpumasks with zalloc_cpumask_var it uses alloc_cpumask_var, so we can get random data there, which of course confuses the logic completely and causes random failures. Reported-and-tested-by: Dave Jones Reported-and-tested-by: Yinghai Lu Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1305032015060.2990@ionos Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/time/tick-broadcast.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 61d00a8cdf2f..d70cdc42c829 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -785,11 +785,11 @@ bool tick_broadcast_oneshot_available(void) void __init tick_broadcast_init(void) { - alloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); - alloc_cpumask_var(&tmpmask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tmpmask, GFP_NOWAIT); #ifdef CONFIG_TICK_ONESHOT - alloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); - alloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT); - alloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT); #endif } -- cgit v1.2.3 From 524eff183f51d080a83b348d0ea97c08b3607b9a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 6 May 2013 18:27:17 +0200 Subject: perf: Fix EXIT event notification The perf_event_task_ctx() function needs to be called with preemption disabled, since it's checking for currently scheduled cpu against event cpu. We disable preemption for task related perf event context if there's one defined, leaving up to the chance which cpu it gets scheduled in. Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Namhyung Kim Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Stephane Eranian Cc: Borislav Petkov Link: http://lkml.kernel.org/r/1367857638-27631-2-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6b41c1899a8b..38b68a05c3c6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4474,7 +4474,7 @@ static void perf_event_task_ctx(struct perf_event_context *ctx, static void perf_event_task_event(struct perf_task_event *task_event) { struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; + struct perf_event_context *ctx, *task_ctx = task_event->task_ctx; struct pmu *pmu; int ctxn; @@ -4485,20 +4485,22 @@ static void perf_event_task_event(struct perf_task_event *task_event) goto next; perf_event_task_ctx(&cpuctx->ctx, task_event); - ctx = task_event->task_ctx; - if (!ctx) { - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) - perf_event_task_ctx(ctx, task_event); - } + if (task_ctx) + goto next; + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto next; + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) + perf_event_task_ctx(ctx, task_event); next: put_cpu_ptr(pmu->pmu_cpu_context); } - if (task_event->task_ctx) - perf_event_task_ctx(task_event->task_ctx, task_event); + if (task_ctx) { + preempt_disable(); + perf_event_task_ctx(task_ctx, task_event); + preempt_enable(); + } rcu_read_unlock(); } -- cgit v1.2.3 From 52d857a8784a09576215c71cebf368d61c12a754 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 6 May 2013 18:27:18 +0200 Subject: perf: Factor out auxiliary events notification Add perf_event_aux() function to send out all types of auxiliary events - mmap, task, comm events. For each type there's match and output functions defined and used as callbacks during perf_event_aux processing. This way we can centralize the pmu/context iterating and event matching logic. Also since lot of the code was duplicated, this patch reduces the .text size about 2kB on my setup: snipped output from 'objdump -x kernel/events/core.o' before: Idx Name Size 0 .text 0000d313 after: Idx Name Size 0 .text 0000cad3 Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Namhyung Kim Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Borislav Petkov Link: http://lkml.kernel.org/r/1367857638-27631-3-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 242 +++++++++++++++++++-------------------------------- 1 file changed, 89 insertions(+), 153 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 38b68a05c3c6..9dc297faf7c0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4394,6 +4394,64 @@ perf_event_read_event(struct perf_event *event, perf_output_end(&handle); } +typedef int (perf_event_aux_match_cb)(struct perf_event *event, void *data); +typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); + +static void +perf_event_aux_ctx(struct perf_event_context *ctx, + perf_event_aux_match_cb match, + perf_event_aux_output_cb output, + void *data) +{ + struct perf_event *event; + + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (event->state < PERF_EVENT_STATE_INACTIVE) + continue; + if (!event_filter_match(event)) + continue; + if (match(event, data)) + output(event, data); + } +} + +static void +perf_event_aux(perf_event_aux_match_cb match, + perf_event_aux_output_cb output, + void *data, + struct perf_event_context *task_ctx) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + struct pmu *pmu; + int ctxn; + + rcu_read_lock(); + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + if (cpuctx->unique_pmu != pmu) + goto next; + perf_event_aux_ctx(&cpuctx->ctx, match, output, data); + if (task_ctx) + goto next; + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto next; + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) + perf_event_aux_ctx(ctx, match, output, data); +next: + put_cpu_ptr(pmu->pmu_cpu_context); + } + + if (task_ctx) { + preempt_disable(); + perf_event_aux_ctx(task_ctx, match, output, data); + preempt_enable(); + } + rcu_read_unlock(); +} + /* * task tracking -- fork/exit * @@ -4416,8 +4474,9 @@ struct perf_task_event { }; static void perf_event_task_output(struct perf_event *event, - struct perf_task_event *task_event) + void *data) { + struct perf_task_event *task_event = data; struct perf_output_handle handle; struct perf_sample_data sample; struct task_struct *task = task_event->task; @@ -4445,64 +4504,11 @@ out: task_event->event_id.header.size = size; } -static int perf_event_task_match(struct perf_event *event) -{ - if (event->state < PERF_EVENT_STATE_INACTIVE) - return 0; - - if (!event_filter_match(event)) - return 0; - - if (event->attr.comm || event->attr.mmap || - event->attr.mmap_data || event->attr.task) - return 1; - - return 0; -} - -static void perf_event_task_ctx(struct perf_event_context *ctx, - struct perf_task_event *task_event) -{ - struct perf_event *event; - - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_event_task_match(event)) - perf_event_task_output(event, task_event); - } -} - -static void perf_event_task_event(struct perf_task_event *task_event) +static int perf_event_task_match(struct perf_event *event, + void *data __maybe_unused) { - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx, *task_ctx = task_event->task_ctx; - struct pmu *pmu; - int ctxn; - - rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->unique_pmu != pmu) - goto next; - perf_event_task_ctx(&cpuctx->ctx, task_event); - - if (task_ctx) - goto next; - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) - perf_event_task_ctx(ctx, task_event); -next: - put_cpu_ptr(pmu->pmu_cpu_context); - } - if (task_ctx) { - preempt_disable(); - perf_event_task_ctx(task_ctx, task_event); - preempt_enable(); - } - - rcu_read_unlock(); + return event->attr.comm || event->attr.mmap || + event->attr.mmap_data || event->attr.task; } static void perf_event_task(struct task_struct *task, @@ -4533,7 +4539,10 @@ static void perf_event_task(struct task_struct *task, }, }; - perf_event_task_event(&task_event); + perf_event_aux(perf_event_task_match, + perf_event_task_output, + &task_event, + task_ctx); } void perf_event_fork(struct task_struct *task) @@ -4559,8 +4568,9 @@ struct perf_comm_event { }; static void perf_event_comm_output(struct perf_event *event, - struct perf_comm_event *comm_event) + void *data) { + struct perf_comm_event *comm_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int size = comm_event->event_id.header.size; @@ -4587,39 +4597,16 @@ out: comm_event->event_id.header.size = size; } -static int perf_event_comm_match(struct perf_event *event) -{ - if (event->state < PERF_EVENT_STATE_INACTIVE) - return 0; - - if (!event_filter_match(event)) - return 0; - - if (event->attr.comm) - return 1; - - return 0; -} - -static void perf_event_comm_ctx(struct perf_event_context *ctx, - struct perf_comm_event *comm_event) +static int perf_event_comm_match(struct perf_event *event, + void *data __maybe_unused) { - struct perf_event *event; - - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_event_comm_match(event)) - perf_event_comm_output(event, comm_event); - } + return event->attr.comm; } static void perf_event_comm_event(struct perf_comm_event *comm_event) { - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; char comm[TASK_COMM_LEN]; unsigned int size; - struct pmu *pmu; - int ctxn; memset(comm, 0, sizeof(comm)); strlcpy(comm, comm_event->task->comm, sizeof(comm)); @@ -4629,24 +4616,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) comm_event->comm_size = size; comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; - rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->unique_pmu != pmu) - goto next; - perf_event_comm_ctx(&cpuctx->ctx, comm_event); - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; - - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) - perf_event_comm_ctx(ctx, comm_event); -next: - put_cpu_ptr(pmu->pmu_cpu_context); - } - rcu_read_unlock(); + perf_event_aux(perf_event_comm_match, + perf_event_comm_output, + comm_event, + NULL); } void perf_event_comm(struct task_struct *task) @@ -4708,8 +4682,9 @@ struct perf_mmap_event { }; static void perf_event_mmap_output(struct perf_event *event, - struct perf_mmap_event *mmap_event) + void *data) { + struct perf_mmap_event *mmap_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int size = mmap_event->event_id.header.size; @@ -4736,46 +4711,24 @@ out: } static int perf_event_mmap_match(struct perf_event *event, - struct perf_mmap_event *mmap_event, - int executable) + void *data) { - if (event->state < PERF_EVENT_STATE_INACTIVE) - return 0; - - if (!event_filter_match(event)) - return 0; - - if ((!executable && event->attr.mmap_data) || - (executable && event->attr.mmap)) - return 1; - - return 0; -} - -static void perf_event_mmap_ctx(struct perf_event_context *ctx, - struct perf_mmap_event *mmap_event, - int executable) -{ - struct perf_event *event; + struct perf_mmap_event *mmap_event = data; + struct vm_area_struct *vma = mmap_event->vma; + int executable = vma->vm_flags & VM_EXEC; - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_event_mmap_match(event, mmap_event, executable)) - perf_event_mmap_output(event, mmap_event); - } + return (!executable && event->attr.mmap_data) || + (executable && event->attr.mmap); } static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) { - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; struct vm_area_struct *vma = mmap_event->vma; struct file *file = vma->vm_file; unsigned int size; char tmp[16]; char *buf = NULL; const char *name; - struct pmu *pmu; - int ctxn; memset(tmp, 0, sizeof(tmp)); @@ -4831,27 +4784,10 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; - rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->unique_pmu != pmu) - goto next; - perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, - vma->vm_flags & VM_EXEC); - - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; - - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) { - perf_event_mmap_ctx(ctx, mmap_event, - vma->vm_flags & VM_EXEC); - } -next: - put_cpu_ptr(pmu->pmu_cpu_context); - } - rcu_read_unlock(); + perf_event_aux(perf_event_mmap_match, + perf_event_mmap_output, + mmap_event, + NULL); kfree(buf); } -- cgit v1.2.3 From d3251859168b0b12841e1b90d6d768ab478dc23d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 10 May 2013 11:10:17 -0700 Subject: workqueue: workqueue_congested() shouldn't translate WORK_CPU_UNBOUND into node number df2d5ae499 ("workqueue: map an unbound workqueues to multiple per-node pool_workqueues") made unbound workqueues to map to multiple per-node pool_workqueues and accordingly updated workqueue_contested() so that, for unbound workqueues, it maps the specified @cpu to the NUMA node number to obtain the matching pool_workqueue to query the congested state. Before this change, workqueue_congested() ignored @cpu for unbound workqueues as there was only one pool_workqueue and some users (fscache) called it with WORK_CPU_UNBOUND. After the commit, this causes the following oops as WORK_CPU_UNBOUND gets translated to garbage by cpu_to_node(). BUG: unable to handle kernel paging request at ffff8803598d98b8 IP: [] unbound_pwq_by_node+0xa1/0xfa PGD 2421067 PUD 0 Oops: 0000 [#1] SMP CPU: 1 PID: 2689 Comm: cat Tainted: GF 3.9.0-fsdevel+ #4 task: ffff88003d801040 ti: ffff880025806000 task.ti: ffff880025806000 RIP: 0010:[] [] unbound_pwq_by_node+0xa1/0xfa RSP: 0018:ffff880025807ad8 EFLAGS: 00010202 RAX: 0000000000000001 RBX: ffff8800388a2400 RCX: 0000000000000003 RDX: ffff880025807fd8 RSI: ffffffff81a31420 RDI: ffff88003d8016e0 RBP: ffff880025807ae8 R08: ffff88003d801730 R09: ffffffffa00b4898 R10: ffffffff81044217 R11: ffff88003d801040 R12: 0000000064206e97 R13: ffff880036059d98 R14: ffff880038cc8080 R15: ffff880038cc82d0 FS: 00007f21afd9c740(0000) GS:ffff88003d100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: ffff8803598d98b8 CR3: 000000003df49000 CR4: 00000000000007e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Stack: ffff8800388a2400 0000000000000002 ffff880025807b18 ffffffff810442ce ffffffff81044217 ffff880000000002 ffff8800371b4080 ffff88003d112ec0 ffff880025807b38 ffffffffa00810b0 ffff880036059d88 ffff880036059be8 Call Trace: [] workqueue_congested+0xb7/0x12c [] fscache_enqueue_object+0xb2/0xe8 [fscache] [] __fscache_acquire_cookie+0x3b9/0x56c [fscache] [] nfs_fscache_set_inode_cookie+0xee/0x132 [nfs] [] do_open+0x9/0xd [nfs] [] do_dentry_open+0x175/0x24b [] finish_open+0x41/0x51 Fix it by using smp_processor_id() if @cpu is WORK_CPU_UNBOUND. Signed-off-by: Tejun Heo Reported-by: David Howells Tested-and-Acked-by: David Howells --- kernel/workqueue.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4aa9f5bc6b2d..1ae602809efb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4311,6 +4311,12 @@ bool current_is_workqueue_rescuer(void) * no synchronization around this function and the test result is * unreliable and only useful as advisory hints or for debugging. * + * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU. + * Note that both per-cpu and unbound workqueues may be associated with + * multiple pool_workqueues which have separate congested states. A + * workqueue being congested on one CPU doesn't mean the workqueue is also + * contested on other CPUs / NUMA nodes. + * * RETURNS: * %true if congested, %false otherwise. */ @@ -4321,6 +4327,9 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) rcu_read_lock_sched(); + if (cpu == WORK_CPU_UNBOUND) + cpu = smp_processor_id(); + if (!(wq->flags & WQ_UNBOUND)) pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); else -- cgit v1.2.3 From 4b0c0f294f60abcdd20994a8341a95c8ac5eeb96 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 May 2013 15:02:50 +0200 Subject: tick: Cleanup NOHZ per cpu data on cpu down Prarit reported a crash on CPU offline/online. The reason is that on CPU down the NOHZ related per cpu data of the dead cpu is not cleaned up. If at cpu online an interrupt happens before the per cpu tick device is registered the irq_enter() check potentially sees stale data and dereferences a NULL pointer. Cleanup the data after the cpu is dead. Reported-by: Prarit Bhargava Cc: stable@vger.kernel.org Cc: Mike Galbraith Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1305031451561.2886@ionos Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 225f8bf19095..0eed1db2d792 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -904,7 +904,7 @@ void tick_cancel_sched_timer(int cpu) hrtimer_cancel(&ts->sched_timer); # endif - ts->nohz_mode = NOHZ_MODE_INACTIVE; + memset(ts, 0, sizeof(*ts)); } #endif -- cgit v1.2.3 From f7ea0fd639c2c48d3c61b6eec75362be290c6874 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 13 May 2013 21:40:27 +0200 Subject: tick: Don't invoke tick_nohz_stop_sched_tick() if the cpu is offline commit 5b39939a4 (nohz: Move ts->idle_calls incrementation into strict idle logic) moved code out of tick_nohz_stop_sched_tick() and missed to bail out when the cpu is offline. That's causing subsequent failures as an offline CPU is supposed to die and not to fiddle with nohz magic. Return false in can_stop_idle_tick() if the cpu is offline. Reported-and-tested-by: Jiri Kosina Reported-and-tested-by: Prarit Bhargava Cc: Frederic Weisbecker Cc: Borislav Petkov Cc: Tony Luck Cc: x86@kernel.org Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1305132138160.2863@ionos Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 0eed1db2d792..012142187db9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -469,6 +469,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (unlikely(!cpu_online(cpu))) { if (cpu == tick_do_timer_cpu) tick_do_timer_cpu = TICK_DO_TIMER_NONE; + return false; } if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) -- cgit v1.2.3 From b47430d3adbedbfdb5979ba4874f5dadf94f16b1 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Tue, 14 May 2013 04:01:27 +0530 Subject: rcu/idle: Wrap cpu-idle poll mode within rcu_idle_enter/exit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bjørn Mork reported the following warning when running powertop. [ 49.289034] ------------[ cut here ]------------ [ 49.289055] WARNING: at kernel/rcutree.c:502 rcu_eqs_exit_common.isra.48+0x3d/0x125() [ 49.289244] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 3.10.0-bisect-rcu-warn+ #107 [ 49.289251] ffffffff8157d8c8 ffffffff81801e28 ffffffff8137e4e3 ffffffff81801e68 [ 49.289260] ffffffff8103094f ffffffff81801e68 0000000000000000 ffff88023afcd9b0 [ 49.289268] 0000000000000000 0140000000000000 ffff88023bee7700 ffffffff81801e78 [ 49.289276] Call Trace: [ 49.289285] [] dump_stack+0x19/0x1b [ 49.289293] [] warn_slowpath_common+0x62/0x7b [ 49.289300] [] warn_slowpath_null+0x15/0x17 [ 49.289306] [] rcu_eqs_exit_common.isra.48+0x3d/0x125 [ 49.289314] [] ? trace_hardirqs_off_caller+0x37/0xa6 [ 49.289320] [] rcu_idle_exit+0x85/0xa8 [ 49.289327] [] trace_cpu_idle_rcuidle+0xae/0xff [ 49.289334] [] cpu_startup_entry+0x72/0x115 [ 49.289341] [] rest_init+0x149/0x150 [ 49.289347] [] ? csum_partial_copy_generic+0x16c/0x16c [ 49.289355] [] start_kernel+0x3f0/0x3fd [ 49.289362] [] ? repair_env_string+0x5a/0x5a [ 49.289368] [] x86_64_start_reservations+0x2a/0x2c [ 49.289375] [] x86_64_start_kernel+0xcd/0xd1 [ 49.289379] ---[ end trace 07a1cc95e29e9036 ]--- The warning is that 'rdtp->dynticks' has an unexpected value, which roughly translates to - the calls to rcu_idle_enter() and rcu_idle_exit() were not made in the correct order, or otherwise messed up. And Bjørn's painstaking debugging indicated that this happens when the idle loop enters the poll mode. Looking at the poll function cpu_idle_poll(), and the implementation of trace_cpu_idle_rcuidle(), the problem becomes very clear: cpu_idle_poll() lacks calls to rcu_idle_enter/exit(), and trace_cpu_idle_rcuidle() calls them in the reverse order - first rcu_idle_exit(), and then rcu_idle_enter(). Hence the even/odd alternative sequencing of rdtp->dynticks goes for a toss. And powertop readily triggers this because powertop uses the idle-tracing infrastructure extensively. So, to fix this, wrap the code in cpu_idle_poll() within rcu_idle_enter/exit(), so that it blends properly with the calls inside trace_cpu_idle_rcuidle() and thus get the function ordering right. Reported-and-tested-by: Bjørn Mork Cc: Paul McKenney Cc: Steven Rostedt Cc: Dipankar Sarma Signed-off-by: Srivatsa S. Bhat Link: http://lkml.kernel.org/r/519169BF.4080208@linux.vnet.ibm.com Signed-off-by: Thomas Gleixner --- kernel/cpu/idle.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index 8b86c0c68edf..d5585f5e038e 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -40,11 +40,13 @@ __setup("hlt", cpu_idle_nopoll_setup); static inline int cpu_idle_poll(void) { + rcu_idle_enter(); trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); while (!need_resched()) cpu_relax(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); + rcu_idle_exit(); return 1; } -- cgit v1.2.3 From 42a5cf46cd56f46267d2a9fcf2655f4078cd3042 Mon Sep 17 00:00:00 2001 From: Tirupathi Reddy Date: Tue, 14 May 2013 13:59:02 +0530 Subject: timer: Don't reinitialize the cpu base lock during CPU_UP_PREPARE An inactive timer's base can refer to a offline cpu's base. In the current code, cpu_base's lock is blindly reinitialized each time a CPU is brought up. If a CPU is brought online during the period that another thread is trying to modify an inactive timer on that CPU with holding its timer base lock, then the lock will be reinitialized under its feet. This leads to following SPIN_BUG(). <0> BUG: spinlock already unlocked on CPU#3, kworker/u:3/1466 <0> lock: 0xe3ebe000, .magic: dead4ead, .owner: kworker/u:3/1466, .owner_cpu: 1 <4> [] (unwind_backtrace+0x0/0x11c) from [] (do_raw_spin_unlock+0x40/0xcc) <4> [] (do_raw_spin_unlock+0x40/0xcc) from [] (_raw_spin_unlock+0x8/0x30) <4> [] (_raw_spin_unlock+0x8/0x30) from [] (mod_timer+0x294/0x310) <4> [] (mod_timer+0x294/0x310) from [] (queue_delayed_work_on+0x104/0x120) <4> [] (queue_delayed_work_on+0x104/0x120) from [] (sdhci_msm_bus_voting+0x88/0x9c) <4> [] (sdhci_msm_bus_voting+0x88/0x9c) from [] (sdhci_disable+0x40/0x48) <4> [] (sdhci_disable+0x40/0x48) from [] (mmc_release_host+0x4c/0xb0) <4> [] (mmc_release_host+0x4c/0xb0) from [] (mmc_sd_detect+0x90/0xfc) <4> [] (mmc_sd_detect+0x90/0xfc) from [] (mmc_rescan+0x7c/0x2c4) <4> [] (mmc_rescan+0x7c/0x2c4) from [] (process_one_work+0x27c/0x484) <4> [] (process_one_work+0x27c/0x484) from [] (worker_thread+0x210/0x3b0) <4> [] (worker_thread+0x210/0x3b0) from [] (kthread+0x80/0x8c) <4> [] (kthread+0x80/0x8c) from [] (kernel_thread_exit+0x0/0x8) As an example, this particular crash occurred when CPU #3 is executing mod_timer() on an inactive timer whose base is refered to offlined CPU #2. The code locked the timer_base corresponding to CPU #2. Before it could proceed, CPU #2 came online and reinitialized the spinlock corresponding to its base. Thus now CPU #3 held a lock which was reinitialized. When CPU #3 finally ended up unlocking the old cpu_base corresponding to CPU #2, we hit the above SPIN_BUG(). CPU #0 CPU #3 CPU #2 ------ ------- ------- ..... ...... mod_timer() lock_timer_base spin_lock_irqsave(&base->lock) cpu_up(2) ..... ...... init_timers_cpu() .... ..... spin_lock_init(&base->lock) ..... spin_unlock_irqrestore(&base->lock) ...... Allocation of per_cpu timer vector bases is done only once under "tvec_base_done[]" check. In the current code, spinlock_initialization of base->lock isn't under this check. When a CPU is up each time the base lock is reinitialized. Move base spinlock initialization under the check. Signed-off-by: Tirupathi Reddy Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1368520142-4136-1-git-send-email-tirupath@codeaurora.org Signed-off-by: Thomas Gleixner --- kernel/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 09bca8ce9771..7376589adc28 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1539,12 +1539,12 @@ static int __cpuinit init_timers_cpu(int cpu) boot_done = 1; base = &boot_tvec_bases; } + spin_lock_init(&base->lock); tvec_base_done[cpu] = 1; } else { base = per_cpu(tvec_bases, cpu); } - spin_lock_init(&base->lock); for (j = 0; j < TVN_SIZE; j++) { INIT_LIST_HEAD(base->tv5.vec + j); -- cgit v1.2.3 From 6faf72834d9d0c0dc6632604eaeffb621e87fcf9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 May 2013 06:53:37 -0700 Subject: rcu: Fix comparison sense in rcu_needs_cpu() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit c0f4dfd4f (rcu: Make RCU_FAST_NO_HZ take advantage of numbered callbacks) introduced a bug that can result in excessively long grace periods. This bug reverse the senes of the "if" statement checking for lazy callbacks, so that RCU takes a lazy approach when there are in fact non-lazy callbacks. This can result in excessive boot, suspend, and resume times. This commit therefore fixes the sense of this "if" statement. Reported-by: Borislav Petkov Reported-by: Bjørn Mork Reported-by: Joerg Roedel Signed-off-by: Paul E. McKenney Tested-by: Bjørn Mork Tested-by: Joerg Roedel --- kernel/rcutree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 170814dc418f..6d939a645da1 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1667,7 +1667,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj) rdtp->last_accelerate = jiffies; /* Request timer delay depending on laziness, and round. */ - if (rdtp->all_lazy) { + if (!rdtp->all_lazy) { *dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies; } else { -- cgit v1.2.3 From 8f174b1175a10903ade40f36eb6c896412877ca0 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 1 May 2013 00:07:00 +0900 Subject: workqueue: correct handling of the pool spin_lock When we fail to mutex_trylock(), we release the pool spin_lock and do mutex_lock(). After that, we should regrab the pool spin_lock, but, regrabbing is missed in current code. So correct it. Cc: Lai Jiangshan Signed-off-by: Joonsoo Kim Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1ae602809efb..286847b90225 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2059,6 +2059,7 @@ static bool manage_workers(struct worker *worker) if (unlikely(!mutex_trylock(&pool->manager_mutex))) { spin_unlock_irq(&pool->lock); mutex_lock(&pool->manager_mutex); + spin_lock_irq(&pool->lock); ret = true; } -- cgit v1.2.3 From ad7b1f841f8a54c6d61ff181451f55b68175e15a Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Mon, 6 May 2013 17:44:55 -0400 Subject: workqueue: Make schedule_work() available again to non GPL modules Commit 8425e3d5bdbe ("workqueue: inline trivial wrappers") changed schedule_work() and schedule_delayed_work() to inline wrappers, but these rely on some symbols that are EXPORT_SYMBOL_GPL, while the original functions were EXPORT_SYMBOL. This has the effect of changing the licensing requirement for these functions and making them unavailable to non GPL modules. Make them available again by removing the restriction on the required symbols. Signed-off-by: Marc Dionne Signed-off-by: Tejun Heo --- kernel/workqueue.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 286847b90225..02916f421385 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -296,7 +296,7 @@ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; struct workqueue_struct *system_wq __read_mostly; -EXPORT_SYMBOL_GPL(system_wq); +EXPORT_SYMBOL(system_wq); struct workqueue_struct *system_highpri_wq __read_mostly; EXPORT_SYMBOL_GPL(system_highpri_wq); struct workqueue_struct *system_long_wq __read_mostly; @@ -1411,7 +1411,7 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, local_irq_restore(flags); return ret; } -EXPORT_SYMBOL_GPL(queue_work_on); +EXPORT_SYMBOL(queue_work_on); void delayed_work_timer_fn(unsigned long __data) { @@ -1485,7 +1485,7 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, local_irq_restore(flags); return ret; } -EXPORT_SYMBOL_GPL(queue_delayed_work_on); +EXPORT_SYMBOL(queue_delayed_work_on); /** * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU -- cgit v1.2.3 From b4f711ee03d28f776fd2324fd0bd999cc428e4d2 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 24 Apr 2013 11:32:56 -0700 Subject: time: Revert ALWAYS_USE_PERSISTENT_CLOCK compile time optimizaitons Kay Sievers noted that the ALWAYS_USE_PERSISTENT_CLOCK config, which enables some minor compile time optimization to avoid uncessary code in mostly the suspend/resume path could cause problems for userland. In particular, the dependency for RTC_HCTOSYS on !ALWAYS_USE_PERSISTENT_CLOCK, which avoids setting the time twice and simplifies suspend/resume, has the side effect of causing the /sys/class/rtc/rtcN/hctosys flag to always be zero, and this flag is commonly used by udev to setup the /dev/rtc symlink to /dev/rtcN, which can cause pain for older applications. While the udev rules could use some work to be less fragile, breaking userland should strongly be avoided. Additionally the compile time optimizations are fairly minor, and the code being optimized is likely to be reworked in the future, so lets revert this change. Reported-by: Kay Sievers Signed-off-by: John Stultz Cc: stable #3.9 Cc: Feng Tang Cc: Jason Gunthorpe Link: http://lkml.kernel.org/r/1366828376-18124-1-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/Kconfig | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 24510d84efd7..b69692250af4 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -12,11 +12,6 @@ config CLOCKSOURCE_WATCHDOG config ARCH_CLOCKSOURCE_DATA bool -# Platforms has a persistent clock -config ALWAYS_USE_PERSISTENT_CLOCK - bool - default n - # Timekeeping vsyscall support config GENERIC_TIME_VSYSCALL bool -- cgit v1.2.3 From 615ee5443ff9bedd356dc6865f3e9c276ce434ea Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 26 Mar 2013 11:35:16 -0400 Subject: rcu: Don't allocate bootmem from rcu_init() When rcu_init() is called we already have slab working, allocating bootmem at that point results in warnings and an allocation from slab. This commit therefore changes alloc_bootmem_cpumask_var() to alloc_cpumask_var() in rcu_bootup_announce_oddness(), which is called from rcu_init(). Signed-off-by: Sasha Levin Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Tested-by: Robin Holt [paulmck: convert to zalloc_cpumask_var(), as suggested by Yinghai Lu.] --- kernel/rcutree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 6d939a645da1..3db5a375d8dd 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -88,7 +88,7 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_RCU_NOCB_CPU #ifndef CONFIG_RCU_NOCB_CPU_NONE if (!have_rcu_nocb_mask) { - alloc_bootmem_cpumask_var(&rcu_nocb_mask); + zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL); have_rcu_nocb_mask = true; } #ifdef CONFIG_RCU_NOCB_CPU_ZERO -- cgit v1.2.3 From 60705c89460fdc7227f2d153b68b3f34814738a4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 14 May 2013 15:40:48 -0400 Subject: tracing: Fix leaks of filter preds Special preds are created when folding a series of preds that can be done in serial. These are allocated in an ops field of the pred structure. But they were never freed, causing memory leaks. This was discovered using the kmemleak checker: unreferenced object 0xffff8800797fd5e0 (size 32): comm "swapper/0", pid 1, jiffies 4294690605 (age 104.608s) hex dump (first 32 bytes): 00 00 01 00 03 00 05 00 07 00 09 00 0b 00 0d 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmemleak_alloc+0x73/0x98 [] kmemleak_alloc_recursive.constprop.42+0x16/0x18 [] __kmalloc+0xd7/0x125 [] kcalloc.constprop.24+0x2d/0x2f [] fold_pred_tree_cb+0xa9/0xf4 [] walk_pred_tree+0x47/0xcc [] replace_preds.isra.20+0x6f8/0x72f [] create_filter+0x4e/0x8b [] ftrace_test_event_filter+0x5a/0x155 [] do_one_initcall+0xa0/0x137 [] kernel_init_freeable+0x14d/0x1dc [] kernel_init+0xe/0xdb [] ret_from_fork+0x7c/0xb0 [] 0xffffffffffffffff Cc: Tom Zanussi Cc: stable@vger.kernel.org # 2.6.39+ Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index a6361178de5a..e1b653f7e1ca 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -750,7 +750,11 @@ static int filter_set_pred(struct event_filter *filter, static void __free_preds(struct event_filter *filter) { + int i; + if (filter->preds) { + for (i = 0; i < filter->n_preds; i++) + kfree(filter->preds[i].ops); kfree(filter->preds); filter->preds = NULL; } -- cgit v1.2.3 From c02c7e65d9b13670e34bc523744cf4f6e99c198a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 13 May 2013 20:58:34 +0900 Subject: tracing/kprobes: Use rcu_dereference_raw for tp->files Use rcu_dereference_raw() for accessing tp->files. Because the write-side uses rcu_assign_pointer() for memory barrier, the read-side also has to use rcu_dereference_raw() with read memory barrier. Link: http://lkml.kernel.org/r/20130513115834.6545.17022.stgit@mhiramat-M0-7522 Cc: Srikar Dronamraju Cc: Oleg Nesterov Cc: Tom Zanussi Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 47 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 636d45fe69b3..0a3d8d5c483d 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -185,9 +185,14 @@ static struct trace_probe *find_trace_probe(const char *event, static int trace_probe_nr_files(struct trace_probe *tp) { - struct ftrace_event_file **file = tp->files; + struct ftrace_event_file **file; int ret = 0; + /* + * Since all tp->files updater is protected by probe_enable_lock, + * we don't need to lock an rcu_read_lock. + */ + file = rcu_dereference_raw(tp->files); if (file) while (*(file++)) ret++; @@ -209,9 +214,10 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) mutex_lock(&probe_enable_lock); if (file) { - struct ftrace_event_file **new, **old = tp->files; + struct ftrace_event_file **new, **old; int n = trace_probe_nr_files(tp); + old = rcu_dereference_raw(tp->files); /* 1 is for new one and 1 is for stopper */ new = kzalloc((n + 2) * sizeof(struct ftrace_event_file *), GFP_KERNEL); @@ -251,11 +257,17 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) static int trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file) { + struct ftrace_event_file **files; int i; - if (tp->files) { - for (i = 0; tp->files[i]; i++) - if (tp->files[i] == file) + /* + * Since all tp->files updater is protected by probe_enable_lock, + * we don't need to lock an rcu_read_lock. + */ + files = rcu_dereference_raw(tp->files); + if (files) { + for (i = 0; files[i]; i++) + if (files[i] == file) return i; } @@ -274,10 +286,11 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) mutex_lock(&probe_enable_lock); if (file) { - struct ftrace_event_file **new, **old = tp->files; + struct ftrace_event_file **new, **old; int n = trace_probe_nr_files(tp); int i, j; + old = rcu_dereference_raw(tp->files); if (n == 0 || trace_probe_file_index(tp, file) < 0) { ret = -EINVAL; goto out_unlock; @@ -872,9 +885,16 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, static __kprobes void kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs) { - struct ftrace_event_file **file = tp->files; + /* + * Note: preempt is already disabled around the kprobe handler. + * However, we still need an smp_read_barrier_depends() corresponding + * to smp_wmb() in rcu_assign_pointer() to access the pointer. + */ + struct ftrace_event_file **file = rcu_dereference_raw(tp->files); + + if (unlikely(!file)) + return; - /* Note: preempt is already disabled around the kprobe handler */ while (*file) { __kprobe_trace_func(tp, regs, *file); file++; @@ -925,9 +945,16 @@ static __kprobes void kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, struct pt_regs *regs) { - struct ftrace_event_file **file = tp->files; + /* + * Note: preempt is already disabled around the kprobe handler. + * However, we still need an smp_read_barrier_depends() corresponding + * to smp_wmb() in rcu_assign_pointer() to access the pointer. + */ + struct ftrace_event_file **file = rcu_dereference_raw(tp->files); + + if (unlikely(!file)) + return; - /* Note: preempt is already disabled around the kprobe handler */ while (*file) { __kretprobe_trace_func(tp, ri, regs, *file); file++; -- cgit v1.2.3 From 3d1fc7b0880c4db612a3d3211a808659e28af588 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 13 May 2013 20:58:37 +0900 Subject: tracing/kprobes: Fix a sparse warning for incorrect type in assignment Fix a sparse warning about the rcu operated pointer is defined without __rcu address space. Link: http://lkml.kernel.org/r/20130513115837.6545.23322.stgit@mhiramat-M0-7522 Cc: Srikar Dronamraju Cc: Oleg Nesterov Cc: Tom Zanussi Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 0a3d8d5c483d..81c5109b9d00 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -35,7 +35,7 @@ struct trace_probe { const char *symbol; /* symbol name */ struct ftrace_event_class class; struct ftrace_event_call call; - struct ftrace_event_file **files; + struct ftrace_event_file * __rcu *files; ssize_t size; /* trace entry size */ unsigned int nr_args; struct probe_arg args[]; -- cgit v1.2.3 From b62fdd97fcae17e483b005bafd13fadbd9840672 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 13 May 2013 20:58:39 +0900 Subject: tracing/kprobes: Make print_*probe_event static According to sparse warning, print_*probe_event static because those functions are not directly called from outside. Link: http://lkml.kernel.org/r/20130513115839.6545.83067.stgit@mhiramat-M0-7522 Cc: Srikar Dronamraju Cc: Oleg Nesterov Cc: Tom Zanussi Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 81c5109b9d00..9f46e98ba8f2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -962,7 +962,7 @@ kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, } /* Event entry printers */ -enum print_line_t +static enum print_line_t print_kprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) { @@ -998,7 +998,7 @@ partial: return TRACE_TYPE_PARTIAL_LINE; } -enum print_line_t +static enum print_line_t print_kretprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) { -- cgit v1.2.3 From 1be0c25da56e860992af972a60321563ca2cfcd1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 15 May 2013 14:24:24 -0700 Subject: workqueue: don't perform NUMA-aware allocations on offline nodes in wq_numa_init() wq_numa_init() builds per-node cpumasks which are later used to make unbound workqueues NUMA-aware. The cpumasks are allocated using alloc_cpumask_var_node() for all possible nodes. Unfortunately, on machines with off-line nodes, this leads to NUMA-aware allocations on existing bug offline nodes, which in turn triggers BUG in the memory allocation code. Fix it by using NUMA_NO_NODE for cpumask allocations for offline nodes. kernel BUG at include/linux/gfp.h:323! invalid opcode: 0000 [#1] SMP Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.9.0+ #1 Hardware name: ProLiant BL465c G7, BIOS A19 12/10/2011 task: ffff880234608000 ti: ffff880234602000 task.ti: ffff880234602000 RIP: 0010:[] [] new_slab+0x2ad/0x340 RSP: 0000:ffff880234603bf8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff880237404b40 RCX: 00000000000000d0 RDX: 0000000000000001 RSI: 0000000000000003 RDI: 00000000002052d0 RBP: ffff880234603c28 R08: 0000000000000000 R09: 0000000000000001 R10: 0000000000000001 R11: ffffffff812e3aa8 R12: 0000000000000001 R13: ffff8802378161c0 R14: 0000000000030027 R15: 00000000000040d0 FS: 0000000000000000(0000) GS:ffff880237800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: ffff88043fdff000 CR3: 00000000018d5000 CR4: 00000000000007f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Stack: ffff880234603c28 0000000000000001 00000000000000d0 ffff8802378161c0 ffff880237404b40 ffff880237404b40 ffff880234603d28 ffffffff815edba1 ffff880237816140 0000000000000000 ffff88023740e1c0 Call Trace: [] __slab_alloc+0x330/0x4f2 [] kmem_cache_alloc_node_trace+0xa5/0x200 [] alloc_cpumask_var_node+0x28/0x90 [] wq_numa_init+0x10d/0x1be [] init_workqueues+0x64/0x341 [] do_one_initcall+0xea/0x1a0 [] kernel_init_freeable+0xb7/0x1ec [] kernel_init+0xe/0xf0 [] ret_from_fork+0x7c/0xb0 Code: 45 84 ac 00 00 00 f0 41 80 4d 00 40 e9 f6 fe ff ff 66 0f 1f 84 00 00 00 00 00 e8 eb 4b ff ff 49 89 c5 e9 05 fe ff ff <0f> 0b 4c 8b 73 38 44 89 ff 81 cf 00 00 20 00 4c 89 f6 48 c1 ee Signed-off-by: Tejun Heo Reported-and-Tested-by: Lingzhu Xiang --- kernel/workqueue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 02916f421385..ee8e29a2320c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4905,7 +4905,8 @@ static void __init wq_numa_init(void) BUG_ON(!tbl); for_each_node(node) - BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, node)); + BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, + node_online(node) ? node : NUMA_NO_NODE)); for_each_possible_cpu(cpu) { node = cpu_to_node(cpu); -- cgit v1.2.3 From 6ed0106667d76589cb648c27edb4f4ffbf9d59ca Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 16 May 2013 20:48:49 +0900 Subject: tracing: Return -EBUSY when event_enable_func() fails to get module Since try_module_get() returns false( = 0) when it fails to pindown a module, event_enable_func() returns 0 which means "succeed". This can cause a kernel panic when the entry is removed, because the event is already released. This fixes the bug by returning -EBUSY, because the reason why it fails is that the module is being removed at that time. Link: http://lkml.kernel.org/r/20130516114848.13508.97899.stgit@mhiramat-M0-7522 Cc: Srikar Dronamraju Cc: Oleg Nesterov Cc: Tom Zanussi Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7a0cf68027cc..27963e2bf4bf 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2072,8 +2072,10 @@ event_enable_func(struct ftrace_hash *hash, out_reg: /* Don't let event modules unload while probe registered */ ret = try_module_get(file->event_call->mod); - if (!ret) + if (!ret) { + ret = -EBUSY; goto out_free; + } ret = __ftrace_event_enable_disable(file, 1, 1); if (ret < 0) -- cgit v1.2.3 From 264b83c07a84223f0efd0d1db9ccc66d6f88288f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 16 May 2013 17:43:55 +0200 Subject: usermodehelper: check subprocess_info->path != NULL argv_split(empty_or_all_spaces) happily succeeds, it simply returns argc == 0 and argv[0] == NULL. Change call_usermodehelper_exec() to check sub_info->path != NULL to avoid the crash. This is the minimal fix, todo: - perhaps we should change argv_split() to return NULL or change the callers. - kill or justify ->path[0] check - narrow the scope of helper_lock() Signed-off-by: Oleg Nesterov Acked-By: Lucas De Marchi Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- kernel/kmod.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 1296e72e4161..8241906c4b61 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -569,6 +569,11 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) int retval = 0; helper_lock(); + if (!sub_info->path) { + retval = -EINVAL; + goto out; + } + if (sub_info->path[0] == '\0') goto out; -- cgit v1.2.3 From 06c9494c0e9bdfcaa14d6d2b096a0ff7abe8494f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 15 May 2013 20:33:01 +0100 Subject: kmemleak: Scan all allocated, writeable and not executable module sections Instead of just picking data sections by name (names that start with .data, .bss or .ref.data), use the section flags and scan all sections that are allocated, writable and not executable. Which should cover all sections of a module that might reference data. Signed-off-by: Steven Rostedt [catalin.marinas@arm.com: removed unused 'name' variable] [catalin.marinas@arm.com: collapsed 'if' blocks] Signed-off-by: Catalin Marinas Acked-by: Rusty Russell --- kernel/module.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index b049939177f6..06f496a5d182 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2431,10 +2431,10 @@ static void kmemleak_load_module(const struct module *mod, kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); for (i = 1; i < info->hdr->e_shnum; i++) { - const char *name = info->secstrings + info->sechdrs[i].sh_name; - if (!(info->sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - if (!strstarts(name, ".data") && !strstarts(name, ".bss")) + /* Scan all writable sections that's not executable */ + if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) || + !(info->sechdrs[i].sh_flags & SHF_WRITE) || + (info->sechdrs[i].sh_flags & SHF_EXECINSTR)) continue; kmemleak_scan_area((void *)info->sechdrs[i].sh_addr, -- cgit v1.2.3 From 89c837351db0b9b52fd572ec8b0445a42e59b75c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 15 May 2013 20:46:23 +0100 Subject: kmemleak: No need for scanning specific module sections As kmemleak now scans all module sections that are allocated, writable and non executable, there's no need to scan individual sections that might reference data. Signed-off-by: Steven Rostedt Signed-off-by: Catalin Marinas Acked-by: Rusty Russell --- kernel/module.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 06f496a5d182..cab4bce49c23 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2769,24 +2769,11 @@ static void find_module_sections(struct module *mod, struct load_info *info) mod->trace_events = section_objs(info, "_ftrace_events", sizeof(*mod->trace_events), &mod->num_trace_events); - /* - * This section contains pointers to allocated objects in the trace - * code and not scanning it leads to false positives. - */ - kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * - mod->num_trace_events, GFP_KERNEL); #endif #ifdef CONFIG_TRACING mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", sizeof(*mod->trace_bprintk_fmt_start), &mod->num_trace_bprintk_fmt); - /* - * This section contains pointers to allocated objects in the trace - * code and not scanning it leads to false positives. - */ - kmemleak_scan_area(mod->trace_bprintk_fmt_start, - sizeof(*mod->trace_bprintk_fmt_start) * - mod->num_trace_bprintk_fmt, GFP_KERNEL); #endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD /* sechdrs[0].sh_size is always zero */ -- cgit v1.2.3 From 387b8b3e37cb1c257fb607787f73815c30d22859 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 24 May 2013 15:55:25 -0700 Subject: auditfilter.c: fix kernel-doc warnings Fix kernel-doc warnings in kernel/auditfilter.c: Warning(kernel/auditfilter.c:1029): Excess function parameter 'loginuid' description in 'audit_receive_filter' Warning(kernel/auditfilter.c:1029): Excess function parameter 'sessionid' description in 'audit_receive_filter' Warning(kernel/auditfilter.c:1029): Excess function parameter 'sid' description in 'audit_receive_filter' Signed-off-by: Randy Dunlap Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditfilter.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 83a2970295d1..6bd4a90d1991 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1021,9 +1021,6 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re * @seq: netlink audit message sequence (serial) number * @data: payload data * @datasz: size of payload data - * @loginuid: loginuid of sender - * @sessionid: sessionid for netlink audit message - * @sid: SE Linux Security ID of sender */ int audit_receive_filter(int type, int pid, int seq, void *data, size_t datasz) { -- cgit v1.2.3