From f01114cb59d670e9b4f2c335930dd57db96e9360 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 31 May 2011 12:26:55 +0200 Subject: sched: Fix cross-cpu clock sync on remote wakeups Markus reported that commit 317f394160e ("sched: Move the second half of ttwu() to the remote cpu") caused some accounting funnies on his AMD Phenom II X4, such as weird 'top' results. It turns out that this is due to non-synced TSC and the queued remote wakeups stopped coupeling the two relevant cpu clocks, which leads to wakeups seeing time jumps, which in turn lead to skewed runtime stats. Add an explicit call to sched_clock_cpu() to couple the per-cpu clocks to restore the normal flow of time. Reported-and-tested-by: Markus Trippelsdorf Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1306835745.2353.3.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index cbb3a0eee58e..49cc70b152cf 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2600,6 +2600,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) #if defined(CONFIG_SMP) if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { + sched_clock_cpu(cpu); /* sync clocks x-cpu */ ttwu_queue_remote(p, cpu); return; } -- cgit v1.2.3 From f339b9dc1f03591761d5d930800db24bc0eda1e1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 31 May 2011 10:49:20 +0200 Subject: sched: Fix schedstat.nr_wakeups_migrate While looking over the code I found that with the ttwu rework the nr_wakeups_migrate test broke since we now switch cpus prior to calling ttwu_stat(), hence the test is always true. Cure this by passing the migration state in wake_flags. Also move the whole test under CONFIG_SMP, its hard to migrate tasks on UP :-) Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-pwwxl7gdqs5676f1d4cx6pj7@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 49cc70b152cf..2fe98ed474da 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2447,6 +2447,10 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) } rcu_read_unlock(); } + + if (wake_flags & WF_MIGRATED) + schedstat_inc(p, se.statistics.nr_wakeups_migrate); + #endif /* CONFIG_SMP */ schedstat_inc(rq, ttwu_count); @@ -2455,9 +2459,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) if (wake_flags & WF_SYNC) schedstat_inc(p, se.statistics.nr_wakeups_sync); - if (cpu != task_cpu(p)) - schedstat_inc(p, se.statistics.nr_wakeups_migrate); - #endif /* CONFIG_SCHEDSTATS */ } @@ -2675,8 +2676,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->sched_class->task_waking(p); cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); - if (task_cpu(p) != cpu) + if (task_cpu(p) != cpu) { + wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); + } #endif /* CONFIG_SMP */ ttwu_queue(p, cpu); -- cgit v1.2.3 From 74c355fbdfedd3820046dba4f537876cea54c207 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 May 2011 16:48:06 +0200 Subject: perf, cgroups: Fix up for new API Ben changed the cgroup API in commit f780bdb7c1c (cgroups: add per-thread subsystem callbacks) in an incompatible way, but forgot to convert the perf cgroup bits. Avoid compile warnings and runtime splats and convert perf too ;-) Acked-by: Ben Blum Cc: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1306767651.1200.2990.camel@twins Signed-off-by: Ingo Molnar --- kernel/events/core.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index c09767f7db3e..8a15944bf9d2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7394,26 +7394,12 @@ static int __perf_cgroup_move(void *info) return 0; } -static void perf_cgroup_move(struct task_struct *task) +static void +perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task) { task_function_call(task, __perf_cgroup_move, task); } -static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *task, - bool threadgroup) -{ - perf_cgroup_move(task); - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &task->thread_group, thread_group) { - perf_cgroup_move(c); - } - rcu_read_unlock(); - } -} - static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, struct cgroup *old_cgrp, struct task_struct *task) { @@ -7425,7 +7411,7 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, if (!(task->flags & PF_EXITING)) return; - perf_cgroup_move(task); + perf_cgroup_attach_task(cgrp, task); } struct cgroup_subsys perf_subsys = { @@ -7434,6 +7420,6 @@ struct cgroup_subsys perf_subsys = { .create = perf_cgroup_create, .destroy = perf_cgroup_destroy, .exit = perf_cgroup_exit, - .attach = perf_cgroup_attach, + .attach_task = perf_cgroup_attach_task, }; #endif /* CONFIG_CGROUP_PERF */ -- cgit v1.2.3 From 1b054b67d3bfc6dca9f634c104780f3f24ff3eec Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 Jun 2011 11:13:33 +0200 Subject: clockevents: Handle empty cpumask gracefully For UP it's stupid to request an initialized cpumask for the clock event devices. Though we need the mask set even on UP to avoid a horrible ifdeffery especially in the broadcast code. For SMP we can at least try to survive with a warning and set the cpumask of the cpu we're running on. That gives a decent chance to bring the machine up and retrieve the debug info. Signed-off-by: Thomas Gleixner Cc: Linus Walleij Cc: Russell King - ARM Linux Cc: Stephen Boyd --- kernel/time/clockevents.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index c027d4f602f1..e4c699dfa4e8 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -182,7 +182,10 @@ void clockevents_register_device(struct clock_event_device *dev) unsigned long flags; BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); - BUG_ON(!dev->cpumask); + if (!dev->cpumask) { + WARN_ON(num_possible_cpus() > 1); + dev->cpumask = cpumask_of(smp_processor_id()); + } raw_spin_lock_irqsave(&clockevents_lock, flags); -- cgit v1.2.3 From ef26f20cd117eb3c185038ed7cbf7b235575751d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 31 May 2011 08:56:10 +0200 Subject: genirq: Print threaded handler in spurious debug output In forced threaded mode (or with an explicit threaded handler) we only see the primary handler, but not the threaded handler. Signed-off-by: Sebastian Andrzej Siewior Link: http://lkml.kernel.org/r/1306824972-27067-1-git-send-email-sebastian@breakpoint.cc Signed-off-by: Thomas Gleixner --- kernel/irq/spurious.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index dfbd550401b2..c9a78ba30b6f 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -201,10 +201,11 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, raw_spin_lock_irqsave(&desc->lock, flags); action = desc->action; while (action) { - printk(KERN_ERR "[<%p>]", action->handler); - print_symbol(" (%s)", - (unsigned long)action->handler); - printk("\n"); + printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler); + if (action->thread_fn) + printk(KERN_CONT " threaded [<%p>] %pf", + action->thread_fn, action->thread_fn); + printk(KERN_CONT "\n"); action = action->next; } raw_spin_unlock_irqrestore(&desc->lock, flags); -- cgit v1.2.3 From 3a43e05f4d0600e906fa09f4a65d749288c44592 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 31 May 2011 08:56:11 +0200 Subject: irq: Handle spurios irq detection for threaded irqs The detection of spurios interrupts is currently limited to first level handler. In force-threaded mode we never notice if the threaded irq does not feel responsible. This patch catches the return value of the threaded handler and forwards it to the spurious detector. If the primary handler returns only IRQ_WAKE_THREAD then the spourious detector ignores it because it gets called again from the threaded handler. [ tglx: Report the erroneous return value early and bail out ] Signed-off-by: Sebastian Andrzej Siewior Link: http://lkml.kernel.org/r/1306824972-27067-2-git-send-email-sebastian@breakpoint.cc Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 6 ------ kernel/irq/manage.c | 24 ++++++++++++++++++------ kernel/irq/spurious.c | 22 ++++++++++++++++++---- 3 files changed, 36 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 90cb55f6d7eb..470d08c82bbe 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -132,12 +132,6 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) switch (res) { case IRQ_WAKE_THREAD: - /* - * Set result to handled so the spurious check - * does not trigger. - */ - res = IRQ_HANDLED; - /* * Catch drivers which return WAKE_THREAD but * did not set up a thread function diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f7ce0021e1c4..d64bafb1afd0 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -723,13 +723,16 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } * context. So we need to disable bh here to avoid deadlocks and other * side effects. */ -static void +static irqreturn_t irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) { + irqreturn_t ret; + local_bh_disable(); - action->thread_fn(action->irq, action->dev_id); + ret = action->thread_fn(action->irq, action->dev_id); irq_finalize_oneshot(desc, action, false); local_bh_enable(); + return ret; } /* @@ -737,10 +740,14 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) * preemtible - many of them need to sleep and wait for slow busses to * complete. */ -static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action) +static irqreturn_t irq_thread_fn(struct irq_desc *desc, + struct irqaction *action) { - action->thread_fn(action->irq, action->dev_id); + irqreturn_t ret; + + ret = action->thread_fn(action->irq, action->dev_id); irq_finalize_oneshot(desc, action, false); + return ret; } /* @@ -753,7 +760,8 @@ static int irq_thread(void *data) }; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); - void (*handler_fn)(struct irq_desc *desc, struct irqaction *action); + irqreturn_t (*handler_fn)(struct irq_desc *desc, + struct irqaction *action); int wake; if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD, @@ -783,8 +791,12 @@ static int irq_thread(void *data) desc->istate |= IRQS_PENDING; raw_spin_unlock_irq(&desc->lock); } else { + irqreturn_t action_ret; + raw_spin_unlock_irq(&desc->lock); - handler_fn(desc, action); + action_ret = handler_fn(desc, action); + if (!noirqdebug) + note_interrupt(action->irq, desc, action_ret); } wake = atomic_dec_and_test(&desc->threads_active); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index c9a78ba30b6f..aa57d5da18c1 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -167,6 +167,13 @@ out: jiffies + POLL_SPURIOUS_IRQ_INTERVAL); } +static inline int bad_action_ret(irqreturn_t action_ret) +{ + if (likely(action_ret <= (IRQ_HANDLED | IRQ_WAKE_THREAD))) + return 0; + return 1; +} + /* * If 99,900 of the previous 100,000 interrupts have not been handled * then assume that the IRQ is stuck in some manner. Drop a diagnostic @@ -182,7 +189,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *action; unsigned long flags; - if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { + if (bad_action_ret(action_ret)) { printk(KERN_ERR "irq event %d: bogus return value %x\n", irq, action_ret); } else { @@ -263,7 +270,16 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, if (desc->istate & IRQS_POLL_INPROGRESS) return; - if (unlikely(action_ret != IRQ_HANDLED)) { + /* we get here again via the threaded handler */ + if (action_ret == IRQ_WAKE_THREAD) + return; + + if (bad_action_ret(action_ret)) { + report_bad_irq(irq, desc, action_ret); + return; + } + + if (unlikely(action_ret == IRQ_NONE)) { /* * If we are seeing only the odd spurious IRQ caused by * bus asynchronicity then don't eventually trigger an error, @@ -275,8 +291,6 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, else desc->irqs_unhandled++; desc->last_unhandled = jiffies; - if (unlikely(action_ret != IRQ_NONE)) - report_bad_irq(irq, desc, action_ret); } if (unlikely(try_misrouted_irq(irq, desc, action_ret))) { -- cgit v1.2.3 From e7fbad300a7a6432238f086e3c9a61538a905858 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 31 May 2011 18:14:39 +0200 Subject: genirq: Fix descriptor init on non-sparse IRQs The genirq changes are initializing descriptors for sparse IRQs quite differently from how non-sparse (stacked?) IRQs are initialized, with the effect that on my platform all IRQs are default-disabled on sparse IRQs and default-enabled if non-sparse IRQs are used, crashing some GPIO driver. Fix this by refactoring the non-sparse IRQs to use the same descriptor init function as the sparse IRQs. Signed-off: Linus Walleij Link: http://lkml.kernel.org/r/1306858479-16622-1-git-send-email-linus.walleij@stericsson.com Cc: stable@kernel.org # 2.6.39 Signed-off-by: Thomas Gleixner --- kernel/irq/irqdesc.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 886e80347b32..c7ddc87d6c87 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -257,13 +257,11 @@ int __init early_irq_init(void) count = ARRAY_SIZE(irq_desc); for (i = 0; i < count; i++) { - desc[i].irq_data.irq = i; - desc[i].irq_data.chip = &no_irq_chip; desc[i].kstat_irqs = alloc_percpu(unsigned int); - irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); - alloc_masks(desc + i, GFP_KERNEL, node); - desc_smp_init(desc + i, node); + alloc_masks(&desc[i], GFP_KERNEL, node); + raw_spin_lock_init(&desc[i].lock); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); + desc_set_defaults(i, &desc[i], node); } return arch_early_irq_init(); } -- cgit v1.2.3 From c5182b8867e189e14a8df5dbfcba1c73f286e061 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 2 Jun 2011 18:55:13 +0100 Subject: genirq: Ensure we locate the passed IRQ in irq_alloc_descs() When irq_alloc_descs() is called with no base IRQ specified then it will search for a range of IRQs starting from a specified base address. In the case where an IRQ is specified it still does this search in order to ensure that none of the requested range is already allocated and it still uses the from parameter to specify the base for the search. This means that in the case where a base is specified but from is zero (which is reasonable as any IRQ number is in the range specified by a zero from) the function will get confused and try to allocate the first suitably sized block of free IRQs it finds. Instead use a specified IRQ as the base address for the search, and insist that any from that is specified can support that IRQ. Signed-off-by: Mark Brown Link: http://lkml.kernel.org/r/1307037313-15733-1-git-send-email-broonie@opensource.wolfsonmicro.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdesc.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index c7ddc87d6c87..4c60a50e66b2 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -344,6 +344,12 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) if (!cnt) return -EINVAL; + if (irq >= 0) { + if (from > irq) + return -EINVAL; + from = irq; + } + mutex_lock(&sparse_irq_lock); start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS, -- cgit v1.2.3 From 1c3cc11602111d1318c2a5743bd2e88c82813927 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Sat, 21 May 2011 12:58:28 +0200 Subject: timers: Consider slack value in mod_timer() There is an optimization which does not update the timer if the timer was pending and the expiration time was unchanged. Since commit 3bbb9ec9 ("timers: Introduce the concept of timer slack for legacy timers") this optimization is no longer applied for timers where the expiration time got extended due to the slack value. So we need to check again after the expiration time might have been updated. [ tglx: Made it a single check by applying slack first and sorting out the slack = 0 value (all timeouts < 256 jiffies) early ] Signed-off-by: Sebastian Andrzej Siewior Link: http://lkml.kernel.org/r/20110521105828.GA29442@Chamillionaire.breakpoint.cc Signed-off-by: Thomas Gleixner --- kernel/timer.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index fd6198692b57..8cff36119e4d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -749,16 +749,15 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires) unsigned long expires_limit, mask; int bit; - expires_limit = expires; - if (timer->slack >= 0) { expires_limit = expires + timer->slack; } else { - unsigned long now = jiffies; + long delta = expires - jiffies; + + if (delta < 256) + return expires; - /* No slack, if already expired else auto slack 0.4% */ - if (time_after(expires, now)) - expires_limit = expires + (expires - now)/256; + expires_limit = expires + delta / 256; } mask = expires ^ expires_limit; if (mask == 0) @@ -795,6 +794,8 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires) */ int mod_timer(struct timer_list *timer, unsigned long expires) { + expires = apply_slack(timer, expires); + /* * This is a common optimization triggered by the * networking code - if the timer is re-modified @@ -803,8 +804,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires) if (timer_pending(timer) && timer->expires == expires) return 1; - expires = apply_slack(timer, expires); - return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); } EXPORT_SYMBOL(mod_timer); -- cgit v1.2.3 From aa4a221875873d2a1f9656cb7fd7e545e952b4fa Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Fri, 3 Jun 2011 17:54:40 -0400 Subject: perf: Comment /proc/sys/kernel/perf_event_paranoid to be part of user ABI Turns out that distro packages use this file as an indicator of the perf event subsystem - this is easier to check for from scripts than the existence of the system call. This is easy enough to keep around for the kernel, so add a comment to make sure it stays so. Signed-off-by: Vince Weaver Cc: David Ahern Cc: Peter Zijlstra Cc: paulus@samba.org Cc: acme@redhat.com Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/alpine.DEB.2.00.1106031751170.29381@cl320.eecs.utk.edu Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4fc92445a29c..f175d98bd355 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -938,6 +938,12 @@ static struct ctl_table kern_table[] = { }, #endif #ifdef CONFIG_PERF_EVENTS + /* + * User-space scripts rely on the existence of this file + * as a feature check for perf_events being enabled. + * + * So it's an ABI, do not remove! + */ { .procname = "perf_event_paranoid", .data = &sysctl_perf_event_paranoid, -- cgit v1.2.3 From 0aff1c0cef13b34c17e81a502336fad738151c37 Mon Sep 17 00:00:00 2001 From: GuoWen Li Date: Wed, 1 Jun 2011 19:18:47 +0800 Subject: ftrace: Fix possible undefined return code kernel/trace/ftrace.c: In function 'ftrace_regex_write.clone.15': kernel/trace/ftrace.c:2743:6: warning: 'ret' may be used uninitialized in this function Signed-off-by: GuoWen Li Link: http://lkml.kernel.org/r/201106011918.47939.guowen.li.linux@gmail.com Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1ee417fcbfa5..204b3eb4e690 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2740,7 +2740,7 @@ static int ftrace_process_regex(struct ftrace_hash *hash, { char *func, *command, *next = buff; struct ftrace_func_command *p; - int ret; + int ret = -EINVAL; func = strsep(&next, ":"); -- cgit v1.2.3 From f2513cde93f0957d5dc6c09bc24b0cccd27d8e1d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Jun 2011 12:32:43 +0200 Subject: lockdep: Fix lock_is_held() on recursion The main lock_is_held() user is lockdep_assert_held(), avoid false assertions in lockdep_off() sections by unconditionally reporting the lock is taken. [ the reason this is important is a lockdep_assert_held() in ttwu() which triggers a warning under lockdep_off() as in printk() which can trigger another wakeup and lock up due to spinlock recursion, as reported and heroically debugged by Arne Jansen ] Reported-and-tested-by: Arne Jansen Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Link: http://lkml.kernel.org/r/1307398759.2497.966.camel@laptop Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 63437d065ac8..298c9276dfdb 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3426,7 +3426,7 @@ int lock_is_held(struct lockdep_map *lock) int ret = 0; if (unlikely(current->lockdep_recursion)) - return ret; + return 1; /* avoid false negative lockdep_assert_held() */ raw_local_irq_save(flags); check_flags(flags); -- cgit v1.2.3 From 6c6c54e1807faf116724451ef2bd14993780470a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 3 Jun 2011 17:37:07 +0200 Subject: sched: Fix/clarify set_task_cpu() locking rules Sergey reported a CONFIG_PROVE_RCU warning in push_rt_task where set_task_cpu() was called with both relevant rq->locks held, which should be sufficient for running tasks since holding its rq->lock will serialize against sched_move_task(). Update the comments and fix the task_group() lockdep test. Reported-and-tested-by: Sergey Senozhatsky Cc: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1307115427.2353.3456.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2fe98ed474da..3f2e502d609b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -605,10 +605,10 @@ static inline int cpu_of(struct rq *rq) /* * Return the group to which this tasks belongs. * - * We use task_subsys_state_check() and extend the RCU verification - * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach() - * holds that lock for each task it moves into the cgroup. Therefore - * by holding that lock, we pin the task to the current cgroup. + * We use task_subsys_state_check() and extend the RCU verification with + * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each + * task it moves into the cgroup. Therefore by holding either of those locks, + * we pin the task to the current cgroup. */ static inline struct task_group *task_group(struct task_struct *p) { @@ -616,7 +616,8 @@ static inline struct task_group *task_group(struct task_struct *p) struct cgroup_subsys_state *css; css = task_subsys_state_check(p, cpu_cgroup_subsys_id, - lockdep_is_held(&p->pi_lock)); + lockdep_is_held(&p->pi_lock) || + lockdep_is_held(&task_rq(p)->lock)); tg = container_of(css, struct task_group, css); return autogroup_task_group(p, tg); @@ -2200,6 +2201,16 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); #ifdef CONFIG_LOCKDEP + /* + * The caller should hold either p->pi_lock or rq->lock, when changing + * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. + * + * sched_move_task() holds both and thus holding either pins the cgroup, + * see set_task_rq(). + * + * Furthermore, all task_rq users should acquire both locks, see + * task_rq_lock(). + */ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || lockdep_is_held(&task_rq(p)->lock))); #endif -- cgit v1.2.3 From 265a5b7ee3eb21a4d0e53e17d59ba6eada91af39 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Jun 2011 22:35:13 -0400 Subject: kprobes/trace: Fix kprobe selftest for gcc 4.6 With gcc 4.6, the self test kprobe function: kprobe_trace_selftest_target() is optimized such that kallsyms does not list it. The kprobes test uses this function to insert a probe and test it. But it will fail the test if the function is not listed in kallsyms. Adding a __used annotation keeps the symbol in the kallsyms table. Suggested-by: David Daney Cc: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f925c45f0afa..27d13b36b8be 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1870,8 +1870,12 @@ fs_initcall(init_kprobe_trace); #ifdef CONFIG_FTRACE_STARTUP_TEST -static int kprobe_trace_selftest_target(int a1, int a2, int a3, - int a4, int a5, int a6) +/* + * The "__used" keeps gcc from removing the function symbol + * from the kallsyms table. + */ +static __used int kprobe_trace_selftest_target(int a1, int a2, int a3, + int a4, int a5, int a6) { return a1 + a2 + a3 + a4 + a5 + a6; } -- cgit v1.2.3 From a4f18ed11a4ddf327dd91cd19e237278600ad327 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 7 Jun 2011 09:26:46 -0400 Subject: ftrace: Revert 8ab2b7efd ftrace: Remove unnecessary disabling of irqs Revert the commit that removed the disabling of interrupts around the initial modifying of mcount callers to nops, and update the comment. The original comment was outdated and stated that the interrupts were being disabled to prevent kstop machine, which was required with the old ftrace daemon, but was no longer the case. What the comment failed to mention was that interrupts needed to be disabled to keep interrupts from preempting the modifying of the code and then executing the code that was partially modified. Revert the commit and update the comment. Reported-by: Richard W.M. Jones Tested-by: Richard W.M. Jones Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 204b3eb4e690..908038f57440 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3330,6 +3330,7 @@ static int ftrace_process_locs(struct module *mod, { unsigned long *p; unsigned long addr; + unsigned long flags; mutex_lock(&ftrace_lock); p = start; @@ -3346,7 +3347,13 @@ static int ftrace_process_locs(struct module *mod, ftrace_record_ip(addr); } + /* + * Disable interrupts to prevent interrupts from executing + * code that is being modified. + */ + local_irq_save(flags); ftrace_update_code(mod); + local_irq_restore(flags); mutex_unlock(&ftrace_lock); return 0; -- cgit v1.2.3 From 13863a66c9c8a663665445cf05d68de96ff31830 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Thu, 9 Jun 2011 23:14:58 +0200 Subject: genirq: Prevent potential NULL dereference in irq_set_irq_wake() In kernel/irq/manage.c::irq_set_irq_wake() we call irq_get_desc_buslock() which may return NULL, but the code dereferences the result unconditionally. irq_set_irq_wake() has lots of callers - I checked a few and I couldn't find anything that guarantees that they won't call it with some input that will cause irq_get_desc_buslock() to return NULL, so I think it's a good thing to test and -EINVAL was the most sane error code in this situation that I could think of. Not all callers test the return value of irq_set_irq_wake(), but those that do take != 0 to mean error as far as I can see, so they should be fine. I guess those that don't test actually should, but that's a different issue. Signed-off-by: Jesper Juhl Link: http://lkml.kernel.org/r/alpine.LNX.2.00.1106092300360.17868@swampdragon.chaosbits.net Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d64bafb1afd0..0a7840aeb0fb 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -491,6 +491,9 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on) struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); int ret = 0; + if (!desc) + return -EINVAL; + /* wakeup-capable irqs can be shared between drivers that * don't need to have the same sleep mode behaviors. */ -- cgit v1.2.3