From c8446b75be6f85b3d40066922876cb7adc948afb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 30 Oct 2012 13:33:54 +0100 Subject: irq_work: Fix racy IRQ_WORK_BUSY flag setting The IRQ_WORK_BUSY flag is set right before we execute the work. Once this flag value is set, the work enters a claimable state again. So if we have specific data to compute in our work, we ensure it's either handled by another CPU or locally by enqueuing the work again. This state machine is guanranteed by atomic operations on the flags. So when we set IRQ_WORK_BUSY without using an xchg-like operation, we break this guarantee as in the following summarized scenario: CPU 1 CPU 2 ----- ----- (flags = 0) old_flags = flags; (flags = 0) cmpxchg(flags, old_flags, old_flags | IRQ_WORK_FLAGS) (flags = 3) [...] flags = IRQ_WORK_BUSY (flags = 2) func() (sees flags = 3) cmpxchg(flags, old_flags, old_flags | IRQ_WORK_FLAGS) (give up) cmpxchg(flags, 2, 0); (flags = 0) CPU 1 claims a work and executes it, so it sets IRQ_WORK_BUSY and the work is again in a claimable state. Now CPU 2 has new data to process and try to claim that work but it may see a stale value of the flags and think the work is still pending somewhere that will handle our data. This is because CPU 1 doesn't set IRQ_WORK_BUSY atomically. As a result, the data expected to be handle by CPU 2 won't get handled. To fix this, use xchg() to set IRQ_WORK_BUSY, this way we ensure the CPU 2 will see the correct value with cmpxchg() using the expected ordering. Changelog-heavily-inspired-by: Steven Rostedt Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Andrew Morton Cc: Paul Gortmaker Cc: Anish Kumar --- kernel/irq_work.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 1588e3b2871b..57be1a6cd8da 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -119,8 +119,11 @@ void irq_work_run(void) /* * Clear the PENDING bit, after this point the @work * can be re-used. + * Make it immediately visible so that other CPUs trying + * to claim that work don't rely on us to handle their data + * while we are in the middle of the func. */ - work->flags = IRQ_WORK_BUSY; + xchg(&work->flags, IRQ_WORK_BUSY); work->func(work); /* * Clear the BUSY bit and return to the free state if -- cgit v1.2.3 From e0bbe2d80c415bd4063d894ec2ccb336788af814 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 27 Oct 2012 15:21:36 +0200 Subject: irq_work: Fix racy check on work pending flag Work claiming wants to be SMP-safe. And by the time we try to claim a work, if it is already executing concurrently on another CPU, we want to succeed the claiming and queue the work again because the other CPU may have missed the data we wanted to handle in our work if it's about to complete there. This scenario is summarized below: CPU 1 CPU 2 ----- ----- (flags = 0) cmpxchg(flags, 0, IRQ_WORK_FLAGS) (flags = 3) [...] xchg(flags, IRQ_WORK_BUSY) (flags = 2) func() if (flags & IRQ_WORK_PENDING) (not true) cmpxchg(flags, flags, IRQ_WORK_FLAGS) (flags = 3) [...] cmpxchg(flags, IRQ_WORK_BUSY, 0); (fail, pending on CPU 2) This state machine is synchronized using [cmp]xchg() on the flags. As such, the early IRQ_WORK_PENDING check in CPU 2 above is racy. By the time we check it, we may be dealing with a stale value because we aren't using an atomic accessor. As a result, CPU 2 may "see" that the work is still pending on another CPU while it may be actually completing the work function exection already, leaving our data unprocessed. To fix this, we start by speculating about the value we wish to be in the work->flags but we only make any conclusion after the value returned by the cmpxchg() call that either claims the work or let the current owner handle the pending work for us. Changelog-heavily-inspired-by: Steven Rostedt Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Andrew Morton Cc: Paul Gortmaker Cc: Anish Kumar --- kernel/irq_work.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 57be1a6cd8da..64eddd59ed83 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -34,15 +34,21 @@ static DEFINE_PER_CPU(struct llist_head, irq_work_list); */ static bool irq_work_claim(struct irq_work *work) { - unsigned long flags, nflags; + unsigned long flags, oflags, nflags; + /* + * Start with our best wish as a premise but only trust any + * flag value after cmpxchg() result. + */ + flags = work->flags & ~IRQ_WORK_PENDING; for (;;) { - flags = work->flags; - if (flags & IRQ_WORK_PENDING) - return false; nflags = flags | IRQ_WORK_FLAGS; - if (cmpxchg(&work->flags, flags, nflags) == flags) + oflags = cmpxchg(&work->flags, flags, nflags); + if (oflags == flags) break; + if (oflags & IRQ_WORK_PENDING) + return false; + flags = oflags; cpu_relax(); } -- cgit v1.2.3 From 33a5f6261a61af28f7b4c86f9f958da0f206c914 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 11 Oct 2012 17:52:56 +0200 Subject: nohz: Add API to check tick state We need some quick way to check if the CPU has stopped its tick. This will be useful to implement the printk tick using the irq work subsystem. Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andrew Morton Cc: Paul Gortmaker --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a40260885265..9e945aa090ac 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -28,7 +28,7 @@ /* * Per cpu nohz control structure */ -static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); +DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); /* * The time, when the last jiffy update happened. Protected by xtime_lock. -- cgit v1.2.3 From 00b42959106a9ca1c2899e591ae4e9a83ad6af05 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 7 Nov 2012 21:03:07 +0100 Subject: irq_work: Don't stop the tick with pending works Don't stop the tick if we have pending irq works on the queue, otherwise if the arch can't raise self-IPIs, we may not find an opportunity to execute the pending works for a while. Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andrew Morton Cc: Paul Gortmaker --- kernel/irq_work.c | 11 +++++++++++ kernel/time/tick-sched.c | 3 ++- 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 64eddd59ed83..b3c113a14727 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -99,6 +99,17 @@ bool irq_work_queue(struct irq_work *work) } EXPORT_SYMBOL_GPL(irq_work_queue); +bool irq_work_needs_cpu(void) +{ + struct llist_head *this_list; + + this_list = &__get_cpu_var(irq_work_list); + if (llist_empty(this_list)) + return false; + + return true; +} + /* * Run the irq_work entries on this cpu. Requires to be ran from hardirq * context with local IRQs disabled. diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 9e945aa090ac..f249e8c3e58e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -289,7 +290,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, } while (read_seqretry(&xtime_lock, seq)); if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || - arch_needs_cpu(cpu)) { + arch_needs_cpu(cpu) || irq_work_needs_cpu()) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; } else { -- cgit v1.2.3 From c0e980a4bd7fc5c9b748f2f0209d2a48c0fdf0ab Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 15 Nov 2012 11:34:21 -0500 Subject: irq_work: Flush work on CPU_DYING In order not to offline a CPU with pending irq works, flush the queue from CPU_DYING. The notifier is called by stop_machine on the CPU that is going down. The code will not be called from irq context (so things like get_irq_regs() wont work) but I'm not sure what the requirements are for irq_work in that regard (Peter?). But irqs are disabled and the CPU is about to go offline. Might as well flush the work. Signed-off-by: Steven Rostedt Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andrew Morton Cc: Paul Gortmaker Signed-off-by: Frederic Weisbecker --- kernel/irq_work.c | 51 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 45 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index b3c113a14727..4ed17490f629 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include /* @@ -110,11 +112,7 @@ bool irq_work_needs_cpu(void) return true; } -/* - * Run the irq_work entries on this cpu. Requires to be ran from hardirq - * context with local IRQs disabled. - */ -void irq_work_run(void) +static void __irq_work_run(void) { struct irq_work *work; struct llist_head *this_list; @@ -124,7 +122,6 @@ void irq_work_run(void) if (llist_empty(this_list)) return; - BUG_ON(!in_irq()); BUG_ON(!irqs_disabled()); llnode = llist_del_all(this_list); @@ -149,6 +146,16 @@ void irq_work_run(void) (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0); } } + +/* + * Run the irq_work entries on this cpu. Requires to be ran from hardirq + * context with local IRQs disabled. + */ +void irq_work_run(void) +{ + BUG_ON(!in_irq()); + __irq_work_run(); +} EXPORT_SYMBOL_GPL(irq_work_run); /* @@ -163,3 +170,35 @@ void irq_work_sync(struct irq_work *work) cpu_relax(); } EXPORT_SYMBOL_GPL(irq_work_sync); + +#ifdef CONFIG_HOTPLUG_CPU +static int irq_work_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + + switch (action) { + case CPU_DYING: + /* Called from stop_machine */ + if (WARN_ON_ONCE(cpu != smp_processor_id())) + break; + __irq_work_run(); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block cpu_notify; + +static __init int irq_work_init_cpu_notifier(void) +{ + cpu_notify.notifier_call = irq_work_cpu_notify; + cpu_notify.priority = 0; + register_cpu_notifier(&cpu_notify); + return 0; +} +device_initcall(irq_work_init_cpu_notifier); + +#endif /* CONFIG_HOTPLUG_CPU */ -- cgit v1.2.3 From 8aa2accee41f7045dc904fa41d4475b2f6ffae3e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 15 Nov 2012 12:52:44 -0500 Subject: irq_work: Warn if there's still work on cpu_down If we are in nohz and there's still irq_work to be done when the idle task is about to go offline, give a nasty warning. Everything should have been flushed from the CPU_DYING notifier already. Further attempts to enqueue an irq_work are buggy because irqs are disabled by __cpu_disable(). The best we can do is to report the issue to the user. Signed-off-by: Steven Rostedt Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andrew Morton Cc: Paul Gortmaker Signed-off-by: Frederic Weisbecker --- kernel/irq_work.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 4ed17490f629..480f74715ba9 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -109,6 +109,9 @@ bool irq_work_needs_cpu(void) if (llist_empty(this_list)) return false; + /* All work should have been flushed before going offline */ + WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); + return true; } -- cgit v1.2.3 From bc6679aef673f9dcb8f718528fc3df49ff661af9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 19 Oct 2012 16:43:41 -0400 Subject: irq_work: Make self-IPIs optable On irq work initialization, let the user choose to define it as "lazy" or not. "Lazy" means that we don't want to send an IPI (provided the arch can anyway) when we enqueue this work but we rather prefer to wait for the next timer tick to execute our work if possible. This is going to be a benefit for non-urgent enqueuers (like printk in the future) that may prefer not to raise an IPI storm in case of frequent enqueuing on short periods of time. Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andrew Morton Cc: Paul Gortmaker --- kernel/irq_work.c | 47 +++++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 480f74715ba9..7f3a59bc8e3d 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -12,24 +12,15 @@ #include #include #include +#include +#include #include #include #include -/* - * An entry can be in one of four states: - * - * free NULL, 0 -> {claimed} : free to be used - * claimed NULL, 3 -> {pending} : claimed to be enqueued - * pending next, 3 -> {busy} : queued, pending callback - * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed - */ - -#define IRQ_WORK_PENDING 1UL -#define IRQ_WORK_BUSY 2UL -#define IRQ_WORK_FLAGS 3UL static DEFINE_PER_CPU(struct llist_head, irq_work_list); +static DEFINE_PER_CPU(int, irq_work_raised); /* * Claim the entry so that no one else will poke at it. @@ -69,14 +60,19 @@ void __weak arch_irq_work_raise(void) */ static void __irq_work_queue(struct irq_work *work) { - bool empty; - preempt_disable(); - empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); - /* The list was empty, raise self-interrupt to start processing. */ - if (empty) - arch_irq_work_raise(); + llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); + + /* + * If the work is not "lazy" or the tick is stopped, raise the irq + * work interrupt (if supported by the arch), otherwise, just wait + * for the next tick. + */ + if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) { + if (!this_cpu_cmpxchg(irq_work_raised, 0, 1)) + arch_irq_work_raise(); + } preempt_enable(); } @@ -117,10 +113,19 @@ bool irq_work_needs_cpu(void) static void __irq_work_run(void) { + unsigned long flags; struct irq_work *work; struct llist_head *this_list; struct llist_node *llnode; + + /* + * Reset the "raised" state right before we check the list because + * an NMI may enqueue after we find the list empty from the runner. + */ + __this_cpu_write(irq_work_raised, 0); + barrier(); + this_list = &__get_cpu_var(irq_work_list); if (llist_empty(this_list)) return; @@ -140,13 +145,15 @@ static void __irq_work_run(void) * to claim that work don't rely on us to handle their data * while we are in the middle of the func. */ - xchg(&work->flags, IRQ_WORK_BUSY); + flags = work->flags & ~IRQ_WORK_PENDING; + xchg(&work->flags, flags); + work->func(work); /* * Clear the BUSY bit and return to the free state if * no-one else claimed it meanwhile. */ - (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0); + (void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); } } -- cgit v1.2.3 From 74876a98a87a115254b3a66a14b27320b7f0acaa Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Oct 2012 18:00:23 +0200 Subject: printk: Wake up klogd using irq_work klogd is woken up asynchronously from the tick in order to do it safely. However if printk is called when the tick is stopped, the reader won't be woken up until the next interrupt, which might not fire for a while. As a result, the user may miss some message. To fix this, lets implement the printk tick using a lazy irq work. This subsystem takes care of the timer tick state and can fix up accordingly. Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andrew Morton Cc: Paul Gortmaker --- kernel/printk.c | 36 ++++++++++++++++++++---------------- kernel/time/tick-sched.c | 2 +- kernel/timer.c | 1 - 3 files changed, 21 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 2d607f4d1797..c9104feba5ec 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -42,6 +42,7 @@ #include #include #include +#include #include @@ -1955,30 +1956,32 @@ int is_console_locked(void) static DEFINE_PER_CPU(int, printk_pending); static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); -void printk_tick(void) +static void wake_up_klogd_work_func(struct irq_work *irq_work) { - if (__this_cpu_read(printk_pending)) { - int pending = __this_cpu_xchg(printk_pending, 0); - if (pending & PRINTK_PENDING_SCHED) { - char *buf = __get_cpu_var(printk_sched_buf); - printk(KERN_WARNING "[sched_delayed] %s", buf); - } - if (pending & PRINTK_PENDING_WAKEUP) - wake_up_interruptible(&log_wait); + int pending = __this_cpu_xchg(printk_pending, 0); + + if (pending & PRINTK_PENDING_SCHED) { + char *buf = __get_cpu_var(printk_sched_buf); + printk(KERN_WARNING "[sched_delayed] %s", buf); } -} -int printk_needs_cpu(int cpu) -{ - if (cpu_is_offline(cpu)) - printk_tick(); - return __this_cpu_read(printk_pending); + if (pending & PRINTK_PENDING_WAKEUP) + wake_up_interruptible(&log_wait); } +static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { + .func = wake_up_klogd_work_func, + .flags = IRQ_WORK_LAZY, +}; + void wake_up_klogd(void) { - if (waitqueue_active(&log_wait)) + preempt_disable(); + if (waitqueue_active(&log_wait)) { this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); + irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); + } + preempt_enable(); } static void console_cont_flush(char *text, size_t size) @@ -2458,6 +2461,7 @@ int printk_sched(const char *fmt, ...) va_end(args); __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); + irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); local_irq_restore(flags); return r; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f249e8c3e58e..822d7572bf2d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -289,7 +289,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, time_delta = timekeeping_max_deferment(); } while (read_seqretry(&xtime_lock, seq)); - if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || + if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || arch_needs_cpu(cpu) || irq_work_needs_cpu()) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; diff --git a/kernel/timer.c b/kernel/timer.c index 367d00858482..ff3b5165737b 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1351,7 +1351,6 @@ void update_process_times(int user_tick) account_process_tick(p, user_tick); run_local_timers(); rcu_check_callbacks(cpu, user_tick); - printk_tick(); #ifdef CONFIG_IRQ_WORK if (in_irq()) irq_work_run(); -- cgit v1.2.3 From e716efde75267eab919cdb2bef5b2cb77f305326 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 23 Nov 2012 10:08:44 +0100 Subject: genirq: Avoid deadlock in spurious handling commit 52553ddf(genirq: fix regression in irqfixup, irqpoll) introduced a potential deadlock by calling the action handler with the irq descriptor lock held. Remove the call and let the handling code run even for an interrupt where only a single action is registered. That matches the goal of the above commit and avoids the deadlock. Document the confusing action = desc->action reload in the handling loop while at it. Reported-and-tested-by: "Wang, Warner" Tested-by: Edward Donovan Cc: "Wang, Song-Bo (Stoney)" Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/irq/spurious.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 611cd6003c45..7b5f012bde9d 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -80,13 +80,11 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) /* * All handlers must agree on IRQF_SHARED, so we test just the - * first. Check for action->next as well. + * first. */ action = desc->action; if (!action || !(action->flags & IRQF_SHARED) || - (action->flags & __IRQF_TIMER) || - (action->handler(irq, action->dev_id) == IRQ_HANDLED) || - !action->next) + (action->flags & __IRQF_TIMER)) goto out; /* Already running on another processor */ @@ -104,6 +102,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) do { if (handle_irq_event(desc) == IRQ_HANDLED) ret = IRQ_HANDLED; + /* Make sure that there is still a valid action */ action = desc->action; } while ((desc->istate & IRQS_PENDING) && action); desc->istate &= ~IRQS_POLL_INPROGRESS; -- cgit v1.2.3 From 42f8570f437b65aaf3ef176a38ad7d7fc5847d8b Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Mon, 17 Dec 2012 10:01:23 -0500 Subject: workqueue: use new hashtable implementation Switch workqueues to use the new hashtable implementation. This reduces the amount of generic unrelated code in the workqueues. This patch depends on d9b482c ("hashtable: introduce a small and naive hashtable") which was merged in v3.6. Acked-by: Tejun Heo Signed-off-by: Sasha Levin Signed-off-by: Tejun Heo --- kernel/workqueue.c | 86 ++++++++++-------------------------------------------- 1 file changed, 15 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index fbc6576a83c3..acd417be8199 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -41,6 +41,7 @@ #include #include #include +#include #include "workqueue_sched.h" @@ -82,8 +83,6 @@ enum { NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ - BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, - BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1, MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ @@ -180,7 +179,7 @@ struct global_cwq { unsigned int flags; /* L: GCWQ_* flags */ /* workers are chained either in busy_hash or pool idle_list */ - struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; + DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); /* L: hash of busy workers */ struct worker_pool pools[NR_WORKER_POOLS]; @@ -285,8 +284,7 @@ EXPORT_SYMBOL_GPL(system_freezable_wq); (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++) #define for_each_busy_worker(worker, i, pos, gcwq) \ - for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ - hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) + hash_for_each(gcwq->busy_hash, i, pos, worker, hentry) static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask, unsigned int sw) @@ -858,63 +856,6 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) atomic_inc(get_pool_nr_running(pool)); } -/** - * busy_worker_head - return the busy hash head for a work - * @gcwq: gcwq of interest - * @work: work to be hashed - * - * Return hash head of @gcwq for @work. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - * - * RETURNS: - * Pointer to the hash head. - */ -static struct hlist_head *busy_worker_head(struct global_cwq *gcwq, - struct work_struct *work) -{ - const int base_shift = ilog2(sizeof(struct work_struct)); - unsigned long v = (unsigned long)work; - - /* simple shift and fold hash, do we need something better? */ - v >>= base_shift; - v += v >> BUSY_WORKER_HASH_ORDER; - v &= BUSY_WORKER_HASH_MASK; - - return &gcwq->busy_hash[v]; -} - -/** - * __find_worker_executing_work - find worker which is executing a work - * @gcwq: gcwq of interest - * @bwh: hash head as returned by busy_worker_head() - * @work: work to find worker for - * - * Find a worker which is executing @work on @gcwq. @bwh should be - * the hash head obtained by calling busy_worker_head() with the same - * work. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - * - * RETURNS: - * Pointer to worker which is executing @work if found, NULL - * otherwise. - */ -static struct worker *__find_worker_executing_work(struct global_cwq *gcwq, - struct hlist_head *bwh, - struct work_struct *work) -{ - struct worker *worker; - struct hlist_node *tmp; - - hlist_for_each_entry(worker, tmp, bwh, hentry) - if (worker->current_work == work) - return worker; - return NULL; -} - /** * find_worker_executing_work - find worker which is executing a work * @gcwq: gcwq of interest @@ -934,8 +875,14 @@ static struct worker *__find_worker_executing_work(struct global_cwq *gcwq, static struct worker *find_worker_executing_work(struct global_cwq *gcwq, struct work_struct *work) { - return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work), - work); + struct worker *worker; + struct hlist_node *tmp; + + hash_for_each_possible(gcwq->busy_hash, worker, tmp, hentry, (unsigned long)work) + if (worker->current_work == work) + return worker; + + return NULL; } /** @@ -2166,7 +2113,6 @@ __acquires(&gcwq->lock) struct cpu_workqueue_struct *cwq = get_work_cwq(work); struct worker_pool *pool = worker->pool; struct global_cwq *gcwq = pool->gcwq; - struct hlist_head *bwh = busy_worker_head(gcwq, work); bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; work_func_t f = work->func; int work_color; @@ -2198,7 +2144,7 @@ __acquires(&gcwq->lock) * already processing the work. If so, defer the work to the * currently executing one. */ - collision = __find_worker_executing_work(gcwq, bwh, work); + collision = find_worker_executing_work(gcwq, work); if (unlikely(collision)) { move_linked_works(work, &collision->scheduled, NULL); return; @@ -2206,7 +2152,7 @@ __acquires(&gcwq->lock) /* claim and dequeue */ debug_work_deactivate(work); - hlist_add_head(&worker->hentry, bwh); + hash_add(gcwq->busy_hash, &worker->hentry, (unsigned long)worker); worker->current_work = work; worker->current_cwq = cwq; work_color = get_work_color(work); @@ -2264,7 +2210,7 @@ __acquires(&gcwq->lock) worker_clr_flags(worker, WORKER_CPU_INTENSIVE); /* we're done with it, release */ - hlist_del_init(&worker->hentry); + hash_del(&worker->hentry); worker->current_work = NULL; worker->current_cwq = NULL; cwq_dec_nr_in_flight(cwq, work_color); @@ -3831,7 +3777,6 @@ out_unlock: static int __init init_workqueues(void) { unsigned int cpu; - int i; /* make sure we have enough bits for OFFQ CPU number */ BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) < @@ -3849,8 +3794,7 @@ static int __init init_workqueues(void) gcwq->cpu = cpu; gcwq->flags |= GCWQ_DISASSOCIATED; - for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) - INIT_HLIST_HEAD(&gcwq->busy_hash[i]); + hash_init(gcwq->busy_hash); for_each_worker_pool(pool, gcwq) { pool->gcwq = gcwq; -- cgit v1.2.3 From a2c1c57be8d9fd5b716113c8991d3d702eeacf77 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Dec 2012 10:35:02 -0800 Subject: workqueue: consider work function when searching for busy work items To avoid executing the same work item concurrenlty, workqueue hashes currently busy workers according to their current work items and looks up the the table when it wants to execute a new work item. If there already is a worker which is executing the new work item, the new item is queued to the found worker so that it gets executed only after the current execution finishes. Unfortunately, a work item may be freed while being executed and thus recycled for different purposes. If it gets recycled for a different work item and queued while the previous execution is still in progress, workqueue may make the new work item wait for the old one although the two aren't really related in any way. In extreme cases, this false dependency may lead to deadlock although it's extremely unlikely given that there aren't too many self-freeing work item users and they usually don't wait for other work items. To alleviate the problem, record the current work function in each busy worker and match it together with the work item address in find_worker_executing_work(). While this isn't complete, it ensures that unrelated work items don't interact with each other and in the very unlikely case where a twisted wq user triggers it, it's always onto itself making the culprit easy to spot. Signed-off-by: Tejun Heo Reported-by: Andrey Isakov Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=51701 Cc: stable@vger.kernel.org --- kernel/workqueue.c | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index acd417be8199..d24a41101838 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -137,6 +137,7 @@ struct worker { }; struct work_struct *current_work; /* L: work being processed */ + work_func_t current_func; /* L: current_work's fn */ struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ struct list_head scheduled; /* L: scheduled works */ struct task_struct *task; /* I: worker task */ @@ -861,9 +862,27 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) * @gcwq: gcwq of interest * @work: work to find worker for * - * Find a worker which is executing @work on @gcwq. This function is - * identical to __find_worker_executing_work() except that this - * function calculates @bwh itself. + * Find a worker which is executing @work on @gcwq by searching + * @gcwq->busy_hash which is keyed by the address of @work. For a worker + * to match, its current execution should match the address of @work and + * its work function. This is to avoid unwanted dependency between + * unrelated work executions through a work item being recycled while still + * being executed. + * + * This is a bit tricky. A work item may be freed once its execution + * starts and nothing prevents the freed area from being recycled for + * another work item. If the same work item address ends up being reused + * before the original execution finishes, workqueue will identify the + * recycled work item as currently executing and make it wait until the + * current execution finishes, introducing an unwanted dependency. + * + * This function checks the work item address, work function and workqueue + * to avoid false positives. Note that this isn't complete as one may + * construct a work function which can introduce dependency onto itself + * through a recycled work item. Well, if somebody wants to shoot oneself + * in the foot that badly, there's only so much we can do, and if such + * deadlock actually occurs, it should be easy to locate the culprit work + * function. * * CONTEXT: * spin_lock_irq(gcwq->lock). @@ -878,8 +897,10 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq, struct worker *worker; struct hlist_node *tmp; - hash_for_each_possible(gcwq->busy_hash, worker, tmp, hentry, (unsigned long)work) - if (worker->current_work == work) + hash_for_each_possible(gcwq->busy_hash, worker, tmp, hentry, + (unsigned long)work) + if (worker->current_work == work && + worker->current_func == work->func) return worker; return NULL; @@ -2114,7 +2135,6 @@ __acquires(&gcwq->lock) struct worker_pool *pool = worker->pool; struct global_cwq *gcwq = pool->gcwq; bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; - work_func_t f = work->func; int work_color; struct worker *collision; #ifdef CONFIG_LOCKDEP @@ -2154,6 +2174,7 @@ __acquires(&gcwq->lock) debug_work_deactivate(work); hash_add(gcwq->busy_hash, &worker->hentry, (unsigned long)worker); worker->current_work = work; + worker->current_func = work->func; worker->current_cwq = cwq; work_color = get_work_color(work); @@ -2186,7 +2207,7 @@ __acquires(&gcwq->lock) lock_map_acquire_read(&cwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); trace_workqueue_execute_start(work); - f(work); + worker->current_func(work); /* * While we must be careful to not use "work" after this, the trace * point will only record its address. @@ -2198,7 +2219,8 @@ __acquires(&gcwq->lock) if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" " last function: %pf\n", - current->comm, preempt_count(), task_pid_nr(current), f); + current->comm, preempt_count(), task_pid_nr(current), + worker->current_func); debug_show_held_locks(current); dump_stack(); } @@ -2212,6 +2234,7 @@ __acquires(&gcwq->lock) /* we're done with it, release */ hash_del(&worker->hentry); worker->current_work = NULL; + worker->current_func = NULL; worker->current_cwq = NULL; cwq_dec_nr_in_flight(cwq, work_color); } -- cgit v1.2.3 From 023f27d3d6fcc9048754d879fe5e7d63402a5b16 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Dec 2012 11:24:06 -0800 Subject: workqueue: fix find_worker_executing_work() brekage from hashtable conversion 42f8570f43 ("workqueue: use new hashtable implementation") incorrectly made busy workers hashed by the pointer value of worker instead of work. This broke find_worker_executing_work() which in turn broke a lot of fundamental operations of workqueue - non-reentrancy and flushing among others. The flush malfunction triggered warning in disk event code in Fengguang's automated test. write_dev_root_ (3265) used greatest stack depth: 2704 bytes left ------------[ cut here ]------------ WARNING: at /c/kernel-tests/src/stable/block/genhd.c:1574 disk_clear_events+0x\ cf/0x108() Hardware name: Bochs Modules linked in: Pid: 3328, comm: ata_id Not tainted 3.7.0-01930-gbff6343 #1167 Call Trace: [] warn_slowpath_common+0x83/0x9c [] warn_slowpath_null+0x1a/0x1c [] disk_clear_events+0xcf/0x108 [] check_disk_change+0x27/0x59 [] cdrom_open+0x49/0x68b [] idecd_open+0x88/0xb7 [] __blkdev_get+0x102/0x3ec [] blkdev_get+0x18f/0x30f [] blkdev_open+0x75/0x80 [] do_dentry_open+0x1ea/0x295 [] finish_open+0x35/0x41 [] do_last+0x878/0xa25 [] path_openat+0xc6/0x333 [] do_filp_open+0x38/0x86 [] do_sys_open+0x6c/0xf9 [] sys_open+0x21/0x23 [] system_call_fastpath+0x16/0x1b Signed-off-by: Tejun Heo Reported-by: Fengguang Wu Cc: Sasha Levin --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d24a41101838..7967f3476393 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2172,7 +2172,7 @@ __acquires(&gcwq->lock) /* claim and dequeue */ debug_work_deactivate(work); - hash_add(gcwq->busy_hash, &worker->hentry, (unsigned long)worker); + hash_add(gcwq->busy_hash, &worker->hentry, (unsigned long)work); worker->current_work = work; worker->current_func = work->func; worker->current_cwq = cwq; -- cgit v1.2.3 From 7b1f62076bba10786d2118006ae68ac120cd6c56 Mon Sep 17 00:00:00 2001 From: Stephen Warren Date: Wed, 7 Nov 2012 17:58:54 -0700 Subject: time: convert arch_gettimeoffset to a pointer Currently, whenever CONFIG_ARCH_USES_GETTIMEOFFSET is enabled, each arch core provides a single implementation of arch_gettimeoffset(). In many cases, different sub-architectures, different machines, or different timer providers exist, and so the arch ends up implementing arch_gettimeoffset() as a call-through-pointer anyway. Examples are ARM, Cris, M68K, and it's arguable that the remaining architectures, M32R and Blackfin, should be doing this anyway. Modify arch_gettimeoffset so that it itself is a function pointer, which the arch initializes. This will allow later changes to move the initialization of this function into individual machine support or timer drivers. This is particularly useful for code in drivers/clocksource which should rely on an arch-independant mechanism to register their implementation of arch_gettimeoffset(). This patch also converts the Cris architecture to set arch_gettimeoffset directly to the final implementation in time_init(), because Cris already had separate time_init() functions per sub-architecture. M68K and ARM are converted to set arch_gettimeoffset to the final implementation in later patches, because they already have function pointers in place for this purpose. Cc: Russell King Cc: Mike Frysinger Cc: Mikael Starvik Cc: Hirokazu Takata Cc: Thomas Gleixner Acked-by: Geert Uytterhoeven Acked-by: Jesper Nilsson Acked-by: John Stultz Signed-off-by: Stephen Warren --- kernel/time/timekeeping.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index cbc6acb0db3f..8ed934601587 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -135,6 +135,20 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) } /* Timekeeper helper functions. */ + +#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET +u32 (*arch_gettimeoffset)(void); + +u32 get_arch_timeoffset(void) +{ + if (likely(arch_gettimeoffset)) + return arch_gettimeoffset(); + return 0; +} +#else +static inline u32 get_arch_timeoffset(void) { return 0; } +#endif + static inline s64 timekeeping_get_ns(struct timekeeper *tk) { cycle_t cycle_now, cycle_delta; @@ -151,8 +165,8 @@ static inline s64 timekeeping_get_ns(struct timekeeper *tk) nsec = cycle_delta * tk->mult + tk->xtime_nsec; nsec >>= tk->shift; - /* If arch requires, add in gettimeoffset() */ - return nsec + arch_gettimeoffset(); + /* If arch requires, add in get_arch_timeoffset() */ + return nsec + get_arch_timeoffset(); } static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) @@ -171,8 +185,8 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) /* convert delta to nanoseconds. */ nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); - /* If arch requires, add in gettimeoffset() */ - return nsec + arch_gettimeoffset(); + /* If arch requires, add in get_arch_timeoffset() */ + return nsec + get_arch_timeoffset(); } static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); @@ -254,8 +268,8 @@ static void timekeeping_forward_now(struct timekeeper *tk) tk->xtime_nsec += cycle_delta * tk->mult; - /* If arch requires, add in gettimeoffset() */ - tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift; + /* If arch requires, add in get_arch_timeoffset() */ + tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift; tk_normalize_xtime(tk); -- cgit v1.2.3 From 923c7538236564c46ee80c253a416705321f13e3 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 27 Dec 2012 11:39:12 +0800 Subject: userns: Allow unprivileged reboot In a container with its own pid namespace and user namespace, rebooting the system won't reboot the host, but terminate all the processes in it and thus have the container shutdown, so it's safe. Signed-off-by: Li Zefan Signed-off-by: Eric W. Biederman --- kernel/sys.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 265b37690421..24d1ef56cd95 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -433,11 +433,12 @@ static DEFINE_MUTEX(reboot_mutex); SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, void __user *, arg) { + struct pid_namespace *pid_ns = task_active_pid_ns(current); char buffer[256]; int ret = 0; /* We only trust the superuser with rebooting the system. */ - if (!capable(CAP_SYS_BOOT)) + if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT)) return -EPERM; /* For safety, we require "magic" arguments. */ @@ -453,7 +454,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, * pid_namespace, the command is handled by reboot_pid_ns() which will * call do_exit(). */ - ret = reboot_pid_ns(task_active_pid_ns(current), cmd); + ret = reboot_pid_ns(pid_ns, cmd); if (ret) return ret; -- cgit v1.2.3 From 12a9d2fef1d35770d3cdc2cd1faabb83c45bc0fa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:49:33 -0800 Subject: cgroup: implement cgroup_rightmost_descendant() Implement cgroup_rightmost_descendant() which returns the right most descendant of the specified cgroup. This can be used to skip the cgroup's subtree while iterating with cgroup_for_each_descendant_pre(). Signed-off-by: Tejun Heo Acked-by: Michal Hocko Acked-by: Li Zefan --- kernel/cgroup.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4855892798fd..6643f7053454 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3017,6 +3017,32 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, } EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); +/** + * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup + * @pos: cgroup of interest + * + * Return the rightmost descendant of @pos. If there's no descendant, + * @pos is returned. This can be used during pre-order traversal to skip + * subtree of @pos. + */ +struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) +{ + struct cgroup *last, *tmp; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + do { + last = pos; + /* ->prev isn't RCU safe, walk ->next till the end */ + pos = NULL; + list_for_each_entry_rcu(tmp, &last->children, sibling) + pos = tmp; + } while (pos); + + return last; +} +EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant); + static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) { struct cgroup *last; -- cgit v1.2.3 From 01c889cf4f2e16c9a21376f8583f4a2872d7d1da Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:07 -0800 Subject: cpuset: remove unused cpuset_unlock() Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7bb63eea6eb8..854b8bfbc15c 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2411,17 +2411,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) return 0; } -/** - * cpuset_unlock - release lock on cpuset changes - * - * Undo the lock taken in a previous cpuset_lock() call. - */ - -void cpuset_unlock(void) -{ - mutex_unlock(&callback_mutex); -} - /** * cpuset_mem_spread_node() - On which node to begin search for a file page * cpuset_slab_spread_node() - On which node to begin search for a slab page -- cgit v1.2.3 From 0772324ae669f787b42fdb9fc5ac2c3d1c0df509 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:07 -0800 Subject: cpuset: remove fast exit path from remove_tasks_in_empty_cpuset() The function isn't that hot, the overhead of missing the fast exit is low, the test itself depends heavily on cgroup internals, and it's gonna be a hindrance when trying to decouple cpuset locking from cgroup core. Remove the fast exit path. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 854b8bfbc15c..5372b6f5e5b3 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1967,14 +1967,6 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) { struct cpuset *parent; - /* - * The cgroup's css_sets list is in use if there are tasks - * in the cpuset; the list is empty if there are none; - * the cs->css.refcnt seems always 0. - */ - if (list_empty(&cs->css.cgroup->css_sets)) - return; - /* * Find its next-highest non-empty parent, (top cpuset * has online cpus, so can't be empty). -- cgit v1.2.3 From c8f699bb56aeae951e02fe2a46c9ada022535770 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:07 -0800 Subject: cpuset: introduce ->css_on/offline() Add cpuset_css_on/offline() and rearrange css init/exit such that, * Allocation and clearing to the default values happen in css_alloc(). Allocation now uses kzalloc(). * Config inheritance and registration happen in css_online(). * css_offline() undoes what css_online() did. * css_free() frees. This doesn't introduce any visible behavior changes. This will help cleaning up locking. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 66 ++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 5372b6f5e5b3..1d7a611ff771 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1790,15 +1790,12 @@ static struct cftype files[] = { static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) { - struct cgroup *parent_cg = cont->parent; - struct cgroup *tmp_cg; - struct cpuset *parent, *cs; + struct cpuset *cs; - if (!parent_cg) + if (!cont->parent) return &top_cpuset.css; - parent = cgroup_cs(parent_cg); - cs = kmalloc(sizeof(*cs), GFP_KERNEL); + cs = kzalloc(sizeof(*cs), GFP_KERNEL); if (!cs) return ERR_PTR(-ENOMEM); if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { @@ -1806,22 +1803,34 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) return ERR_PTR(-ENOMEM); } - cs->flags = 0; - if (is_spread_page(parent)) - set_bit(CS_SPREAD_PAGE, &cs->flags); - if (is_spread_slab(parent)) - set_bit(CS_SPREAD_SLAB, &cs->flags); set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; + cs->parent = cgroup_cs(cont->parent); + + return &cs->css; +} + +static int cpuset_css_online(struct cgroup *cgrp) +{ + struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *parent = cs->parent; + struct cgroup *tmp_cg; + + if (!parent) + return 0; + + if (is_spread_page(parent)) + set_bit(CS_SPREAD_PAGE, &cs->flags); + if (is_spread_slab(parent)) + set_bit(CS_SPREAD_SLAB, &cs->flags); - cs->parent = parent; number_of_cpusets++; - if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags)) - goto skip_clone; + if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) + return 0; /* * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is @@ -1836,19 +1845,34 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) * changed to grant parent->cpus_allowed-sibling_cpus_exclusive * (and likewise for mems) to the new cgroup. */ - list_for_each_entry(tmp_cg, &parent_cg->children, sibling) { + list_for_each_entry(tmp_cg, &cgrp->parent->children, sibling) { struct cpuset *tmp_cs = cgroup_cs(tmp_cg); if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) - goto skip_clone; + return 0; } mutex_lock(&callback_mutex); cs->mems_allowed = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); mutex_unlock(&callback_mutex); -skip_clone: - return &cs->css; + + return 0; +} + +static void cpuset_css_offline(struct cgroup *cgrp) +{ + struct cpuset *cs = cgroup_cs(cgrp); + + /* css_offline is called w/o cgroup_mutex, grab it */ + cgroup_lock(); + + if (is_sched_load_balance(cs)) + update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); + + number_of_cpusets--; + + cgroup_unlock(); } /* @@ -1861,10 +1885,6 @@ static void cpuset_css_free(struct cgroup *cont) { struct cpuset *cs = cgroup_cs(cont); - if (is_sched_load_balance(cs)) - update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); - - number_of_cpusets--; free_cpumask_var(cs->cpus_allowed); kfree(cs); } @@ -1872,6 +1892,8 @@ static void cpuset_css_free(struct cgroup *cont) struct cgroup_subsys cpuset_subsys = { .name = "cpuset", .css_alloc = cpuset_css_alloc, + .css_online = cpuset_css_online, + .css_offline = cpuset_css_offline, .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, .attach = cpuset_attach, -- cgit v1.2.3 From efeb77b2f13deb0503e65ad2b243495b6de75173 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:07 -0800 Subject: cpuset: introduce CS_ONLINE Add CS_ONLINE which is set from css_online() and cleared from css_offline(). This will enable using generic cgroup iterator while allowing decoupling cpuset from cgroup internal locking. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1d7a611ff771..e857887bd246 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -138,6 +138,7 @@ static inline bool task_has_mempolicy(struct task_struct *task) /* bits in struct cpuset flags field */ typedef enum { + CS_ONLINE, CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, CS_MEM_HARDWALL, @@ -154,6 +155,11 @@ enum hotplug_event { }; /* convenient tests for these bits */ +static inline bool is_cpuset_online(const struct cpuset *cs) +{ + return test_bit(CS_ONLINE, &cs->flags); +} + static inline int is_cpu_exclusive(const struct cpuset *cs) { return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); @@ -190,7 +196,8 @@ static inline int is_spread_slab(const struct cpuset *cs) } static struct cpuset top_cpuset = { - .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), + .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | + (1 << CS_MEM_EXCLUSIVE)), }; /* @@ -1822,6 +1829,7 @@ static int cpuset_css_online(struct cgroup *cgrp) if (!parent) return 0; + set_bit(CS_ONLINE, &cs->flags); if (is_spread_page(parent)) set_bit(CS_SPREAD_PAGE, &cs->flags); if (is_spread_slab(parent)) @@ -1871,6 +1879,7 @@ static void cpuset_css_offline(struct cgroup *cgrp) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); number_of_cpusets--; + clear_bit(CS_ONLINE, &cs->flags); cgroup_unlock(); } -- cgit v1.2.3 From ae8086ce15fdab2b57599d7a3242a114ba4b8597 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:07 -0800 Subject: cpuset: introduce cpuset_for_each_child() Instead of iterating cgroup->children directly, introduce and use cpuset_for_each_child() which wraps cgroup_for_each_child() and performs online check. As it uses the generic iterator, it requires RCU read locking too. As cpuset is currently protected by cgroup_mutex, non-online cpusets aren't visible to all the iterations and this patch currently doesn't make any functional difference. This will be used to de-couple cpuset locking from cgroup core. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 85 ++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 54 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e857887bd246..4b054b9faf3d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -200,6 +200,19 @@ static struct cpuset top_cpuset = { (1 << CS_MEM_EXCLUSIVE)), }; +/** + * cpuset_for_each_child - traverse online children of a cpuset + * @child_cs: loop cursor pointing to the current child + * @pos_cgrp: used for iteration + * @parent_cs: target cpuset to walk children of + * + * Walk @child_cs through the online children of @parent_cs. Must be used + * with RCU read locked. + */ +#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \ + cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ + if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) + /* * There are two global mutexes guarding cpuset structures. The first * is the main control groups cgroup_mutex, accessed via @@ -419,48 +432,55 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) { struct cgroup *cont; struct cpuset *c, *par; + int ret; + + rcu_read_lock(); /* Each of our child cpusets must be a subset of us */ - list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { - if (!is_cpuset_subset(cgroup_cs(cont), trial)) - return -EBUSY; - } + ret = -EBUSY; + cpuset_for_each_child(c, cont, cur) + if (!is_cpuset_subset(c, trial)) + goto out; /* Remaining checks don't apply to root cpuset */ + ret = 0; if (cur == &top_cpuset) - return 0; + goto out; par = cur->parent; /* We must be a subset of our parent cpuset */ + ret = -EACCES; if (!is_cpuset_subset(trial, par)) - return -EACCES; + goto out; /* * If either I or some sibling (!= me) is exclusive, we can't * overlap */ - list_for_each_entry(cont, &par->css.cgroup->children, sibling) { - c = cgroup_cs(cont); + ret = -EINVAL; + cpuset_for_each_child(c, cont, par) { if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && c != cur && cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) - return -EINVAL; + goto out; if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && c != cur && nodes_intersects(trial->mems_allowed, c->mems_allowed)) - return -EINVAL; + goto out; } /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ - if (cgroup_task_count(cur->css.cgroup)) { - if (cpumask_empty(trial->cpus_allowed) || - nodes_empty(trial->mems_allowed)) { - return -ENOSPC; - } - } + ret = -ENOSPC; + if (cgroup_task_count(cur->css.cgroup) && + (cpumask_empty(trial->cpus_allowed) || + nodes_empty(trial->mems_allowed))) + goto out; - return 0; + ret = 0; +out: + rcu_read_unlock(); + return ret; } #ifdef CONFIG_SMP @@ -501,10 +521,10 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) if (is_sched_load_balance(cp)) update_domain_attr(dattr, cp); - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); + rcu_read_lock(); + cpuset_for_each_child(child, cont, cp) list_add_tail(&child->stack_list, &q); - } + rcu_read_unlock(); } } @@ -623,10 +643,10 @@ static int generate_sched_domains(cpumask_var_t **domains, continue; } - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); + rcu_read_lock(); + cpuset_for_each_child(child, cont, cp) list_add_tail(&child->stack_list, &q); - } + rcu_read_unlock(); } for (i = 0; i < csn; i++) @@ -1824,7 +1844,8 @@ static int cpuset_css_online(struct cgroup *cgrp) { struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *parent = cs->parent; - struct cgroup *tmp_cg; + struct cpuset *tmp_cs; + struct cgroup *pos_cg; if (!parent) return 0; @@ -1853,12 +1874,14 @@ static int cpuset_css_online(struct cgroup *cgrp) * changed to grant parent->cpus_allowed-sibling_cpus_exclusive * (and likewise for mems) to the new cgroup. */ - list_for_each_entry(tmp_cg, &cgrp->parent->children, sibling) { - struct cpuset *tmp_cs = cgroup_cs(tmp_cg); - - if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) + rcu_read_lock(); + cpuset_for_each_child(tmp_cs, pos_cg, parent) { + if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { + rcu_read_unlock(); return 0; + } } + rcu_read_unlock(); mutex_lock(&callback_mutex); cs->mems_allowed = parent->mems_allowed; @@ -2027,10 +2050,10 @@ static struct cpuset *cpuset_next(struct list_head *queue) cp = list_first_entry(queue, struct cpuset, stack_list); list_del(queue->next); - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); + rcu_read_lock(); + cpuset_for_each_child(child, cont, cp) list_add_tail(&child->stack_list, queue); - } + rcu_read_unlock(); return cp; } -- cgit v1.2.3 From 4e4c9a140fc2ecf5e086922ccd2022bdabe509b6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:07 -0800 Subject: cpuset: cleanup cpuset[_can]_attach() cpuset_can_attach() prepare global variables cpus_attach and cpuset_attach_nodemask_{to|from} which are used by cpuset_attach(). There is no reason to prepare in cpuset_can_attach(). The same information can be accessed from cpuset_attach(). Move the prepartion logic from cpuset_can_attach() to cpuset_attach() and make the global variables static ones inside cpuset_attach(). With this change, there's no reason to keep cpuset_attach_nodemask_{from|to} global. Move them inside cpuset_attach(). Unfortunately, we need to keep cpus_attach global as it can't be allocated from cpuset_attach(). v2: cpus_attach not converted to cpumask_t as per Li Zefan and Rusty Russell. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Rusty Russell --- kernel/cpuset.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4b054b9faf3d..c5edc6b3eb28 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1395,15 +1395,6 @@ static int fmeter_getrate(struct fmeter *fmp) return val; } -/* - * Protected by cgroup_lock. The nodemasks must be stored globally because - * dynamically allocating them is not allowed in can_attach, and they must - * persist until attach. - */ -static cpumask_var_t cpus_attach; -static nodemask_t cpuset_attach_nodemask_from; -static nodemask_t cpuset_attach_nodemask_to; - /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { @@ -1430,19 +1421,21 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) return ret; } - /* prepare for attach */ - if (cs == &top_cpuset) - cpumask_copy(cpus_attach, cpu_possible_mask); - else - guarantee_online_cpus(cs, cpus_attach); - - guarantee_online_mems(cs, &cpuset_attach_nodemask_to); - return 0; } +/* + * Protected by cgroup_mutex. cpus_attach is used only by cpuset_attach() + * but we can't allocate it dynamically there. Define it global and + * allocate from cpuset_init(). + */ +static cpumask_var_t cpus_attach; + static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { + /* static bufs protected by cgroup_mutex */ + static nodemask_t cpuset_attach_nodemask_from; + static nodemask_t cpuset_attach_nodemask_to; struct mm_struct *mm; struct task_struct *task; struct task_struct *leader = cgroup_taskset_first(tset); @@ -1450,6 +1443,14 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *oldcs = cgroup_cs(oldcgrp); + /* prepare for attach */ + if (cs == &top_cpuset) + cpumask_copy(cpus_attach, cpu_possible_mask); + else + guarantee_online_cpus(cs, cpus_attach); + + guarantee_online_mems(cs, &cpuset_attach_nodemask_to); + cgroup_taskset_for_each(task, cgrp, tset) { /* * can_attach beforehand should guarantee that this doesn't -- cgit v1.2.3 From deb7aa308ea264b374d1db970174f5728a2faa27 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:07 -0800 Subject: cpuset: reorganize CPU / memory hotplug handling Reorganize hotplug path to prepare for async hotplug handling. * Both CPU and memory hotplug handlings are collected into a single function - cpuset_handle_hotplug(). It doesn't take any argument but compares the current setttings of top_cpuset against what's actually available to determine what happened. This function directly updates top_cpuset. If there are CPUs or memory nodes which are taken down, cpuset_propagate_hotplug() in invoked on all !root cpusets. * cpuset_propagate_hotplug() is responsible for updating the specified cpuset so that it doesn't include any resource which isn't available to top_cpuset. If no CPU or memory is left after update, all tasks are moved to the nearest ancestor with both resources. * update_tasks_cpumask() and update_tasks_nodemask() are now always called after cpus or mems masks are updated even if the cpuset doesn't have any task. This is for brevity and not expected to have any measureable effect. * cpu_active_mask and N_HIGH_MEMORY are read exactly once per cpuset_handle_hotplug() invocation, all cpusets share the same view of what resources are available, and cpuset_handle_hotplug() can handle multiple resources going up and down. These properties will allow async operation. The reorganization, while drastic, is equivalent and shouldn't cause any behavior difference. This will enable making hotplug handling async and remove get_online_cpus() -> cgroup_mutex nesting. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 221 ++++++++++++++++++++++++++------------------------------ 1 file changed, 104 insertions(+), 117 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c5edc6b3eb28..3d448e646a4a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -148,12 +148,6 @@ typedef enum { CS_SPREAD_SLAB, } cpuset_flagbits_t; -/* the type of hotplug event */ -enum hotplug_event { - CPUSET_CPU_OFFLINE, - CPUSET_MEM_OFFLINE, -}; - /* convenient tests for these bits */ static inline bool is_cpuset_online(const struct cpuset *cs) { @@ -2059,116 +2053,131 @@ static struct cpuset *cpuset_next(struct list_head *queue) return cp; } - -/* - * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory - * online/offline) and update the cpusets accordingly. - * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such - * cpuset must be moved to a parent cpuset. - * - * Called with cgroup_mutex held. We take callback_mutex to modify - * cpus_allowed and mems_allowed. +/** + * cpuset_propagate_hotplug - propagate CPU/memory hotplug to a cpuset + * @cs: cpuset in interest * - * This walk processes the tree from top to bottom, completing one layer - * before dropping down to the next. It always processes a node before - * any of its children. + * Compare @cs's cpu and mem masks against top_cpuset and if some have gone + * offline, update @cs accordingly. If @cs ends up with no CPU or memory, + * all its tasks are moved to the nearest ancestor with both resources. * - * In the case of memory hot-unplug, it will remove nodes from N_MEMORY - * if all present pages from a node are offlined. + * Should be called with cgroup_mutex held. */ -static void -scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) +static void cpuset_propagate_hotplug(struct cpuset *cs) { - LIST_HEAD(queue); - struct cpuset *cp; /* scans cpusets being updated */ - static nodemask_t oldmems; /* protected by cgroup_mutex */ - - list_add_tail((struct list_head *)&root->stack_list, &queue); - - switch (event) { - case CPUSET_CPU_OFFLINE: - while ((cp = cpuset_next(&queue)) != NULL) { + static cpumask_t off_cpus; + static nodemask_t off_mems, tmp_mems; - /* Continue past cpusets with all cpus online */ - if (cpumask_subset(cp->cpus_allowed, cpu_active_mask)) - continue; + WARN_ON_ONCE(!cgroup_lock_is_held()); - /* Remove offline cpus from this cpuset. */ - mutex_lock(&callback_mutex); - cpumask_and(cp->cpus_allowed, cp->cpus_allowed, - cpu_active_mask); - mutex_unlock(&callback_mutex); + cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); + nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); - /* Move tasks from the empty cpuset to a parent */ - if (cpumask_empty(cp->cpus_allowed)) - remove_tasks_in_empty_cpuset(cp); - else - update_tasks_cpumask(cp, NULL); - } - break; - - case CPUSET_MEM_OFFLINE: - while ((cp = cpuset_next(&queue)) != NULL) { - - /* Continue past cpusets with all mems online */ - if (nodes_subset(cp->mems_allowed, - node_states[N_MEMORY])) - continue; - - oldmems = cp->mems_allowed; - - /* Remove offline mems from this cpuset. */ - mutex_lock(&callback_mutex); - nodes_and(cp->mems_allowed, cp->mems_allowed, - node_states[N_MEMORY]); - mutex_unlock(&callback_mutex); + /* remove offline cpus from @cs */ + if (!cpumask_empty(&off_cpus)) { + mutex_lock(&callback_mutex); + cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); + mutex_unlock(&callback_mutex); + update_tasks_cpumask(cs, NULL); + } - /* Move tasks from the empty cpuset to a parent */ - if (nodes_empty(cp->mems_allowed)) - remove_tasks_in_empty_cpuset(cp); - else - update_tasks_nodemask(cp, &oldmems, NULL); - } + /* remove offline mems from @cs */ + if (!nodes_empty(off_mems)) { + tmp_mems = cs->mems_allowed; + mutex_lock(&callback_mutex); + nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); + mutex_unlock(&callback_mutex); + update_tasks_nodemask(cs, &tmp_mems, NULL); } + + if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) + remove_tasks_in_empty_cpuset(cs); } -/* - * The top_cpuset tracks what CPUs and Memory Nodes are online, - * period. This is necessary in order to make cpusets transparent - * (of no affect) on systems that are actively using CPU hotplug - * but making no active use of cpusets. - * - * The only exception to this is suspend/resume, where we don't - * modify cpusets at all. +/** + * cpuset_handle_hotplug - handle CPU/memory hot[un]plug * - * This routine ensures that top_cpuset.cpus_allowed tracks - * cpu_active_mask on each CPU hotplug (cpuhp) event. + * This function is called after either CPU or memory configuration has + * changed and updates cpuset accordingly. The top_cpuset is always + * synchronized to cpu_active_mask and N_MEMORY, which is necessary in + * order to make cpusets transparent (of no affect) on systems that are + * actively using CPU hotplug but making no active use of cpusets. * - * Called within get_online_cpus(). Needs to call cgroup_lock() - * before calling generate_sched_domains(). + * Non-root cpusets are only affected by offlining. If any CPUs or memory + * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all + * descendants. * - * @cpu_online: Indicates whether this is a CPU online event (true) or - * a CPU offline event (false). + * Note that CPU offlining during suspend is ignored. We don't modify + * cpusets across suspend/resume cycles at all. */ -void cpuset_update_active_cpus(bool cpu_online) +static void cpuset_handle_hotplug(void) { - struct sched_domain_attr *attr; - cpumask_var_t *doms; - int ndoms; + static cpumask_t new_cpus, tmp_cpus; + static nodemask_t new_mems, tmp_mems; + bool cpus_updated, mems_updated; + bool cpus_offlined, mems_offlined; cgroup_lock(); - mutex_lock(&callback_mutex); - cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); - mutex_unlock(&callback_mutex); - if (!cpu_online) - scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE); + /* fetch the available cpus/mems and find out which changed how */ + cpumask_copy(&new_cpus, cpu_active_mask); + new_mems = node_states[N_MEMORY]; + + cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); + cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed, + &new_cpus); + + mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); + nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems); + mems_offlined = !nodes_empty(tmp_mems); + + /* synchronize cpus_allowed to cpu_active_mask */ + if (cpus_updated) { + mutex_lock(&callback_mutex); + cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); + mutex_unlock(&callback_mutex); + /* we don't mess with cpumasks of tasks in top_cpuset */ + } + + /* synchronize mems_allowed to N_MEMORY */ + if (mems_updated) { + tmp_mems = top_cpuset.mems_allowed; + mutex_lock(&callback_mutex); + top_cpuset.mems_allowed = new_mems; + mutex_unlock(&callback_mutex); + update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL); + } + + /* if cpus or mems went down, we need to propagate to descendants */ + if (cpus_offlined || mems_offlined) { + struct cpuset *cs; + LIST_HEAD(queue); + + list_add_tail(&top_cpuset.stack_list, &queue); + while ((cs = cpuset_next(&queue))) + if (cs != &top_cpuset) + cpuset_propagate_hotplug(cs); + } - ndoms = generate_sched_domains(&doms, &attr); cgroup_unlock(); - /* Have scheduler rebuild the domains */ - partition_sched_domains(ndoms, doms, attr); + /* rebuild sched domains if cpus_allowed has changed */ + if (cpus_updated) { + struct sched_domain_attr *attr; + cpumask_var_t *doms; + int ndoms; + + cgroup_lock(); + ndoms = generate_sched_domains(&doms, &attr); + cgroup_unlock(); + + partition_sched_domains(ndoms, doms, attr); + } +} + +void cpuset_update_active_cpus(bool cpu_online) +{ + cpuset_handle_hotplug(); } #ifdef CONFIG_MEMORY_HOTPLUG @@ -2180,29 +2189,7 @@ void cpuset_update_active_cpus(bool cpu_online) static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { - static nodemask_t oldmems; /* protected by cgroup_mutex */ - - cgroup_lock(); - switch (action) { - case MEM_ONLINE: - oldmems = top_cpuset.mems_allowed; - mutex_lock(&callback_mutex); - top_cpuset.mems_allowed = node_states[N_MEMORY]; - mutex_unlock(&callback_mutex); - update_tasks_nodemask(&top_cpuset, &oldmems, NULL); - break; - case MEM_OFFLINE: - /* - * needn't update top_cpuset.mems_allowed explicitly because - * scan_cpusets_upon_hotplug() will update it. - */ - scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE); - break; - default: - break; - } - cgroup_unlock(); - + cpuset_handle_hotplug(); return NOTIFY_OK; } #endif -- cgit v1.2.3 From 3a5a6d0c2b0391e159fa5bf1dddb9bf1f35178a0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:07 -0800 Subject: cpuset: don't nest cgroup_mutex inside get_online_cpus() CPU / memory hotplug path currently grabs cgroup_mutex from hotplug event notifications. We want to separate cpuset locking from cgroup core and make cgroup_mutex outer to hotplug synchronization so that, among other things, mechanisms which depend on get_online_cpus() can be used from cgroup callbacks. In general, we want to keep cgroup_mutex the outermost lock to minimize locking interactions among different controllers. Convert cpuset_handle_hotplug() to cpuset_hotplug_workfn() and schedule it from the hotplug notifications. As the function can already handle multiple mixed events without any input, converting it to a work function is mostly trivial; however, one complication is that cpuset_update_active_cpus() needs to update sched domains synchronously to reflect an offlined cpu to avoid confusing the scheduler. This is worked around by falling back to the the default single sched domain synchronously before scheduling the actual hotplug work. This makes sched domain rebuilt twice per CPU hotplug event but the operation isn't that heavy and a lot of the second operation would be noop for systems w/ single sched domain, which is the common case. This decouples cpuset hotplug handling from the notification callbacks and there can be an arbitrary delay between the actual event and updates to cpusets. Scheduler and mm can handle it fine but moving tasks out of an empty cpuset may race against writes to the cpuset restoring execution resources which can lead to confusing behavior. Flush hotplug work item from cpuset_write_resmask() to avoid such confusions. v2: Synchronous sched domain rebuilding using the fallback sched domain added. This fixes various issues caused by confused scheduler putting tasks on a dead CPU, including the one reported by Li Zefan. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3d448e646a4a..658eb1a32084 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -259,6 +259,13 @@ static char cpuset_name[CPUSET_NAME_LEN]; static char cpuset_nodelist[CPUSET_NODELIST_LEN]; static DEFINE_SPINLOCK(cpuset_buffer_lock); +/* + * CPU / memory hotplug is handled asynchronously. + */ +static void cpuset_hotplug_workfn(struct work_struct *work); + +static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); + /* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we @@ -1565,6 +1572,19 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *trialcs; + /* + * CPU or memory hotunplug may leave @cs w/o any execution + * resources, in which case the hotplug code asynchronously updates + * configuration and transfers all tasks to the nearest ancestor + * which can execute. + * + * As writes to "cpus" or "mems" may restore @cs's execution + * resources, wait for the previously scheduled operations before + * proceeding, so that we don't end up keep removing tasks added + * after execution capability is restored. + */ + flush_work(&cpuset_hotplug_work); + if (!cgroup_lock_live_group(cgrp)) return -ENODEV; @@ -2095,7 +2115,7 @@ static void cpuset_propagate_hotplug(struct cpuset *cs) } /** - * cpuset_handle_hotplug - handle CPU/memory hot[un]plug + * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -2110,7 +2130,7 @@ static void cpuset_propagate_hotplug(struct cpuset *cs) * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. */ -static void cpuset_handle_hotplug(void) +static void cpuset_hotplug_workfn(struct work_struct *work) { static cpumask_t new_cpus, tmp_cpus; static nodemask_t new_mems, tmp_mems; @@ -2177,7 +2197,18 @@ static void cpuset_handle_hotplug(void) void cpuset_update_active_cpus(bool cpu_online) { - cpuset_handle_hotplug(); + /* + * We're inside cpu hotplug critical region which usually nests + * inside cgroup synchronization. Bounce actual hotplug processing + * to a work item to avoid reverse locking order. + * + * We still need to do partition_sched_domains() synchronously; + * otherwise, the scheduler will get confused and put tasks to the + * dead CPU. Fall back to the default single domain. + * cpuset_hotplug_workfn() will rebuild it as necessary. + */ + partition_sched_domains(1, NULL, NULL); + schedule_work(&cpuset_hotplug_work); } #ifdef CONFIG_MEMORY_HOTPLUG @@ -2189,7 +2220,7 @@ void cpuset_update_active_cpus(bool cpu_online) static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { - cpuset_handle_hotplug(); + schedule_work(&cpuset_hotplug_work); return NOTIFY_OK; } #endif -- cgit v1.2.3 From 699140ba838dd3fa2c5cce474e14f194b09f91aa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:07 -0800 Subject: cpuset: drop async_rebuild_sched_domains() In general, we want to make cgroup_mutex one of the outermost locks and be able to use get_online_cpus() and friends from cgroup methods. With cpuset hotplug made async, get_online_cpus() can now be nested inside cgroup_mutex. Currently, cpuset avoids nesting get_online_cpus() inside cgroup_mutex by bouncing sched_domain rebuilding to a work item. As such nesting is allowed now, remove the workqueue bouncing code and always rebuild sched_domains synchronously. This also nests sched_domains_mutex inside cgroup_mutex, which is intended and should be okay. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 76 ++++++++++++--------------------------------------------- 1 file changed, 16 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 658eb1a32084..74e412f908db 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -60,14 +60,6 @@ #include #include -/* - * Workqueue for cpuset related tasks. - * - * Using kevent workqueue may cause deadlock when memory_migrate - * is set. So we create a separate workqueue thread for cpuset. - */ -static struct workqueue_struct *cpuset_wq; - /* * Tracks how many cpusets are currently defined in system. * When there is only one cpuset (the root cpuset) we can @@ -753,25 +745,25 @@ done: /* * Rebuild scheduler domains. * - * Call with neither cgroup_mutex held nor within get_online_cpus(). - * Takes both cgroup_mutex and get_online_cpus(). + * If the flag 'sched_load_balance' of any cpuset with non-empty + * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset + * which has that flag enabled, or if any cpuset with a non-empty + * 'cpus' is removed, then call this routine to rebuild the + * scheduler's dynamic sched domains. * - * Cannot be directly called from cpuset code handling changes - * to the cpuset pseudo-filesystem, because it cannot be called - * from code that already holds cgroup_mutex. + * Call with cgroup_mutex held. Takes get_online_cpus(). */ -static void do_rebuild_sched_domains(struct work_struct *unused) +static void rebuild_sched_domains_locked(void) { struct sched_domain_attr *attr; cpumask_var_t *doms; int ndoms; + WARN_ON_ONCE(!cgroup_lock_is_held()); get_online_cpus(); /* Generate domain masks and attrs */ - cgroup_lock(); ndoms = generate_sched_domains(&doms, &attr); - cgroup_unlock(); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); @@ -779,7 +771,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused) put_online_cpus(); } #else /* !CONFIG_SMP */ -static void do_rebuild_sched_domains(struct work_struct *unused) +static void rebuild_sched_domains_locked(void) { } @@ -791,44 +783,11 @@ static int generate_sched_domains(cpumask_var_t **domains, } #endif /* CONFIG_SMP */ -static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); - -/* - * Rebuild scheduler domains, asynchronously via workqueue. - * - * If the flag 'sched_load_balance' of any cpuset with non-empty - * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset - * which has that flag enabled, or if any cpuset with a non-empty - * 'cpus' is removed, then call this routine to rebuild the - * scheduler's dynamic sched domains. - * - * The rebuild_sched_domains() and partition_sched_domains() - * routines must nest cgroup_lock() inside get_online_cpus(), - * but such cpuset changes as these must nest that locking the - * other way, holding cgroup_lock() for much of the code. - * - * So in order to avoid an ABBA deadlock, the cpuset code handling - * these user changes delegates the actual sched domain rebuilding - * to a separate workqueue thread, which ends up processing the - * above do_rebuild_sched_domains() function. - */ -static void async_rebuild_sched_domains(void) -{ - queue_work(cpuset_wq, &rebuild_sched_domains_work); -} - -/* - * Accomplishes the same scheduler domain rebuild as the above - * async_rebuild_sched_domains(), however it directly calls the - * rebuild routine synchronously rather than calling it via an - * asynchronous work thread. - * - * This can only be called from code that is not holding - * cgroup_mutex (not nested in a cgroup_lock() call.) - */ void rebuild_sched_domains(void) { - do_rebuild_sched_domains(NULL); + cgroup_lock(); + rebuild_sched_domains_locked(); + cgroup_unlock(); } /** @@ -948,7 +907,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, heap_free(&heap); if (is_load_balanced) - async_rebuild_sched_domains(); + rebuild_sched_domains_locked(); return 0; } @@ -1196,7 +1155,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) cs->relax_domain_level = val; if (!cpumask_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) - async_rebuild_sched_domains(); + rebuild_sched_domains_locked(); } return 0; @@ -1288,7 +1247,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, mutex_unlock(&callback_mutex); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) - async_rebuild_sched_domains(); + rebuild_sched_domains_locked(); if (spread_flag_changed) update_tasks_flags(cs, &heap); @@ -1925,7 +1884,7 @@ static void cpuset_css_offline(struct cgroup *cgrp) /* * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which - * will call async_rebuild_sched_domains(). + * will call rebuild_sched_domains_locked(). */ static void cpuset_css_free(struct cgroup *cont) @@ -2237,9 +2196,6 @@ void __init cpuset_init_smp(void) top_cpuset.mems_allowed = node_states[N_MEMORY]; hotplug_memory_notifier(cpuset_track_online_nodes, 10); - - cpuset_wq = create_singlethread_workqueue("cpuset"); - BUG_ON(!cpuset_wq); } /** -- cgit v1.2.3 From 8d03394877ecdf87e1d694664c460747b8e05aa1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:07 -0800 Subject: cpuset: make CPU / memory hotplug propagation asynchronous cpuset_hotplug_workfn() has been invoking cpuset_propagate_hotplug() directly to propagate hotplug updates to !root cpusets; however, this has the following problems. * cpuset locking is scheduled to be decoupled from cgroup_mutex, cgroup_mutex will be unexported, and cgroup_attach_task() will do cgroup locking internally, so propagation can't synchronously move tasks to a parent cgroup while walking the hierarchy. * We can't use cgroup generic tree iterator because propagation to each cpuset may sleep. With propagation done asynchronously, we can lose the rather ugly cpuset specific iteration. Convert cpuset_propagate_hotplug() to cpuset_propagate_hotplug_workfn() and execute it from newly added cpuset->hotplug_work. The work items are run on an ordered workqueue, so the propagation order is preserved. cpuset_hotplug_workfn() schedules all propagations while holding cgroup_mutex and waits for completion without cgroup_mutex. Each in-flight propagation holds a reference to the cpuset->css. This patch doesn't cause any functional difference. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 74e412f908db..a7bb547786d7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -99,6 +99,8 @@ struct cpuset { /* used for walking a cpuset hierarchy */ struct list_head stack_list; + + struct work_struct hotplug_work; }; /* Retrieve the cpuset for a cgroup */ @@ -254,7 +256,10 @@ static DEFINE_SPINLOCK(cpuset_buffer_lock); /* * CPU / memory hotplug is handled asynchronously. */ +static struct workqueue_struct *cpuset_propagate_hotplug_wq; + static void cpuset_hotplug_workfn(struct work_struct *work); +static void cpuset_propagate_hotplug_workfn(struct work_struct *work); static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); @@ -1808,6 +1813,7 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); fmeter_init(&cs->fmeter); + INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn); cs->relax_domain_level = -1; cs->parent = cgroup_cs(cont->parent); @@ -2033,21 +2039,20 @@ static struct cpuset *cpuset_next(struct list_head *queue) } /** - * cpuset_propagate_hotplug - propagate CPU/memory hotplug to a cpuset + * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset * @cs: cpuset in interest * * Compare @cs's cpu and mem masks against top_cpuset and if some have gone * offline, update @cs accordingly. If @cs ends up with no CPU or memory, * all its tasks are moved to the nearest ancestor with both resources. - * - * Should be called with cgroup_mutex held. */ -static void cpuset_propagate_hotplug(struct cpuset *cs) +static void cpuset_propagate_hotplug_workfn(struct work_struct *work) { static cpumask_t off_cpus; static nodemask_t off_mems, tmp_mems; + struct cpuset *cs = container_of(work, struct cpuset, hotplug_work); - WARN_ON_ONCE(!cgroup_lock_is_held()); + cgroup_lock(); cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); @@ -2071,6 +2076,36 @@ static void cpuset_propagate_hotplug(struct cpuset *cs) if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) remove_tasks_in_empty_cpuset(cs); + + cgroup_unlock(); + + /* the following may free @cs, should be the last operation */ + css_put(&cs->css); +} + +/** + * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset + * @cs: cpuset of interest + * + * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and + * memory masks according to top_cpuset. + */ +static void schedule_cpuset_propagate_hotplug(struct cpuset *cs) +{ + /* + * Pin @cs. The refcnt will be released when the work item + * finishes executing. + */ + if (!css_tryget(&cs->css)) + return; + + /* + * Queue @cs->hotplug_work. If already pending, lose the css ref. + * cpuset_propagate_hotplug_wq is ordered and propagation will + * happen in the order this function is called. + */ + if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work)) + css_put(&cs->css); } /** @@ -2135,11 +2170,14 @@ static void cpuset_hotplug_workfn(struct work_struct *work) list_add_tail(&top_cpuset.stack_list, &queue); while ((cs = cpuset_next(&queue))) if (cs != &top_cpuset) - cpuset_propagate_hotplug(cs); + schedule_cpuset_propagate_hotplug(cs); } cgroup_unlock(); + /* wait for propagations to finish */ + flush_workqueue(cpuset_propagate_hotplug_wq); + /* rebuild sched domains if cpus_allowed has changed */ if (cpus_updated) { struct sched_domain_attr *attr; @@ -2196,6 +2234,10 @@ void __init cpuset_init_smp(void) top_cpuset.mems_allowed = node_states[N_MEMORY]; hotplug_memory_notifier(cpuset_track_online_nodes, 10); + + cpuset_propagate_hotplug_wq = + alloc_ordered_workqueue("cpuset_hotplug", 0); + BUG_ON(!cpuset_propagate_hotplug_wq); } /** -- cgit v1.2.3 From 452477fa68c6d8ef80adebd05194c1c157ad9a53 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:07 -0800 Subject: cpuset: pin down cpus and mems while a task is being attached cpuset is scheduled to be decoupled from cgroup_lock which will make configuration updates race with task migration. Any config update will be allowed to happen between ->can_attach() and ->attach(). If such config update removes either all cpus or mems, by the time ->attach() is called, the condition verified by ->can_attach(), that the cpuset is capable of hosting the tasks, is no longer true. This patch adds cpuset->attach_in_progress which is incremented from ->can_attach() and decremented when the attach operation finishes either successfully or not. validate_change() treats cpusets w/ non-zero ->attach_in_progress like cpusets w/ tasks and refuses to remove all cpus or mems from it. This currently doesn't make any functional difference as everything is protected by cgroup_mutex but enables decoupling the locking. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a7bb547786d7..4334576f5d6a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -91,6 +91,12 @@ struct cpuset { struct fmeter fmeter; /* memory_pressure filter */ + /* + * Tasks are being attached to this cpuset. Used to prevent + * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). + */ + int attach_in_progress; + /* partition number for rebuild_sched_domains() */ int pn; @@ -468,9 +474,12 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) goto out; } - /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ + /* + * Cpusets with tasks - existing or newly being attached - can't + * have empty cpus_allowed or mems_allowed. + */ ret = -ENOSPC; - if (cgroup_task_count(cur->css.cgroup) && + if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) && (cpumask_empty(trial->cpus_allowed) || nodes_empty(trial->mems_allowed))) goto out; @@ -1386,9 +1395,21 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) return ret; } + /* + * Mark attach is in progress. This makes validate_change() fail + * changes which zero cpus/mems_allowed. + */ + cs->attach_in_progress++; + return 0; } +static void cpuset_cancel_attach(struct cgroup *cgrp, + struct cgroup_taskset *tset) +{ + cgroup_cs(cgrp)->attach_in_progress--; +} + /* * Protected by cgroup_mutex. cpus_attach is used only by cpuset_attach() * but we can't allocate it dynamically there. Define it global and @@ -1441,6 +1462,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) &cpuset_attach_nodemask_to); mmput(mm); } + + cs->attach_in_progress--; } /* The various types of files and directories in a cpuset file system */ @@ -1908,6 +1931,7 @@ struct cgroup_subsys cpuset_subsys = { .css_offline = cpuset_css_offline, .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, + .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, .subsys_id = cpuset_subsys_id, .base_cftypes = files, -- cgit v1.2.3 From 02bb586372a71595203b3ff19a9be48eaa076f6c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:08 -0800 Subject: cpuset: schedule hotplug propagation from cpuset_attach() if the cpuset is empty cpuset is scheduled to be decoupled from cgroup_lock which will make hotplug handling race with task migration. cpus or mems will be allowed to go offline between ->can_attach() and ->attach(). If hotplug takes down all cpus or mems of a cpuset while attach is in progress, ->attach() may end up putting tasks into an empty cpuset. This patchset makes ->attach() schedule hotplug propagation if the cpuset is empty after attaching is complete. This will move the tasks to the nearest ancestor which can execute and the end result would be as if hotplug handling happened after the tasks finished attaching. cpuset_write_resmask() now also flushes cpuset_propagate_hotplug_wq to wait for propagations scheduled directly by cpuset_attach(). This currently doesn't make any functional difference as everything is protected by cgroup_mutex but enables decoupling the locking. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4334576f5d6a..644281003f5d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -266,6 +266,7 @@ static struct workqueue_struct *cpuset_propagate_hotplug_wq; static void cpuset_hotplug_workfn(struct work_struct *work); static void cpuset_propagate_hotplug_workfn(struct work_struct *work); +static void schedule_cpuset_propagate_hotplug(struct cpuset *cs); static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); @@ -1464,6 +1465,14 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) } cs->attach_in_progress--; + + /* + * We may have raced with CPU/memory hotunplug. Trigger hotplug + * propagation if @cs doesn't have any CPU or memory. It will move + * the newly added tasks to the nearest parent which can execute. + */ + if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) + schedule_cpuset_propagate_hotplug(cs); } /* The various types of files and directories in a cpuset file system */ @@ -1569,8 +1578,13 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, * resources, wait for the previously scheduled operations before * proceeding, so that we don't end up keep removing tasks added * after execution capability is restored. + * + * Flushing cpuset_hotplug_work is enough to synchronize against + * hotplug hanlding; however, cpuset_attach() may schedule + * propagation work directly. Flush the workqueue too. */ flush_work(&cpuset_hotplug_work); + flush_workqueue(cpuset_propagate_hotplug_wq); if (!cgroup_lock_live_group(cgrp)) return -ENODEV; -- cgit v1.2.3 From 5d21cc2db040d01f8c19b8602f6987813e1176b4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:08 -0800 Subject: cpuset: replace cgroup_mutex locking with cpuset internal locking Supposedly for historical reasons, cpuset depends on cgroup core for locking. It depends on cgroup_mutex in cgroup callbacks and grabs cgroup_mutex from other places where it wants to be synchronized. This is majorly messy and highly prone to introducing circular locking dependency especially because cgroup_mutex is supposed to be one of the outermost locks. As previous patches already plugged possible races which may happen by decoupling from cgroup_mutex, replacing cgroup_mutex with cpuset specific cpuset_mutex is mostly straight-forward. Introduce cpuset_mutex, replace all occurrences of cgroup_mutex with it, and add cpuset_mutex locking to places which inherited cgroup_mutex from cgroup core. The only complication is from cpuset wanting to initiate task migration when a cpuset loses all cpus or memory nodes. Task migration may go through full cgroup and all subsystem locking and should be initiated without holding any cpuset specific lock; however, a previous patch already made hotplug handled asynchronously and moving the task migration part outside other locks is easy. cpuset_propagate_hotplug_workfn() now invokes remove_tasks_in_empty_cpuset() without holding any lock. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 188 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 107 insertions(+), 81 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 644281003f5d..5e348ae37ce9 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -208,23 +208,20 @@ static struct cpuset top_cpuset = { if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) /* - * There are two global mutexes guarding cpuset structures. The first - * is the main control groups cgroup_mutex, accessed via - * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific - * callback_mutex, below. They can nest. It is ok to first take - * cgroup_mutex, then nest callback_mutex. We also require taking - * task_lock() when dereferencing a task's cpuset pointer. See "The - * task_lock() exception", at the end of this comment. - * - * A task must hold both mutexes to modify cpusets. If a task - * holds cgroup_mutex, then it blocks others wanting that mutex, - * ensuring that it is the only task able to also acquire callback_mutex - * and be able to modify cpusets. It can perform various checks on - * the cpuset structure first, knowing nothing will change. It can - * also allocate memory while just holding cgroup_mutex. While it is - * performing these checks, various callback routines can briefly - * acquire callback_mutex to query cpusets. Once it is ready to make - * the changes, it takes callback_mutex, blocking everyone else. + * There are two global mutexes guarding cpuset structures - cpuset_mutex + * and callback_mutex. The latter may nest inside the former. We also + * require taking task_lock() when dereferencing a task's cpuset pointer. + * See "The task_lock() exception", at the end of this comment. + * + * A task must hold both mutexes to modify cpusets. If a task holds + * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it + * is the only task able to also acquire callback_mutex and be able to + * modify cpusets. It can perform various checks on the cpuset structure + * first, knowing nothing will change. It can also allocate memory while + * just holding cpuset_mutex. While it is performing these checks, various + * callback routines can briefly acquire callback_mutex to query cpusets. + * Once it is ready to make the changes, it takes callback_mutex, blocking + * everyone else. * * Calls to the kernel memory allocator can not be made while holding * callback_mutex, as that would risk double tripping on callback_mutex @@ -246,6 +243,7 @@ static struct cpuset top_cpuset = { * guidelines for accessing subsystem state in kernel/cgroup.c */ +static DEFINE_MUTEX(cpuset_mutex); static DEFINE_MUTEX(callback_mutex); /* @@ -351,7 +349,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) /* * update task's spread flag if cpuset's page/slab spread flag is set * - * Called with callback_mutex/cgroup_mutex held + * Called with callback_mutex/cpuset_mutex held */ static void cpuset_update_task_spread_flag(struct cpuset *cs, struct task_struct *tsk) @@ -371,7 +369,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs, * * One cpuset is a subset of another if all its allowed CPUs and * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. Call holding cgroup_mutex. + * are only set if the other's are set. Call holding cpuset_mutex. */ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) @@ -420,7 +418,7 @@ static void free_trial_cpuset(struct cpuset *trial) * If we replaced the flag and mask values of the current cpuset * (cur) with those values in the trial cpuset (trial), would * our various subset and exclusive rules still be valid? Presumes - * cgroup_mutex held. + * cpuset_mutex held. * * 'cur' is the address of an actual, in-use cpuset. Operations * such as list traversal that depend on the actual address of the @@ -555,7 +553,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * domains when operating in the severe memory shortage situations * that could cause allocation failures below. * - * Must be called with cgroup_lock held. + * Must be called with cpuset_mutex held. * * The three key local variables below are: * q - a linked-list queue of cpuset pointers, used to implement a @@ -766,7 +764,7 @@ done: * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * - * Call with cgroup_mutex held. Takes get_online_cpus(). + * Call with cpuset_mutex held. Takes get_online_cpus(). */ static void rebuild_sched_domains_locked(void) { @@ -774,7 +772,7 @@ static void rebuild_sched_domains_locked(void) cpumask_var_t *doms; int ndoms; - WARN_ON_ONCE(!cgroup_lock_is_held()); + lockdep_assert_held(&cpuset_mutex); get_online_cpus(); /* Generate domain masks and attrs */ @@ -800,9 +798,9 @@ static int generate_sched_domains(cpumask_var_t **domains, void rebuild_sched_domains(void) { - cgroup_lock(); + mutex_lock(&cpuset_mutex); rebuild_sched_domains_locked(); - cgroup_unlock(); + mutex_unlock(&cpuset_mutex); } /** @@ -810,7 +808,7 @@ void rebuild_sched_domains(void) * @tsk: task to test * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner * - * Call with cgroup_mutex held. May take callback_mutex during call. + * Call with cpuset_mutex held. May take callback_mutex during call. * Called for each task in a cgroup by cgroup_scan_tasks(). * Return nonzero if this tasks's cpus_allowed mask should be changed (in other * words, if its mask is not equal to its cpuset's mask). @@ -831,7 +829,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk, * cpus_allowed mask needs to be changed. * * We don't need to re-check for the cgroup/cpuset membership, since we're - * holding cgroup_lock() at this point. + * holding cpuset_mutex at this point. */ static void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) @@ -844,7 +842,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk, * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() * - * Called with cgroup_mutex held + * Called with cpuset_mutex held * * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. @@ -934,7 +932,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Temporarilly set tasks mems_allowed to target nodes of migration, * so that the migration code can allocate pages on these nodes. * - * Call holding cgroup_mutex, so current's cpuset won't change + * Call holding cpuset_mutex, so current's cpuset won't change * during this call, as manage_mutex holds off any cpuset_attach() * calls. Therefore we don't need to take task_lock around the * call to guarantee_online_mems(), as we know no one is changing @@ -1009,7 +1007,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, /* * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy * of it to cpuset's new mems_allowed, and migrate pages to new nodes if - * memory_migrate flag is set. Called with cgroup_mutex held. + * memory_migrate flag is set. Called with cpuset_mutex held. */ static void cpuset_change_nodemask(struct task_struct *p, struct cgroup_scanner *scan) @@ -1018,7 +1016,7 @@ static void cpuset_change_nodemask(struct task_struct *p, struct cpuset *cs; int migrate; const nodemask_t *oldmem = scan->data; - static nodemask_t newmems; /* protected by cgroup_mutex */ + static nodemask_t newmems; /* protected by cpuset_mutex */ cs = cgroup_cs(scan->cg); guarantee_online_mems(cs, &newmems); @@ -1045,7 +1043,7 @@ static void *cpuset_being_rebound; * @oldmem: old mems_allowed of cpuset cs * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() * - * Called with cgroup_mutex held + * Called with cpuset_mutex held * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 * if @heap != NULL. */ @@ -1067,7 +1065,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, * take while holding tasklist_lock. Forks can happen - the * mpol_dup() cpuset_being_rebound check will catch such forks, * and rebind their vma mempolicies too. Because we still hold - * the global cgroup_mutex, we know that no other rebind effort + * the global cpuset_mutex, we know that no other rebind effort * will be contending for the global variable cpuset_being_rebound. * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. @@ -1086,7 +1084,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, * mempolicies and if the cpuset is marked 'memory_migrate', * migrate the tasks pages to the new memory. * - * Call with cgroup_mutex held. May take callback_mutex during call. + * Call with cpuset_mutex held. May take callback_mutex during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_sem, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. @@ -1184,7 +1182,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) * Called by cgroup_scan_tasks() for each task in a cgroup. * * We don't need to re-check for the cgroup/cpuset membership, since we're - * holding cgroup_lock() at this point. + * holding cpuset_mutex at this point. */ static void cpuset_change_flag(struct task_struct *tsk, struct cgroup_scanner *scan) @@ -1197,7 +1195,7 @@ static void cpuset_change_flag(struct task_struct *tsk, * @cs: the cpuset in which each task's spread flags needs to be changed * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() * - * Called with cgroup_mutex held + * Called with cpuset_mutex held * * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. @@ -1222,7 +1220,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) * cs: the cpuset to update * turning_on: whether the flag is being set or cleared * - * Call with cgroup_mutex held. + * Call with cpuset_mutex held. */ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, @@ -1370,15 +1368,18 @@ static int fmeter_getrate(struct fmeter *fmp) return val; } -/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ +/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct cpuset *cs = cgroup_cs(cgrp); struct task_struct *task; int ret; + mutex_lock(&cpuset_mutex); + + ret = -ENOSPC; if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) - return -ENOSPC; + goto out_unlock; cgroup_taskset_for_each(task, cgrp, tset) { /* @@ -1390,10 +1391,12 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) * set_cpus_allowed_ptr() on all attached tasks before * cpus_allowed may be changed. */ + ret = -EINVAL; if (task->flags & PF_THREAD_BOUND) - return -EINVAL; - if ((ret = security_task_setscheduler(task))) - return ret; + goto out_unlock; + ret = security_task_setscheduler(task); + if (ret) + goto out_unlock; } /* @@ -1401,18 +1404,22 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) * changes which zero cpus/mems_allowed. */ cs->attach_in_progress++; - - return 0; + ret = 0; +out_unlock: + mutex_unlock(&cpuset_mutex); + return ret; } static void cpuset_cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { + mutex_lock(&cpuset_mutex); cgroup_cs(cgrp)->attach_in_progress--; + mutex_unlock(&cpuset_mutex); } /* - * Protected by cgroup_mutex. cpus_attach is used only by cpuset_attach() + * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach() * but we can't allocate it dynamically there. Define it global and * allocate from cpuset_init(). */ @@ -1420,7 +1427,7 @@ static cpumask_var_t cpus_attach; static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { - /* static bufs protected by cgroup_mutex */ + /* static bufs protected by cpuset_mutex */ static nodemask_t cpuset_attach_nodemask_from; static nodemask_t cpuset_attach_nodemask_to; struct mm_struct *mm; @@ -1430,6 +1437,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *oldcs = cgroup_cs(oldcgrp); + mutex_lock(&cpuset_mutex); + /* prepare for attach */ if (cs == &top_cpuset) cpumask_copy(cpus_attach, cpu_possible_mask); @@ -1473,6 +1482,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) */ if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) schedule_cpuset_propagate_hotplug(cs); + + mutex_unlock(&cpuset_mutex); } /* The various types of files and directories in a cpuset file system */ @@ -1494,12 +1505,13 @@ typedef enum { static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) { - int retval = 0; struct cpuset *cs = cgroup_cs(cgrp); cpuset_filetype_t type = cft->private; + int retval = -ENODEV; - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; + mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) + goto out_unlock; switch (type) { case FILE_CPU_EXCLUSIVE: @@ -1533,18 +1545,20 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) retval = -EINVAL; break; } - cgroup_unlock(); +out_unlock: + mutex_unlock(&cpuset_mutex); return retval; } static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) { - int retval = 0; struct cpuset *cs = cgroup_cs(cgrp); cpuset_filetype_t type = cft->private; + int retval = -ENODEV; - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; + mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) + goto out_unlock; switch (type) { case FILE_SCHED_RELAX_DOMAIN_LEVEL: @@ -1554,7 +1568,8 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) retval = -EINVAL; break; } - cgroup_unlock(); +out_unlock: + mutex_unlock(&cpuset_mutex); return retval; } @@ -1564,9 +1579,9 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, const char *buf) { - int retval = 0; struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *trialcs; + int retval = -ENODEV; /* * CPU or memory hotunplug may leave @cs w/o any execution @@ -1586,13 +1601,14 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, flush_work(&cpuset_hotplug_work); flush_workqueue(cpuset_propagate_hotplug_wq); - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; + mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) + goto out_unlock; trialcs = alloc_trial_cpuset(cs); if (!trialcs) { retval = -ENOMEM; - goto out; + goto out_unlock; } switch (cft->private) { @@ -1608,8 +1624,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, } free_trial_cpuset(trialcs); -out: - cgroup_unlock(); +out_unlock: + mutex_unlock(&cpuset_mutex); return retval; } @@ -1867,6 +1883,8 @@ static int cpuset_css_online(struct cgroup *cgrp) if (!parent) return 0; + mutex_lock(&cpuset_mutex); + set_bit(CS_ONLINE, &cs->flags); if (is_spread_page(parent)) set_bit(CS_SPREAD_PAGE, &cs->flags); @@ -1876,7 +1894,7 @@ static int cpuset_css_online(struct cgroup *cgrp) number_of_cpusets++; if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) - return 0; + goto out_unlock; /* * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is @@ -1895,7 +1913,7 @@ static int cpuset_css_online(struct cgroup *cgrp) cpuset_for_each_child(tmp_cs, pos_cg, parent) { if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { rcu_read_unlock(); - return 0; + goto out_unlock; } } rcu_read_unlock(); @@ -1904,7 +1922,8 @@ static int cpuset_css_online(struct cgroup *cgrp) cs->mems_allowed = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); mutex_unlock(&callback_mutex); - +out_unlock: + mutex_unlock(&cpuset_mutex); return 0; } @@ -1912,8 +1931,7 @@ static void cpuset_css_offline(struct cgroup *cgrp) { struct cpuset *cs = cgroup_cs(cgrp); - /* css_offline is called w/o cgroup_mutex, grab it */ - cgroup_lock(); + mutex_lock(&cpuset_mutex); if (is_sched_load_balance(cs)) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); @@ -1921,7 +1939,7 @@ static void cpuset_css_offline(struct cgroup *cgrp) number_of_cpusets--; clear_bit(CS_ONLINE, &cs->flags); - cgroup_unlock(); + mutex_unlock(&cpuset_mutex); } /* @@ -1996,7 +2014,9 @@ static void cpuset_do_move_task(struct task_struct *tsk, { struct cgroup *new_cgroup = scan->data; + cgroup_lock(); cgroup_attach_task(new_cgroup, tsk); + cgroup_unlock(); } /** @@ -2004,7 +2024,7 @@ static void cpuset_do_move_task(struct task_struct *tsk, * @from: cpuset in which the tasks currently reside * @to: cpuset to which the tasks will be moved * - * Called with cgroup_mutex held + * Called with cpuset_mutex held * callback_mutex must not be held, as cpuset_attach() will take it. * * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, @@ -2031,9 +2051,6 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) * removing that CPU or node from all cpusets. If this removes the * last CPU or node from a cpuset, then move the tasks in the empty * cpuset to its next-highest non-empty parent. - * - * Called with cgroup_mutex held - * callback_mutex must not be held, as cpuset_attach() will take it. */ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) { @@ -2089,8 +2106,9 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work) static cpumask_t off_cpus; static nodemask_t off_mems, tmp_mems; struct cpuset *cs = container_of(work, struct cpuset, hotplug_work); + bool is_empty; - cgroup_lock(); + mutex_lock(&cpuset_mutex); cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); @@ -2112,10 +2130,18 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work) update_tasks_nodemask(cs, &tmp_mems, NULL); } - if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) - remove_tasks_in_empty_cpuset(cs); + is_empty = cpumask_empty(cs->cpus_allowed) || + nodes_empty(cs->mems_allowed); - cgroup_unlock(); + mutex_unlock(&cpuset_mutex); + + /* + * If @cs became empty, move tasks to the nearest ancestor with + * execution resources. This is full cgroup operation which will + * also call back into cpuset. Should be done outside any lock. + */ + if (is_empty) + remove_tasks_in_empty_cpuset(cs); /* the following may free @cs, should be the last operation */ css_put(&cs->css); @@ -2169,7 +2195,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) bool cpus_updated, mems_updated; bool cpus_offlined, mems_offlined; - cgroup_lock(); + mutex_lock(&cpuset_mutex); /* fetch the available cpus/mems and find out which changed how */ cpumask_copy(&new_cpus, cpu_active_mask); @@ -2211,7 +2237,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) schedule_cpuset_propagate_hotplug(cs); } - cgroup_unlock(); + mutex_unlock(&cpuset_mutex); /* wait for propagations to finish */ flush_workqueue(cpuset_propagate_hotplug_wq); @@ -2222,9 +2248,9 @@ static void cpuset_hotplug_workfn(struct work_struct *work) cpumask_var_t *doms; int ndoms; - cgroup_lock(); + mutex_lock(&cpuset_mutex); ndoms = generate_sched_domains(&doms, &attr); - cgroup_unlock(); + mutex_unlock(&cpuset_mutex); partition_sched_domains(ndoms, doms, attr); } @@ -2650,7 +2676,7 @@ void __cpuset_memory_pressure_bump(void) * - Used for /proc//cpuset. * - No need to task_lock(tsk) on this tsk->cpuset reference, as it * doesn't really matter if tsk->cpuset changes after we read it, - * and we take cgroup_mutex, keeping cpuset_attach() from changing it + * and we take cpuset_mutex, keeping cpuset_attach() from changing it * anyway. */ static int proc_cpuset_show(struct seq_file *m, void *unused_v) @@ -2673,7 +2699,7 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v) goto out_free; retval = -EINVAL; - cgroup_lock(); + mutex_lock(&cpuset_mutex); css = task_subsys_state(tsk, cpuset_subsys_id); retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); if (retval < 0) @@ -2681,7 +2707,7 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v) seq_puts(m, buf); seq_putc(m, '\n'); out_unlock: - cgroup_unlock(); + mutex_unlock(&cpuset_mutex); put_task_struct(tsk); out_free: kfree(buf); -- cgit v1.2.3 From fc560a26accea9567b7f8f41eefa01e1bb127d74 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:08 -0800 Subject: cpuset: replace cpuset->stack_list with cpuset_for_each_descendant_pre() Implement cpuset_for_each_descendant_pre() and replace the cpuset-specific tree walking using cpuset->stack_list with it. Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Acked-by: Li Zefan --- kernel/cpuset.c | 123 ++++++++++++++++++++++---------------------------------- 1 file changed, 48 insertions(+), 75 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 5e348ae37ce9..0ddb48d40e4d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -103,9 +103,6 @@ struct cpuset { /* for custom sched domain */ int relax_domain_level; - /* used for walking a cpuset hierarchy */ - struct list_head stack_list; - struct work_struct hotplug_work; }; @@ -207,6 +204,20 @@ static struct cpuset top_cpuset = { cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) +/** + * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants + * @des_cs: loop cursor pointing to the current descendant + * @pos_cgrp: used for iteration + * @root_cs: target cpuset to walk ancestor of + * + * Walk @des_cs through the online descendants of @root_cs. Must be used + * with RCU read locked. The caller may modify @pos_cgrp by calling + * cgroup_rightmost_descendant() to skip subtree. + */ +#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \ + cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \ + if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp))))) + /* * There are two global mutexes guarding cpuset structures - cpuset_mutex * and callback_mutex. The latter may nest inside the former. We also @@ -507,31 +518,24 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) return; } -static void -update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) +static void update_domain_attr_tree(struct sched_domain_attr *dattr, + struct cpuset *root_cs) { - LIST_HEAD(q); - - list_add(&c->stack_list, &q); - while (!list_empty(&q)) { - struct cpuset *cp; - struct cgroup *cont; - struct cpuset *child; - - cp = list_first_entry(&q, struct cpuset, stack_list); - list_del(q.next); + struct cpuset *cp; + struct cgroup *pos_cgrp; - if (cpumask_empty(cp->cpus_allowed)) + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { + /* skip the whole subtree if @cp doesn't have any CPU */ + if (cpumask_empty(cp->cpus_allowed)) { + pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); continue; + } if (is_sched_load_balance(cp)) update_domain_attr(dattr, cp); - - rcu_read_lock(); - cpuset_for_each_child(child, cont, cp) - list_add_tail(&child->stack_list, &q); - rcu_read_unlock(); } + rcu_read_unlock(); } /* @@ -591,7 +595,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { - LIST_HEAD(q); /* queue of cpusets to be scanned */ struct cpuset *cp; /* scans q */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ @@ -600,6 +603,7 @@ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ + struct cgroup *pos_cgrp; doms = NULL; dattr = NULL; @@ -627,33 +631,27 @@ static int generate_sched_domains(cpumask_var_t **domains, goto done; csn = 0; - list_add(&top_cpuset.stack_list, &q); - while (!list_empty(&q)) { - struct cgroup *cont; - struct cpuset *child; /* scans child cpusets of cp */ - - cp = list_first_entry(&q, struct cpuset, stack_list); - list_del(q.next); - - if (cpumask_empty(cp->cpus_allowed)) - continue; - + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) { /* - * All child cpusets contain a subset of the parent's cpus, so - * just skip them, and then we call update_domain_attr_tree() - * to calc relax_domain_level of the corresponding sched - * domain. + * Continue traversing beyond @cp iff @cp has some CPUs and + * isn't load balancing. The former is obvious. The + * latter: All child cpusets contain a subset of the + * parent's cpus, so just skip them, and then we call + * update_domain_attr_tree() to calc relax_domain_level of + * the corresponding sched domain. */ - if (is_sched_load_balance(cp)) { - csa[csn++] = cp; + if (!cpumask_empty(cp->cpus_allowed) && + !is_sched_load_balance(cp)) continue; - } - rcu_read_lock(); - cpuset_for_each_child(child, cont, cp) - list_add_tail(&child->stack_list, &q); - rcu_read_unlock(); - } + if (is_sched_load_balance(cp)) + csa[csn++] = cp; + + /* skip @cp's subtree */ + pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); + } + rcu_read_unlock(); for (i = 0; i < csn; i++) csa[i]->pn = i; @@ -2068,31 +2066,6 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) move_member_tasks_to_cpuset(cs, parent); } -/* - * Helper function to traverse cpusets. - * It can be used to walk the cpuset tree from top to bottom, completing - * one layer before dropping down to the next (thus always processing a - * node before any of its children). - */ -static struct cpuset *cpuset_next(struct list_head *queue) -{ - struct cpuset *cp; - struct cpuset *child; /* scans child cpusets of cp */ - struct cgroup *cont; - - if (list_empty(queue)) - return NULL; - - cp = list_first_entry(queue, struct cpuset, stack_list); - list_del(queue->next); - rcu_read_lock(); - cpuset_for_each_child(child, cont, cp) - list_add_tail(&child->stack_list, queue); - rcu_read_unlock(); - - return cp; -} - /** * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset * @cs: cpuset in interest @@ -2229,12 +2202,12 @@ static void cpuset_hotplug_workfn(struct work_struct *work) /* if cpus or mems went down, we need to propagate to descendants */ if (cpus_offlined || mems_offlined) { struct cpuset *cs; - LIST_HEAD(queue); + struct cgroup *pos_cgrp; - list_add_tail(&top_cpuset.stack_list, &queue); - while ((cs = cpuset_next(&queue))) - if (cs != &top_cpuset) - schedule_cpuset_propagate_hotplug(cs); + rcu_read_lock(); + cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) + schedule_cpuset_propagate_hotplug(cs); + rcu_read_unlock(); } mutex_unlock(&cpuset_mutex); -- cgit v1.2.3 From c431069fe4bacc0cd3ca94a8453987140a5b3517 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Jan 2013 08:51:08 -0800 Subject: cpuset: remove cpuset->parent cgroup already tracks the hierarchy. Follow cgroup->parent to find the parent and drop cpuset->parent. Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Acked-by: Li Zefan --- kernel/cpuset.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 0ddb48d40e4d..6aa5bbb5f33b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -87,8 +87,6 @@ struct cpuset { cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ - struct cpuset *parent; /* my parent */ - struct fmeter fmeter; /* memory_pressure filter */ /* @@ -120,6 +118,15 @@ static inline struct cpuset *task_cs(struct task_struct *task) struct cpuset, css); } +static inline struct cpuset *parent_cs(const struct cpuset *cs) +{ + struct cgroup *pcgrp = cs->css.cgroup->parent; + + if (pcgrp) + return cgroup_cs(pcgrp); + return NULL; +} + #ifdef CONFIG_NUMA static inline bool task_has_mempolicy(struct task_struct *task) { @@ -323,7 +330,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, struct cpumask *pmask) { while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) - cs = cs->parent; + cs = parent_cs(cs); if (cs) cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); else @@ -348,7 +355,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) { while (cs && !nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) - cs = cs->parent; + cs = parent_cs(cs); if (cs) nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]); @@ -461,7 +468,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) if (cur == &top_cpuset) goto out; - par = cur->parent; + par = parent_cs(cur); /* We must be a subset of our parent cpuset */ ret = -EACCES; @@ -1866,7 +1873,6 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) fmeter_init(&cs->fmeter); INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn); cs->relax_domain_level = -1; - cs->parent = cgroup_cs(cont->parent); return &cs->css; } @@ -1874,7 +1880,7 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) static int cpuset_css_online(struct cgroup *cgrp) { struct cpuset *cs = cgroup_cs(cgrp); - struct cpuset *parent = cs->parent; + struct cpuset *parent = parent_cs(cs); struct cpuset *tmp_cs; struct cgroup *pos_cg; @@ -2058,10 +2064,10 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) * Find its next-highest non-empty parent, (top cpuset * has online cpus, so can't be empty). */ - parent = cs->parent; + parent = parent_cs(cs); while (cpumask_empty(parent->cpus_allowed) || nodes_empty(parent->mems_allowed)) - parent = parent->parent; + parent = parent_cs(parent); move_member_tasks_to_cpuset(cs, parent); } @@ -2373,8 +2379,8 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) */ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) { - while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent) - cs = cs->parent; + while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) + cs = parent_cs(cs); return cs; } -- cgit v1.2.3 From dc975e94f322e60fa8fcc44dec1820fde4de174c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 15 Nov 2012 11:27:26 -0800 Subject: tracing: Export trace_clock_local() The rcutorture tests need to be able to trace the time of the beginning of an RCU read-side critical section, and thus need access to trace_clock_local(). This commit therefore adds a the needed EXPORT_SYMBOL_GPL(). Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/trace/trace_clock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 394783531cbb..1bbb1b200cec 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -44,6 +44,7 @@ u64 notrace trace_clock_local(void) return clock; } +EXPORT_SYMBOL_GPL(trace_clock_local); /* * trace_clock(): 'between' trace clock. Not completely serialized, -- cgit v1.2.3 From 52494535103986dbbf689b44d8c2c7efe2132b16 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 14 Nov 2012 16:26:40 -0800 Subject: rcu: Reduce rcutorture tracing Currently, rcutorture traces every read-side access. This can be problematic because even a two-minute rcutorture run on a two-CPU system can generate 28,853,363 reads. Normally, only a failing read is of interest, so this commit traces adjusts rcutorture's tracing to only trace failing reads. The resulting event tracing records the time and the ->completed value captured at the beginning of the RCU read-side critical section, allowing correlation with other event-tracing messages. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett [ paulmck: Add fix to build problem located by Randy Dunlap based on diagnosis by Steven Rostedt. ] --- kernel/rcupdate.c | 9 ++++++--- kernel/rcutorture.c | 31 ++++++++++++++++++++++++------- 2 files changed, 30 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a2cf76177b44..303359d1ca88 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -404,11 +404,14 @@ EXPORT_SYMBOL_GPL(rcuhead_debug_descr); #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) -void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp) +void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp, + unsigned long secs, + unsigned long c_old, unsigned long c) { - trace_rcu_torture_read(rcutorturename, rhp); + trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c); } EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); #else -#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) +#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \ + do { } while (0) #endif diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 31dea01c85fd..a583f1ce713d 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -46,6 +46,7 @@ #include #include #include +#include #include MODULE_LICENSE("GPL"); @@ -1028,7 +1029,6 @@ void rcutorture_trace_dump(void) return; if (atomic_xchg(&beenhere, 1) != 0) return; - do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL); ftrace_dump(DUMP_ALL); } @@ -1042,13 +1042,16 @@ static void rcu_torture_timer(unsigned long unused) { int idx; int completed; + int completed_end; static DEFINE_RCU_RANDOM(rand); static DEFINE_SPINLOCK(rand_lock); struct rcu_torture *p; int pipe_count; + unsigned long long ts; idx = cur_ops->readlock(); completed = cur_ops->completed(); + ts = trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || @@ -1058,7 +1061,6 @@ static void rcu_torture_timer(unsigned long unused) cur_ops->readunlock(idx); return; } - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); spin_lock(&rand_lock); @@ -1071,10 +1073,16 @@ static void rcu_torture_timer(unsigned long unused) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - if (pipe_count > 1) + completed_end = cur_ops->completed(); + if (pipe_count > 1) { + unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC); + + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, + completed, completed_end); rcutorture_trace_dump(); + } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = cur_ops->completed() - completed; + completed = completed_end - completed; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; @@ -1094,11 +1102,13 @@ static int rcu_torture_reader(void *arg) { int completed; + int completed_end; int idx; DEFINE_RCU_RANDOM(rand); struct rcu_torture *p; int pipe_count; struct timer_list t; + unsigned long long ts; VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); set_user_nice(current, 19); @@ -1112,6 +1122,7 @@ rcu_torture_reader(void *arg) } idx = cur_ops->readlock(); completed = cur_ops->completed(); + ts = trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || @@ -1122,7 +1133,6 @@ rcu_torture_reader(void *arg) schedule_timeout_interruptible(HZ); continue; } - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); cur_ops->read_delay(&rand); @@ -1132,10 +1142,17 @@ rcu_torture_reader(void *arg) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - if (pipe_count > 1) + completed_end = cur_ops->completed(); + if (pipe_count > 1) { + unsigned long __maybe_unused ts_rem = + do_div(ts, NSEC_PER_USEC); + + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, + ts, completed, completed_end); rcutorture_trace_dump(); + } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = cur_ops->completed() - completed; + completed = completed_end - completed; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; -- cgit v1.2.3 From 62e3cb143fd78a2ee6f44ef0dfe50cdff2119d9a Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Tue, 20 Nov 2012 09:55:26 -0800 Subject: rcu: Make rcu_is_cpu_rrupt_from_idle helper functions static Both rcutiny and rcutree define a helper function named rcu_is_cpu_rrupt_from_idle(), each used exactly once, later in the same file. This commit therefore declares these helper functions static. Signed-off-by: Josh Triplett Signed-off-by: Paul E. McKenney --- kernel/rcutiny.c | 2 +- kernel/rcutree.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index e7dce58f9c2a..9f72a0f9f85a 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -193,7 +193,7 @@ EXPORT_SYMBOL(rcu_is_cpu_idle); * interrupts don't count, we must be running at the first interrupt * level. */ -int rcu_is_cpu_rrupt_from_idle(void) +static int rcu_is_cpu_rrupt_from_idle(void) { return rcu_dynticks_nesting <= 1; } diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e441b77b614e..cceda7602dfd 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -727,7 +727,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); * interrupt from idle, return true. The caller must have at least * disabled preemption. */ -int rcu_is_cpu_rrupt_from_idle(void) +static int rcu_is_cpu_rrupt_from_idle(void) { return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; } -- cgit v1.2.3 From 1bdc2b7d243dc8b9aadfc8002a69cf911e9e3e72 Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Tue, 27 Nov 2012 13:58:27 +0800 Subject: rcu: Use new nesting value for rcu_dyntick trace in rcu_eqs_enter_common This patch uses the real new value of dynticks_nesting instead of 0 in rcu_eqs_enter_common(). Signed-off-by: Li Zhong Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index cceda7602dfd..d145796bd61f 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -336,7 +336,7 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, bool user) { - trace_rcu_dyntick("Start", oldval, 0); + trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); -- cgit v1.2.3 From 4930521ae10fd28ebc713107fd243c8044a95415 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 29 Nov 2012 13:49:00 -0800 Subject: rcu: Silence compiler array out-of-bounds false positive It turns out that gcc 4.8 warns on array indexes being out of bounds unless it can prove otherwise. It gives this warning on some RCU initialization code. Because this is far from any fastpath, add an explicit check for array bounds and panic if so. This gives the compiler enough information to figure out that the array index is never out of bounds. However, if a similar false positive occurs on a fastpath, it will probably be necessary to tell the compiler to keep its array-index anxieties to itself. ;-) Markus Trippelsdorf Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d145796bd61f..e0d98157fbea 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2938,6 +2938,10 @@ static void __init rcu_init_one(struct rcu_state *rsp, BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ + /* Silence gcc 4.8 warning about array index out of range. */ + if (rcu_num_lvls > RCU_NUM_LVLS) + panic("rcu_init_one: rcu_num_lvls overflow"); + /* Initialize the level-tracking arrays. */ for (i = 0; i < rcu_num_lvls; i++) -- cgit v1.2.3 From de5e64378e6be3757a8533f55bbe1153349806dd Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 20 Dec 2012 14:11:28 -0500 Subject: rcutorture: Don't compare ptr with 0 Signed-off-by: Sasha Levin Reviewed-by: Josh Triplett Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 31dea01c85fd..0249800c6112 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -1749,7 +1749,7 @@ static int rcu_torture_barrier_init(void) barrier_cbs_wq = kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), GFP_KERNEL); - if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0) + if (barrier_cbs_tasks == NULL || !barrier_cbs_wq) return -ENOMEM; for (i = 0; i < n_barrier_cbs; i++) { init_waitqueue_head(&barrier_cbs_wq[i]); -- cgit v1.2.3 From dc35c8934eba959b690921615fcd987e8bc17e4a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 3 Dec 2012 13:52:00 -0800 Subject: rcu: Tag callback lists with corresponding grace-period number Currently, callbacks are advanced each time the corresponding CPU notices a change in its leaf rcu_node structure's ->completed value (this value counts grace-period completions). This approach has worked quite well, but with the advent of RCU_FAST_NO_HZ, we cannot count on a given CPU seeing all the grace-period completions. When a CPU misses a grace-period completion that occurs while it is in dyntick-idle mode, this will delay invocation of its callbacks. In addition, acceleration of callbacks (when RCU realizes that a given callback need only wait until the end of the next grace period, rather than having to wait for a partial grace period followed by a full grace period) must be carried out extremely carefully. Insufficient acceleration will result in unnecessarily long grace-period latencies, while excessive acceleration will result in premature callback invocation. Changes that involve this tradeoff are therefore among the most nerve-wracking changes to RCU. This commit therefore explicitly tags groups of callbacks with the number of the grace period that they are waiting for. This means that callback-advancement and callback-acceleration functions are idempotent, so that excessive acceleration will merely waste a few CPU cycles. This also allows a CPU to take full advantage of any grace periods that have elapsed while it has been in dyntick-idle mode. It should also enable simulataneous simplifications to and optimizations of RCU_FAST_NO_HZ. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 195 +++++++++++++++++++++++++++++++++++++++++++++++-------- kernel/rcutree.h | 2 + 2 files changed, 169 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e441b77b614e..ac6a75d152ef 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -305,17 +305,27 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) } /* - * Does the current CPU require a yet-as-unscheduled grace period? + * Does the current CPU require a not-yet-started grace period? + * The caller must have disabled interrupts to prevent races with + * normal callback registry. */ static int cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { - struct rcu_head **ntp; + int i; - ntp = rdp->nxttail[RCU_DONE_TAIL + - (ACCESS_ONCE(rsp->completed) != rdp->completed)]; - return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp && - !rcu_gp_in_progress(rsp); + if (rcu_gp_in_progress(rsp)) + return 0; /* No, a grace period is already in progress. */ + if (!rdp->nxttail[RCU_NEXT_TAIL]) + return 0; /* No, this is a no-CBs (or offline) CPU. */ + if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) + return 1; /* Yes, this CPU has newly registered callbacks. */ + for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) + if (rdp->nxttail[i - 1] != rdp->nxttail[i] && + ULONG_CMP_LT(ACCESS_ONCE(rsp->completed), + rdp->nxtcompleted[i])) + return 1; /* Yes, CBs for future grace period. */ + return 0; /* No grace period needed. */ } /* @@ -1070,6 +1080,139 @@ static void init_callback_list(struct rcu_data *rdp) init_nocb_callback_list(rdp); } +/* + * Determine the value that ->completed will have at the end of the + * next subsequent grace period. This is used to tag callbacks so that + * a CPU can invoke callbacks in a timely fashion even if that CPU has + * been dyntick-idle for an extended period with callbacks under the + * influence of RCU_FAST_NO_HZ. + * + * The caller must hold rnp->lock with interrupts disabled. + */ +static unsigned long rcu_cbs_completed(struct rcu_state *rsp, + struct rcu_node *rnp) +{ + /* + * If RCU is idle, we just wait for the next grace period. + * But we can only be sure that RCU is idle if we are looking + * at the root rcu_node structure -- otherwise, a new grace + * period might have started, but just not yet gotten around + * to initializing the current non-root rcu_node structure. + */ + if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed) + return rnp->completed + 1; + + /* + * Otherwise, wait for a possible partial grace period and + * then the subsequent full grace period. + */ + return rnp->completed + 2; +} + +/* + * If there is room, assign a ->completed number to any callbacks on + * this CPU that have not already been assigned. Also accelerate any + * callbacks that were previously assigned a ->completed number that has + * since proven to be too conservative, which can happen if callbacks get + * assigned a ->completed number while RCU is idle, but with reference to + * a non-root rcu_node structure. This function is idempotent, so it does + * not hurt to call it repeatedly. + * + * The caller must hold rnp->lock with interrupts disabled. + */ +static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp) +{ + unsigned long c; + int i; + + /* If the CPU has no callbacks, nothing to do. */ + if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) + return; + + /* + * Starting from the sublist containing the callbacks most + * recently assigned a ->completed number and working down, find the + * first sublist that is not assignable to an upcoming grace period. + * Such a sublist has something in it (first two tests) and has + * a ->completed number assigned that will complete sooner than + * the ->completed number for newly arrived callbacks (last test). + * + * The key point is that any later sublist can be assigned the + * same ->completed number as the newly arrived callbacks, which + * means that the callbacks in any of these later sublist can be + * grouped into a single sublist, whether or not they have already + * been assigned a ->completed number. + */ + c = rcu_cbs_completed(rsp, rnp); + for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--) + if (rdp->nxttail[i] != rdp->nxttail[i - 1] && + !ULONG_CMP_GE(rdp->nxtcompleted[i], c)) + break; + + /* + * If there are no sublist for unassigned callbacks, leave. + * At the same time, advance "i" one sublist, so that "i" will + * index into the sublist where all the remaining callbacks should + * be grouped into. + */ + if (++i >= RCU_NEXT_TAIL) + return; + + /* + * Assign all subsequent callbacks' ->completed number to the next + * full grace period and group them all in the sublist initially + * indexed by "i". + */ + for (; i <= RCU_NEXT_TAIL; i++) { + rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; + rdp->nxtcompleted[i] = c; + } +} + +/* + * Move any callbacks whose grace period has completed to the + * RCU_DONE_TAIL sublist, then compact the remaining sublists and + * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL + * sublist. This function is idempotent, so it does not hurt to + * invoke it repeatedly. As long as it is not invoked -too- often... + * + * The caller must hold rnp->lock with interrupts disabled. + */ +static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp) +{ + int i, j; + + /* If the CPU has no callbacks, nothing to do. */ + if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) + return; + + /* + * Find all callbacks whose ->completed numbers indicate that they + * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. + */ + for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { + if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i])) + break; + rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i]; + } + /* Clean up any sublist tail pointers that were misordered above. */ + for (j = RCU_WAIT_TAIL; j < i; j++) + rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL]; + + /* Copy down callbacks to fill in empty sublists. */ + for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { + if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL]) + break; + rdp->nxttail[j] = rdp->nxttail[i]; + rdp->nxtcompleted[j] = rdp->nxtcompleted[i]; + } + + /* Classify any remaining callbacks. */ + rcu_accelerate_cbs(rsp, rnp, rdp); +} + /* * Advance this CPU's callbacks, but only if the current grace period * has ended. This may be called only from the CPU to whom the rdp @@ -1080,12 +1223,15 @@ static void __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { /* Did another grace period end? */ - if (rdp->completed != rnp->completed) { + if (rdp->completed == rnp->completed) { + + /* No, so just accelerate recent callbacks. */ + rcu_accelerate_cbs(rsp, rnp, rdp); - /* Advance callbacks. No harm if list empty. */ - rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL]; - rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL]; - rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + } else { + + /* Advance callbacks. */ + rcu_advance_cbs(rsp, rnp, rdp); /* Remember that we saw this grace-period completion. */ rdp->completed = rnp->completed; @@ -1392,17 +1538,10 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) /* * Because there is no grace period in progress right now, * any callbacks we have up to this point will be satisfied - * by the next grace period. So promote all callbacks to be - * handled after the end of the next grace period. If the - * CPU is not yet aware of the end of the previous grace period, - * we need to allow for the callback advancement that will - * occur when it does become aware. Deadlock prevents us from - * making it aware at this point: We cannot acquire a leaf - * rcu_node ->lock while holding the root rcu_node ->lock. + * by the next grace period. So this is a good place to + * assign a grace period number to recently posted callbacks. */ - rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - if (rdp->completed == rsp->completed) - rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + rcu_accelerate_cbs(rsp, rnp, rdp); rsp->gp_flags = RCU_GP_FLAG_INIT; raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ @@ -1527,7 +1666,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * This GP can't end until cpu checks in, so all of our * callbacks can be processed during the next GP. */ - rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + rcu_accelerate_cbs(rsp, rnp, rdp); rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ } @@ -1779,7 +1918,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) long bl, count, count_lazy; int i; - /* If no callbacks are ready, just return.*/ + /* If no callbacks are ready, just return. */ if (!cpu_has_callbacks_ready_to_invoke(rdp)) { trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), @@ -2008,19 +2147,19 @@ __rcu_process_callbacks(struct rcu_state *rsp) WARN_ON_ONCE(rdp->beenonline == 0); - /* - * Advance callbacks in response to end of earlier grace - * period that some other CPU ended. - */ + /* Handle the end of a grace period that some other CPU ended. */ rcu_process_gp_end(rsp, rdp); /* Update RCU state based on any recent quiescent states. */ rcu_check_quiescent_state(rsp, rdp); /* Does this CPU require a not-yet-started grace period? */ + local_irq_save(flags); if (cpu_needs_another_gp(rsp, rdp)) { - raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); + raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ rcu_start_gp(rsp, flags); /* releases above lock */ + } else { + local_irq_restore(flags); } /* If there are callbacks ready, invoke them. */ diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 4b69291b093d..c9f362e9b13d 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -282,6 +282,8 @@ struct rcu_data { */ struct rcu_head *nxtlist; struct rcu_head **nxttail[RCU_NEXT_SIZE]; + unsigned long nxtcompleted[RCU_NEXT_SIZE]; + /* grace periods for sublists. */ long qlen_lazy; /* # of lazy queued callbacks */ long qlen; /* # of queued callbacks, incl lazy */ long qlen_last_fqs_check; -- cgit v1.2.3 From 6d4b418c75a74eea1dd3701e106a9da8c335c451 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 27 Nov 2012 16:55:44 -0800 Subject: rcu: Trace callback acceleration This commit adds event tracing for callback acceleration to allow better tracking of callbacks through the system. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ac6a75d152ef..e9dce4fa76d8 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1168,6 +1168,12 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; rdp->nxtcompleted[i] = c; } + + /* Trace depending on how much we were able to accelerate. */ + if (!*rdp->nxttail[RCU_WAIT_TAIL]) + trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB"); + else + trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB"); } /* -- cgit v1.2.3 From b6fca72536fb94098acb23c00b4e65d3bc4bb252 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Wed, 9 Jan 2013 20:06:28 +0530 Subject: sysctl: Enable IA64 "ignore-unaligned-usertrap" to be used cross-arch IA64 defines /proc/sys/kernel/ignore-unaligned-usertrap to control verbose warnings on unaligned access emulation. Although the exact mechanics of what to do with sysctl (ignore/shout) are arch specific, this change enables the sysctl to be usable cross-arch. Signed-off-by: Vineet Gupta Cc: Fenghua Yu Cc: "Eric W. Biederman" Cc: Serge Hallyn Signed-off-by: Tony Luck --- kernel/sysctl.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c88878db491e..840fd5eb2309 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -161,10 +161,13 @@ extern int unaligned_enabled; #endif #ifdef CONFIG_IA64 -extern int no_unaligned_warning; extern int unaligned_dump_stack; #endif +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN +extern int no_unaligned_warning; +#endif + #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -911,7 +914,7 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_doulongvec_minmax, }, #endif -#ifdef CONFIG_IA64 +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN { .procname = "ignore-unaligned-usertrap", .data = &no_unaligned_warning, @@ -919,6 +922,8 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#endif +#ifdef CONFIG_IA64 { .procname = "unaligned-dump-stack", .data = &unaligned_dump_stack, -- cgit v1.2.3 From 0ac801fe07374148714b5ef53df90ac5b1673c0c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 10 Jan 2013 11:49:27 +0800 Subject: cgroup: use new hashtable implementation Switch cgroup to use the new hashtable implementation. No functional changes. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 92 ++++++++++++++++++++++++--------------------------------- 1 file changed, 39 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 6643f7053454..54b39081ac04 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -52,7 +52,7 @@ #include #include #include -#include +#include #include #include #include @@ -376,22 +376,18 @@ static int css_set_count; * account cgroups in empty hierarchies. */ #define CSS_SET_HASH_BITS 7 -#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) -static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; +static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); -static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) +static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) { int i; - int index; - unsigned long tmp = 0UL; + unsigned long key = 0UL; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) - tmp += (unsigned long)css[i]; - tmp = (tmp >> 16) ^ tmp; + key += (unsigned long)css[i]; + key = (key >> 16) ^ key; - index = hash_long(tmp, CSS_SET_HASH_BITS); - - return &css_set_table[index]; + return key; } /* We don't maintain the lists running through each css_set to its @@ -418,7 +414,7 @@ static void __put_css_set(struct css_set *cg, int taskexit) } /* This css_set is dead. unlink it and release cgroup refcounts */ - hlist_del(&cg->hlist); + hash_del(&cg->hlist); css_set_count--; list_for_each_entry_safe(link, saved_link, &cg->cg_links, @@ -550,9 +546,9 @@ static struct css_set *find_existing_css_set( { int i; struct cgroupfs_root *root = cgrp->root; - struct hlist_head *hhead; struct hlist_node *node; struct css_set *cg; + unsigned long key; /* * Build the set of subsystem state objects that we want to see in the @@ -572,8 +568,8 @@ static struct css_set *find_existing_css_set( } } - hhead = css_set_hash(template); - hlist_for_each_entry(cg, node, hhead, hlist) { + key = css_set_hash(template); + hash_for_each_possible(css_set_table, cg, node, hlist, key) { if (!compare_css_sets(cg, oldcg, cgrp, template)) continue; @@ -657,8 +653,8 @@ static struct css_set *find_css_set( struct list_head tmp_cg_links; - struct hlist_head *hhead; struct cg_cgroup_link *link; + unsigned long key; /* First see if we already have a cgroup group that matches * the desired set */ @@ -704,8 +700,8 @@ static struct css_set *find_css_set( css_set_count++; /* Add this cgroup group to the hash table */ - hhead = css_set_hash(res->subsys); - hlist_add_head(&res->hlist, hhead); + key = css_set_hash(res->subsys); + hash_add(css_set_table, &res->hlist, key); write_unlock(&css_set_lock); @@ -1597,6 +1593,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, struct cgroupfs_root *existing_root; const struct cred *cred; int i; + struct hlist_node *node; + struct css_set *cg; BUG_ON(sb->s_root != NULL); @@ -1650,14 +1648,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, /* Link the top cgroup in this hierarchy into all * the css_set objects */ write_lock(&css_set_lock); - for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { - struct hlist_head *hhead = &css_set_table[i]; - struct hlist_node *node; - struct css_set *cg; - - hlist_for_each_entry(cg, node, hhead, hlist) - link_css_set(&tmp_cg_links, cg, root_cgrp); - } + hash_for_each(css_set_table, i, node, cg, hlist) + link_css_set(&tmp_cg_links, cg, root_cgrp); write_unlock(&css_set_lock); free_cg_links(&tmp_cg_links); @@ -4464,6 +4456,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; int i, ret; + struct hlist_node *node, *tmp; + struct css_set *cg; + unsigned long key; /* check name and function validity */ if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || @@ -4529,23 +4524,17 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * this is all done under the css_set_lock. */ write_lock(&css_set_lock); - for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { - struct css_set *cg; - struct hlist_node *node, *tmp; - struct hlist_head *bucket = &css_set_table[i], *new_bucket; - - hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) { - /* skip entries that we already rehashed */ - if (cg->subsys[ss->subsys_id]) - continue; - /* remove existing entry */ - hlist_del(&cg->hlist); - /* set new value */ - cg->subsys[ss->subsys_id] = css; - /* recompute hash and restore entry */ - new_bucket = css_set_hash(cg->subsys); - hlist_add_head(&cg->hlist, new_bucket); - } + hash_for_each_safe(css_set_table, i, node, tmp, cg, hlist) { + /* skip entries that we already rehashed */ + if (cg->subsys[ss->subsys_id]) + continue; + /* remove existing entry */ + hash_del(&cg->hlist); + /* set new value */ + cg->subsys[ss->subsys_id] = css; + /* recompute hash and restore entry */ + key = css_set_hash(cg->subsys); + hash_add(css_set_table, node, key); } write_unlock(&css_set_lock); @@ -4577,7 +4566,6 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys); void cgroup_unload_subsys(struct cgroup_subsys *ss) { struct cg_cgroup_link *link; - struct hlist_head *hhead; BUG_ON(ss->module == NULL); @@ -4611,11 +4599,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) write_lock(&css_set_lock); list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { struct css_set *cg = link->cg; + unsigned long key; - hlist_del(&cg->hlist); + hash_del(&cg->hlist); cg->subsys[ss->subsys_id] = NULL; - hhead = css_set_hash(cg->subsys); - hlist_add_head(&cg->hlist, hhead); + key = css_set_hash(cg->subsys); + hash_add(css_set_table, &cg->hlist, key); } write_unlock(&css_set_lock); @@ -4657,9 +4646,6 @@ int __init cgroup_init_early(void) list_add(&init_css_set_link.cg_link_list, &init_css_set.cg_links); - for (i = 0; i < CSS_SET_TABLE_SIZE; i++) - INIT_HLIST_HEAD(&css_set_table[i]); - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; @@ -4693,7 +4679,7 @@ int __init cgroup_init(void) { int err; int i; - struct hlist_head *hhead; + unsigned long key; err = bdi_init(&cgroup_backing_dev_info); if (err) @@ -4712,8 +4698,8 @@ int __init cgroup_init(void) } /* Add init_css_set to the hash table */ - hhead = css_set_hash(init_css_set.subsys); - hlist_add_head(&init_css_set.hlist, hhead); + key = css_set_hash(init_css_set.subsys); + hash_add(css_set_table, &init_css_set.hlist, key); BUG_ON(!init_root_id(&rootnode)); cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); -- cgit v1.2.3 From c10d73671ad30f54692f7f69f0e09e75d3a8926a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Jan 2013 15:26:34 -0800 Subject: softirq: reduce latencies In various network workloads, __do_softirq() latencies can be up to 20 ms if HZ=1000, and 200 ms if HZ=100. This is because we iterate 10 times in the softirq dispatcher, and some actions can consume a lot of cycles. This patch changes the fallback to ksoftirqd condition to : - A time limit of 2 ms. - need_resched() being set on current task When one of this condition is met, we wakeup ksoftirqd for further softirq processing if we still have pending softirqs. Using need_resched() as the only condition can trigger RCU stalls, as we can keep BH disabled for too long. I ran several benchmarks and got no significant difference in throughput, but a very significant reduction of latencies (one order of magnitude) : In following bench, 200 antagonist "netperf -t TCP_RR" are started in background, using all available cpus. Then we start one "netperf -t TCP_RR", bound to the cpu handling the NIC IRQ (hard+soft) Before patch : # netperf -H 7.7.7.84 -t TCP_RR -T2,2 -- -k RT_LATENCY,MIN_LATENCY,MAX_LATENCY,P50_LATENCY,P90_LATENCY,P99_LATENCY,MEAN_LATENCY,STDDEV_LATENCY MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : first burst 0 : cpu bind RT_LATENCY=550110.424 MIN_LATENCY=146858 MAX_LATENCY=997109 P50_LATENCY=305000 P90_LATENCY=550000 P99_LATENCY=710000 MEAN_LATENCY=376989.12 STDDEV_LATENCY=184046.92 After patch : # netperf -H 7.7.7.84 -t TCP_RR -T2,2 -- -k RT_LATENCY,MIN_LATENCY,MAX_LATENCY,P50_LATENCY,P90_LATENCY,P99_LATENCY,MEAN_LATENCY,STDDEV_LATENCY MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : first burst 0 : cpu bind RT_LATENCY=40545.492 MIN_LATENCY=9834 MAX_LATENCY=78366 P50_LATENCY=33583 P90_LATENCY=59000 P99_LATENCY=69000 MEAN_LATENCY=38364.67 STDDEV_LATENCY=12865.26 Signed-off-by: Eric Dumazet Cc: David Miller Cc: Tom Herbert Cc: Ben Hutchings Signed-off-by: David S. Miller --- kernel/softirq.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index ed567babe789..47cb991c6ba4 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -195,21 +195,21 @@ void local_bh_enable_ip(unsigned long ip) EXPORT_SYMBOL(local_bh_enable_ip); /* - * We restart softirq processing MAX_SOFTIRQ_RESTART times, - * and we fall back to softirqd after that. + * We restart softirq processing for at most 2 ms, + * and if need_resched() is not set. * - * This number has been established via experimentation. + * These limits have been established via experimentation. * The two things to balance is latency against fairness - * we want to handle softirqs as soon as possible, but they * should not be able to lock up the box. */ -#define MAX_SOFTIRQ_RESTART 10 +#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2) asmlinkage void __do_softirq(void) { struct softirq_action *h; __u32 pending; - int max_restart = MAX_SOFTIRQ_RESTART; + unsigned long end = jiffies + MAX_SOFTIRQ_TIME; int cpu; unsigned long old_flags = current->flags; @@ -264,11 +264,12 @@ restart: local_irq_disable(); pending = local_softirq_pending(); - if (pending && --max_restart) - goto restart; + if (pending) { + if (time_before(jiffies, end) && !need_resched()) + goto restart; - if (pending) wakeup_softirqd(); + } lockdep_softirq_exit(); -- cgit v1.2.3 From 750358ee4f6ebafccf71b36fdc96d738706b4e37 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 2 Oct 2012 11:19:30 -0700 Subject: kernel/gcov: remove depends on CONFIG_EXPERIMENTAL The CONFIG_EXPERIMENTAL config item has not carried much meaning for a while now and is almost always enabled by default. As agreed during the Linux kernel summit, remove it from any "depends on" lines in Kconfigs. Cc: WANG Cong Signed-off-by: Kees Cook Acked-by: Peter Oberparleiter --- kernel/gcov/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index a92028196cc1..d4da55d1fb65 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -35,7 +35,7 @@ config GCOV_KERNEL config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE + depends on SUPERH || S390 || X86 || PPC || MICROBLAZE default n ---help--- This options activates profiling for the entire kernel. -- cgit v1.2.3 From 3a366e614d0837d9fc23f78cdb1a1186ebc3387f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 11 Jan 2013 13:06:33 -0800 Subject: block: add missing block_bio_complete() tracepoint bio completion didn't kick block_bio_complete TP. Only dm was explicitly triggering the TP on IO completion. This makes block_bio_complete TP useless for tracers which want to know about bios, and all other bio based drivers skip generating blktrace completion events. This patch makes all bio completions via bio_endio() generate block_bio_complete TP. * Explicit trace_block_bio_complete() invocation removed from dm and the trace point is unexported. * @rq dropped from trace_block_bio_complete(). bios may fly around w/o queue associated. Verifying and accessing the assocaited queue belongs to TP probes. * blktrace now gets both request and bio completions. Make it ignore bio completions if request completion path is happening. This makes all bio based drivers generate blktrace completion events properly and makes the block_bio_complete TP actually useful. v2: With this change, block_bio_complete TP could be invoked on sg commands which have bio's with %NULL bi_bdev. Update TP assignment code to check whether bio->bi_bdev is %NULL before dereferencing. Signed-off-by: Tejun Heo Original-patch-by: Namhyung Kim Cc: Tejun Heo Cc: Steven Rostedt Cc: Alasdair Kergon Cc: dm-devel@redhat.com Cc: Neil Brown Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c0bd0308741c..190d98fbed27 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -739,6 +739,12 @@ static void blk_add_trace_rq_complete(void *ignore, struct request_queue *q, struct request *rq) { + struct blk_trace *bt = q->blk_trace; + + /* if control ever passes through here, it's a request based driver */ + if (unlikely(bt && !bt->rq_based)) + bt->rq_based = true; + blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); } @@ -774,10 +780,24 @@ static void blk_add_trace_bio_bounce(void *ignore, blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); } -static void blk_add_trace_bio_complete(void *ignore, - struct request_queue *q, struct bio *bio, - int error) +static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error) { + struct request_queue *q; + struct blk_trace *bt; + + if (!bio->bi_bdev) + return; + + q = bdev_get_queue(bio->bi_bdev); + bt = q->blk_trace; + + /* + * Request based drivers will generate both rq and bio completions. + * Ignore bio ones. + */ + if (likely(!bt) || bt->rq_based) + return; + blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); } -- cgit v1.2.3 From 8c1cf6bb02fda79b0a4b9bd121f6be6d4ce7a15a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 11 Jan 2013 13:06:34 -0800 Subject: block: add @req to bio_{front|back}_merge tracepoints bio_{front|back}_merge tracepoints report a bio merging into an existing request but didn't specify which request the bio is being merged into. Add @req to it. This makes it impossible to share the event template with block_bio_queue - split it out. @req isn't used or exported to userland at this point and there is no userland visible behavior change. Later changes will make use of the extra parameter. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 190d98fbed27..fb593f6a687e 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -803,6 +803,7 @@ static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error) static void blk_add_trace_bio_backmerge(void *ignore, struct request_queue *q, + struct request *rq, struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); @@ -810,6 +811,7 @@ static void blk_add_trace_bio_backmerge(void *ignore, static void blk_add_trace_bio_frontmerge(void *ignore, struct request_queue *q, + struct request *rq, struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); -- cgit v1.2.3 From c35ef95c273c06471818f9245a05ac5a6e3ffa34 Mon Sep 17 00:00:00 2001 From: Shawn Guo Date: Sat, 12 Jan 2013 11:50:04 +0000 Subject: clockevents: export clockevents_config_and_register for module use clockevents_config_and_register is a handy helper for clockevent drivers, some of which might support module build, so export the symbol. Signed-off-by: Shawn Guo Acked-by: Arnd Bergmann Reviewed-by: Thomas Gleixner Signed-off-by: Olof Johansson --- kernel/time/clockevents.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 30b6de0d977c..c6d6400ee137 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -339,6 +339,7 @@ void clockevents_config_and_register(struct clock_event_device *dev, clockevents_config(dev, freq); clockevents_register_device(dev); } +EXPORT_SYMBOL_GPL(clockevents_config_and_register); /** * clockevents_update_freq - Update frequency and reprogram a clock event device. -- cgit v1.2.3 From 5d65bc0ca1bceb73204dab943922ba3c83276a8c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 14 Jan 2013 17:23:26 +0800 Subject: cgroup: remove synchronize_rcu() from cgroup_attach_{task|proc}() These 2 syncronize_rcu()s make attaching a task to a cgroup quite slow, and it can't be ignored in some situations. A real case from Colin Cross: Android uses cgroups heavily to manage thread priorities, putting threads in a background group with reduced cpu.shares when they are not visible to the user, and in a foreground group when they are. Some RPCs from foreground threads to background threads will temporarily move the background thread into the foreground group for the duration of the RPC. This results in many calls to cgroup_attach_task. In cgroup_attach_task() it's task->cgroups that is protected by RCU, and put_css_set() calls kfree_rcu() to free it. If we remove this synchronize_rcu(), there can be threads in RCU-read sections accessing their old cgroup via current->cgroups with concurrent rmdir operation, but this is safe. # time for ((i=0; i<50; i++)) { echo $$ > /mnt/sub/tasks; echo $$ > /mnt/tasks; } real 0m2.524s user 0m0.008s sys 0m0.004s With this patch: real 0m0.004s user 0m0.004s sys 0m0.000s tj: These synchronize_rcu()s are utterly confused. synchornize_rcu() necessarily has to come between two operations to guarantee that the changes made by the former operation are visible to all rcu readers before proceeding to the latter operation. Here, synchornize_rcu() are at the end of attach operations with nothing beyond it. Its only effect would be delaying completion of write(2) to sysfs tasks/procs files until all rcu readers see the change, which doesn't mean anything. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo Reported-by: Colin Cross --- kernel/cgroup.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 54b39081ac04..ce27351e9249 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1974,7 +1974,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) ss->attach(cgrp, &tset); } - synchronize_rcu(); out: if (retval) { for_each_subsys(root, ss) { @@ -2143,7 +2142,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) /* * step 5: success! and cleanup */ - synchronize_rcu(); retval = 0; out_put_css_set_refs: if (retval) { -- cgit v1.2.3 From 130e3695a3edf6bf21464f2826720a79a6afdee0 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 14 Jan 2013 17:24:18 +0800 Subject: cgroup: remove synchronize_rcu() from rebind_subsystems() Nothing's protected by RCU in rebind_subsystems(), and I can't think of a reason why it is needed. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ce27351e9249..a893985601e9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1079,7 +1079,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, } } root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; - synchronize_rcu(); return 0; } -- cgit v1.2.3 From 27e89ae5d6e94e30231ee89a7736f62d84ba4c6f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 15 Jan 2013 14:10:57 +0800 Subject: cpuset: fix RCU lockdep splat 5d21cc2db040d01f8c19b8602f6987813e1176b4 ("cpuset: replace cgroup_mutex locking with cpuset internal locking") incorrectly converted proc_cpuset_show() from cgroup_lock() to cpuset_mutex. proc_cpuset_show() is accessing cgroup hierarchy proper to determine cgroup path which can't be protected by cpuset_mutex. This triggered the following RCU warning. =============================== [ INFO: suspicious RCU usage. ] 3.8.0-rc3-next-20130114-sasha-00016-ga107525-dirty #262 Tainted: G W ------------------------------- include/linux/cgroup.h:534 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 1 2 locks held by trinity/7514: #0: (&p->lock){+.+.+.}, at: [] seq_read+0x3a/0x3e0 #1: (cpuset_mutex){+.+...}, at: [] proc_cpuset_show+0x84/0x190 stack backtrace: Pid: 7514, comm: trinity Tainted: G W +3.8.0-rc3-next-20130114-sasha-00016-ga107525-dirty #262 Call Trace: [] lockdep_rcu_suspicious+0x10b/0x120 [] proc_cpuset_show+0x111/0x190 [] seq_read+0x1b7/0x3e0 [] ? seq_lseek+0x110/0x110 [] do_loop_readv_writev+0x4b/0x90 [] do_readv_writev+0xf6/0x1d0 [] vfs_readv+0x3e/0x60 [] sys_readv+0x50/0xd0 [] tracesys+0xe1/0xe6 The operation can be performed under RCU read lock. Replace cpuset_mutex locking with RCU read locking. tj: Rewrote patch description. Reported-by: Sasha Levin Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6aa5bbb5f33b..1a675e446ef2 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2678,15 +2678,15 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v) goto out_free; retval = -EINVAL; - mutex_lock(&cpuset_mutex); + rcu_read_lock(); css = task_subsys_state(tsk, cpuset_subsys_id); retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); + rcu_read_unlock(); if (retval < 0) - goto out_unlock; + goto out_put_task; seq_puts(m, buf); seq_putc(m, '\n'); -out_unlock: - mutex_unlock(&cpuset_mutex); +out_put_task: put_task_struct(tsk); out_free: kfree(buf); -- cgit v1.2.3 From d127027baf98dce3ca31bec18c2c0e048ceda7c4 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 15 Jan 2013 14:11:32 +0800 Subject: cpuset: drop spurious retval assignment in proc_cpuset_show() proc_cpuset_show() has a spurious -EINVAL assignment which does nothing. Remove it. This patch doesn't make any functional difference. tj: Rewrote patch description. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1a675e446ef2..16be7c9edfd7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2677,7 +2677,6 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v) if (!tsk) goto out_free; - retval = -EINVAL; rcu_read_lock(); css = task_subsys_state(tsk, cpuset_subsys_id); retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); -- cgit v1.2.3 From 1e817fb62cd185a2232ad4302579491805609489 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 19 Nov 2012 10:26:16 -0800 Subject: time: create __getnstimeofday for WARNless calls The pstore RAM backend can get called during resume, and must be defensive against a suspended time source. Expose getnstimeofday logic that returns an error instead of a WARN. This can be detected and the timestamp can be zeroed out. Reported-by: Doug Anderson Cc: John Stultz Cc: Anton Vorontsov Signed-off-by: Kees Cook Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4c7de02eacdc..dfc7f87eb332 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -214,19 +214,18 @@ static void timekeeping_forward_now(struct timekeeper *tk) } /** - * getnstimeofday - Returns the time of day in a timespec + * __getnstimeofday - Returns the time of day in a timespec. * @ts: pointer to the timespec to be set * - * Returns the time of day in a timespec. + * Updates the time of day in the timespec. + * Returns 0 on success, or -ve when suspended (timespec will be undefined). */ -void getnstimeofday(struct timespec *ts) +int __getnstimeofday(struct timespec *ts) { struct timekeeper *tk = &timekeeper; unsigned long seq; s64 nsecs = 0; - WARN_ON(timekeeping_suspended); - do { seq = read_seqbegin(&tk->lock); @@ -237,6 +236,26 @@ void getnstimeofday(struct timespec *ts) ts->tv_nsec = 0; timespec_add_ns(ts, nsecs); + + /* + * Do not bail out early, in case there were callers still using + * the value, even in the face of the WARN_ON. + */ + if (unlikely(timekeeping_suspended)) + return -EAGAIN; + return 0; +} +EXPORT_SYMBOL(__getnstimeofday); + +/** + * getnstimeofday - Returns the time of day in a timespec. + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec (WARN if suspended). + */ +void getnstimeofday(struct timespec *ts) +{ + WARN_ON(__getnstimeofday(ts)); } EXPORT_SYMBOL(getnstimeofday); -- cgit v1.2.3 From 023f333a99cee9b5cd3268ff87298eb01a31f78e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Dec 2012 14:30:53 -0700 Subject: NTP: Add a CONFIG_RTC_SYSTOHC configuration The purpose of this option is to allow ARM/etc systems that rely on the class RTC subsystem to have the same kind of automatic NTP based synchronization that we have on PC platforms. Today ARM does not implement update_persistent_clock and makes extensive use of the class RTC system. When enabled CONFIG_RTC_SYSTOHC will provide a generic rtc_update_persistent_clock that stores the current time in the RTC and is intended complement the existing CONFIG_RTC_HCTOSYS option that loads the RTC at boot. Like with RTC_HCTOSYS the platform's update_persistent_clock is used first, if it works. Platforms with mixed class RTC and non-RTC drivers need to return ENODEV when class RTC should be used. Such an update for PPC is included in this patch. Long term, implementations of update_persistent_clock should migrate to proper class RTC drivers and use CONFIG_RTC_SYSTOHC instead. Tested on ARM kirkwood and PPC405 Signed-off-by: Jason Gunthorpe Signed-off-by: John Stultz --- kernel/time/ntp.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 24174b4d669b..313b161504b7 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "tick-internal.h" @@ -483,8 +484,7 @@ out: return leap; } -#ifdef CONFIG_GENERIC_CMOS_UPDATE - +#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) static void sync_cmos_clock(struct work_struct *work); static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); @@ -510,14 +510,22 @@ static void sync_cmos_clock(struct work_struct *work) } getnstimeofday(&now); - if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) + if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) { + fail = -ENODEV; +#ifdef CONFIG_GENERIC_CMOS_UPDATE fail = update_persistent_clock(now); +#endif +#ifdef CONFIG_RTC_SYSTOHC + if (fail == -ENODEV) + fail = rtc_set_ntp_time(now); +#endif + } next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2); if (next.tv_nsec <= 0) next.tv_nsec += NSEC_PER_SEC; - if (!fail) + if (!fail || fail == -ENODEV) next.tv_sec = 659; else next.tv_sec = 0; -- cgit v1.2.3 From f0dbe81f0e7c39783ad25d9084bbcda131508993 Mon Sep 17 00:00:00 2001 From: Miroslav Lichvar Date: Fri, 11 Jan 2013 11:58:58 +0100 Subject: posix-timers: Fix clock_adjtime to always return timex data on success The clock_adj call returns the clock state on success, which may be a non-zero value (e.g. TIME_INS), but the modified timex data is copied back to the user only when zero value (TIME_OK) was returned. Fix the condition to copy the data also with positive return values. Signed-off-by: Miroslav Lichvar Signed-off-by: John Stultz --- kernel/posix-timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 69185ae6b701..10349d5f2ec3 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -997,7 +997,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, err = kc->clock_adj(which_clock, &ktx); - if (!err && copy_to_user(utx, &ktx, sizeof(ktx))) + if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx))) return -EFAULT; return err; -- cgit v1.2.3 From 31ade30692dc9680bfc95700d794818fa3f754ac Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 16 Jan 2013 00:09:47 +0800 Subject: timekeeping: Add persistent_clock_exist flag In current kernel, there are several places which need to check whether there is a persistent clock for the platform. Current check is done by calling the read_persistent_clock() and validating its return value. So one optimization is to do the check only once in timekeeping_init(), and use a flag persistent_clock_exist to record it. v2: Add a has_persistent_clock() helper function, as suggested by John. Cc: Thomas Gleixner Cc: John Stultz Signed-off-by: Feng Tang Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index dfc7f87eb332..b7a584177618 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -28,6 +28,9 @@ static struct timekeeper timekeeper; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; +/* Flag for if there is a persistent clock on this platform */ +bool __read_mostly persistent_clock_exist = false; + static inline void tk_normalize_xtime(struct timekeeper *tk) { while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) { @@ -609,12 +612,14 @@ void __init timekeeping_init(void) struct timespec now, boot, tmp; read_persistent_clock(&now); + if (!timespec_valid_strict(&now)) { pr_warn("WARNING: Persistent clock returned invalid value!\n" " Check your CMOS/BIOS settings.\n"); now.tv_sec = 0; now.tv_nsec = 0; - } + } else if (now.tv_sec || now.tv_nsec) + persistent_clock_exist = true; read_boot_clock(&boot); if (!timespec_valid_strict(&boot)) { @@ -687,11 +692,12 @@ void timekeeping_inject_sleeptime(struct timespec *delta) { struct timekeeper *tk = &timekeeper; unsigned long flags; - struct timespec ts; - /* Make sure we don't set the clock twice */ - read_persistent_clock(&ts); - if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) + /* + * Make sure we don't set the clock twice, as timekeeping_resume() + * already did it + */ + if (has_persistent_clock()) return; write_seqlock_irqsave(&tk->lock, flags); -- cgit v1.2.3 From 05ad717c77b1b8e98a1dd768c3700036d634629e Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 16 Jan 2013 00:09:49 +0800 Subject: timekeeping: Add CONFIG_HAS_PERSISTENT_CLOCK option Make the persistent clock check a kernel config option, so that some platform can explicitely select it, also make CONFIG_RTC_HCTOSYS and RTC_SYSTOHC depend on its non-existence, which could prevent the persistent clock and RTC code from doing similar thing twice during system's init/suspend/resume phases. If the CONFIG_HAS_PERSISTENT_CLOCK=n, then no change happens for kernel which still does the persistent clock check in timekeeping_init(). Cc: Thomas Gleixner Suggested-by: John Stultz Signed-off-by: Feng Tang [jstultz: Added dependency for RTC_SYSTOHC as well] Signed-off-by: John Stultz --- kernel/time/Kconfig | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 8601f0db1261..f7e45b9b142b 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -12,6 +12,11 @@ config CLOCKSOURCE_WATCHDOG config ARCH_CLOCKSOURCE_DATA bool +# Platforms has a persistent clock +config HAS_PERSISTENT_CLOCK + bool + default n + # Timekeeping vsyscall support config GENERIC_TIME_VSYSCALL bool -- cgit v1.2.3 From 4dbd27711cd92bcc364426937a0d5e80f10b1cfb Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Fri, 4 Jan 2013 11:12:43 +0000 Subject: tick: export nohz tick idle symbols for module use Allow drivers such as intel_powerclamp to use these apis for turning on/off ticks during idle. Signed-off-by: Jacob Pan Signed-off-by: Zhang Rui --- kernel/time/tick-sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index d58e552d9fd1..a7677579bcc3 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -553,6 +553,7 @@ void tick_nohz_idle_enter(void) local_irq_enable(); } +EXPORT_SYMBOL_GPL(tick_nohz_idle_enter); /** * tick_nohz_irq_exit - update next tick event from interrupt exit @@ -681,6 +682,7 @@ void tick_nohz_idle_exit(void) local_irq_enable(); } +EXPORT_SYMBOL_GPL(tick_nohz_idle_exit); static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) { -- cgit v1.2.3 From 111c225a5f8d872bc9327ada18d13b75edaa34be Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 17 Jan 2013 17:16:24 -0800 Subject: workqueue: set PF_WQ_WORKER on rescuers PF_WQ_WORKER is used to tell scheduler that the task is a workqueue worker and needs wq_worker_sleeping/waking_up() invoked on it for concurrency management. As rescuers never participate in concurrency management, PF_WQ_WORKER wasn't set on them. There's a need for an interface which can query whether %current is executing a work item and if so which. Such interface requires a way to identify all tasks which may execute work items and PF_WQ_WORKER will be used for that. As all normal workers always have PF_WQ_WORKER set, we only need to add it to rescuers. As rescuers start with WORKER_PREP but never clear it, it's always NOT_RUNNING and there's no need to worry about it interfering with concurrency management even if PF_WQ_WORKER is set; however, unlike normal workers, rescuers currently don't have its worker struct as kthread_data(). It uses the associated workqueue_struct instead. This is problematic as wq_worker_sleeping/waking_up() expect struct worker at kthread_data(). This patch adds worker->rescue_wq and start rescuer kthreads with worker struct as kthread_data and sets PF_WQ_WORKER on rescuers. Signed-off-by: Tejun Heo Cc: Linus Torvalds --- kernel/workqueue.c | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7967f3476393..6b99ac7b19f6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -149,6 +149,9 @@ struct worker { /* for rebinding worker to CPU */ struct work_struct rebind_work; /* L: for busy worker */ + + /* used only by rescuers to point to the target workqueue */ + struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ }; struct worker_pool { @@ -763,12 +766,20 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, unsigned int cpu) { struct worker *worker = kthread_data(task), *to_wakeup = NULL; - struct worker_pool *pool = worker->pool; - atomic_t *nr_running = get_pool_nr_running(pool); + struct worker_pool *pool; + atomic_t *nr_running; + /* + * Rescuers, which may not have all the fields set up like normal + * workers, also reach here, let's not access anything before + * checking NOT_RUNNING. + */ if (worker->flags & WORKER_NOT_RUNNING) return NULL; + pool = worker->pool; + nr_running = get_pool_nr_running(pool); + /* this can only happen on the local cpu */ BUG_ON(cpu != raw_smp_processor_id()); @@ -2357,7 +2368,7 @@ sleep: /** * rescuer_thread - the rescuer thread function - * @__wq: the associated workqueue + * @__rescuer: self * * Workqueue rescuer thread function. There's one rescuer for each * workqueue which has WQ_RESCUER set. @@ -2374,20 +2385,27 @@ sleep: * * This should happen rarely. */ -static int rescuer_thread(void *__wq) +static int rescuer_thread(void *__rescuer) { - struct workqueue_struct *wq = __wq; - struct worker *rescuer = wq->rescuer; + struct worker *rescuer = __rescuer; + struct workqueue_struct *wq = rescuer->rescue_wq; struct list_head *scheduled = &rescuer->scheduled; bool is_unbound = wq->flags & WQ_UNBOUND; unsigned int cpu; set_user_nice(current, RESCUER_NICE_LEVEL); + + /* + * Mark rescuer as worker too. As WORKER_PREP is never cleared, it + * doesn't participate in concurrency management. + */ + rescuer->task->flags |= PF_WQ_WORKER; repeat: set_current_state(TASK_INTERRUPTIBLE); if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); + rescuer->task->flags &= ~PF_WQ_WORKER; return 0; } @@ -2431,6 +2449,8 @@ repeat: spin_unlock_irq(&gcwq->lock); } + /* rescuers should never participate in concurrency management */ + WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); schedule(); goto repeat; } @@ -3266,7 +3286,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, if (!rescuer) goto err; - rescuer->task = kthread_create(rescuer_thread, wq, "%s", + rescuer->rescue_wq = wq; + rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); if (IS_ERR(rescuer->task)) goto err; -- cgit v1.2.3 From ea138446e51f7bfe55cdeffa3f1dd9cafc786bd8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 18 Jan 2013 14:05:55 -0800 Subject: workqueue: rename kernel/workqueue_sched.h to kernel/workqueue_internal.h Workqueue wants to expose more interface internal to kernel/. Instead of adding a new header file, repurpose kernel/workqueue_sched.h. Rename it to workqueue_internal.h and add include protector. This patch doesn't introduce any functional changes. Signed-off-by: Tejun Heo Cc: Linus Torvalds Cc: Ingo Molnar Cc: Peter Zijlstra --- kernel/sched/core.c | 2 +- kernel/workqueue.c | 2 +- kernel/workqueue_internal.h | 18 ++++++++++++++++++ kernel/workqueue_sched.h | 9 --------- 4 files changed, 20 insertions(+), 11 deletions(-) create mode 100644 kernel/workqueue_internal.h delete mode 100644 kernel/workqueue_sched.h (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 257002c13bb0..c6737f4fb63b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -83,7 +83,7 @@ #endif #include "sched.h" -#include "../workqueue_sched.h" +#include "../workqueue_internal.h" #include "../smpboot.h" #define CREATE_TRACE_POINTS diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6b99ac7b19f6..b4e92061a934 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -43,7 +43,7 @@ #include #include -#include "workqueue_sched.h" +#include "workqueue_internal.h" enum { /* diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h new file mode 100644 index 000000000000..b3ea6ad5566b --- /dev/null +++ b/kernel/workqueue_internal.h @@ -0,0 +1,18 @@ +/* + * kernel/workqueue_internal.h + * + * Workqueue internal header file. Only to be included by workqueue and + * core kernel subsystems. + */ +#ifndef _KERNEL_WORKQUEUE_INTERNAL_H +#define _KERNEL_WORKQUEUE_INTERNAL_H + +/* + * Scheduler hooks for concurrency managed workqueue. Only to be used from + * sched.c and workqueue.c. + */ +void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); +struct task_struct *wq_worker_sleeping(struct task_struct *task, + unsigned int cpu); + +#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h deleted file mode 100644 index 2d10fc98dc79..000000000000 --- a/kernel/workqueue_sched.h +++ /dev/null @@ -1,9 +0,0 @@ -/* - * kernel/workqueue_sched.h - * - * Scheduler hooks for concurrency managed workqueue. Only to be - * included from sched.c and workqueue.c. - */ -void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); -struct task_struct *wq_worker_sleeping(struct task_struct *task, - unsigned int cpu); -- cgit v1.2.3 From 2eaebdb33e1911c0cf3d44fd3596c42c6f502fab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 18 Jan 2013 14:05:55 -0800 Subject: workqueue: move struct worker definition to workqueue_internal.h This will be used to implement an inline function to query whether %current is a workqueue worker and, if so, allow determining which work item it's executing. Signed-off-by: Tejun Heo Cc: Linus Torvalds --- kernel/workqueue.c | 32 +------------------------------- kernel/workqueue_internal.h | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b4e92061a934..2ffa240052fa 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -122,37 +122,7 @@ enum { * W: workqueue_lock protected. */ -struct global_cwq; -struct worker_pool; - -/* - * The poor guys doing the actual heavy lifting. All on-duty workers - * are either serving the manager role, on idle list or on busy hash. - */ -struct worker { - /* on idle list while idle, on busy hash table while busy */ - union { - struct list_head entry; /* L: while idle */ - struct hlist_node hentry; /* L: while busy */ - }; - - struct work_struct *current_work; /* L: work being processed */ - work_func_t current_func; /* L: current_work's fn */ - struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ - struct list_head scheduled; /* L: scheduled works */ - struct task_struct *task; /* I: worker task */ - struct worker_pool *pool; /* I: the associated pool */ - /* 64 bytes boundary on 64bit, 32 on 32bit */ - unsigned long last_active; /* L: last active timestamp */ - unsigned int flags; /* X: flags */ - int id; /* I: worker id */ - - /* for rebinding worker to CPU */ - struct work_struct rebind_work; /* L: for busy worker */ - - /* used only by rescuers to point to the target workqueue */ - struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ -}; +/* struct worker is defined in workqueue_internal.h */ struct worker_pool { struct global_cwq *gcwq; /* I: the owning gcwq */ diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index b3ea6ad5566b..02549fa04587 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -7,6 +7,43 @@ #ifndef _KERNEL_WORKQUEUE_INTERNAL_H #define _KERNEL_WORKQUEUE_INTERNAL_H +#include + +struct global_cwq; +struct worker_pool; + +/* + * The poor guys doing the actual heavy lifting. All on-duty workers are + * either serving the manager role, on idle list or on busy hash. For + * details on the locking annotation (L, I, X...), refer to workqueue.c. + * + * Only to be used in workqueue and async. + */ +struct worker { + /* on idle list while idle, on busy hash table while busy */ + union { + struct list_head entry; /* L: while idle */ + struct hlist_node hentry; /* L: while busy */ + }; + + struct work_struct *current_work; /* L: work being processed */ + work_func_t current_func; /* L: current_work's fn */ + struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ + struct list_head scheduled; /* L: scheduled works */ + struct task_struct *task; /* I: worker task */ + struct worker_pool *pool; /* I: the associated pool */ + /* 64 bytes boundary on 64bit, 32 on 32bit */ + unsigned long last_active; /* L: last active timestamp */ + unsigned int flags; /* X: flags */ + int id; /* I: worker id */ + + /* for rebinding worker to CPU */ + struct work_struct rebind_work; /* L: for busy worker */ + + /* used only by rescuers to point to the target workqueue */ + struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ +}; + /* * Scheduler hooks for concurrency managed workqueue. Only to be used from * sched.c and workqueue.c. -- cgit v1.2.3 From 84b233adcca3cacd5cfa8013a5feda7a3db4a9af Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 18 Jan 2013 14:05:56 -0800 Subject: workqueue: implement current_is_async() This function queries whether %current is an async worker executing an async item. This will be used to implement warning on synchronous request_module() from async workers. Signed-off-by: Tejun Heo --- kernel/async.c | 14 ++++++++++++++ kernel/workqueue_internal.h | 11 +++++++++++ 2 files changed, 25 insertions(+) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index 9d3118384858..d9bf2a9b5cee 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -57,6 +57,8 @@ asynchronous and synchronous parts of the kernel. #include #include +#include "workqueue_internal.h" + static async_cookie_t next_cookie = 1; #define MAX_WORK 32768 @@ -337,3 +339,15 @@ void async_synchronize_cookie(async_cookie_t cookie) async_synchronize_cookie_domain(cookie, &async_running); } EXPORT_SYMBOL_GPL(async_synchronize_cookie); + +/** + * current_is_async - is %current an async worker task? + * + * Returns %true if %current is an async worker task. + */ +bool current_is_async(void) +{ + struct worker *worker = current_wq_worker(); + + return worker && worker->current_func == async_run_entry_fn; +} diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 02549fa04587..cc35e7e62091 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -8,6 +8,7 @@ #define _KERNEL_WORKQUEUE_INTERNAL_H #include +#include struct global_cwq; struct worker_pool; @@ -44,6 +45,16 @@ struct worker { struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ }; +/** + * current_wq_worker - return struct worker if %current is a workqueue worker + */ +static inline struct worker *current_wq_worker(void) +{ + if (current->flags & PF_WQ_WORKER) + return kthread_data(current); + return NULL; +} + /* * Scheduler hooks for concurrency managed workqueue. Only to be used from * sched.c and workqueue.c. -- cgit v1.2.3 From 64748a2c9062da0c32b59c1b368a86fc4613b1e1 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 21 Jan 2013 17:03:02 +1030 Subject: module: printk message when module signature fail taints kernel. Reported-by: Chris Samuel Signed-off-by: Rusty Russell --- kernel/module.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index eab08274ec9b..e69a5a68766f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3192,8 +3192,13 @@ again: #ifdef CONFIG_MODULE_SIG mod->sig_ok = info->sig_ok; - if (!mod->sig_ok) + if (!mod->sig_ok) { + printk_once(KERN_NOTICE + "%s: module verification failed: signature and/or" + " required key missing - tainting kernel\n", + mod->name); add_taint_module(mod, TAINT_FORCED_MODULE); + } #endif /* Now module is in final location, initialize linked lists, etc. */ -- cgit v1.2.3 From 373d4d099761cb1f637bed488ab3871945882273 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 21 Jan 2013 17:17:39 +1030 Subject: taint: add explicit flag to show whether lock dep is still OK. Fix up all callers as they were before, with make one change: an unsigned module taints the kernel, but doesn't turn off lockdep. Signed-off-by: Rusty Russell --- kernel/module.c | 26 +++++++++++++++----------- kernel/panic.c | 34 ++++++++++++++-------------------- kernel/sched/core.c | 2 +- kernel/sysctl.c | 2 +- 4 files changed, 31 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index e69a5a68766f..cc000dd6e4a8 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -197,9 +197,10 @@ static inline int strong_try_module_get(struct module *mod) return -ENOENT; } -static inline void add_taint_module(struct module *mod, unsigned flag) +static inline void add_taint_module(struct module *mod, unsigned flag, + enum lockdep_ok lockdep_ok) { - add_taint(flag); + add_taint(flag, lockdep_ok); mod->taints |= (1U << flag); } @@ -727,7 +728,7 @@ static inline int try_force_unload(unsigned int flags) { int ret = (flags & O_TRUNC); if (ret) - add_taint(TAINT_FORCED_RMMOD); + add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE); return ret; } #else @@ -1138,7 +1139,7 @@ static int try_to_force_load(struct module *mod, const char *reason) if (!test_taint(TAINT_FORCED_MODULE)) printk(KERN_WARNING "%s: %s: kernel tainted.\n", mod->name, reason); - add_taint_module(mod, TAINT_FORCED_MODULE); + add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE); return 0; #else return -ENOEXEC; @@ -2147,7 +2148,8 @@ static void set_license(struct module *mod, const char *license) if (!test_taint(TAINT_PROPRIETARY_MODULE)) printk(KERN_WARNING "%s: module license '%s' taints " "kernel.\n", mod->name, license); - add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + add_taint_module(mod, TAINT_PROPRIETARY_MODULE, + LOCKDEP_NOW_UNRELIABLE); } } @@ -2700,10 +2702,10 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) } if (!get_modinfo(info, "intree")) - add_taint_module(mod, TAINT_OOT_MODULE); + add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK); if (get_modinfo(info, "staging")) { - add_taint_module(mod, TAINT_CRAP); + add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); printk(KERN_WARNING "%s: module is from the staging directory," " the quality is unknown, you have been warned.\n", mod->name); @@ -2869,15 +2871,17 @@ static int check_module_license_and_versions(struct module *mod) * using GPL-only symbols it needs. */ if (strcmp(mod->name, "ndiswrapper") == 0) - add_taint(TAINT_PROPRIETARY_MODULE); + add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE); /* driverloader was caught wrongly pretending to be under GPL */ if (strcmp(mod->name, "driverloader") == 0) - add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + add_taint_module(mod, TAINT_PROPRIETARY_MODULE, + LOCKDEP_NOW_UNRELIABLE); /* lve claims to be GPL but upstream won't provide source */ if (strcmp(mod->name, "lve") == 0) - add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + add_taint_module(mod, TAINT_PROPRIETARY_MODULE, + LOCKDEP_NOW_UNRELIABLE); #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) @@ -3197,7 +3201,7 @@ again: "%s: module verification failed: signature and/or" " required key missing - tainting kernel\n", mod->name); - add_taint_module(mod, TAINT_FORCED_MODULE); + add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK); } #endif diff --git a/kernel/panic.c b/kernel/panic.c index e1b2822fff97..7c57cc9eee2c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -259,26 +259,19 @@ unsigned long get_taint(void) return tainted_mask; } -void add_taint(unsigned flag) +/** + * add_taint: add a taint flag if not already set. + * @flag: one of the TAINT_* constants. + * @lockdep_ok: whether lock debugging is still OK. + * + * If something bad has gone wrong, you'll want @lockdebug_ok = false, but for + * some notewortht-but-not-corrupting cases, it can be set to true. + */ +void add_taint(unsigned flag, enum lockdep_ok lockdep_ok) { - /* - * Can't trust the integrity of the kernel anymore. - * We don't call directly debug_locks_off() because the issue - * is not necessarily serious enough to set oops_in_progress to 1 - * Also we want to keep up lockdep for staging/out-of-tree - * development and post-warning case. - */ - switch (flag) { - case TAINT_CRAP: - case TAINT_OOT_MODULE: - case TAINT_WARN: - case TAINT_FIRMWARE_WORKAROUND: - break; - - default: - if (__debug_locks_off()) - printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n"); - } + if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off()) + printk(KERN_WARNING + "Disabling lock debugging due to kernel taint\n"); set_bit(flag, &tainted_mask); } @@ -421,7 +414,8 @@ static void warn_slowpath_common(const char *file, int line, void *caller, print_modules(); dump_stack(); print_oops_end_marker(); - add_taint(taint); + /* Just a warning, don't kill lockdep. */ + add_taint(taint, LOCKDEP_STILL_OK); } void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 257002c13bb0..662f3d512183 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2785,7 +2785,7 @@ static noinline void __schedule_bug(struct task_struct *prev) if (irqs_disabled()) print_irqtrace_events(prev); dump_stack(); - add_taint(TAINT_WARN); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c88878db491e..f97f9d75cde8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2006,7 +2006,7 @@ static int proc_taint(struct ctl_table *table, int write, int i; for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { if ((tmptaint >> i) & 1) - add_taint(i); + add_taint(i, LOCKDEP_STILL_OK); } } -- cgit v1.2.3 From a3535c7e4f4495fe947f7901d25447d80e04fe52 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 21 Jan 2013 17:18:59 +1030 Subject: module: clean up load_module a little more. 1fb9341ac34825aa40354e74d9a2c69df7d2c304 made our locking in load_module more complicated: we grab the mutex once to insert the module in the list, then again to upgrade it once it's formed. Since the locking is self-contained, it's neater to do this in separate functions. Signed-off-by: Rusty Russell --- kernel/module.c | 107 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 68 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index cc000dd6e4a8..921bed4794e9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3145,12 +3145,72 @@ static int may_init_module(void) return 0; } +/* + * We try to place it in the list now to make sure it's unique before + * we dedicate too many resources. In particular, temporary percpu + * memory exhaustion. + */ +static int add_unformed_module(struct module *mod) +{ + int err; + struct module *old; + + mod->state = MODULE_STATE_UNFORMED; + +again: + mutex_lock(&module_mutex); + if ((old = find_module_all(mod->name, true)) != NULL) { + if (old->state == MODULE_STATE_COMING + || old->state == MODULE_STATE_UNFORMED) { + /* Wait in case it fails to load. */ + mutex_unlock(&module_mutex); + err = wait_event_interruptible(module_wq, + finished_loading(mod->name)); + if (err) + goto out_unlocked; + goto again; + } + err = -EEXIST; + goto out; + } + list_add_rcu(&mod->list, &modules); + err = 0; + +out: + mutex_unlock(&module_mutex); +out_unlocked: + return err; +} + +static int complete_formation(struct module *mod, struct load_info *info) +{ + int err; + + mutex_lock(&module_mutex); + + /* Find duplicate symbols (must be called under lock). */ + err = verify_export_symbols(mod); + if (err < 0) + goto out; + + /* This relies on module_mutex for list integrity. */ + module_bug_finalize(info->hdr, info->sechdrs, mod); + + /* Mark state as coming so strong_try_module_get() ignores us, + * but kallsyms etc. can see us. */ + mod->state = MODULE_STATE_COMING; + +out: + mutex_unlock(&module_mutex); + return err; +} + /* Allocate and load the module: note that size of section 0 is always zero, and we rely on this for optional sections. */ static int load_module(struct load_info *info, const char __user *uargs, int flags) { - struct module *mod, *old; + struct module *mod; long err; err = module_sig_check(info); @@ -3168,31 +3228,10 @@ static int load_module(struct load_info *info, const char __user *uargs, goto free_copy; } - /* - * We try to place it in the list now to make sure it's unique - * before we dedicate too many resources. In particular, - * temporary percpu memory exhaustion. - */ - mod->state = MODULE_STATE_UNFORMED; -again: - mutex_lock(&module_mutex); - if ((old = find_module_all(mod->name, true)) != NULL) { - if (old->state == MODULE_STATE_COMING - || old->state == MODULE_STATE_UNFORMED) { - /* Wait in case it fails to load. */ - mutex_unlock(&module_mutex); - err = wait_event_interruptible(module_wq, - finished_loading(mod->name)); - if (err) - goto free_module; - goto again; - } - err = -EEXIST; - mutex_unlock(&module_mutex); + /* Reserve our place in the list. */ + err = add_unformed_module(mod); + if (err) goto free_module; - } - list_add_rcu(&mod->list, &modules); - mutex_unlock(&module_mutex); #ifdef CONFIG_MODULE_SIG mod->sig_ok = info->sig_ok; @@ -3245,21 +3284,11 @@ again: dynamic_debug_setup(info->debug, info->num_debug); - mutex_lock(&module_mutex); - /* Find duplicate symbols (must be called under lock). */ - err = verify_export_symbols(mod); - if (err < 0) + /* Finally it's fully formed, ready to start executing. */ + err = complete_formation(mod, info); + if (err) goto ddebug_cleanup; - /* This relies on module_mutex for list integrity. */ - module_bug_finalize(info->hdr, info->sechdrs, mod); - - /* Mark state as coming so strong_try_module_get() ignores us, - * but kallsyms etc. can see us. */ - mod->state = MODULE_STATE_COMING; - - mutex_unlock(&module_mutex); - /* Module is ready to execute: parsing args may do that. */ err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, -32768, 32767, &ddebug_dyndbg_module_param_cb); @@ -3283,8 +3312,8 @@ again: /* module_bug_cleanup needs module_mutex protection */ mutex_lock(&module_mutex); module_bug_cleanup(mod); - ddebug_cleanup: mutex_unlock(&module_mutex); + ddebug_cleanup: dynamic_debug_remove(info->debug); synchronize_sched(); kfree(mod->args); -- cgit v1.2.3 From 771e03842a9e98a1c2013ca1ed8bb2793488f3e5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 30 Nov 2012 10:41:57 -0500 Subject: ring-buffer: Remove unnecessary recusive call in rb_advance_iter() The original ring-buffer code had special checks at the start of rb_advance_iter() and instead of repeating them again at the end of the function if a certain condition existed, I just did a recursive call to rb_advance_iter() because the special condition would cause rb_advance_iter() to return early (after the checks). But as things have changed, the special checks no longer exist and the only thing done for the special_condition is to call rb_inc_iter() and return. Instead of doing a confusing recursive call, just call rb_inc_iter instead. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ce8514feedcd..6ff9cc4658ed 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3425,7 +3425,7 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) /* check for end of page padding */ if ((iter->head >= rb_page_size(iter->head_page)) && (iter->head_page != cpu_buffer->commit_page)) - rb_advance_iter(iter); + rb_inc_iter(iter); } static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) -- cgit v1.2.3 From d8a0349c0cea477322c66ea9362f10c62fad5f62 Mon Sep 17 00:00:00 2001 From: Shan Wei Date: Tue, 13 Nov 2012 09:53:04 +0800 Subject: tracing: Use this_cpu_ptr per-cpu helper typeof(&buffer) is a pointer to array of 1024 char, or char (*)[1024]. But, typeof(&buffer[0]) is a pointer to char which match the return type of get_trace_buf(). As well-known, the value of &buffer is equal to &buffer[0]. so return this_cpu_ptr(&percpu_buffer->buffer[0]) can avoid type cast. Link: http://lkml.kernel.org/r/50A1A800.3020102@gmail.com Reviewed-by: Christoph Lameter Signed-off-by: Shan Wei Signed-off-by: Steven Rostedt --- kernel/trace/blktrace.c | 2 +- kernel/trace/trace.c | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c0bd0308741c..71259e2b6b61 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -147,7 +147,7 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) return; local_irq_save(flags); - buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); + buf = this_cpu_ptr(bt->msg_data); va_start(args, fmt); n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3c13e46d7d24..f8b7c626f3fd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1517,7 +1517,6 @@ static struct trace_buffer_struct *trace_percpu_nmi_buffer; static char *get_trace_buf(void) { struct trace_buffer_struct *percpu_buffer; - struct trace_buffer_struct *buffer; /* * If we have allocated per cpu buffers, then we do not @@ -1535,9 +1534,7 @@ static char *get_trace_buf(void) if (!percpu_buffer) return NULL; - buffer = per_cpu_ptr(percpu_buffer, smp_processor_id()); - - return buffer->buffer; + return this_cpu_ptr(&percpu_buffer->buffer[0]); } static int alloc_percpu_trace_buffer(void) -- cgit v1.2.3 From d24d7dbf3cc49b00a152e55e24f0eeb173c7a971 Mon Sep 17 00:00:00 2001 From: Jovi Zhang Date: Wed, 18 Jul 2012 18:16:44 +0800 Subject: tracing: Verify target file before registering a uprobe event Without this patch, we can register a uprobe event for a directory. Enabling such a uprobe event would anyway fail. Example: $ echo 'p /bin:0x4245c0' > /sys/kernel/debug/tracing/uprobe_events However dirctories cannot be valid targets for uprobe. Hence verify if the target is a regular file during the probe registration. Link: http://lkml.kernel.org/r/20130103004212.690763002@goodmis.org Cc: Namhyung Kim Signed-off-by: Jovi Zhang Acked-by: Srikar Dronamraju [ cleaned up whitespace and removed redundant IS_DIR() check ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_uprobe.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c86e6d4f67fb..87b6db4ccbc5 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -258,6 +258,10 @@ static int create_trace_uprobe(int argc, char **argv) goto fail_address_parse; inode = igrab(path.dentry->d_inode); + if (!S_ISREG(inode->i_mode)) { + ret = -EINVAL; + goto fail_address_parse; + } argc -= 2; argv += 2; @@ -356,7 +360,7 @@ fail_address_parse: if (inode) iput(inode); - pr_info("Failed to parse address.\n"); + pr_info("Failed to parse address or file.\n"); return ret; } -- cgit v1.2.3 From 6aea49cb5f3001a8275bf9c9f586ec3eb39af194 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Wed, 21 Nov 2012 15:13:47 +0800 Subject: tracing/syscalls: Make local functions static Some functions in the syscall tracing is used only locally to the file, but they are labeled global. Convert them to static functions. Signed-off-by: Fengguang Wu Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 7609dd6714c2..5329e13e74a1 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -77,7 +77,7 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr) return syscalls_metadata[nr]; } -enum print_line_t +static enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_event *event) { @@ -130,7 +130,7 @@ end: return TRACE_TYPE_HANDLED; } -enum print_line_t +static enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags, struct trace_event *event) { @@ -270,7 +270,7 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call) return ret; } -void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) +static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) { struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; @@ -305,7 +305,7 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) trace_current_buffer_unlock_commit(buffer, event, 0, 0); } -void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) +static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) { struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; @@ -337,7 +337,7 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) trace_current_buffer_unlock_commit(buffer, event, 0, 0); } -int reg_event_syscall_enter(struct ftrace_event_call *call) +static int reg_event_syscall_enter(struct ftrace_event_call *call) { int ret = 0; int num; @@ -356,7 +356,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call) return ret; } -void unreg_event_syscall_enter(struct ftrace_event_call *call) +static void unreg_event_syscall_enter(struct ftrace_event_call *call) { int num; @@ -371,7 +371,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call) mutex_unlock(&syscall_trace_lock); } -int reg_event_syscall_exit(struct ftrace_event_call *call) +static int reg_event_syscall_exit(struct ftrace_event_call *call) { int ret = 0; int num; @@ -390,7 +390,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call) return ret; } -void unreg_event_syscall_exit(struct ftrace_event_call *call) +static void unreg_event_syscall_exit(struct ftrace_event_call *call) { int num; @@ -459,7 +459,7 @@ unsigned long __init __weak arch_syscall_addr(int nr) return (unsigned long)sys_call_table[nr]; } -int __init init_ftrace_syscalls(void) +static int __init init_ftrace_syscalls(void) { struct syscall_metadata *meta; unsigned long addr; -- cgit v1.2.3 From a54164114b96b4693b42cdb553260eec41ea4393 Mon Sep 17 00:00:00 2001 From: Hiraku Toyooka Date: Wed, 19 Dec 2012 16:02:34 +0900 Subject: tracing: Add checks if tr->buffer is NULL in tracing_reset{_online_cpus} max_tr->buffer could be NULL in the tracing_reset{_online_cpus}. In this case, a NULL pointer dereference happens, so we should return immediately from these functions. Note, the current code does not call tracing_reset*() with max_tr when its buffer is NULL, but future code will. This patch is needed to prevent the future code from crashing. Link: http://lkml.kernel.org/r/20121219070234.31200.93863.stgit@liselsia Signed-off-by: Hiraku Toyooka Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f8b7c626f3fd..72b171b90e55 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -922,6 +922,9 @@ void tracing_reset(struct trace_array *tr, int cpu) { struct ring_buffer *buffer = tr->buffer; + if (!buffer) + return; + ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ @@ -936,6 +939,9 @@ void tracing_reset_online_cpus(struct trace_array *tr) struct ring_buffer *buffer = tr->buffer; int cpu; + if (!buffer) + return; + ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ -- cgit v1.2.3 From 84c6cf0db6a00601eb43cfc08244a398ffb0894c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 20 Dec 2012 21:43:52 -0500 Subject: tracing: Remove unneeded check of max_tr->buffer before tracing_reset There's now a check in tracing_reset_online_cpus() if the buffer is allocated or NULL. No need to do a check before calling it with max_tr. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 72b171b90e55..d62248dfda76 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4040,8 +4040,7 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, * Reset the buffer so that it doesn't have incomparable timestamps. */ tracing_reset_online_cpus(&global_trace); - if (max_tr.buffer) - tracing_reset_online_cpus(&max_tr); + tracing_reset_online_cpus(&max_tr); mutex_unlock(&trace_types_lock); -- cgit v1.2.3 From 8741db532e86da2e54f05be751bfe1922ca63d57 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 16 Jan 2013 10:49:37 -0500 Subject: tracing/fgraph: Add max_graph_depth to limit function_graph depth Add the file max_graph_depth to the debug tracing directory that lets the user define the depth of the function graph. A very useful operation is to set the depth to 1. Then it traces only the first function that is called when entering the kernel. This can be used to determine what system operations interrupt a process. For example, to work on NOHZ processes (single tasks running without a timer tick), if any interrupt goes off and preempts that task, this code will show it happening. # cd /sys/kernel/debug/tracing # echo 1 > max_graph_depth # echo function_graph > current_tracer # cat per_cpu/cpu//trace Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 60 ++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4edb4b74eb7e..7008d2e13cf2 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -47,6 +47,8 @@ struct fgraph_data { #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 #define TRACE_GRAPH_PRINT_IRQS 0x40 +static unsigned int max_depth; + static struct tracer_opt trace_opts[] = { /* Display overruns? (for self-debug purpose) */ { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, @@ -250,8 +252,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) return 0; /* trace it when it is-nested-in or is a function enabled. */ - if (!(trace->depth || ftrace_graph_addr(trace->func)) || - ftrace_graph_ignore_irqs()) + if ((!(trace->depth || ftrace_graph_addr(trace->func)) || + ftrace_graph_ignore_irqs()) || + (max_depth && trace->depth >= max_depth)) return 0; local_irq_save(flags); @@ -1457,6 +1460,59 @@ static struct tracer graph_trace __read_mostly = { #endif }; + +static ssize_t +graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + max_depth = val; + + *ppos += cnt; + + return cnt; +} + +static ssize_t +graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/ + int n; + + n = sprintf(buf, "%d\n", max_depth); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); +} + +static const struct file_operations graph_depth_fops = { + .open = tracing_open_generic, + .write = graph_depth_write, + .read = graph_depth_read, + .llseek = generic_file_llseek, +}; + +static __init int init_graph_debugfs(void) +{ + struct dentry *d_tracer; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + trace_create_file("max_graph_depth", 0644, d_tracer, + NULL, &graph_depth_fops); + + return 0; +} +fs_initcall(init_graph_debugfs); + static __init int init_graph_trace(void) { max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); -- cgit v1.2.3 From 06aeaaeabf69da4a3e86df532425640f51b01cef Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 28 Sep 2012 17:15:17 +0900 Subject: ftrace: Move ARCH_SUPPORTS_FTRACE_SAVE_REGS in Kconfig Move SAVE_REGS support flag into Kconfig and rename it to CONFIG_DYNAMIC_FTRACE_WITH_REGS. This also introduces CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS which indicates the architecture depending part of ftrace has a code that saves full registers. On the other hand, CONFIG_DYNAMIC_FTRACE_WITH_REGS indicates the code is enabled. Link: http://lkml.kernel.org/r/20120928081516.3560.72534.stgit@ltc138.sdl.hitachi.co.jp Cc: Ingo Molnar Cc: Ananth N Mavinakayanahalli Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Frederic Weisbecker Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 8 ++++++++ kernel/trace/ftrace.c | 6 +++--- kernel/trace/trace_selftest.c | 2 +- 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5d89335a485f..cdc9d284d24e 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -39,6 +39,9 @@ config HAVE_DYNAMIC_FTRACE help See Documentation/trace/ftrace-design.txt +config HAVE_DYNAMIC_FTRACE_WITH_REGS + bool + config HAVE_FTRACE_MCOUNT_RECORD bool help @@ -434,6 +437,11 @@ config DYNAMIC_FTRACE were made. If so, it runs stop_machine (stops all CPUS) and modifies the code to jump over the call to ftrace. +config DYNAMIC_FTRACE_WITH_REGS + def_bool y + depends on DYNAMIC_FTRACE + depends on HAVE_DYNAMIC_FTRACE_WITH_REGS + config FUNCTION_PROFILER bool "Kernel function profiler" depends on FUNCTION_TRACER diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 41473b4ad7a4..6e34dc162fe1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -337,7 +337,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) return -EINVAL; -#ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS +#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS /* * If the ftrace_ops specifies SAVE_REGS, then it only can be used * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set. @@ -4143,8 +4143,8 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, * Archs are to support both the regs and ftrace_ops at the same time. * If they support ftrace_ops, it is assumed they support regs. * If call backs want to use regs, they must either check for regs - * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS. - * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved. + * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS. + * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved. * An architecture can pass partial regs with ftrace_ops and still * set the ARCH_SUPPORT_FTARCE_OPS. */ diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 47623169a815..6c62d58d8e87 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -568,7 +568,7 @@ trace_selftest_function_regs(void) int ret; int supported = 0; -#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS supported = 1; #endif -- cgit v1.2.3 From e7dbfe349d12eabb7783b117e0c115f6f3d9ef9e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 28 Sep 2012 17:15:20 +0900 Subject: kprobes/x86: Move ftrace-based kprobe code into kprobes-ftrace.c Split ftrace-based kprobes code from kprobes, and introduce CONFIG_(HAVE_)KPROBES_ON_FTRACE Kconfig flags. For the cleanup reason, this also moves kprobe_ftrace check into skip_singlestep. Link: http://lkml.kernel.org/r/20120928081520.3560.25624.stgit@ltc138.sdl.hitachi.co.jp Cc: Ingo Molnar Cc: Ananth N Mavinakayanahalli Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Frederic Weisbecker Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/kprobes.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 098f396aa409..f423c3ef4a82 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -919,7 +919,7 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) } #endif /* CONFIG_OPTPROBES */ -#ifdef KPROBES_CAN_USE_FTRACE +#ifdef CONFIG_KPROBES_ON_FTRACE static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { .func = kprobe_ftrace_handler, .flags = FTRACE_OPS_FL_SAVE_REGS, @@ -964,7 +964,7 @@ static void __kprobes disarm_kprobe_ftrace(struct kprobe *p) (unsigned long)p->addr, 1, 0); WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); } -#else /* !KPROBES_CAN_USE_FTRACE */ +#else /* !CONFIG_KPROBES_ON_FTRACE */ #define prepare_kprobe(p) arch_prepare_kprobe(p) #define arm_kprobe_ftrace(p) do {} while (0) #define disarm_kprobe_ftrace(p) do {} while (0) @@ -1414,12 +1414,12 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p, */ ftrace_addr = ftrace_location((unsigned long)p->addr); if (ftrace_addr) { -#ifdef KPROBES_CAN_USE_FTRACE +#ifdef CONFIG_KPROBES_ON_FTRACE /* Given address is not on the instruction boundary */ if ((unsigned long)p->addr != ftrace_addr) return -EILSEQ; p->flags |= KPROBE_FLAG_FTRACE; -#else /* !KPROBES_CAN_USE_FTRACE */ +#else /* !CONFIG_KPROBES_ON_FTRACE */ return -EINVAL; #endif } -- cgit v1.2.3 From b000c8065a92b0fe0e1694f41b2c8d8ba7b7b1ec Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 18 Jan 2013 10:31:20 -0500 Subject: tracing: Remove the extra 4 bytes of padding in events Due to a userspace issue with PowerTop v2beta, which hardcoded the offset of event fields that it was using, it broke when we removed the Big Kernel Lock counter from the event header. (commit e6e1e2593 "tracing: Remove lock_depth from event entry") Because this broke userspace, it was determined that we must keep those 4 bytes around. (commit a3a4a5acd "Regression: partial revert "tracing: Remove lock_depth from event entry"") This unfortunately wastes space in the ring buffer. 4 bytes per event, where a lot of events are just 24 bytes. That's 16% of the buffer wasted. A million events will add 4 megs of white space into the buffer. It was later noticed that PowerTop v2beta could not work on systems where the kernel was 64 bit but the userspace was 32 bits. The reason was because the offsets are different between the two and the hard coded offset of one would not work with the other. With PowerTop v2 final, it implemented the same interface that both perf and trace-cmd use. That is, it reads the format file of the event to find the offsets of the fields it needs. This fixes the problem with running powertop on a 32 bit userspace running on a 64 bit kernel. It also no longer requires the 4 byte padding. As PowerTop v2 has been out for a while, and is included in all major distributions, it is time that we can safely remove the 4 bytes of padding. Users of PowerTop v2beta should upgrade to PowerTop v2 final. Cc: Linus Torvalds Acked-by: Arjan van de Ven Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 1 - kernel/trace/trace_events.c | 1 - 2 files changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d62248dfda76..a387bd271e71 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1173,7 +1173,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; - entry->padding = 0; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 880073d0b946..57e9b284250c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -116,7 +116,6 @@ static int trace_define_common_fields(void) __common_field(unsigned char, flags); __common_field(unsigned char, preempt_count); __common_field(int, pid); - __common_field(int, padding); return ret; } -- cgit v1.2.3 From 0a71e4c6d749d06f52e75a406fc9046924fcfcc1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 22 Jan 2013 12:06:56 -0500 Subject: tracing: Remove trace.h header from trace_clock.c As trace_clock is used by other things besides tracing, and it does not require anything from trace.h, it is best not to include the header file in trace_clock.c. Signed-off-by: Steven Rostedt --- kernel/trace/trace_clock.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 394783531cbb..22b638b28e48 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -21,8 +21,6 @@ #include #include -#include "trace.h" - /* * trace_clock_local(): the simplest and least coherent tracing clock. * -- cgit v1.2.3 From 2739d3cce9816805fe26774fea2527d5b16e924d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 21 Jan 2013 18:18:33 +0800 Subject: cgroup: fix bogus kernel warnings when cgroup_create() failed If cgroup_create() failed and cgroup_destroy_locked() is called to do cleanup, we'll see a bunch of warnings: cgroup_addrm_files: failed to remove 2MB.limit_in_bytes, err=-2 cgroup_addrm_files: failed to remove 2MB.usage_in_bytes, err=-2 cgroup_addrm_files: failed to remove 2MB.max_usage_in_bytes, err=-2 cgroup_addrm_files: failed to remove 2MB.failcnt, err=-2 cgroup_addrm_files: failed to remove prioidx, err=-2 cgroup_addrm_files: failed to remove ifpriomap, err=-2 ... We failed to remove those files, because cgroup_create() has failed before creating those cgroup files. To fix this, we simply don't warn if cgroup_rm_file() can't find the cft entry. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a893985601e9..ad3359f903f0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -921,13 +921,17 @@ static void remove_dir(struct dentry *d) dput(parent); } -static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) +static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) { struct cfent *cfe; lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); lockdep_assert_held(&cgroup_mutex); + /* + * If we're doing cleanup due to failure of cgroup_create(), + * the corresponding @cfe may not exist. + */ list_for_each_entry(cfe, &cgrp->files, node) { struct dentry *d = cfe->dentry; @@ -940,9 +944,8 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) list_del_init(&cfe->node); dput(d); - return 0; + break; } - return -ENOENT; } /** @@ -2758,14 +2761,14 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) continue; - if (is_add) + if (is_add) { err = cgroup_add_file(cgrp, subsys, cft); - else - err = cgroup_rm_file(cgrp, cft); - if (err) { - pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n", - is_add ? "add" : "remove", cft->name, err); + if (err) + pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", + cft->name, err); ret = err; + } else { + cgroup_rm_file(cgrp, cft); } } return ret; -- cgit v1.2.3 From 0fdff3ec6d87856cdcc99e69cf42143fdd6c56b4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 22 Jan 2013 16:48:03 -0800 Subject: async, kmod: warn on synchronous request_module() from async workers Synchronous requet_module() from an async worker can lead to deadlock because module init path may invoke async_synchronize_full(). The async worker waits for request_module() to complete and the module loading waits for the async task to finish. This bug happened in the block layer because of default elevator auto-loading. Block layer has been updated not to do default elevator auto-loading and it has been decided to disallow synchronous request_module() from async workers. Trigger WARN_ON_ONCE() on synchronous request_module() from async workers. For more details, please refer to the following thread. http://thread.gmane.org/gmane.linux.kernel/1420814 Signed-off-by: Tejun Heo Reported-by: Alex Riesen Cc: Linus Torvalds Cc: Arjan van de Ven --- kernel/kmod.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 1c317e386831..ecd42b484db8 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -130,6 +131,14 @@ int __request_module(bool wait, const char *fmt, ...) #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ static int kmod_loop_msg; + /* + * We don't allow synchronous module loading from async. Module + * init may invoke async_synchronize_full() which will end up + * waiting for this task which already is waiting for the module + * loading to complete, leading to a deadlock. + */ + WARN_ON_ONCE(wait && current_is_async()); + va_start(args, fmt); ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); va_end(args); -- cgit v1.2.3 From 34600f0e9c33c9cd48ae87448205f51332b7d5a0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 22 Jan 2013 13:35:11 -0500 Subject: tracing: Fix race with max_tr and changing tracers There's a race condition between the setting of a new tracer and the update of the max trace buffers (the swap). When a new tracer is added, it sets current_trace to nop_trace before disabling the old tracer. At this moment, if the old tracer uses update_max_tr(), the update may trigger the warning against !current_trace->use_max-tr, as nop_trace doesn't have that set. As update_max_tr() requires that interrupts be disabled, we can add a check to see if current_trace == nop_trace and bail if it does. Then when disabling the current_trace, set it to nop_trace and run synchronize_sched(). This will make sure all calls to update_max_tr() have completed (it was called with interrupts disabled). As a clean up, this commit also removes shrinking and recreating the max_tr buffer if the old and new tracers both have use_max_tr set. The old way use to always shrink the buffer, and then expand it for the next tracer. This is a waste of time. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a387bd271e71..d2a658349ca1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -709,10 +709,14 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); - if (!current_trace->use_max_tr) { - WARN_ON_ONCE(1); + + /* If we disabled the tracer, stop now */ + if (current_trace == &nop_trace) return; - } + + if (WARN_ON_ONCE(!current_trace->use_max_tr)) + return; + arch_spin_lock(&ftrace_max_lock); tr->buffer = max_tr.buffer; @@ -3185,6 +3189,7 @@ static int tracing_set_tracer(const char *buf) static struct trace_option_dentry *topts; struct trace_array *tr = &global_trace; struct tracer *t; + bool had_max_tr; int ret = 0; mutex_lock(&trace_types_lock); @@ -3211,7 +3216,19 @@ static int tracing_set_tracer(const char *buf) trace_branch_disable(); if (current_trace && current_trace->reset) current_trace->reset(tr); - if (current_trace && current_trace->use_max_tr) { + + had_max_tr = current_trace && current_trace->use_max_tr; + current_trace = &nop_trace; + + if (had_max_tr && !t->use_max_tr) { + /* + * We need to make sure that the update_max_tr sees that + * current_trace changed to nop_trace to keep it from + * swapping the buffers after we resize it. + * The update_max_tr is called from interrupts disabled + * so a synchronized_sched() is sufficient. + */ + synchronize_sched(); /* * We don't free the ring buffer. instead, resize it because * The max_tr ring buffer has some state (e.g. ring->clock) and @@ -3222,10 +3239,8 @@ static int tracing_set_tracer(const char *buf) } destroy_trace_option_files(topts); - current_trace = &nop_trace; - topts = create_trace_option_files(t); - if (t->use_max_tr) { + if (t->use_max_tr && !had_max_tr) { /* we need to make per cpu buffer sizes equivalent */ ret = resize_buffer_duplicate_size(&max_tr, &global_trace, RING_BUFFER_ALL_CPUS); -- cgit v1.2.3 From 05cbbf643b8eea1be21082c53cdb856d1dc6d765 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 22 Jan 2013 23:35:11 -0500 Subject: tracing: Fix selftest function recursion accounting The test that checks function recursion does things differently if the arch does not support all ftrace features. But that really doesn't make a difference with how the test runs, and either way the count variable should be 2 at the end. Currently the test wrongly fails for archs that don't support all the ftrace features. Signed-off-by: Steven Rostedt --- kernel/trace/trace_selftest.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 6c62d58d8e87..adb008a0136f 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -452,7 +452,6 @@ trace_selftest_function_recursion(void) char *func_name; int len; int ret; - int cnt; /* The previous test PASSED */ pr_cont("PASSED\n"); @@ -510,19 +509,10 @@ trace_selftest_function_recursion(void) unregister_ftrace_function(&test_recsafe_probe); - /* - * If arch supports all ftrace features, and no other task - * was on the list, we should be fine. - */ - if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC) - cnt = 2; /* Should have recursed */ - else - cnt = 1; - ret = -1; - if (trace_selftest_recursion_cnt != cnt) { - pr_cont("*callback not called expected %d times (%d)* ", - cnt, trace_selftest_recursion_cnt); + if (trace_selftest_recursion_cnt != 2) { + pr_cont("*callback not called expected 2 times (%d)* ", + trace_selftest_recursion_cnt); goto out; } -- cgit v1.2.3 From 6350379452ccaeaa71734adf57dec2ebc9207849 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 16:58:56 -0400 Subject: ftrace: Fix global function tracers that are not recursion safe If one of the function tracers set by the global ops is not recursion safe, it can still be called directly without the added recursion supplied by the ftrace infrastructure. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6e34dc162fe1..789cbec24e81 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -221,10 +221,24 @@ static void update_global_ops(void) * registered callers. */ if (ftrace_global_list == &ftrace_list_end || - ftrace_global_list->next == &ftrace_list_end) + ftrace_global_list->next == &ftrace_list_end) { func = ftrace_global_list->func; - else + /* + * As we are calling the function directly. + * If it does not have recursion protection, + * the function_trace_op needs to be updated + * accordingly. + */ + if (ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) + global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; + else + global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE; + } else { func = ftrace_global_list_func; + /* The list has its own recursion protection. */ + global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; + } + /* If we filter on pids, update to use the pid function */ if (!list_empty(&ftrace_pids)) { -- cgit v1.2.3 From 9640388b63556b4cfecbb5aaf91a5c99d272f429 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 17:01:20 -0400 Subject: ftrace: Fix function tracing recursion self test The function tracing recursion self test should not crash the machine if the resursion test fails. If it detects that the function tracing is recursing when it should not be, then bail, don't go into an infinite recursive loop. Signed-off-by: Steven Rostedt --- kernel/trace/trace_selftest.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index adb008a0136f..51c819c12c29 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -415,7 +415,8 @@ static void trace_selftest_test_recursion_func(unsigned long ip, * The ftrace infrastructure should provide the recursion * protection. If not, this will crash the kernel! */ - trace_selftest_recursion_cnt++; + if (trace_selftest_recursion_cnt++ > 10) + return; DYN_FTRACE_TEST_NAME(); } -- cgit v1.2.3 From 0a016409e42f273415f8225ddf2c58eb2df88034 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 17:03:03 -0400 Subject: ftrace: Optimize the function tracer list loop There is lots of places that perform: op = rcu_dereference_raw(ftrace_control_list); while (op != &ftrace_list_end) { Add a helper macro to do this, and also optimize for a single entity. That is, gcc will optimize a loop for either no iterations or more than one iteration. But usually only a single callback is registered to the function tracer, thus the optimized case should be a single pass. to do this we now do: op = rcu_dereference_raw(list); do { [...] } while (likely(op = rcu_dereference_raw((op)->next)) && unlikely((op) != &ftrace_list_end)); An op is always registered (ftrace_list_end when no callbacks is registered), thus when a single callback is registered, the link list looks like: top => callback => ftrace_list_end => NULL. The likely(op = op->next) still must be performed due to the race of removing the callback, where the first op assignment could equal ftrace_list_end. In that case, the op->next would be NULL. But this is unlikely (only happens in a race condition when removing the callback). But it is very likely that the next op would be ftrace_list_end, unless more than one callback has been registered. This tells gcc what the most common case is and makes the fast path with the least amount of branches. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 48 ++++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 789cbec24e81..1330969d8447 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -111,6 +111,26 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); #define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) #endif +/* + * Traverse the ftrace_global_list, invoking all entries. The reason that we + * can use rcu_dereference_raw() is that elements removed from this list + * are simply leaked, so there is no need to interact with a grace-period + * mechanism. The rcu_dereference_raw() calls are needed to handle + * concurrent insertions into the ftrace_global_list. + * + * Silly Alpha and silly pointer-speculation compiler optimizations! + */ +#define do_for_each_ftrace_op(op, list) \ + op = rcu_dereference_raw(list); \ + do + +/* + * Optimized for just a single item in the list (as that is the normal case). + */ +#define while_for_each_ftrace_op(op) \ + while (likely(op = rcu_dereference_raw((op)->next)) && \ + unlikely((op) != &ftrace_list_end)) + /** * ftrace_nr_registered_ops - return number of ops registered * @@ -132,15 +152,6 @@ int ftrace_nr_registered_ops(void) return cnt; } -/* - * Traverse the ftrace_global_list, invoking all entries. The reason that we - * can use rcu_dereference_raw() is that elements removed from this list - * are simply leaked, so there is no need to interact with a grace-period - * mechanism. The rcu_dereference_raw() calls are needed to handle - * concurrent insertions into the ftrace_global_list. - * - * Silly Alpha and silly pointer-speculation compiler optimizations! - */ static void ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) @@ -149,11 +160,9 @@ ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, return; trace_recursion_set(TRACE_GLOBAL_BIT); - op = rcu_dereference_raw(ftrace_global_list); /*see above*/ - while (op != &ftrace_list_end) { + do_for_each_ftrace_op(op, ftrace_global_list) { op->func(ip, parent_ip, op, regs); - op = rcu_dereference_raw(op->next); /*see above*/ - }; + } while_for_each_ftrace_op(op); trace_recursion_clear(TRACE_GLOBAL_BIT); } @@ -4104,14 +4113,11 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, */ preempt_disable_notrace(); trace_recursion_set(TRACE_CONTROL_BIT); - op = rcu_dereference_raw(ftrace_control_list); - while (op != &ftrace_list_end) { + do_for_each_ftrace_op(op, ftrace_control_list) { if (!ftrace_function_local_disabled(op) && ftrace_ops_test(op, ip)) op->func(ip, parent_ip, op, regs); - - op = rcu_dereference_raw(op->next); - }; + } while_for_each_ftrace_op(op); trace_recursion_clear(TRACE_CONTROL_BIT); preempt_enable_notrace(); } @@ -4139,12 +4145,10 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, * they must be freed after a synchronize_sched(). */ preempt_disable_notrace(); - op = rcu_dereference_raw(ftrace_ops_list); - while (op != &ftrace_list_end) { + do_for_each_ftrace_op(op, ftrace_ops_list) { if (ftrace_ops_test(op, ip)) op->func(ip, parent_ip, op, regs); - op = rcu_dereference_raw(op->next); - }; + } while_for_each_ftrace_op(op); preempt_enable_notrace(); trace_recursion_clear(TRACE_INTERNAL_BIT); } -- cgit v1.2.3 From c29f122cd7fc178b72b1335b1fce0dff2e5c0f5d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 17:17:59 -0400 Subject: ftrace: Add context level recursion bit checking Currently for recursion checking in the function tracer, ftrace tests a task_struct bit to determine if the function tracer had recursed or not. If it has, then it will will return without going further. But this leads to races. If an interrupt came in after the bit was set, the functions being traced would see that bit set and think that the function tracer recursed on itself, and would return. Instead add a bit for each context (normal, softirq, irq and nmi). A check of which context the task is in is made before testing the associated bit. Now if an interrupt preempts the function tracer after the previous context has been set, the interrupt functions can still be traced. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 40 +++++++++++++++++++++++++++++++++------- kernel/trace/trace.h | 12 +++++++++--- 2 files changed, 42 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1330969d8447..639b6ab1f04c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -156,14 +156,27 @@ static void ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { - if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) + int bit; + + if (in_interrupt()) { + if (in_nmi()) + bit = TRACE_GLOBAL_NMI_BIT; + + else if (in_irq()) + bit = TRACE_GLOBAL_IRQ_BIT; + else + bit = TRACE_GLOBAL_SIRQ_BIT; + } else + bit = TRACE_GLOBAL_BIT; + + if (unlikely(trace_recursion_test(bit))) return; - trace_recursion_set(TRACE_GLOBAL_BIT); + trace_recursion_set(bit); do_for_each_ftrace_op(op, ftrace_global_list) { op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); - trace_recursion_clear(TRACE_GLOBAL_BIT); + trace_recursion_clear(bit); } static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, @@ -4132,14 +4145,27 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ignored, struct pt_regs *regs) { struct ftrace_ops *op; + unsigned int bit; if (function_trace_stop) return; - if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) - return; + if (in_interrupt()) { + if (in_nmi()) + bit = TRACE_INTERNAL_NMI_BIT; + + else if (in_irq()) + bit = TRACE_INTERNAL_IRQ_BIT; + else + bit = TRACE_INTERNAL_SIRQ_BIT; + } else + bit = TRACE_INTERNAL_BIT; + + if (unlikely(trace_recursion_test(bit))) + return; + + trace_recursion_set(bit); - trace_recursion_set(TRACE_INTERNAL_BIT); /* * Some of the ops may be dynamically allocated, * they must be freed after a synchronize_sched(). @@ -4150,7 +4176,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); preempt_enable_notrace(); - trace_recursion_clear(TRACE_INTERNAL_BIT); + trace_recursion_clear(bit); } /* diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c75d7988902c..fe6ccff9cc70 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -299,8 +299,14 @@ struct tracer { /* for function tracing recursion */ #define TRACE_INTERNAL_BIT (1<<11) -#define TRACE_GLOBAL_BIT (1<<12) -#define TRACE_CONTROL_BIT (1<<13) +#define TRACE_INTERNAL_NMI_BIT (1<<12) +#define TRACE_INTERNAL_IRQ_BIT (1<<13) +#define TRACE_INTERNAL_SIRQ_BIT (1<<14) +#define TRACE_GLOBAL_BIT (1<<15) +#define TRACE_GLOBAL_NMI_BIT (1<<16) +#define TRACE_GLOBAL_IRQ_BIT (1<<17) +#define TRACE_GLOBAL_SIRQ_BIT (1<<18) +#define TRACE_CONTROL_BIT (1<<19) /* * Abuse of the trace_recursion. @@ -309,7 +315,7 @@ struct tracer { * was called in irq context but we have irq tracing off. Since this * can only be modified by current, we can reuse trace_recursion. */ -#define TRACE_IRQ_BIT (1<<13) +#define TRACE_IRQ_BIT (1<<20) #define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) #define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) -- cgit v1.2.3 From e46cbf75c621725964fe1f6e7013e8bcd86a0e3d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 17:32:25 -0400 Subject: tracing: Make the trace recursion bits into enums Convert the bits into enums which makes the code a little easier to maintain. Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fe6ccff9cc70..5a095d6f088d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -298,15 +298,18 @@ struct tracer { #define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) /* for function tracing recursion */ -#define TRACE_INTERNAL_BIT (1<<11) -#define TRACE_INTERNAL_NMI_BIT (1<<12) -#define TRACE_INTERNAL_IRQ_BIT (1<<13) -#define TRACE_INTERNAL_SIRQ_BIT (1<<14) -#define TRACE_GLOBAL_BIT (1<<15) -#define TRACE_GLOBAL_NMI_BIT (1<<16) -#define TRACE_GLOBAL_IRQ_BIT (1<<17) -#define TRACE_GLOBAL_SIRQ_BIT (1<<18) -#define TRACE_CONTROL_BIT (1<<19) +enum { + TRACE_INTERNAL_BIT = 11, + TRACE_INTERNAL_NMI_BIT, + TRACE_INTERNAL_IRQ_BIT, + TRACE_INTERNAL_SIRQ_BIT, + + TRACE_GLOBAL_BIT, + TRACE_GLOBAL_NMI_BIT, + TRACE_GLOBAL_IRQ_BIT, + TRACE_GLOBAL_SIRQ_BIT, + + TRACE_CONTROL_BIT, /* * Abuse of the trace_recursion. @@ -315,11 +318,12 @@ struct tracer { * was called in irq context but we have irq tracing off. Since this * can only be modified by current, we can reuse trace_recursion. */ -#define TRACE_IRQ_BIT (1<<20) + TRACE_IRQ_BIT, +}; -#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) -#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) -#define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) +#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) +#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0) +#define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit))) #define TRACE_PIPE_ALL_CPU -1 -- cgit v1.2.3 From edc15cafcbfa3d73f819cae99885a2e35e4cbce5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 17:47:21 -0400 Subject: tracing: Avoid unnecessary multiple recursion checks When function tracing occurs, the following steps are made: If arch does not support a ftrace feature: call internal function (uses INTERNAL bits) which calls... If callback is registered to the "global" list, the list function is called and recursion checks the GLOBAL bits. then this function calls... The function callback, which can use the FTRACE bits to check for recursion. Now if the arch does not suppport a feature, and it calls the global list function which calls the ftrace callback all three of these steps will do a recursion protection. There's no reason to do one if the previous caller already did. The recursion that we are protecting against will go through the same steps again. To prevent the multiple recursion checks, if a recursion bit is set that is higher than the MAX bit of the current check, then we know that the check was made by the previous caller, and we can skip the current check. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 40 +++++-------------- kernel/trace/trace.h | 106 +++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 110 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 639b6ab1f04c..ce8c3d68292f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -158,25 +158,15 @@ ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, { int bit; - if (in_interrupt()) { - if (in_nmi()) - bit = TRACE_GLOBAL_NMI_BIT; - - else if (in_irq()) - bit = TRACE_GLOBAL_IRQ_BIT; - else - bit = TRACE_GLOBAL_SIRQ_BIT; - } else - bit = TRACE_GLOBAL_BIT; - - if (unlikely(trace_recursion_test(bit))) + bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX); + if (bit < 0) return; - trace_recursion_set(bit); do_for_each_ftrace_op(op, ftrace_global_list) { op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); - trace_recursion_clear(bit); + + trace_clear_recursion(bit); } static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, @@ -4145,26 +4135,14 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ignored, struct pt_regs *regs) { struct ftrace_ops *op; - unsigned int bit; + int bit; if (function_trace_stop) return; - if (in_interrupt()) { - if (in_nmi()) - bit = TRACE_INTERNAL_NMI_BIT; - - else if (in_irq()) - bit = TRACE_INTERNAL_IRQ_BIT; - else - bit = TRACE_INTERNAL_SIRQ_BIT; - } else - bit = TRACE_INTERNAL_BIT; - - if (unlikely(trace_recursion_test(bit))) - return; - - trace_recursion_set(bit); + bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); + if (bit < 0) + return; /* * Some of the ops may be dynamically allocated, @@ -4176,7 +4154,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); preempt_enable_notrace(); - trace_recursion_clear(bit); + trace_clear_recursion(bit); } /* diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5a095d6f088d..c203a51dd412 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -297,18 +297,49 @@ struct tracer { /* Ring buffer has the 10 LSB bits to count */ #define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) -/* for function tracing recursion */ +/* + * For function tracing recursion: + * The order of these bits are important. + * + * When function tracing occurs, the following steps are made: + * If arch does not support a ftrace feature: + * call internal function (uses INTERNAL bits) which calls... + * If callback is registered to the "global" list, the list + * function is called and recursion checks the GLOBAL bits. + * then this function calls... + * The function callback, which can use the FTRACE bits to + * check for recursion. + * + * Now if the arch does not suppport a feature, and it calls + * the global list function which calls the ftrace callback + * all three of these steps will do a recursion protection. + * There's no reason to do one if the previous caller already + * did. The recursion that we are protecting against will + * go through the same steps again. + * + * To prevent the multiple recursion checks, if a recursion + * bit is set that is higher than the MAX bit of the current + * check, then we know that the check was made by the previous + * caller, and we can skip the current check. + */ enum { - TRACE_INTERNAL_BIT = 11, - TRACE_INTERNAL_NMI_BIT, - TRACE_INTERNAL_IRQ_BIT, - TRACE_INTERNAL_SIRQ_BIT, + TRACE_FTRACE_BIT = 11, + TRACE_FTRACE_NMI_BIT, + TRACE_FTRACE_IRQ_BIT, + TRACE_FTRACE_SIRQ_BIT, + /* GLOBAL_BITs must be greater than FTRACE_BITs */ TRACE_GLOBAL_BIT, TRACE_GLOBAL_NMI_BIT, TRACE_GLOBAL_IRQ_BIT, TRACE_GLOBAL_SIRQ_BIT, + /* INTERNAL_BITs must be greater than GLOBAL_BITs */ + TRACE_INTERNAL_BIT, + TRACE_INTERNAL_NMI_BIT, + TRACE_INTERNAL_IRQ_BIT, + TRACE_INTERNAL_SIRQ_BIT, + TRACE_CONTROL_BIT, /* @@ -325,6 +356,71 @@ enum { #define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0) #define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit))) +#define TRACE_CONTEXT_BITS 4 + +#define TRACE_FTRACE_START TRACE_FTRACE_BIT +#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_GLOBAL_START TRACE_GLOBAL_BIT +#define TRACE_GLOBAL_MAX ((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_LIST_START TRACE_INTERNAL_BIT +#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_CONTEXT_MASK TRACE_LIST_MAX + +static __always_inline int trace_get_context_bit(void) +{ + int bit; + + if (in_interrupt()) { + if (in_nmi()) + bit = 0; + + else if (in_irq()) + bit = 1; + else + bit = 2; + } else + bit = 3; + + return bit; +} + +static __always_inline int trace_test_and_set_recursion(int start, int max) +{ + unsigned int val = current->trace_recursion; + int bit; + + /* A previous recursion check was made */ + if ((val & TRACE_CONTEXT_MASK) > max) + return 0; + + bit = trace_get_context_bit() + start; + if (unlikely(val & (1 << bit))) + return -1; + + val |= 1 << bit; + current->trace_recursion = val; + barrier(); + + return bit; +} + +static __always_inline void trace_clear_recursion(int bit) +{ + unsigned int val = current->trace_recursion; + + if (!bit) + return; + + bit = 1 << bit; + val &= ~bit; + + barrier(); + current->trace_recursion = val; +} + #define TRACE_PIPE_ALL_CPU -1 static inline struct ring_buffer_iter * -- cgit v1.2.3 From 897f68a48b1f8fb6cb7493e1ee37e3ed7f879937 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 17:52:35 -0400 Subject: ftrace: Use only the preempt version of function tracing The function tracer had two different versions of function tracing. The disabling of irqs version and the preempt disable version. As function tracing in very intrusive and can cause nasty recursion issues, it has its own recursion protection. But the old method to do this was a flat layer. If it detected that a recursion was happening then it would just return without recording. This made the preempt version (much faster than the irq disabling one) not very useful, because if an interrupt were to occur after the recursion flag was set, the interrupt would not be traced at all, because every function that was traced would think it recursed on itself (due to the context it preempted setting the recursive flag). Now that we have a recursion flag for every context level, we no longer need to worry about that. We can disable preemption, set the current context recursion check bit, and go on. If an interrupt were to come along, it would check its own context bit and happily continue to trace. As the preempt version is faster than the irq disable version, there's no more reason to keep the preempt version around. And the irq disable version still had an issue with missing out on tracing NMI code. Remove the irq disable function tracer version and have the preempt disable version be the default (and only version). Before this patch we had from running: # echo function > /debug/tracing/current_tracer # for i in `seq 10`; do ./hackbench 50; done Time: 12.028 Time: 11.945 Time: 11.925 Time: 11.964 Time: 12.002 Time: 11.910 Time: 11.944 Time: 11.929 Time: 11.941 Time: 11.924 (average: 11.9512) Now we have: # echo function > /debug/tracing/current_tracer # for i in `seq 10`; do ./hackbench 50; done Time: 10.285 Time: 10.407 Time: 10.243 Time: 10.372 Time: 10.380 Time: 10.198 Time: 10.272 Time: 10.354 Time: 10.248 Time: 10.253 (average: 10.3012) a 13.8% savings! Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions.c | 61 ++++++++++-------------------------------- 1 file changed, 14 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 8e3ad8082ab7..1c327ef13a9a 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -47,34 +47,6 @@ static void function_trace_start(struct trace_array *tr) tracing_reset_online_cpus(tr); } -static void -function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) -{ - struct trace_array *tr = func_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu; - int pc; - - if (unlikely(!ftrace_function_enabled)) - return; - - pc = preempt_count(); - preempt_disable_notrace(); - local_save_flags(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) - trace_function(tr, ip, parent_ip, flags, pc); - - atomic_dec(&data->disabled); - preempt_enable_notrace(); -} - /* Our option */ enum { TRACE_FUNC_OPT_STACK = 0x1, @@ -85,34 +57,34 @@ static struct tracer_flags func_flags; static void function_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) - { struct trace_array *tr = func_trace; struct trace_array_cpu *data; unsigned long flags; - long disabled; + unsigned int bit; int cpu; int pc; if (unlikely(!ftrace_function_enabled)) return; - /* - * Need to use raw, since this must be called before the - * recursive protection is performed. - */ - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); + pc = preempt_count(); + preempt_disable_notrace(); - if (likely(disabled == 1)) { - pc = preempt_count(); + bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX); + if (bit < 0) + goto out; + + cpu = smp_processor_id(); + data = tr->data[cpu]; + if (!atomic_read(&data->disabled)) { + local_save_flags(flags); trace_function(tr, ip, parent_ip, flags, pc); } + trace_clear_recursion(bit); - atomic_dec(&data->disabled); - local_irq_restore(flags); + out: + preempt_enable_notrace(); } static void @@ -185,11 +157,6 @@ static void tracing_start_function_trace(void) { ftrace_function_enabled = 0; - if (trace_flags & TRACE_ITER_PREEMPTONLY) - trace_ops.func = function_trace_call_preempt_only; - else - trace_ops.func = function_trace_call; - if (func_flags.val & TRACE_FUNC_OPT_STACK) register_ftrace_function(&trace_stack_ops); else -- cgit v1.2.3 From 567cd4da54ff45513d2ca1f0e3cb9ba45b66d6cf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 18:33:05 -0400 Subject: ring-buffer: User context bit recursion checking Using context bit recursion checking, we can help increase the performance of the ring buffer. Before this patch: # echo function > /debug/tracing/current_tracer # for i in `seq 10`; do ./hackbench 50; done Time: 10.285 Time: 10.407 Time: 10.243 Time: 10.372 Time: 10.380 Time: 10.198 Time: 10.272 Time: 10.354 Time: 10.248 Time: 10.253 (average: 10.3012) Now we have: # echo function > /debug/tracing/current_tracer # for i in `seq 10`; do ./hackbench 50; done Time: 9.712 Time: 9.824 Time: 9.861 Time: 9.827 Time: 9.962 Time: 9.905 Time: 9.886 Time: 10.088 Time: 9.861 Time: 9.834 (average: 9.876) a 4% savings! Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 85 ++++++++++++++++++++++++++++++++-------------- kernel/trace/trace.h | 13 +++---- 2 files changed, 67 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6ff9cc4658ed..481e26269281 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2432,41 +2432,76 @@ rb_reserve_next_event(struct ring_buffer *buffer, #ifdef CONFIG_TRACING -#define TRACE_RECURSIVE_DEPTH 16 +/* + * The lock and unlock are done within a preempt disable section. + * The current_context per_cpu variable can only be modified + * by the current task between lock and unlock. But it can + * be modified more than once via an interrupt. To pass this + * information from the lock to the unlock without having to + * access the 'in_interrupt()' functions again (which do show + * a bit of overhead in something as critical as function tracing, + * we use a bitmask trick. + * + * bit 0 = NMI context + * bit 1 = IRQ context + * bit 2 = SoftIRQ context + * bit 3 = normal context. + * + * This works because this is the order of contexts that can + * preempt other contexts. A SoftIRQ never preempts an IRQ + * context. + * + * When the context is determined, the corresponding bit is + * checked and set (if it was set, then a recursion of that context + * happened). + * + * On unlock, we need to clear this bit. To do so, just subtract + * 1 from the current_context and AND it to itself. + * + * (binary) + * 101 - 1 = 100 + * 101 & 100 = 100 (clearing bit zero) + * + * 1010 - 1 = 1001 + * 1010 & 1001 = 1000 (clearing bit 1) + * + * The least significant bit can be cleared this way, and it + * just so happens that it is the same bit corresponding to + * the current context. + */ +static DEFINE_PER_CPU(unsigned int, current_context); -/* Keep this code out of the fast path cache */ -static noinline void trace_recursive_fail(void) +static __always_inline int trace_recursive_lock(void) { - /* Disable all tracing before we do anything else */ - tracing_off_permanent(); - - printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" - "HC[%lu]:SC[%lu]:NMI[%lu]\n", - trace_recursion_buffer(), - hardirq_count() >> HARDIRQ_SHIFT, - softirq_count() >> SOFTIRQ_SHIFT, - in_nmi()); + unsigned int val = this_cpu_read(current_context); + int bit; - WARN_ON_ONCE(1); -} - -static inline int trace_recursive_lock(void) -{ - trace_recursion_inc(); + if (in_interrupt()) { + if (in_nmi()) + bit = 0; + else if (in_irq()) + bit = 1; + else + bit = 2; + } else + bit = 3; - if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH)) - return 0; + if (unlikely(val & (1 << bit))) + return 1; - trace_recursive_fail(); + val |= (1 << bit); + this_cpu_write(current_context, val); - return -1; + return 0; } -static inline void trace_recursive_unlock(void) +static __always_inline void trace_recursive_unlock(void) { - WARN_ON_ONCE(!trace_recursion_buffer()); + unsigned int val = this_cpu_read(current_context); - trace_recursion_dec(); + val--; + val &= this_cpu_read(current_context); + this_cpu_write(current_context, val); } #else diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c203a51dd412..04a2c7ab1735 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -291,11 +291,6 @@ struct tracer { /* Only current can touch trace_recursion */ -#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0) -#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0) - -/* Ring buffer has the 10 LSB bits to count */ -#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) /* * For function tracing recursion: @@ -323,7 +318,13 @@ struct tracer { * caller, and we can skip the current check. */ enum { - TRACE_FTRACE_BIT = 11, + TRACE_BUFFER_BIT, + TRACE_BUFFER_NMI_BIT, + TRACE_BUFFER_IRQ_BIT, + TRACE_BUFFER_SIRQ_BIT, + + /* Start of function recursion bits */ + TRACE_FTRACE_BIT, TRACE_FTRACE_NMI_BIT, TRACE_FTRACE_IRQ_BIT, TRACE_FTRACE_SIRQ_BIT, -- cgit v1.2.3 From 0b07436d95b5404134da4d661fd183eac863513e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 22 Jan 2013 16:58:30 -0500 Subject: ring-buffer: Remove trace.h from ring_buffer.c ring_buffer.c use to require declarations from trace.h, but these have moved to the generic header files. There's nothing in trace.h that ring_buffer.c requires. There's some headers that trace.h included that ring_buffer.c needs, but it's best that it includes them directly, and not include trace.h. Also, some things may use ring_buffer.c without having tracing configured. This removes the dependency that may come in the future. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 481e26269281..13950d9027cb 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3,8 +3,10 @@ * * Copyright (C) 2008 Steven Rostedt */ +#include #include #include +#include #include #include #include @@ -21,7 +23,6 @@ #include #include -#include "trace.h" static void update_pages_handler(struct work_struct *work); -- cgit v1.2.3 From 8723d5037cafea09c7242303c6c8e5d7058cec61 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Jan 2013 09:32:30 -0800 Subject: async: bring sanity to the use of words domain and running In the beginning, running lists were literal struct list_heads. Later on, struct async_domain was added. For some reason, while the conversion substituted list_heads with async_domains, the variable names weren't fully converted. In more places, "running" was used for struct async_domain while other places adopted new "domain" name. The situation is made much worse by having async_domain's running list named "domain" and async_entry's field pointing to async_domain named "running". So, we end up with mix of "running" and "domain" for variable names for async_domain, with the field names of async_domain and async_entry swapped between "running" and "domain". It feels almost intentionally made to be as confusing as possible. Bring some sanity by * Renaming all async_domain variables "domain". * s/async_running/async_dfl_domain/ * s/async_domain->domain/async_domain->running/ * s/async_entry->running/async_entry->domain/ Signed-off-by: Tejun Heo Cc: Arjan van de Ven Cc: Dan Williams Cc: Linus Torvalds --- kernel/async.c | 68 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 34 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index 6c68fc3fae7b..29d51d483bee 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -64,7 +64,7 @@ static async_cookie_t next_cookie = 1; #define MAX_WORK 32768 static LIST_HEAD(async_pending); -static ASYNC_DOMAIN(async_running); +static ASYNC_DOMAIN(async_dfl_domain); static LIST_HEAD(async_domains); static DEFINE_SPINLOCK(async_lock); static DEFINE_MUTEX(async_register_mutex); @@ -75,7 +75,7 @@ struct async_entry { async_cookie_t cookie; async_func_ptr *func; void *data; - struct async_domain *running; + struct async_domain *domain; }; static DECLARE_WAIT_QUEUE_HEAD(async_done); @@ -86,7 +86,7 @@ static atomic_t entry_count; /* * MUST be called with the lock held! */ -static async_cookie_t __lowest_in_progress(struct async_domain *running) +static async_cookie_t __lowest_in_progress(struct async_domain *domain) { async_cookie_t first_running = next_cookie; /* infinity value */ async_cookie_t first_pending = next_cookie; /* ditto */ @@ -96,13 +96,13 @@ static async_cookie_t __lowest_in_progress(struct async_domain *running) * Both running and pending lists are sorted but not disjoint. * Take the first cookies from both and return the min. */ - if (!list_empty(&running->domain)) { - entry = list_first_entry(&running->domain, typeof(*entry), list); + if (!list_empty(&domain->running)) { + entry = list_first_entry(&domain->running, typeof(*entry), list); first_running = entry->cookie; } list_for_each_entry(entry, &async_pending, list) { - if (entry->running == running) { + if (entry->domain == domain) { first_pending = entry->cookie; break; } @@ -111,13 +111,13 @@ static async_cookie_t __lowest_in_progress(struct async_domain *running) return min(first_running, first_pending); } -static async_cookie_t lowest_in_progress(struct async_domain *running) +static async_cookie_t lowest_in_progress(struct async_domain *domain) { unsigned long flags; async_cookie_t ret; spin_lock_irqsave(&async_lock, flags); - ret = __lowest_in_progress(running); + ret = __lowest_in_progress(domain); spin_unlock_irqrestore(&async_lock, flags); return ret; } @@ -132,11 +132,11 @@ static void async_run_entry_fn(struct work_struct *work) struct async_entry *pos; unsigned long flags; ktime_t uninitialized_var(calltime), delta, rettime; - struct async_domain *running = entry->running; + struct async_domain *domain = entry->domain; /* 1) move self to the running queue, make sure it stays sorted */ spin_lock_irqsave(&async_lock, flags); - list_for_each_entry_reverse(pos, &running->domain, list) + list_for_each_entry_reverse(pos, &domain->running, list) if (entry->cookie < pos->cookie) break; list_move_tail(&entry->list, &pos->list); @@ -162,8 +162,8 @@ static void async_run_entry_fn(struct work_struct *work) /* 3) remove self from the running queue */ spin_lock_irqsave(&async_lock, flags); list_del(&entry->list); - if (running->registered && --running->count == 0) - list_del_init(&running->node); + if (domain->registered && --domain->count == 0) + list_del_init(&domain->node); /* 4) free the entry */ kfree(entry); @@ -175,7 +175,7 @@ static void async_run_entry_fn(struct work_struct *work) wake_up(&async_done); } -static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running) +static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *domain) { struct async_entry *entry; unsigned long flags; @@ -201,13 +201,13 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a INIT_WORK(&entry->work, async_run_entry_fn); entry->func = ptr; entry->data = data; - entry->running = running; + entry->domain = domain; spin_lock_irqsave(&async_lock, flags); newcookie = entry->cookie = next_cookie++; list_add_tail(&entry->list, &async_pending); - if (running->registered && running->count++ == 0) - list_add_tail(&running->node, &async_domains); + if (domain->registered && domain->count++ == 0) + list_add_tail(&domain->node, &async_domains); atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); @@ -230,7 +230,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a */ async_cookie_t async_schedule(async_func_ptr *ptr, void *data) { - return __async_schedule(ptr, data, &async_running); + return __async_schedule(ptr, data, &async_dfl_domain); } EXPORT_SYMBOL_GPL(async_schedule); @@ -238,18 +238,18 @@ EXPORT_SYMBOL_GPL(async_schedule); * async_schedule_domain - schedule a function for asynchronous execution within a certain domain * @ptr: function to execute asynchronously * @data: data pointer to pass to the function - * @running: running list for the domain + * @domain: the domain * * Returns an async_cookie_t that may be used for checkpointing later. - * @running may be used in the async_synchronize_*_domain() functions - * to wait within a certain synchronization domain rather than globally. - * A synchronization domain is specified via the running queue @running to use. - * Note: This function may be called from atomic or non-atomic contexts. + * @domain may be used in the async_synchronize_*_domain() functions to + * wait within a certain synchronization domain rather than globally. A + * synchronization domain is specified via @domain. Note: This function + * may be called from atomic or non-atomic contexts. */ async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, - struct async_domain *running) + struct async_domain *domain) { - return __async_schedule(ptr, data, running); + return __async_schedule(ptr, data, domain); } EXPORT_SYMBOL_GPL(async_schedule_domain); @@ -289,7 +289,7 @@ void async_unregister_domain(struct async_domain *domain) mutex_lock(&async_register_mutex); spin_lock_irq(&async_lock); WARN_ON(!domain->registered || !list_empty(&domain->node) || - !list_empty(&domain->domain)); + !list_empty(&domain->running)); domain->registered = 0; spin_unlock_irq(&async_lock); mutex_unlock(&async_register_mutex); @@ -298,10 +298,10 @@ EXPORT_SYMBOL_GPL(async_unregister_domain); /** * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain - * @domain: running list to synchronize on + * @domain: the domain to synchronize * * This function waits until all asynchronous function calls for the - * synchronization domain specified by the running list @domain have been done. + * synchronization domain specified by @domain have been done. */ void async_synchronize_full_domain(struct async_domain *domain) { @@ -312,17 +312,17 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain); /** * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing * @cookie: async_cookie_t to use as checkpoint - * @running: running list to synchronize on + * @domain: the domain to synchronize * * This function waits until all asynchronous function calls for the - * synchronization domain specified by running list @running submitted - * prior to @cookie have been done. + * synchronization domain specified by @domain submitted prior to @cookie + * have been done. */ -void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running) +void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain) { ktime_t uninitialized_var(starttime), delta, endtime; - if (!running) + if (!domain) return; if (initcall_debug && system_state == SYSTEM_BOOTING) { @@ -330,7 +330,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain starttime = ktime_get(); } - wait_event(async_done, lowest_in_progress(running) >= cookie); + wait_event(async_done, lowest_in_progress(domain) >= cookie); if (initcall_debug && system_state == SYSTEM_BOOTING) { endtime = ktime_get(); @@ -352,7 +352,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain); */ void async_synchronize_cookie(async_cookie_t cookie) { - async_synchronize_cookie_domain(cookie, &async_running); + async_synchronize_cookie_domain(cookie, &async_dfl_domain); } EXPORT_SYMBOL_GPL(async_synchronize_cookie); -- cgit v1.2.3 From c68eee14ec2da345e86f2778c8570759309a4a2e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Jan 2013 09:32:30 -0800 Subject: async: use ULLONG_MAX for infinity cookie value Currently, next_cookie is used as the infinity value. In most cases, this should work fine but it theoretically could bring subtle behavior difference between async_synchronize_full() and async_synchronize_full_domain(). async_synchronize_full() keeps waiting until there's no registered async_entry left regardless of what next_cookie was when the function was called. It guarantees that the queue is completely drained at least once before returning. However, async_synchronize_full_domain() doesn't. It synchronizes upto next_cookie and if further async jobs are queued after the next_cookie value to synchronize is decided, they won't be waited for. For unrelated async jobs, the behavior difference doesn't matter; however, if async jobs which are related (nested or otherwise) to the executing ones are queued while sychronization is in progress, the resulting behavior difference could be problematic. This can be easily fixed by using ULLONG_MAX as the infinity value instead. Define ASYNC_COOKIE_MAX as ULLONG_MAX and use it as the infinity value for synchronization. This makes async_synchronize_full_domain() fully drain the domain at least once before returning, making its behavior match async_synchronize_full(). Signed-off-by: Tejun Heo Cc: Arjan van de Ven Cc: Dan Williams Cc: Linus Torvalds --- kernel/async.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index 29d51d483bee..a4c1a9e63b2e 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -61,7 +61,8 @@ asynchronous and synchronous parts of the kernel. static async_cookie_t next_cookie = 1; -#define MAX_WORK 32768 +#define MAX_WORK 32768 +#define ASYNC_COOKIE_MAX ULLONG_MAX /* infinity cookie */ static LIST_HEAD(async_pending); static ASYNC_DOMAIN(async_dfl_domain); @@ -88,8 +89,8 @@ static atomic_t entry_count; */ static async_cookie_t __lowest_in_progress(struct async_domain *domain) { - async_cookie_t first_running = next_cookie; /* infinity value */ - async_cookie_t first_pending = next_cookie; /* ditto */ + async_cookie_t first_running = ASYNC_COOKIE_MAX; + async_cookie_t first_pending = ASYNC_COOKIE_MAX; struct async_entry *entry; /* @@ -269,7 +270,7 @@ void async_synchronize_full(void) domain = list_first_entry(&async_domains, typeof(*domain), node); spin_unlock_irq(&async_lock); - async_synchronize_cookie_domain(next_cookie, domain); + async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain); } while (!list_empty(&async_domains)); mutex_unlock(&async_register_mutex); } @@ -305,7 +306,7 @@ EXPORT_SYMBOL_GPL(async_unregister_domain); */ void async_synchronize_full_domain(struct async_domain *domain) { - async_synchronize_cookie_domain(next_cookie, domain); + async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain); } EXPORT_SYMBOL_GPL(async_synchronize_full_domain); -- cgit v1.2.3 From 52722794d6a48162fd8906d54618ae60a4abdb21 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Jan 2013 09:32:30 -0800 Subject: async: keep pending tasks on async_domain and remove async_pending Async kept single global pending list and per-domain running lists. When an async item is queued, it's put on the global pending list. The item is moved to the per-domain running list when its execution starts. At this point, this design complicates execution and synchronization without bringing any benefit. The list only matters for synchronization which doesn't care whether a given async item is pending or executing. Also, global synchronization is done by iterating through all active registered async_domains, so the global async_pending list doesn't help anything either. Rename async_domain->running to async_domain->pending and put async items directly there and remove when execution completes. This simplifies lowest_in_progress() a lot - the first item on the pending list is the one with the lowest cookie, and async_run_entry_fn() doesn't have to mess with moving the item from pending to running. After the change, whether a domain is empty or not can be trivially determined by looking at async_domain->pending. Remove async_domain->count and use list_empty() on pending instead. Signed-off-by: Tejun Heo Cc: Arjan van de Ven Cc: Dan Williams Cc: Linus Torvalds --- kernel/async.c | 63 +++++++++++++--------------------------------------------- 1 file changed, 14 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index a4c1a9e63b2e..7c9f50f436d6 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -64,7 +64,6 @@ static async_cookie_t next_cookie = 1; #define MAX_WORK 32768 #define ASYNC_COOKIE_MAX ULLONG_MAX /* infinity cookie */ -static LIST_HEAD(async_pending); static ASYNC_DOMAIN(async_dfl_domain); static LIST_HEAD(async_domains); static DEFINE_SPINLOCK(async_lock); @@ -83,42 +82,17 @@ static DECLARE_WAIT_QUEUE_HEAD(async_done); static atomic_t entry_count; - -/* - * MUST be called with the lock held! - */ -static async_cookie_t __lowest_in_progress(struct async_domain *domain) -{ - async_cookie_t first_running = ASYNC_COOKIE_MAX; - async_cookie_t first_pending = ASYNC_COOKIE_MAX; - struct async_entry *entry; - - /* - * Both running and pending lists are sorted but not disjoint. - * Take the first cookies from both and return the min. - */ - if (!list_empty(&domain->running)) { - entry = list_first_entry(&domain->running, typeof(*entry), list); - first_running = entry->cookie; - } - - list_for_each_entry(entry, &async_pending, list) { - if (entry->domain == domain) { - first_pending = entry->cookie; - break; - } - } - - return min(first_running, first_pending); -} - static async_cookie_t lowest_in_progress(struct async_domain *domain) { + async_cookie_t ret = ASYNC_COOKIE_MAX; unsigned long flags; - async_cookie_t ret; spin_lock_irqsave(&async_lock, flags); - ret = __lowest_in_progress(domain); + if (!list_empty(&domain->pending)) { + struct async_entry *first = list_first_entry(&domain->pending, + struct async_entry, list); + ret = first->cookie; + } spin_unlock_irqrestore(&async_lock, flags); return ret; } @@ -130,20 +104,11 @@ static void async_run_entry_fn(struct work_struct *work) { struct async_entry *entry = container_of(work, struct async_entry, work); - struct async_entry *pos; unsigned long flags; ktime_t uninitialized_var(calltime), delta, rettime; struct async_domain *domain = entry->domain; - /* 1) move self to the running queue, make sure it stays sorted */ - spin_lock_irqsave(&async_lock, flags); - list_for_each_entry_reverse(pos, &domain->running, list) - if (entry->cookie < pos->cookie) - break; - list_move_tail(&entry->list, &pos->list); - spin_unlock_irqrestore(&async_lock, flags); - - /* 2) run (and print duration) */ + /* 1) run (and print duration) */ if (initcall_debug && system_state == SYSTEM_BOOTING) { printk(KERN_DEBUG "calling %lli_%pF @ %i\n", (long long)entry->cookie, @@ -160,19 +125,19 @@ static void async_run_entry_fn(struct work_struct *work) (long long)ktime_to_ns(delta) >> 10); } - /* 3) remove self from the running queue */ + /* 2) remove self from the pending queues */ spin_lock_irqsave(&async_lock, flags); list_del(&entry->list); - if (domain->registered && --domain->count == 0) + if (domain->registered && list_empty(&domain->pending)) list_del_init(&domain->node); - /* 4) free the entry */ + /* 3) free the entry */ kfree(entry); atomic_dec(&entry_count); spin_unlock_irqrestore(&async_lock, flags); - /* 5) wake up any waiters */ + /* 4) wake up any waiters */ wake_up(&async_done); } @@ -206,9 +171,9 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a spin_lock_irqsave(&async_lock, flags); newcookie = entry->cookie = next_cookie++; - list_add_tail(&entry->list, &async_pending); - if (domain->registered && domain->count++ == 0) + if (domain->registered && list_empty(&domain->pending)) list_add_tail(&domain->node, &async_domains); + list_add_tail(&entry->list, &domain->pending); atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); @@ -290,7 +255,7 @@ void async_unregister_domain(struct async_domain *domain) mutex_lock(&async_register_mutex); spin_lock_irq(&async_lock); WARN_ON(!domain->registered || !list_empty(&domain->node) || - !list_empty(&domain->running)); + !list_empty(&domain->pending)); domain->registered = 0; spin_unlock_irq(&async_lock); mutex_unlock(&async_register_mutex); -- cgit v1.2.3 From 9fdb04cdc5566d6ba68283a0bebe49667ca0b0e8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Jan 2013 09:32:30 -0800 Subject: async: replace list of active domains with global list of pending items Global synchronization - async_synchronize_full() - is currently implemented by keeping a list of all active registered domains and syncing them one by one until no domain is active. While this isn't necessarily a complex scheme, it can easily be simplified by keeping global list of the pending items of all registered active domains instead of list of domains and simply using the globl pending list the same way as domain syncing. This patch replaces async_domains with async_global_pending and update lowest_in_progress() to use the global pending list if @domain is %NULL. async_synchronize_full_domain(NULL) is now allowed and equivalent to async_synchronize_full(). As no one is calling with NULL domain, this doesn't affect any existing users. async_register_mutex is no longer necessary and dropped. Signed-off-by: Tejun Heo Cc: Arjan van de Ven Cc: Dan Williams Cc: Linus Torvalds --- kernel/async.c | 63 +++++++++++++++++++++++++++------------------------------- 1 file changed, 29 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index 7c9f50f436d6..6958000eeb44 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -64,13 +64,13 @@ static async_cookie_t next_cookie = 1; #define MAX_WORK 32768 #define ASYNC_COOKIE_MAX ULLONG_MAX /* infinity cookie */ +static LIST_HEAD(async_global_pending); /* pending from all registered doms */ static ASYNC_DOMAIN(async_dfl_domain); -static LIST_HEAD(async_domains); static DEFINE_SPINLOCK(async_lock); -static DEFINE_MUTEX(async_register_mutex); struct async_entry { - struct list_head list; + struct list_head domain_list; + struct list_head global_list; struct work_struct work; async_cookie_t cookie; async_func_ptr *func; @@ -84,15 +84,25 @@ static atomic_t entry_count; static async_cookie_t lowest_in_progress(struct async_domain *domain) { + struct async_entry *first = NULL; async_cookie_t ret = ASYNC_COOKIE_MAX; unsigned long flags; spin_lock_irqsave(&async_lock, flags); - if (!list_empty(&domain->pending)) { - struct async_entry *first = list_first_entry(&domain->pending, - struct async_entry, list); - ret = first->cookie; + + if (domain) { + if (!list_empty(&domain->pending)) + first = list_first_entry(&domain->pending, + struct async_entry, domain_list); + } else { + if (!list_empty(&async_global_pending)) + first = list_first_entry(&async_global_pending, + struct async_entry, global_list); } + + if (first) + ret = first->cookie; + spin_unlock_irqrestore(&async_lock, flags); return ret; } @@ -106,7 +116,6 @@ static void async_run_entry_fn(struct work_struct *work) container_of(work, struct async_entry, work); unsigned long flags; ktime_t uninitialized_var(calltime), delta, rettime; - struct async_domain *domain = entry->domain; /* 1) run (and print duration) */ if (initcall_debug && system_state == SYSTEM_BOOTING) { @@ -127,9 +136,8 @@ static void async_run_entry_fn(struct work_struct *work) /* 2) remove self from the pending queues */ spin_lock_irqsave(&async_lock, flags); - list_del(&entry->list); - if (domain->registered && list_empty(&domain->pending)) - list_del_init(&domain->node); + list_del_init(&entry->domain_list); + list_del_init(&entry->global_list); /* 3) free the entry */ kfree(entry); @@ -170,10 +178,14 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a entry->domain = domain; spin_lock_irqsave(&async_lock, flags); + + /* allocate cookie and queue */ newcookie = entry->cookie = next_cookie++; - if (domain->registered && list_empty(&domain->pending)) - list_add_tail(&domain->node, &async_domains); - list_add_tail(&entry->list, &domain->pending); + + list_add_tail(&entry->domain_list, &domain->pending); + if (domain->registered) + list_add_tail(&entry->global_list, &async_global_pending); + atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); @@ -226,18 +238,7 @@ EXPORT_SYMBOL_GPL(async_schedule_domain); */ void async_synchronize_full(void) { - mutex_lock(&async_register_mutex); - do { - struct async_domain *domain = NULL; - - spin_lock_irq(&async_lock); - if (!list_empty(&async_domains)) - domain = list_first_entry(&async_domains, typeof(*domain), node); - spin_unlock_irq(&async_lock); - - async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain); - } while (!list_empty(&async_domains)); - mutex_unlock(&async_register_mutex); + async_synchronize_full_domain(NULL); } EXPORT_SYMBOL_GPL(async_synchronize_full); @@ -252,13 +253,10 @@ EXPORT_SYMBOL_GPL(async_synchronize_full); */ void async_unregister_domain(struct async_domain *domain) { - mutex_lock(&async_register_mutex); spin_lock_irq(&async_lock); - WARN_ON(!domain->registered || !list_empty(&domain->node) || - !list_empty(&domain->pending)); + WARN_ON(!domain->registered || !list_empty(&domain->pending)); domain->registered = 0; spin_unlock_irq(&async_lock); - mutex_unlock(&async_register_mutex); } EXPORT_SYMBOL_GPL(async_unregister_domain); @@ -278,7 +276,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain); /** * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing * @cookie: async_cookie_t to use as checkpoint - * @domain: the domain to synchronize + * @domain: the domain to synchronize (%NULL for all registered domains) * * This function waits until all asynchronous function calls for the * synchronization domain specified by @domain submitted prior to @cookie @@ -288,9 +286,6 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain { ktime_t uninitialized_var(starttime), delta, endtime; - if (!domain) - return; - if (initcall_debug && system_state == SYSTEM_BOOTING) { printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); starttime = ktime_get(); -- cgit v1.2.3 From d41032a83b4683481cadff84bbf8e0eafeaba830 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 24 Jan 2013 07:52:34 -0500 Subject: tracing: Fix unsigned int compare of zero in recursion check Dan's smatch found a compare bug with the result of the trace_test_and_set_recursion() and comparing to less than zero. If the function fails, it returns -1, but was saved in an unsigned int, which will never be less than zero and will ignore the result of the test if a recursion did happen. Luckily this is the last of the recursion tests, as the infrastructure of ftrace would catch recursions before it got here, except for some few exceptions. Reported-by: Dan Carpenter Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 1c327ef13a9a..601152523326 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -61,7 +61,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, struct trace_array *tr = func_trace; struct trace_array_cpu *data; unsigned long flags; - unsigned int bit; + int bit; int cpu; int pc; -- cgit v1.2.3 From a59f4e079d19464eebb9b06513a1d4f55fdae5ba Mon Sep 17 00:00:00 2001 From: Zhu Yanhai Date: Tue, 8 Jan 2013 12:56:52 +0800 Subject: sched: Fix the broken sched_rr_get_interval() The caller of sched_sliced() should pass se.cfs_rq and se as the arguments, however in sched_rr_get_interval() we gave it rq.cfs_rq and se, which made the following computation obviously wrong. The change was introduced by commit: 77034937dc45 sched: fix crash in sys_sched_rr_get_interval() ... 5 years ago, while it had been the correct 'cfs_rq_of' before the commit. The change seems to be irrelevant to the commit msg, which was to return a 0 timeslice for tasks that are on an idle runqueue. So I believe that was just a plain typo. Signed-off-by: Zhu Yanhai Cc: Peter Zijlstra Cc: Paul Turner Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Andrew Morton Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1357621012-15039-1-git-send-email-gaoyang.zyh@taobao.com [ Since this is an ABI and an old bug, we'll test this via a slow upstream route, to hopefully discover any app breakage. ] Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5eea8707234a..a7a19ffc3b7e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6101,7 +6101,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task * idle runqueue: */ if (rq->cfs.load.weight) - rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); + rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); return rr_interval; } -- cgit v1.2.3 From ba6fdda46b377034c782c0b89c8f1090b31eabd8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 22 Dec 2012 16:59:51 +0100 Subject: profiling: Remove unused timer hook The last remaining user was oprofile and its use has been removed a while ago in commit bc078e4eab65f11bba ("oprofile: convert oprofile from timer_hook to hrtimer"). There doesn't seem to be any upstream user of this hook for about two years now. And I'm not even aware of any out of tree user. Let's remove it. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1356191991-2251-1-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/profile.c | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index 1f391819c42f..dc3384ee874e 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -37,9 +37,6 @@ struct profile_hit { #define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit)) #define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) -/* Oprofile timer tick hook */ -static int (*timer_hook)(struct pt_regs *) __read_mostly; - static atomic_t *prof_buffer; static unsigned long prof_len, prof_shift; @@ -208,25 +205,6 @@ int profile_event_unregister(enum profile_type type, struct notifier_block *n) } EXPORT_SYMBOL_GPL(profile_event_unregister); -int register_timer_hook(int (*hook)(struct pt_regs *)) -{ - if (timer_hook) - return -EBUSY; - timer_hook = hook; - return 0; -} -EXPORT_SYMBOL_GPL(register_timer_hook); - -void unregister_timer_hook(int (*hook)(struct pt_regs *)) -{ - WARN_ON(hook != timer_hook); - timer_hook = NULL; - /* make sure all CPUs see the NULL hook */ - synchronize_sched(); /* Allow ongoing interrupts to complete. */ -} -EXPORT_SYMBOL_GPL(unregister_timer_hook); - - #ifdef CONFIG_SMP /* * Each cpu has a pair of open-addressed hashtables for pending @@ -436,8 +414,6 @@ void profile_tick(int type) { struct pt_regs *regs = get_irq_regs(); - if (type == CPU_PROFILING && timer_hook) - timer_hook(regs); if (!user_mode(regs) && prof_cpu_mask != NULL && cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) profile_hit(type, (void *)profile_pc(regs)); -- cgit v1.2.3 From 1158ddb55416855fd17abe3214298f736f00426a Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 23 Nov 2012 00:02:15 +0400 Subject: sched/rt: Add reschedule check to switched_from_rt() Reschedule rq->curr if the first RT task has just been pulled to the rq. Signed-off-by: Kirill V Tkhai Acked-by: Steven Rostedt Cc: Peter Zijlstra Cc: Tkhai Kirill Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/118761353614535@web28f.yandex.ru Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 418feb01344e..29bda5bdf2a5 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1889,8 +1889,11 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (p->on_rq && !rq->rt.rt_nr_running) - pull_rt_task(rq); + if (!p->on_rq || rq->rt.rt_nr_running) + return; + + if (pull_rt_task(rq)) + resched_task(rq->curr); } void init_sched_rt_class(void) -- cgit v1.2.3 From 51906e779f2b13b38f8153774c4c7163d412ffd9 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Mon, 19 Nov 2012 16:01:29 +0100 Subject: x86/MSI: Support multiple MSIs in presense of IRQ remapping The MSI specification has several constraints in comparison with MSI-X, most notable of them is the inability to configure MSIs independently. As a result, it is impossible to dispatch interrupts from different queues to different CPUs. This is largely devalues the support of multiple MSIs in SMP systems. Also, a necessity to allocate a contiguous block of vector numbers for devices capable of multiple MSIs might cause a considerable pressure on x86 interrupt vector allocator and could lead to fragmentation of the interrupt vectors space. This patch overcomes both drawbacks in presense of IRQ remapping and lets devices take advantage of multiple queues and per-IRQ affinity assignments. Signed-off-by: Alexander Gordeev Cc: Bjorn Helgaas Cc: Suresh Siddha Cc: Yinghai Lu Cc: Matthew Wilcox Cc: Jeff Garzik Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/c8bd86ff56b5fc118257436768aaa04489ac0a4c.1353324359.git.agordeev@redhat.com Signed-off-by: Ingo Molnar --- kernel/irq/chip.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 3aca9f29d30e..cbd97ce0b000 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -90,26 +90,40 @@ int irq_set_handler_data(unsigned int irq, void *data) EXPORT_SYMBOL(irq_set_handler_data); /** - * irq_set_msi_desc - set MSI descriptor data for an irq - * @irq: Interrupt number - * @entry: Pointer to MSI descriptor data + * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset + * @irq_base: Interrupt number base + * @irq_offset: Interrupt number offset + * @entry: Pointer to MSI descriptor data * - * Set the MSI descriptor entry for an irq + * Set the MSI descriptor entry for an irq at offset */ -int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) +int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, + struct msi_desc *entry) { unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); + struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL); if (!desc) return -EINVAL; desc->irq_data.msi_desc = entry; - if (entry) - entry->irq = irq; + if (entry && !irq_offset) + entry->irq = irq_base; irq_put_desc_unlock(desc, flags); return 0; } +/** + * irq_set_msi_desc - set MSI descriptor data for an irq + * @irq: Interrupt number + * @entry: Pointer to MSI descriptor data + * + * Set the MSI descriptor entry for an irq + */ +int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) +{ + return irq_set_msi_desc_off(irq, 0, entry); +} + /** * irq_set_chip_data - set irq chip data for an irq * @irq: Interrupt number -- cgit v1.2.3 From 16c8f1c72ece3871a6c93003cd888fc2d003a7eb Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 8 Nov 2012 13:33:46 +0530 Subject: sched/fair: Set se->vruntime directly in place_entity() We are first storing the new vruntime in a variable and then storing it in se->vruntime. Simply update se->vruntime directly. Signed-off-by: Viresh Kumar Cc: linaro-dev@lists.linaro.org Cc: patches@linaro.org Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/ae59db1945518d6f6250920d46eb1f1a9cc0024e.1352361704.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a7a19ffc3b7e..8dbee9f4ceb2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1680,9 +1680,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) } /* ensure we never gain time by being placed backwards. */ - vruntime = max_vruntime(se->vruntime, vruntime); - - se->vruntime = vruntime; + se->vruntime = max_vruntime(se->vruntime, vruntime); } static void check_enqueue_throttle(struct cfs_rq *cfs_rq); -- cgit v1.2.3 From b5d646f5d5a135064232ff3a140a47a5b84bc911 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 24 Jan 2013 14:43:51 +0800 Subject: cgroup: remove a NULL check in cgroup_exit() init_task.cgroups is initialized at boot phase, and whenver a ask is forked, it's cgroups pointer is inherited from its parent, and it's never set to NULL afterwards. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ad3359f903f0..8da9048078c5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4994,8 +4994,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) } task_unlock(tsk); - if (cg) - put_css_set_taskexit(cg); + put_css_set_taskexit(cg); } /** -- cgit v1.2.3 From e2905b29122173b72b612c962b138e3fa07476b8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Jan 2013 11:01:32 -0800 Subject: workqueue: unexport work_cpu() This function no longer has any external users. Unexport it. It will be removed later on. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2ffa240052fa..fe0745f54fcd 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -449,6 +449,7 @@ static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = { }; static int worker_thread(void *__worker); +static unsigned int work_cpu(struct work_struct *work); static int worker_pool_pri(struct worker_pool *pool) { @@ -3419,13 +3420,12 @@ EXPORT_SYMBOL_GPL(workqueue_congested); * RETURNS: * CPU number if @work was ever queued. WORK_CPU_NONE otherwise. */ -unsigned int work_cpu(struct work_struct *work) +static unsigned int work_cpu(struct work_struct *work) { struct global_cwq *gcwq = get_work_gcwq(work); return gcwq ? gcwq->cpu : WORK_CPU_NONE; } -EXPORT_SYMBOL_GPL(work_cpu); /** * work_busy - test whether a work is currently pending or running -- cgit v1.2.3 From e34cdddb03bdfe98f20c58934fd4c45019f13ae5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Jan 2013 11:01:33 -0800 Subject: workqueue: use std_ prefix for the standard per-cpu pools There are currently two worker pools per cpu (including the unbound cpu) and they are the only pools in use. New class of pools are scheduled to be added and some pool related APIs will be added inbetween. Call the existing pools the standard pools and prefix them with std_. Do this early so that new APIs can use std_ prefix from the beginning. This patch doesn't introduce any functional difference. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index fe0745f54fcd..634251572fdd 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -80,7 +80,7 @@ enum { WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND | WORKER_CPU_INTENSIVE, - NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ + NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */ BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ @@ -156,7 +156,7 @@ struct global_cwq { DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); /* L: hash of busy workers */ - struct worker_pool pools[NR_WORKER_POOLS]; + struct worker_pool pools[NR_STD_WORKER_POOLS]; /* normal and highpri pools */ } ____cacheline_aligned_in_smp; @@ -255,7 +255,7 @@ EXPORT_SYMBOL_GPL(system_freezable_wq); #define for_each_worker_pool(pool, gcwq) \ for ((pool) = &(gcwq)->pools[0]; \ - (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++) + (pool) < &(gcwq)->pools[NR_STD_WORKER_POOLS]; (pool)++) #define for_each_busy_worker(worker, i, pos, gcwq) \ hash_for_each(gcwq->busy_hash, i, pos, worker, hentry) @@ -436,7 +436,7 @@ static bool workqueue_freezing; /* W: have wqs started freezing? */ * try_to_wake_up(). Put it in a separate cacheline. */ static DEFINE_PER_CPU(struct global_cwq, global_cwq); -static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]); +static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_STD_WORKER_POOLS]); /* * Global cpu workqueue and nr_running counter for unbound gcwq. The @@ -444,14 +444,14 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]) * workers have WORKER_UNBOUND set. */ static struct global_cwq unbound_global_cwq; -static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = { - [0 ... NR_WORKER_POOLS - 1] = ATOMIC_INIT(0), /* always 0 */ +static atomic_t unbound_pool_nr_running[NR_STD_WORKER_POOLS] = { + [0 ... NR_STD_WORKER_POOLS - 1] = ATOMIC_INIT(0), /* always 0 */ }; static int worker_thread(void *__worker); static unsigned int work_cpu(struct work_struct *work); -static int worker_pool_pri(struct worker_pool *pool) +static int std_worker_pool_pri(struct worker_pool *pool) { return pool - pool->gcwq->pools; } @@ -467,7 +467,7 @@ static struct global_cwq *get_gcwq(unsigned int cpu) static atomic_t *get_pool_nr_running(struct worker_pool *pool) { int cpu = pool->gcwq->cpu; - int idx = worker_pool_pri(pool); + int idx = std_worker_pool_pri(pool); if (cpu != WORK_CPU_UNBOUND) return &per_cpu(pool_nr_running, cpu)[idx]; @@ -1688,7 +1688,7 @@ static void rebind_workers(struct global_cwq *gcwq) * wq doesn't really matter but let's keep @worker->pool * and @cwq->pool consistent for sanity. */ - if (worker_pool_pri(worker->pool)) + if (std_worker_pool_pri(worker->pool)) wq = system_highpri_wq; else wq = system_wq; @@ -1731,7 +1731,7 @@ static struct worker *alloc_worker(void) static struct worker *create_worker(struct worker_pool *pool) { struct global_cwq *gcwq = pool->gcwq; - const char *pri = worker_pool_pri(pool) ? "H" : ""; + const char *pri = std_worker_pool_pri(pool) ? "H" : ""; struct worker *worker = NULL; int id = -1; @@ -1761,7 +1761,7 @@ static struct worker *create_worker(struct worker_pool *pool) if (IS_ERR(worker->task)) goto fail; - if (worker_pool_pri(pool)) + if (std_worker_pool_pri(pool)) set_user_nice(worker->task, HIGHPRI_NICE_LEVEL); /* -- cgit v1.2.3 From 2464757086b4de0591738d5e30f069d068d70ec0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Jan 2013 11:01:33 -0800 Subject: workqueue: make GCWQ_DISASSOCIATED a pool flag Make GCWQ_DISASSOCIATED a pool flag POOL_DISASSOCIATED. This patch doesn't change locking - DISASSOCIATED on both pools of a CPU are set or clear together while holding gcwq->lock. It shouldn't cause any functional difference. This is part of an effort to remove global_cwq and make worker_pool the top level abstraction, which in turn will help implementing worker pools with user-specified attributes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 66 +++++++++++++++++++++++++++++------------------------- 1 file changed, 35 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 634251572fdd..1b8af92cc2c9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -48,26 +48,28 @@ enum { /* * global_cwq flags + */ + GCWQ_FREEZING = 1 << 1, /* freeze in progress */ + + /* + * worker_pool flags * - * A bound gcwq is either associated or disassociated with its CPU. + * A bound pool is either associated or disassociated with its CPU. * While associated (!DISASSOCIATED), all workers are bound to the * CPU and none has %WORKER_UNBOUND set and concurrency management * is in effect. * * While DISASSOCIATED, the cpu may be offline and all workers have * %WORKER_UNBOUND set and concurrency management disabled, and may - * be executing on any CPU. The gcwq behaves as an unbound one. + * be executing on any CPU. The pool behaves as an unbound one. * * Note that DISASSOCIATED can be flipped only while holding - * assoc_mutex of all pools on the gcwq to avoid changing binding - * state while create_worker() is in progress. + * assoc_mutex to avoid changing binding state while + * create_worker() is in progress. */ - GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */ - GCWQ_FREEZING = 1 << 1, /* freeze in progress */ - - /* pool flags */ POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */ + POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ /* worker flags */ WORKER_STARTED = 1 << 0, /* started */ @@ -115,7 +117,7 @@ enum { * X: During normal operation, modification requires gcwq->lock and * should be done only from local cpu. Either disabling preemption * on local cpu or grabbing gcwq->lock is enough for read access. - * If GCWQ_DISASSOCIATED is set, it's identical to L. + * If POOL_DISASSOCIATED is set, it's identical to L. * * F: wq->flush_mutex protected. * @@ -138,7 +140,7 @@ struct worker_pool { struct timer_list idle_timer; /* L: worker idle timeout */ struct timer_list mayday_timer; /* L: SOS timer for workers */ - struct mutex assoc_mutex; /* protect GCWQ_DISASSOCIATED */ + struct mutex assoc_mutex; /* protect POOL_DISASSOCIATED */ struct ida worker_ida; /* L: for worker IDs */ }; @@ -439,9 +441,9 @@ static DEFINE_PER_CPU(struct global_cwq, global_cwq); static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_STD_WORKER_POOLS]); /* - * Global cpu workqueue and nr_running counter for unbound gcwq. The - * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its - * workers have WORKER_UNBOUND set. + * Global cpu workqueue and nr_running counter for unbound gcwq. The pools + * for online CPUs have POOL_DISASSOCIATED set, and all their workers have + * WORKER_UNBOUND set. */ static struct global_cwq unbound_global_cwq; static atomic_t unbound_pool_nr_running[NR_STD_WORKER_POOLS] = { @@ -1474,7 +1476,6 @@ EXPORT_SYMBOL_GPL(mod_delayed_work); static void worker_enter_idle(struct worker *worker) { struct worker_pool *pool = worker->pool; - struct global_cwq *gcwq = pool->gcwq; BUG_ON(worker->flags & WORKER_IDLE); BUG_ON(!list_empty(&worker->entry) && @@ -1497,7 +1498,7 @@ static void worker_enter_idle(struct worker *worker) * nr_running, the warning may trigger spuriously. Check iff * unbind is not in progress. */ - WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) && + WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) && pool->nr_workers == pool->nr_idle && atomic_read(get_pool_nr_running(pool))); } @@ -1538,7 +1539,7 @@ static void worker_leave_idle(struct worker *worker) * [dis]associated in the meantime. * * This function tries set_cpus_allowed() and locks gcwq and verifies the - * binding against %GCWQ_DISASSOCIATED which is set during + * binding against %POOL_DISASSOCIATED which is set during * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker * enters idle state or fetches works without dropping lock, it can * guarantee the scheduling requirement described in the first paragraph. @@ -1554,7 +1555,8 @@ static void worker_leave_idle(struct worker *worker) static bool worker_maybe_bind_and_lock(struct worker *worker) __acquires(&gcwq->lock) { - struct global_cwq *gcwq = worker->pool->gcwq; + struct worker_pool *pool = worker->pool; + struct global_cwq *gcwq = pool->gcwq; struct task_struct *task = worker->task; while (true) { @@ -1562,13 +1564,13 @@ __acquires(&gcwq->lock) * The following call may fail, succeed or succeed * without actually migrating the task to the cpu if * it races with cpu hotunplug operation. Verify - * against GCWQ_DISASSOCIATED. + * against POOL_DISASSOCIATED. */ - if (!(gcwq->flags & GCWQ_DISASSOCIATED)) + if (!(pool->flags & POOL_DISASSOCIATED)) set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu)); spin_lock_irq(&gcwq->lock); - if (gcwq->flags & GCWQ_DISASSOCIATED) + if (pool->flags & POOL_DISASSOCIATED) return false; if (task_cpu(task) == gcwq->cpu && cpumask_equal(¤t->cpus_allowed, @@ -1766,14 +1768,14 @@ static struct worker *create_worker(struct worker_pool *pool) /* * Determine CPU binding of the new worker depending on - * %GCWQ_DISASSOCIATED. The caller is responsible for ensuring the + * %POOL_DISASSOCIATED. The caller is responsible for ensuring the * flag remains stable across this function. See the comments * above the flag definition for details. * * As an unbound worker may later become a regular one if CPU comes * online, make sure every worker has %PF_THREAD_BOUND set. */ - if (!(gcwq->flags & GCWQ_DISASSOCIATED)) { + if (!(pool->flags & POOL_DISASSOCIATED)) { kthread_bind(worker->task, gcwq->cpu); } else { worker->task->flags |= PF_THREAD_BOUND; @@ -2134,10 +2136,10 @@ __acquires(&gcwq->lock) /* * Ensure we're on the correct CPU. DISASSOCIATED test is * necessary to avoid spurious warnings from rescuers servicing the - * unbound or a disassociated gcwq. + * unbound or a disassociated pool. */ WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) && - !(gcwq->flags & GCWQ_DISASSOCIATED) && + !(pool->flags & POOL_DISASSOCIATED) && raw_smp_processor_id() != gcwq->cpu); /* @@ -3472,7 +3474,7 @@ EXPORT_SYMBOL_GPL(work_busy); * gcwqs serve mix of short, long and very long running works making * blocked draining impractical. * - * This is solved by allowing a gcwq to be disassociated from the CPU + * This is solved by allowing the pools to be disassociated from the CPU * running as an unbound one and allowing it to be reattached later if the * cpu comes back online. */ @@ -3522,7 +3524,8 @@ static void gcwq_unbind_fn(struct work_struct *work) for_each_busy_worker(worker, i, pos, gcwq) worker->flags |= WORKER_UNBOUND; - gcwq->flags |= GCWQ_DISASSOCIATED; + for_each_worker_pool(pool, gcwq) + pool->flags |= POOL_DISASSOCIATED; gcwq_release_assoc_and_unlock(gcwq); @@ -3581,7 +3584,8 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, case CPU_DOWN_FAILED: case CPU_ONLINE: gcwq_claim_assoc_and_lock(gcwq); - gcwq->flags &= ~GCWQ_DISASSOCIATED; + for_each_worker_pool(pool, gcwq) + pool->flags &= ~POOL_DISASSOCIATED; rebind_workers(gcwq); gcwq_release_assoc_and_unlock(gcwq); break; @@ -3806,12 +3810,12 @@ static int __init init_workqueues(void) spin_lock_init(&gcwq->lock); gcwq->cpu = cpu; - gcwq->flags |= GCWQ_DISASSOCIATED; hash_init(gcwq->busy_hash); for_each_worker_pool(pool, gcwq) { pool->gcwq = gcwq; + pool->flags |= POOL_DISASSOCIATED; INIT_LIST_HEAD(&pool->worklist); INIT_LIST_HEAD(&pool->idle_list); @@ -3832,12 +3836,12 @@ static int __init init_workqueues(void) struct global_cwq *gcwq = get_gcwq(cpu); struct worker_pool *pool; - if (cpu != WORK_CPU_UNBOUND) - gcwq->flags &= ~GCWQ_DISASSOCIATED; - for_each_worker_pool(pool, gcwq) { struct worker *worker; + if (cpu != WORK_CPU_UNBOUND) + pool->flags &= ~POOL_DISASSOCIATED; + worker = create_worker(pool); BUG_ON(!worker); spin_lock_irq(&gcwq->lock); -- cgit v1.2.3 From 35b6bb63b8a288f90e07948867941a553b3d97bc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Jan 2013 11:01:33 -0800 Subject: workqueue: make GCWQ_FREEZING a pool flag Make GCWQ_FREEZING a pool flag POOL_FREEZING. This patch doesn't change locking - FREEZING on both pools of a CPU are set or clear together while holding gcwq->lock. It shouldn't cause any functional difference. This leaves gcwq->flags w/o any flags. Removed. While at it, convert BUG_ON()s in freeze_workqueue_begin() and thaw_workqueues() to WARN_ON_ONCE(). This is part of an effort to remove global_cwq and make worker_pool the top level abstraction, which in turn will help implementing worker pools with user-specified attributes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1b8af92cc2c9..1a686e481132 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -46,11 +46,6 @@ #include "workqueue_internal.h" enum { - /* - * global_cwq flags - */ - GCWQ_FREEZING = 1 << 1, /* freeze in progress */ - /* * worker_pool flags * @@ -70,6 +65,7 @@ enum { POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ + POOL_FREEZING = 1 << 3, /* freeze in progress */ /* worker flags */ WORKER_STARTED = 1 << 0, /* started */ @@ -152,7 +148,6 @@ struct worker_pool { struct global_cwq { spinlock_t lock; /* the gcwq lock */ unsigned int cpu; /* I: the associated cpu */ - unsigned int flags; /* L: GCWQ_* flags */ /* workers are chained either in busy_hash or pool idle_list */ DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); @@ -3380,13 +3375,15 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) wq->saved_max_active = max_active; for_each_cwq_cpu(cpu, wq) { - struct global_cwq *gcwq = get_gcwq(cpu); + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + struct worker_pool *pool = cwq->pool; + struct global_cwq *gcwq = pool->gcwq; spin_lock_irq(&gcwq->lock); if (!(wq->flags & WQ_FREEZABLE) || - !(gcwq->flags & GCWQ_FREEZING)) - cwq_set_max_active(get_cwq(gcwq->cpu, wq), max_active); + !(pool->flags & POOL_FREEZING)) + cwq_set_max_active(cwq, max_active); spin_unlock_irq(&gcwq->lock); } @@ -3676,12 +3673,15 @@ void freeze_workqueues_begin(void) for_each_gcwq_cpu(cpu) { struct global_cwq *gcwq = get_gcwq(cpu); + struct worker_pool *pool; struct workqueue_struct *wq; spin_lock_irq(&gcwq->lock); - BUG_ON(gcwq->flags & GCWQ_FREEZING); - gcwq->flags |= GCWQ_FREEZING; + for_each_worker_pool(pool, gcwq) { + WARN_ON_ONCE(pool->flags & POOL_FREEZING); + pool->flags |= POOL_FREEZING; + } list_for_each_entry(wq, &workqueues, list) { struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); @@ -3767,8 +3767,10 @@ void thaw_workqueues(void) spin_lock_irq(&gcwq->lock); - BUG_ON(!(gcwq->flags & GCWQ_FREEZING)); - gcwq->flags &= ~GCWQ_FREEZING; + for_each_worker_pool(pool, gcwq) { + WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); + pool->flags &= ~POOL_FREEZING; + } list_for_each_entry(wq, &workqueues, list) { struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); -- cgit v1.2.3 From 715b06b864c99a18cb8368dfb187da4f569788cd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Jan 2013 11:01:33 -0800 Subject: workqueue: introduce WORK_OFFQ_CPU_NONE Currently, when a work item is off queue, high bits of its data encodes the last CPU it was on. This is scheduled to be changed to pool ID, which will make it impossible to use WORK_CPU_NONE to indicate no association. This patch limits the number of bits which are used for off-queue cpu number to 31 (so that the max fits in an int) and uses the highest possible value - WORK_OFFQ_CPU_NONE - to indicate no association. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1a686e481132..405282d30046 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -573,7 +573,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq; cpu = data >> WORK_OFFQ_CPU_SHIFT; - if (cpu == WORK_CPU_NONE) + if (cpu == WORK_OFFQ_CPU_NONE) return NULL; BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND); @@ -583,7 +583,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) static void mark_work_canceling(struct work_struct *work) { struct global_cwq *gcwq = get_work_gcwq(work); - unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE; + unsigned long cpu = gcwq ? gcwq->cpu : WORK_OFFQ_CPU_NONE; set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING); -- cgit v1.2.3 From 9daf9e678d18585433a4ad90ec51a448e5fd054c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Jan 2013 11:01:33 -0800 Subject: workqueue: add worker_pool->id Add worker_pool->id which is allocated from worker_pool_idr. This will be used to record the last associated worker_pool in work->data. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 405282d30046..9c6ad974bb9e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -124,6 +124,7 @@ enum { struct worker_pool { struct global_cwq *gcwq; /* I: the owning gcwq */ + int id; /* I: pool ID */ unsigned int flags; /* X: flags */ struct list_head worklist; /* L: list of pending works */ @@ -445,6 +446,10 @@ static atomic_t unbound_pool_nr_running[NR_STD_WORKER_POOLS] = { [0 ... NR_STD_WORKER_POOLS - 1] = ATOMIC_INIT(0), /* always 0 */ }; +/* idr of all pools */ +static DEFINE_MUTEX(worker_pool_idr_mutex); +static DEFINE_IDR(worker_pool_idr); + static int worker_thread(void *__worker); static unsigned int work_cpu(struct work_struct *work); @@ -461,6 +466,19 @@ static struct global_cwq *get_gcwq(unsigned int cpu) return &unbound_global_cwq; } +/* allocate ID and assign it to @pool */ +static int worker_pool_assign_id(struct worker_pool *pool) +{ + int ret; + + mutex_lock(&worker_pool_idr_mutex); + idr_pre_get(&worker_pool_idr, GFP_KERNEL); + ret = idr_get_new(&worker_pool_idr, pool, &pool->id); + mutex_unlock(&worker_pool_idr_mutex); + + return ret; +} + static atomic_t *get_pool_nr_running(struct worker_pool *pool) { int cpu = pool->gcwq->cpu; @@ -3830,6 +3848,9 @@ static int __init init_workqueues(void) mutex_init(&pool->assoc_mutex); ida_init(&pool->worker_ida); + + /* alloc pool ID */ + BUG_ON(worker_pool_assign_id(pool)); } } -- cgit v1.2.3 From 7c3eed5cd60d0f736516e6ade77d90c6255860bd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Jan 2013 11:01:33 -0800 Subject: workqueue: record pool ID instead of CPU in work->data when off-queue Currently, when a work item is off-queue, work->data records the CPU it was last on, which is used to locate the last executing instance for non-reentrance, flushing, etc. We're in the process of removing global_cwq and making worker_pool the top level abstraction. This patch makes work->data point to the pool it was last associated with instead of CPU. After the previous WORK_OFFQ_POOL_CPU and worker_poo->id additions, the conversion is fairly straight-forward. WORK_OFFQ constants and functions are modified to record and read back pool ID instead. worker_pool_by_id() is added to allow looking up pool from ID. get_work_pool() replaces get_work_gcwq(), which is reimplemented using get_work_pool(). get_work_pool_id() replaces work_cpu(). This patch shouldn't introduce any observable behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 111 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 67 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9c6ad974bb9e..a4d7e3f0a874 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -451,7 +451,6 @@ static DEFINE_MUTEX(worker_pool_idr_mutex); static DEFINE_IDR(worker_pool_idr); static int worker_thread(void *__worker); -static unsigned int work_cpu(struct work_struct *work); static int std_worker_pool_pri(struct worker_pool *pool) { @@ -479,6 +478,15 @@ static int worker_pool_assign_id(struct worker_pool *pool) return ret; } +/* + * Lookup worker_pool by id. The idr currently is built during boot and + * never modified. Don't worry about locking for now. + */ +static struct worker_pool *worker_pool_by_id(int pool_id) +{ + return idr_find(&worker_pool_idr, pool_id); +} + static atomic_t *get_pool_nr_running(struct worker_pool *pool) { int cpu = pool->gcwq->cpu; @@ -520,17 +528,17 @@ static int work_next_color(int color) /* * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data * contain the pointer to the queued cwq. Once execution starts, the flag - * is cleared and the high bits contain OFFQ flags and CPU number. + * is cleared and the high bits contain OFFQ flags and pool ID. * - * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling() - * and clear_work_data() can be used to set the cwq, cpu or clear + * set_work_cwq(), set_work_pool_and_clear_pending(), mark_work_canceling() + * and clear_work_data() can be used to set the cwq, pool or clear * work->data. These functions should only be called while the work is * owned - ie. while the PENDING bit is set. * - * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to - * a work. gcwq is available once the work has been queued anywhere after - * initialization until it is sync canceled. cwq is available only while - * the work item is queued. + * get_work_pool() and get_work_cwq() can be used to obtain the pool or cwq + * corresponding to a work. Pool is available once the work has been + * queued anywhere after initialization until it is sync canceled. cwq is + * available only while the work item is queued. * * %WORK_OFFQ_CANCELING is used to mark a work item which is being * canceled. While being canceled, a work item may have its PENDING set @@ -552,8 +560,8 @@ static void set_work_cwq(struct work_struct *work, WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); } -static void set_work_cpu_and_clear_pending(struct work_struct *work, - unsigned int cpu) +static void set_work_pool_and_clear_pending(struct work_struct *work, + int pool_id) { /* * The following wmb is paired with the implied mb in @@ -562,13 +570,13 @@ static void set_work_cpu_and_clear_pending(struct work_struct *work, * owner. */ smp_wmb(); - set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0); + set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0); } static void clear_work_data(struct work_struct *work) { - smp_wmb(); /* see set_work_cpu_and_clear_pending() */ - set_work_data(work, WORK_STRUCT_NO_CPU, 0); + smp_wmb(); /* see set_work_pool_and_clear_pending() */ + set_work_data(work, WORK_STRUCT_NO_POOL, 0); } static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work) @@ -581,30 +589,58 @@ static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work) return NULL; } -static struct global_cwq *get_work_gcwq(struct work_struct *work) +/** + * get_work_pool - return the worker_pool a given work was associated with + * @work: the work item of interest + * + * Return the worker_pool @work was last associated with. %NULL if none. + */ +static struct worker_pool *get_work_pool(struct work_struct *work) { unsigned long data = atomic_long_read(&work->data); - unsigned int cpu; + struct worker_pool *pool; + int pool_id; if (data & WORK_STRUCT_CWQ) return ((struct cpu_workqueue_struct *) - (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq; + (data & WORK_STRUCT_WQ_DATA_MASK))->pool; - cpu = data >> WORK_OFFQ_CPU_SHIFT; - if (cpu == WORK_OFFQ_CPU_NONE) + pool_id = data >> WORK_OFFQ_POOL_SHIFT; + if (pool_id == WORK_OFFQ_POOL_NONE) return NULL; - BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND); - return get_gcwq(cpu); + pool = worker_pool_by_id(pool_id); + WARN_ON_ONCE(!pool); + return pool; +} + +/** + * get_work_pool_id - return the worker pool ID a given work is associated with + * @work: the work item of interest + * + * Return the worker_pool ID @work was last associated with. + * %WORK_OFFQ_POOL_NONE if none. + */ +static int get_work_pool_id(struct work_struct *work) +{ + struct worker_pool *pool = get_work_pool(work); + + return pool ? pool->id : WORK_OFFQ_POOL_NONE; +} + +static struct global_cwq *get_work_gcwq(struct work_struct *work) +{ + struct worker_pool *pool = get_work_pool(work); + + return pool ? pool->gcwq : NULL; } static void mark_work_canceling(struct work_struct *work) { - struct global_cwq *gcwq = get_work_gcwq(work); - unsigned long cpu = gcwq ? gcwq->cpu : WORK_OFFQ_CPU_NONE; + unsigned long pool_id = get_work_pool_id(work); - set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING, - WORK_STRUCT_PENDING); + pool_id <<= WORK_OFFQ_POOL_SHIFT; + set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING); } static bool work_is_canceling(struct work_struct *work) @@ -2192,12 +2228,12 @@ __acquires(&gcwq->lock) wake_up_worker(pool); /* - * Record the last CPU and clear PENDING which should be the last + * Record the last pool and clear PENDING which should be the last * update to @work. Also, do this inside @gcwq->lock so that * PENDING and queued state changes happen together while IRQ is * disabled. */ - set_work_cpu_and_clear_pending(work, gcwq->cpu); + set_work_pool_and_clear_pending(work, pool->id); spin_unlock_irq(&gcwq->lock); @@ -2967,7 +3003,8 @@ bool cancel_delayed_work(struct delayed_work *dwork) if (unlikely(ret < 0)) return false; - set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work)); + set_work_pool_and_clear_pending(&dwork->work, + get_work_pool_id(&dwork->work)); local_irq_restore(flags); return ret; } @@ -3430,20 +3467,6 @@ bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) } EXPORT_SYMBOL_GPL(workqueue_congested); -/** - * work_cpu - return the last known associated cpu for @work - * @work: the work of interest - * - * RETURNS: - * CPU number if @work was ever queued. WORK_CPU_NONE otherwise. - */ -static unsigned int work_cpu(struct work_struct *work) -{ - struct global_cwq *gcwq = get_work_gcwq(work); - - return gcwq ? gcwq->cpu : WORK_CPU_NONE; -} - /** * work_busy - test whether a work is currently pending or running * @work: the work to be tested @@ -3816,9 +3839,9 @@ static int __init init_workqueues(void) { unsigned int cpu; - /* make sure we have enough bits for OFFQ CPU number */ - BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) < - WORK_CPU_LAST); + /* make sure we have enough bits for OFFQ pool ID */ + BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < + WORK_CPU_LAST * NR_STD_WORKER_POOLS); cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); -- cgit v1.2.3 From c9e7cf273fa1876dee8effdb201a6f65eefab3a7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Jan 2013 11:01:33 -0800 Subject: workqueue: move busy_hash from global_cwq to worker_pool There's no functional necessity for the two pools on the same CPU to share the busy hash table. It's also likely to be a bottleneck when implementing pools with user-specified attributes. This patch makes busy_hash per-pool. The conversion is mostly straight-forward. Changes worth noting are, * Large block of changes in rebind_workers() is moving the block inside for_each_worker_pool() as now there are separate hash tables for each pool. This changes the order of operations but doesn't break anything. * Thre for_each_worker_pool() loops in gcwq_unbind_fn() are combined into one. This again changes the order of operaitons but doesn't break anything. This is part of an effort to remove global_cwq and make worker_pool the top level abstraction, which in turn will help implementing worker pools with user-specified attributes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 111 ++++++++++++++++++++++++++++------------------------- 1 file changed, 59 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a4d7e3f0a874..99c30116d291 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -137,6 +137,10 @@ struct worker_pool { struct timer_list idle_timer; /* L: worker idle timeout */ struct timer_list mayday_timer; /* L: SOS timer for workers */ + /* workers are chained either in busy_hash or idle_list */ + DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); + /* L: hash of busy workers */ + struct mutex assoc_mutex; /* protect POOL_DISASSOCIATED */ struct ida worker_ida; /* L: for worker IDs */ }; @@ -150,10 +154,6 @@ struct global_cwq { spinlock_t lock; /* the gcwq lock */ unsigned int cpu; /* I: the associated cpu */ - /* workers are chained either in busy_hash or pool idle_list */ - DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); - /* L: hash of busy workers */ - struct worker_pool pools[NR_STD_WORKER_POOLS]; /* normal and highpri pools */ } ____cacheline_aligned_in_smp; @@ -255,8 +255,8 @@ EXPORT_SYMBOL_GPL(system_freezable_wq); for ((pool) = &(gcwq)->pools[0]; \ (pool) < &(gcwq)->pools[NR_STD_WORKER_POOLS]; (pool)++) -#define for_each_busy_worker(worker, i, pos, gcwq) \ - hash_for_each(gcwq->busy_hash, i, pos, worker, hentry) +#define for_each_busy_worker(worker, i, pos, pool) \ + hash_for_each(pool->busy_hash, i, pos, worker, hentry) static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask, unsigned int sw) @@ -892,11 +892,11 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) /** * find_worker_executing_work - find worker which is executing a work - * @gcwq: gcwq of interest + * @pool: pool of interest * @work: work to find worker for * - * Find a worker which is executing @work on @gcwq by searching - * @gcwq->busy_hash which is keyed by the address of @work. For a worker + * Find a worker which is executing @work on @pool by searching + * @pool->busy_hash which is keyed by the address of @work. For a worker * to match, its current execution should match the address of @work and * its work function. This is to avoid unwanted dependency between * unrelated work executions through a work item being recycled while still @@ -924,13 +924,13 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) * Pointer to worker which is executing @work if found, NULL * otherwise. */ -static struct worker *find_worker_executing_work(struct global_cwq *gcwq, +static struct worker *find_worker_executing_work(struct worker_pool *pool, struct work_struct *work) { struct worker *worker; struct hlist_node *tmp; - hash_for_each_possible(gcwq->busy_hash, worker, tmp, hentry, + hash_for_each_possible(pool->busy_hash, worker, tmp, hentry, (unsigned long)work) if (worker->current_work == work && worker->current_func == work->func) @@ -1191,13 +1191,15 @@ static bool is_chained_work(struct workqueue_struct *wq) unsigned int cpu; for_each_gcwq_cpu(cpu) { - struct global_cwq *gcwq = get_gcwq(cpu); + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + struct worker_pool *pool = cwq->pool; + struct global_cwq *gcwq = pool->gcwq; struct worker *worker; struct hlist_node *pos; int i; spin_lock_irqsave(&gcwq->lock, flags); - for_each_busy_worker(worker, i, pos, gcwq) { + for_each_busy_worker(worker, i, pos, pool) { if (worker->task != current) continue; spin_unlock_irqrestore(&gcwq->lock, flags); @@ -1238,7 +1240,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, /* determine gcwq to use */ if (!(wq->flags & WQ_UNBOUND)) { - struct global_cwq *last_gcwq; + struct worker_pool *last_pool; if (cpu == WORK_CPU_UNBOUND) cpu = raw_smp_processor_id(); @@ -1250,14 +1252,15 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, * non-reentrancy. */ gcwq = get_gcwq(cpu); - last_gcwq = get_work_gcwq(work); + last_pool = get_work_pool(work); - if (last_gcwq && last_gcwq != gcwq) { + if (last_pool && last_pool->gcwq != gcwq) { + struct global_cwq *last_gcwq = last_pool->gcwq; struct worker *worker; spin_lock(&last_gcwq->lock); - worker = find_worker_executing_work(last_gcwq, work); + worker = find_worker_executing_work(last_pool, work); if (worker && worker->current_cwq->wq == wq) gcwq = last_gcwq; @@ -1722,31 +1725,32 @@ static void rebind_workers(struct global_cwq *gcwq) */ wake_up_process(worker->task); } - } - /* rebind busy workers */ - for_each_busy_worker(worker, i, pos, gcwq) { - struct work_struct *rebind_work = &worker->rebind_work; - struct workqueue_struct *wq; + /* rebind busy workers */ + for_each_busy_worker(worker, i, pos, pool) { + struct work_struct *rebind_work = &worker->rebind_work; + struct workqueue_struct *wq; - if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, - work_data_bits(rebind_work))) - continue; + if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, + work_data_bits(rebind_work))) + continue; - debug_work_activate(rebind_work); + debug_work_activate(rebind_work); - /* - * wq doesn't really matter but let's keep @worker->pool - * and @cwq->pool consistent for sanity. - */ - if (std_worker_pool_pri(worker->pool)) - wq = system_highpri_wq; - else - wq = system_wq; - - insert_work(get_cwq(gcwq->cpu, wq), rebind_work, - worker->scheduled.next, - work_color_to_flags(WORK_NO_COLOR)); + /* + * wq doesn't really matter but let's keep + * @worker->pool and @cwq->pool consistent for + * sanity. + */ + if (std_worker_pool_pri(worker->pool)) + wq = system_highpri_wq; + else + wq = system_wq; + + insert_work(get_cwq(gcwq->cpu, wq), rebind_work, + worker->scheduled.next, + work_color_to_flags(WORK_NO_COLOR)); + } } } @@ -2197,7 +2201,7 @@ __acquires(&gcwq->lock) * already processing the work. If so, defer the work to the * currently executing one. */ - collision = find_worker_executing_work(gcwq, work); + collision = find_worker_executing_work(pool, work); if (unlikely(collision)) { move_linked_works(work, &collision->scheduled, NULL); return; @@ -2205,7 +2209,7 @@ __acquires(&gcwq->lock) /* claim and dequeue */ debug_work_deactivate(work); - hash_add(gcwq->busy_hash, &worker->hentry, (unsigned long)work); + hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work); worker->current_work = work; worker->current_func = work->func; worker->current_cwq = cwq; @@ -2833,13 +2837,15 @@ EXPORT_SYMBOL_GPL(drain_workqueue); static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) { struct worker *worker = NULL; + struct worker_pool *pool; struct global_cwq *gcwq; struct cpu_workqueue_struct *cwq; might_sleep(); - gcwq = get_work_gcwq(work); - if (!gcwq) + pool = get_work_pool(work); + if (!pool) return false; + gcwq = pool->gcwq; spin_lock_irq(&gcwq->lock); if (!list_empty(&work->entry)) { @@ -2853,7 +2859,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) if (unlikely(!cwq || gcwq != cwq->pool->gcwq)) goto already_gone; } else { - worker = find_worker_executing_work(gcwq, work); + worker = find_worker_executing_work(pool, work); if (!worker) goto already_gone; cwq = worker->current_cwq; @@ -3482,18 +3488,20 @@ EXPORT_SYMBOL_GPL(workqueue_congested); */ unsigned int work_busy(struct work_struct *work) { - struct global_cwq *gcwq = get_work_gcwq(work); + struct worker_pool *pool = get_work_pool(work); + struct global_cwq *gcwq; unsigned long flags; unsigned int ret = 0; - if (!gcwq) + if (!pool) return 0; + gcwq = pool->gcwq; spin_lock_irqsave(&gcwq->lock, flags); if (work_pending(work)) ret |= WORK_BUSY_PENDING; - if (find_worker_executing_work(gcwq, work)) + if (find_worker_executing_work(pool, work)) ret |= WORK_BUSY_RUNNING; spin_unlock_irqrestore(&gcwq->lock, flags); @@ -3555,15 +3563,15 @@ static void gcwq_unbind_fn(struct work_struct *work) * ones which are still executing works from before the last CPU * down must be on the cpu. After this, they may become diasporas. */ - for_each_worker_pool(pool, gcwq) + for_each_worker_pool(pool, gcwq) { list_for_each_entry(worker, &pool->idle_list, entry) worker->flags |= WORKER_UNBOUND; - for_each_busy_worker(worker, i, pos, gcwq) - worker->flags |= WORKER_UNBOUND; + for_each_busy_worker(worker, i, pos, pool) + worker->flags |= WORKER_UNBOUND; - for_each_worker_pool(pool, gcwq) pool->flags |= POOL_DISASSOCIATED; + } gcwq_release_assoc_and_unlock(gcwq); @@ -3854,13 +3862,12 @@ static int __init init_workqueues(void) spin_lock_init(&gcwq->lock); gcwq->cpu = cpu; - hash_init(gcwq->busy_hash); - for_each_worker_pool(pool, gcwq) { pool->gcwq = gcwq; pool->flags |= POOL_DISASSOCIATED; INIT_LIST_HEAD(&pool->worklist); INIT_LIST_HEAD(&pool->idle_list); + hash_init(pool->busy_hash); init_timer_deferrable(&pool->idle_timer); pool->idle_timer.function = idle_worker_timeout; -- cgit v1.2.3 From ec22ca5eab0bd225588c69ccd06b16504cb05adf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Jan 2013 11:01:33 -0800 Subject: workqueue: move global_cwq->cpu to worker_pool Move gcwq->cpu to pool->cpu. This introduces a couple places where gcwq->pools[0].cpu is used. These will soon go away as gcwq is further reduced. This is part of an effort to remove global_cwq and make worker_pool the top level abstraction, which in turn will help implementing worker pools with user-specified attributes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 99c30116d291..366132bd226f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -124,6 +124,7 @@ enum { struct worker_pool { struct global_cwq *gcwq; /* I: the owning gcwq */ + unsigned int cpu; /* I: the associated cpu */ int id; /* I: pool ID */ unsigned int flags; /* X: flags */ @@ -152,7 +153,6 @@ struct worker_pool { */ struct global_cwq { spinlock_t lock; /* the gcwq lock */ - unsigned int cpu; /* I: the associated cpu */ struct worker_pool pools[NR_STD_WORKER_POOLS]; /* normal and highpri pools */ @@ -489,7 +489,7 @@ static struct worker_pool *worker_pool_by_id(int pool_id) static atomic_t *get_pool_nr_running(struct worker_pool *pool) { - int cpu = pool->gcwq->cpu; + int cpu = pool->cpu; int idx = std_worker_pool_pri(pool); if (cpu != WORK_CPU_UNBOUND) @@ -764,7 +764,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) struct worker *worker = kthread_data(task); if (!(worker->flags & WORKER_NOT_RUNNING)) { - WARN_ON_ONCE(worker->pool->gcwq->cpu != cpu); + WARN_ON_ONCE(worker->pool->cpu != cpu); atomic_inc(get_pool_nr_running(worker->pool)); } } @@ -1278,7 +1278,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, } /* gcwq determined, get cwq and queue */ - cwq = get_cwq(gcwq->cpu, wq); + cwq = get_cwq(gcwq->pools[0].cpu, wq); trace_workqueue_queue_work(req_cpu, cwq, work); if (WARN_ON(!list_empty(&work->entry))) { @@ -1385,20 +1385,20 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, /* * This stores cwq for the moment, for the timer_fn. Note that the - * work's gcwq is preserved to allow reentrance detection for + * work's pool is preserved to allow reentrance detection for * delayed works. */ if (!(wq->flags & WQ_UNBOUND)) { - struct global_cwq *gcwq = get_work_gcwq(work); + struct worker_pool *pool = get_work_pool(work); /* - * If we cannot get the last gcwq from @work directly, + * If we cannot get the last pool from @work directly, * select the last CPU such that it avoids unnecessarily * triggering non-reentrancy check in __queue_work(). */ lcpu = cpu; - if (gcwq) - lcpu = gcwq->cpu; + if (pool) + lcpu = pool->cpu; if (lcpu == WORK_CPU_UNBOUND) lcpu = raw_smp_processor_id(); } else { @@ -1619,14 +1619,14 @@ __acquires(&gcwq->lock) * against POOL_DISASSOCIATED. */ if (!(pool->flags & POOL_DISASSOCIATED)) - set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu)); + set_cpus_allowed_ptr(task, get_cpu_mask(pool->cpu)); spin_lock_irq(&gcwq->lock); if (pool->flags & POOL_DISASSOCIATED) return false; - if (task_cpu(task) == gcwq->cpu && + if (task_cpu(task) == pool->cpu && cpumask_equal(¤t->cpus_allowed, - get_cpu_mask(gcwq->cpu))) + get_cpu_mask(pool->cpu))) return true; spin_unlock_irq(&gcwq->lock); @@ -1747,7 +1747,7 @@ static void rebind_workers(struct global_cwq *gcwq) else wq = system_wq; - insert_work(get_cwq(gcwq->cpu, wq), rebind_work, + insert_work(get_cwq(pool->cpu, wq), rebind_work, worker->scheduled.next, work_color_to_flags(WORK_NO_COLOR)); } @@ -1806,10 +1806,10 @@ static struct worker *create_worker(struct worker_pool *pool) worker->pool = pool; worker->id = id; - if (gcwq->cpu != WORK_CPU_UNBOUND) + if (pool->cpu != WORK_CPU_UNBOUND) worker->task = kthread_create_on_node(worker_thread, - worker, cpu_to_node(gcwq->cpu), - "kworker/%u:%d%s", gcwq->cpu, id, pri); + worker, cpu_to_node(pool->cpu), + "kworker/%u:%d%s", pool->cpu, id, pri); else worker->task = kthread_create(worker_thread, worker, "kworker/u:%d%s", id, pri); @@ -1829,7 +1829,7 @@ static struct worker *create_worker(struct worker_pool *pool) * online, make sure every worker has %PF_THREAD_BOUND set. */ if (!(pool->flags & POOL_DISASSOCIATED)) { - kthread_bind(worker->task, gcwq->cpu); + kthread_bind(worker->task, pool->cpu); } else { worker->task->flags |= PF_THREAD_BOUND; worker->flags |= WORKER_UNBOUND; @@ -1936,7 +1936,7 @@ static bool send_mayday(struct work_struct *work) return false; /* mayday mayday mayday */ - cpu = cwq->pool->gcwq->cpu; + cpu = cwq->pool->cpu; /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ if (cpu == WORK_CPU_UNBOUND) cpu = 0; @@ -2193,7 +2193,7 @@ __acquires(&gcwq->lock) */ WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) && !(pool->flags & POOL_DISASSOCIATED) && - raw_smp_processor_id() != gcwq->cpu); + raw_smp_processor_id() != pool->cpu); /* * A single work shouldn't be executed concurrently by @@ -3553,7 +3553,7 @@ static void gcwq_unbind_fn(struct work_struct *work) struct hlist_node *pos; int i; - BUG_ON(gcwq->cpu != smp_processor_id()); + BUG_ON(gcwq->pools[0].cpu != smp_processor_id()); gcwq_claim_assoc_and_lock(gcwq); @@ -3860,10 +3860,10 @@ static int __init init_workqueues(void) struct worker_pool *pool; spin_lock_init(&gcwq->lock); - gcwq->cpu = cpu; for_each_worker_pool(pool, gcwq) { pool->gcwq = gcwq; + pool->cpu = cpu; pool->flags |= POOL_DISASSOCIATED; INIT_LIST_HEAD(&pool->worklist); INIT_LIST_HEAD(&pool->idle_list); -- cgit v1.2.3 From d565ed6309300304de4a865a04adef07a85edc45 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Jan 2013 11:01:33 -0800 Subject: workqueue: move global_cwq->lock to worker_pool Move gcwq->lock to pool->lock. The conversion is mostly straight-forward. Things worth noting are * In many places, this removes the need to use gcwq completely. pool is used directly instead. get_std_worker_pool() is added to help some of these conversions. This also leaves get_work_gcwq() without any user. Removed. * In hotplug and freezer paths, the pools belonging to a CPU are often processed together. This patch makes those paths hold locks of all pools, with highpri lock nested inside, to keep the conversion straight-forward. These nested lockings will be removed by following patches. This is part of an effort to remove global_cwq and make worker_pool the top level abstraction, which in turn will help implementing worker pools with user-specified attributes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 316 ++++++++++++++++++++++++++--------------------------- 1 file changed, 154 insertions(+), 162 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 366132bd226f..c93651208760 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -108,12 +108,12 @@ enum { * P: Preemption protected. Disabling preemption is enough and should * only be modified and accessed from the local cpu. * - * L: gcwq->lock protected. Access with gcwq->lock held. + * L: pool->lock protected. Access with pool->lock held. * - * X: During normal operation, modification requires gcwq->lock and - * should be done only from local cpu. Either disabling preemption - * on local cpu or grabbing gcwq->lock is enough for read access. - * If POOL_DISASSOCIATED is set, it's identical to L. + * X: During normal operation, modification requires pool->lock and should + * be done only from local cpu. Either disabling preemption on local + * cpu or grabbing pool->lock is enough for read access. If + * POOL_DISASSOCIATED is set, it's identical to L. * * F: wq->flush_mutex protected. * @@ -124,6 +124,7 @@ enum { struct worker_pool { struct global_cwq *gcwq; /* I: the owning gcwq */ + spinlock_t lock; /* the pool lock */ unsigned int cpu; /* I: the associated cpu */ int id; /* I: pool ID */ unsigned int flags; /* X: flags */ @@ -152,8 +153,6 @@ struct worker_pool { * target workqueues. */ struct global_cwq { - spinlock_t lock; /* the gcwq lock */ - struct worker_pool pools[NR_STD_WORKER_POOLS]; /* normal and highpri pools */ } ____cacheline_aligned_in_smp; @@ -487,6 +486,13 @@ static struct worker_pool *worker_pool_by_id(int pool_id) return idr_find(&worker_pool_idr, pool_id); } +static struct worker_pool *get_std_worker_pool(int cpu, bool highpri) +{ + struct global_cwq *gcwq = get_gcwq(cpu); + + return &gcwq->pools[highpri]; +} + static atomic_t *get_pool_nr_running(struct worker_pool *pool) { int cpu = pool->cpu; @@ -628,13 +634,6 @@ static int get_work_pool_id(struct work_struct *work) return pool ? pool->id : WORK_OFFQ_POOL_NONE; } -static struct global_cwq *get_work_gcwq(struct work_struct *work) -{ - struct worker_pool *pool = get_work_pool(work); - - return pool ? pool->gcwq : NULL; -} - static void mark_work_canceling(struct work_struct *work) { unsigned long pool_id = get_work_pool_id(work); @@ -653,7 +652,7 @@ static bool work_is_canceling(struct work_struct *work) /* * Policy functions. These define the policies on how the global worker * pools are managed. Unless noted otherwise, these functions assume that - * they're being called with gcwq->lock held. + * they're being called with pool->lock held. */ static bool __need_more_worker(struct worker_pool *pool) @@ -738,7 +737,7 @@ static struct worker *first_worker(struct worker_pool *pool) * Wake up the first idle worker of @pool. * * CONTEXT: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock). */ static void wake_up_worker(struct worker_pool *pool) { @@ -813,7 +812,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, * NOT_RUNNING is clear. This means that we're bound to and * running on the local cpu w/ rq lock held and preemption * disabled, which in turn means that none else could be - * manipulating idle_list, so dereferencing idle_list without gcwq + * manipulating idle_list, so dereferencing idle_list without pool * lock is safe. */ if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist)) @@ -832,7 +831,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, * woken up. * * CONTEXT: - * spin_lock_irq(gcwq->lock) + * spin_lock_irq(pool->lock) */ static inline void worker_set_flags(struct worker *worker, unsigned int flags, bool wakeup) @@ -869,7 +868,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags, * Clear @flags in @worker->flags and adjust nr_running accordingly. * * CONTEXT: - * spin_lock_irq(gcwq->lock) + * spin_lock_irq(pool->lock) */ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) { @@ -918,7 +917,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) * function. * * CONTEXT: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock). * * RETURNS: * Pointer to worker which is executing @work if found, NULL @@ -954,7 +953,7 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool, * nested inside outer list_for_each_entry_safe(). * * CONTEXT: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock). */ static void move_linked_works(struct work_struct *work, struct list_head *head, struct work_struct **nextp) @@ -1007,7 +1006,7 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) * decrement nr_in_flight of its cwq and handle workqueue flushing. * * CONTEXT: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock). */ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color) { @@ -1071,7 +1070,7 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color) static int try_to_grab_pending(struct work_struct *work, bool is_dwork, unsigned long *flags) { - struct global_cwq *gcwq; + struct worker_pool *pool; local_irq_save(*flags); @@ -1096,19 +1095,19 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, * The queueing is in progress, or it is already queued. Try to * steal it from ->worklist without clearing WORK_STRUCT_PENDING. */ - gcwq = get_work_gcwq(work); - if (!gcwq) + pool = get_work_pool(work); + if (!pool) goto fail; - spin_lock(&gcwq->lock); + spin_lock(&pool->lock); if (!list_empty(&work->entry)) { /* - * This work is queued, but perhaps we locked the wrong gcwq. - * In that case we must see the new value after rmb(), see - * insert_work()->wmb(). + * This work is queued, but perhaps we locked the wrong + * pool. In that case we must see the new value after + * rmb(), see insert_work()->wmb(). */ smp_rmb(); - if (gcwq == get_work_gcwq(work)) { + if (pool == get_work_pool(work)) { debug_work_deactivate(work); /* @@ -1126,11 +1125,11 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, cwq_dec_nr_in_flight(get_work_cwq(work), get_work_color(work)); - spin_unlock(&gcwq->lock); + spin_unlock(&pool->lock); return 1; } } - spin_unlock(&gcwq->lock); + spin_unlock(&pool->lock); fail: local_irq_restore(*flags); if (work_is_canceling(work)) @@ -1150,7 +1149,7 @@ fail: * @extra_flags is or'd to work_struct flags. * * CONTEXT: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock). */ static void insert_work(struct cpu_workqueue_struct *cwq, struct work_struct *work, struct list_head *head, @@ -1193,23 +1192,22 @@ static bool is_chained_work(struct workqueue_struct *wq) for_each_gcwq_cpu(cpu) { struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); struct worker_pool *pool = cwq->pool; - struct global_cwq *gcwq = pool->gcwq; struct worker *worker; struct hlist_node *pos; int i; - spin_lock_irqsave(&gcwq->lock, flags); + spin_lock_irqsave(&pool->lock, flags); for_each_busy_worker(worker, i, pos, pool) { if (worker->task != current) continue; - spin_unlock_irqrestore(&gcwq->lock, flags); + spin_unlock_irqrestore(&pool->lock, flags); /* * I'm @worker, no locking necessary. See if @work * is headed to the same workqueue. */ return worker->current_cwq->wq == wq; } - spin_unlock_irqrestore(&gcwq->lock, flags); + spin_unlock_irqrestore(&pool->lock, flags); } return false; } @@ -1217,7 +1215,8 @@ static bool is_chained_work(struct workqueue_struct *wq) static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, struct work_struct *work) { - struct global_cwq *gcwq; + bool highpri = wq->flags & WQ_HIGHPRI; + struct worker_pool *pool; struct cpu_workqueue_struct *cwq; struct list_head *worklist; unsigned int work_flags; @@ -1238,7 +1237,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, WARN_ON_ONCE(!is_chained_work(wq))) return; - /* determine gcwq to use */ + /* determine pool to use */ if (!(wq->flags & WQ_UNBOUND)) { struct worker_pool *last_pool; @@ -1251,38 +1250,37 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, * work needs to be queued on that cpu to guarantee * non-reentrancy. */ - gcwq = get_gcwq(cpu); + pool = get_std_worker_pool(cpu, highpri); last_pool = get_work_pool(work); - if (last_pool && last_pool->gcwq != gcwq) { - struct global_cwq *last_gcwq = last_pool->gcwq; + if (last_pool && last_pool != pool) { struct worker *worker; - spin_lock(&last_gcwq->lock); + spin_lock(&last_pool->lock); worker = find_worker_executing_work(last_pool, work); if (worker && worker->current_cwq->wq == wq) - gcwq = last_gcwq; + pool = last_pool; else { /* meh... not running there, queue here */ - spin_unlock(&last_gcwq->lock); - spin_lock(&gcwq->lock); + spin_unlock(&last_pool->lock); + spin_lock(&pool->lock); } } else { - spin_lock(&gcwq->lock); + spin_lock(&pool->lock); } } else { - gcwq = get_gcwq(WORK_CPU_UNBOUND); - spin_lock(&gcwq->lock); + pool = get_std_worker_pool(WORK_CPU_UNBOUND, highpri); + spin_lock(&pool->lock); } - /* gcwq determined, get cwq and queue */ - cwq = get_cwq(gcwq->pools[0].cpu, wq); + /* pool determined, get cwq and queue */ + cwq = get_cwq(pool->cpu, wq); trace_workqueue_queue_work(req_cpu, cwq, work); if (WARN_ON(!list_empty(&work->entry))) { - spin_unlock(&gcwq->lock); + spin_unlock(&pool->lock); return; } @@ -1300,7 +1298,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, insert_work(cwq, work, worklist, work_flags); - spin_unlock(&gcwq->lock); + spin_unlock(&pool->lock); } /** @@ -1523,7 +1521,7 @@ EXPORT_SYMBOL_GPL(mod_delayed_work); * necessary. * * LOCKING: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock). */ static void worker_enter_idle(struct worker *worker) { @@ -1546,7 +1544,7 @@ static void worker_enter_idle(struct worker *worker) /* * Sanity check nr_running. Because gcwq_unbind_fn() releases - * gcwq->lock between setting %WORKER_UNBOUND and zapping + * pool->lock between setting %WORKER_UNBOUND and zapping * nr_running, the warning may trigger spuriously. Check iff * unbind is not in progress. */ @@ -1562,7 +1560,7 @@ static void worker_enter_idle(struct worker *worker) * @worker is leaving idle state. Update stats. * * LOCKING: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock). */ static void worker_leave_idle(struct worker *worker) { @@ -1597,7 +1595,7 @@ static void worker_leave_idle(struct worker *worker) * guarantee the scheduling requirement described in the first paragraph. * * CONTEXT: - * Might sleep. Called without any lock but returns with gcwq->lock + * Might sleep. Called without any lock but returns with pool->lock * held. * * RETURNS: @@ -1605,10 +1603,9 @@ static void worker_leave_idle(struct worker *worker) * bound), %false if offline. */ static bool worker_maybe_bind_and_lock(struct worker *worker) -__acquires(&gcwq->lock) +__acquires(&pool->lock) { struct worker_pool *pool = worker->pool; - struct global_cwq *gcwq = pool->gcwq; struct task_struct *task = worker->task; while (true) { @@ -1621,14 +1618,14 @@ __acquires(&gcwq->lock) if (!(pool->flags & POOL_DISASSOCIATED)) set_cpus_allowed_ptr(task, get_cpu_mask(pool->cpu)); - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); if (pool->flags & POOL_DISASSOCIATED) return false; if (task_cpu(task) == pool->cpu && cpumask_equal(¤t->cpus_allowed, get_cpu_mask(pool->cpu))) return true; - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); /* * We've raced with CPU hot[un]plug. Give it a breather @@ -1647,15 +1644,13 @@ __acquires(&gcwq->lock) */ static void idle_worker_rebind(struct worker *worker) { - struct global_cwq *gcwq = worker->pool->gcwq; - /* CPU may go down again inbetween, clear UNBOUND only on success */ if (worker_maybe_bind_and_lock(worker)) worker_clr_flags(worker, WORKER_UNBOUND); /* rebind complete, become available again */ list_add(&worker->entry, &worker->pool->idle_list); - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&worker->pool->lock); } /* @@ -1667,12 +1662,11 @@ static void idle_worker_rebind(struct worker *worker) static void busy_worker_rebind_fn(struct work_struct *work) { struct worker *worker = container_of(work, struct worker, rebind_work); - struct global_cwq *gcwq = worker->pool->gcwq; if (worker_maybe_bind_and_lock(worker)) worker_clr_flags(worker, WORKER_UNBOUND); - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&worker->pool->lock); } /** @@ -1704,10 +1698,10 @@ static void rebind_workers(struct global_cwq *gcwq) struct hlist_node *pos; int i; - lockdep_assert_held(&gcwq->lock); - - for_each_worker_pool(pool, gcwq) + for_each_worker_pool(pool, gcwq) { lockdep_assert_held(&pool->assoc_mutex); + lockdep_assert_held(&pool->lock); + } /* dequeue and kick idle ones */ for_each_worker_pool(pool, gcwq) { @@ -1785,19 +1779,18 @@ static struct worker *alloc_worker(void) */ static struct worker *create_worker(struct worker_pool *pool) { - struct global_cwq *gcwq = pool->gcwq; const char *pri = std_worker_pool_pri(pool) ? "H" : ""; struct worker *worker = NULL; int id = -1; - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); while (ida_get_new(&pool->worker_ida, &id)) { - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL)) goto fail; - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); } - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); worker = alloc_worker(); if (!worker) @@ -1838,9 +1831,9 @@ static struct worker *create_worker(struct worker_pool *pool) return worker; fail: if (id >= 0) { - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); ida_remove(&pool->worker_ida, id); - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); } kfree(worker); return NULL; @@ -1853,7 +1846,7 @@ fail: * Make the gcwq aware of @worker and start it. * * CONTEXT: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock). */ static void start_worker(struct worker *worker) { @@ -1870,12 +1863,11 @@ static void start_worker(struct worker *worker) * Destroy @worker and adjust @gcwq stats accordingly. * * CONTEXT: - * spin_lock_irq(gcwq->lock) which is released and regrabbed. + * spin_lock_irq(pool->lock) which is released and regrabbed. */ static void destroy_worker(struct worker *worker) { struct worker_pool *pool = worker->pool; - struct global_cwq *gcwq = pool->gcwq; int id = worker->id; /* sanity check frenzy */ @@ -1890,21 +1882,20 @@ static void destroy_worker(struct worker *worker) list_del_init(&worker->entry); worker->flags |= WORKER_DIE; - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); kthread_stop(worker->task); kfree(worker); - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); ida_remove(&pool->worker_ida, id); } static void idle_worker_timeout(unsigned long __pool) { struct worker_pool *pool = (void *)__pool; - struct global_cwq *gcwq = pool->gcwq; - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); if (too_many_workers(pool)) { struct worker *worker; @@ -1923,7 +1914,7 @@ static void idle_worker_timeout(unsigned long __pool) } } - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); } static bool send_mayday(struct work_struct *work) @@ -1948,10 +1939,9 @@ static bool send_mayday(struct work_struct *work) static void gcwq_mayday_timeout(unsigned long __pool) { struct worker_pool *pool = (void *)__pool; - struct global_cwq *gcwq = pool->gcwq; struct work_struct *work; - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); if (need_to_create_worker(pool)) { /* @@ -1964,7 +1954,7 @@ static void gcwq_mayday_timeout(unsigned long __pool) send_mayday(work); } - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); } @@ -1983,24 +1973,22 @@ static void gcwq_mayday_timeout(unsigned long __pool) * may_start_working() true. * * LOCKING: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. Called only from * manager. * * RETURNS: - * false if no action was taken and gcwq->lock stayed locked, true + * false if no action was taken and pool->lock stayed locked, true * otherwise. */ static bool maybe_create_worker(struct worker_pool *pool) -__releases(&gcwq->lock) -__acquires(&gcwq->lock) +__releases(&pool->lock) +__acquires(&pool->lock) { - struct global_cwq *gcwq = pool->gcwq; - if (!need_to_create_worker(pool)) return false; restart: - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); @@ -2011,7 +1999,7 @@ restart: worker = create_worker(pool); if (worker) { del_timer_sync(&pool->mayday_timer); - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); start_worker(worker); BUG_ON(need_to_create_worker(pool)); return true; @@ -2028,7 +2016,7 @@ restart: } del_timer_sync(&pool->mayday_timer); - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); if (need_to_create_worker(pool)) goto restart; return true; @@ -2042,11 +2030,11 @@ restart: * IDLE_WORKER_TIMEOUT. * * LOCKING: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Called only from manager. * * RETURNS: - * false if no action was taken and gcwq->lock stayed locked, true + * false if no action was taken and pool->lock stayed locked, true * otherwise. */ static bool maybe_destroy_workers(struct worker_pool *pool) @@ -2085,12 +2073,12 @@ static bool maybe_destroy_workers(struct worker_pool *pool) * and may_start_working() is true. * * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. * * RETURNS: - * false if no action was taken and gcwq->lock stayed locked, true if - * some action was taken. + * spin_lock_irq(pool->lock) which may be released and regrabbed + * multiple times. Does GFP_KERNEL allocations. */ static bool manage_workers(struct worker *worker) { @@ -2112,10 +2100,10 @@ static bool manage_workers(struct worker *worker) * manager against CPU hotplug. * * assoc_mutex would always be free unless CPU hotplug is in - * progress. trylock first without dropping @gcwq->lock. + * progress. trylock first without dropping @pool->lock. */ if (unlikely(!mutex_trylock(&pool->assoc_mutex))) { - spin_unlock_irq(&pool->gcwq->lock); + spin_unlock_irq(&pool->lock); mutex_lock(&pool->assoc_mutex); /* * CPU hotplug could have happened while we were waiting @@ -2162,15 +2150,14 @@ static bool manage_workers(struct worker *worker) * call this function to process a work. * * CONTEXT: - * spin_lock_irq(gcwq->lock) which is released and regrabbed. + * spin_lock_irq(pool->lock) which is released and regrabbed. */ static void process_one_work(struct worker *worker, struct work_struct *work) -__releases(&gcwq->lock) -__acquires(&gcwq->lock) +__releases(&pool->lock) +__acquires(&pool->lock) { struct cpu_workqueue_struct *cwq = get_work_cwq(work); struct worker_pool *pool = worker->pool; - struct global_cwq *gcwq = pool->gcwq; bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; int work_color; struct worker *collision; @@ -2225,7 +2212,7 @@ __acquires(&gcwq->lock) worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); /* - * Unbound gcwq isn't concurrency managed and work items should be + * Unbound pool isn't concurrency managed and work items should be * executed ASAP. Wake up another worker if necessary. */ if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) @@ -2233,13 +2220,13 @@ __acquires(&gcwq->lock) /* * Record the last pool and clear PENDING which should be the last - * update to @work. Also, do this inside @gcwq->lock so that + * update to @work. Also, do this inside @pool->lock so that * PENDING and queued state changes happen together while IRQ is * disabled. */ set_work_pool_and_clear_pending(work, pool->id); - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); lock_map_acquire_read(&cwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); @@ -2262,7 +2249,7 @@ __acquires(&gcwq->lock) dump_stack(); } - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); /* clear cpu intensive status */ if (unlikely(cpu_intensive)) @@ -2285,7 +2272,7 @@ __acquires(&gcwq->lock) * fetches a work from the top and executes it. * * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. */ static void process_scheduled_works(struct worker *worker) @@ -2311,16 +2298,15 @@ static int worker_thread(void *__worker) { struct worker *worker = __worker; struct worker_pool *pool = worker->pool; - struct global_cwq *gcwq = pool->gcwq; /* tell the scheduler that this is a workqueue worker */ worker->task->flags |= PF_WQ_WORKER; woke_up: - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); /* we are off idle list if destruction or rebind is requested */ if (unlikely(list_empty(&worker->entry))) { - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); /* if DIE is set, destruction is requested */ if (worker->flags & WORKER_DIE) { @@ -2379,15 +2365,15 @@ sleep: goto recheck; /* - * gcwq->lock is held and there's no work to process and no - * need to manage, sleep. Workers are woken up only while - * holding gcwq->lock or from local cpu, so setting the - * current state before releasing gcwq->lock is enough to - * prevent losing any event. + * pool->lock is held and there's no work to process and no need to + * manage, sleep. Workers are woken up only while holding + * pool->lock or from local cpu, so setting the current state + * before releasing pool->lock is enough to prevent losing any + * event. */ worker_enter_idle(worker); __set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); schedule(); goto woke_up; } @@ -2443,7 +2429,6 @@ repeat: unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); struct worker_pool *pool = cwq->pool; - struct global_cwq *gcwq = pool->gcwq; struct work_struct *work, *n; __set_current_state(TASK_RUNNING); @@ -2465,14 +2450,14 @@ repeat: process_scheduled_works(rescuer); /* - * Leave this gcwq. If keep_working() is %true, notify a + * Leave this pool. If keep_working() is %true, notify a * regular worker; otherwise, we end up with 0 concurrency * and stalling the execution. */ if (keep_working(pool)) wake_up_worker(pool); - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); } /* rescuers should never participate in concurrency management */ @@ -2514,7 +2499,7 @@ static void wq_barrier_func(struct work_struct *work) * underneath us, so we can't reliably determine cwq from @target. * * CONTEXT: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock). */ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, struct wq_barrier *barr, @@ -2524,7 +2509,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, unsigned int linked = 0; /* - * debugobject calls are safe here even with gcwq->lock locked + * debugobject calls are safe here even with pool->lock locked * as we know for sure that this will not trigger any of the * checks and call back into the fixup functions where we * might deadlock. @@ -2597,9 +2582,9 @@ static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq, for_each_cwq_cpu(cpu, wq) { struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - struct global_cwq *gcwq = cwq->pool->gcwq; + struct worker_pool *pool = cwq->pool; - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); if (flush_color >= 0) { BUG_ON(cwq->flush_color != -1); @@ -2616,7 +2601,7 @@ static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq, cwq->work_color = work_color; } - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); } if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush)) @@ -2813,9 +2798,9 @@ reflush: struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); bool drained; - spin_lock_irq(&cwq->pool->gcwq->lock); + spin_lock_irq(&cwq->pool->lock); drained = !cwq->nr_active && list_empty(&cwq->delayed_works); - spin_unlock_irq(&cwq->pool->gcwq->lock); + spin_unlock_irq(&cwq->pool->lock); if (drained) continue; @@ -2838,25 +2823,23 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) { struct worker *worker = NULL; struct worker_pool *pool; - struct global_cwq *gcwq; struct cpu_workqueue_struct *cwq; might_sleep(); pool = get_work_pool(work); if (!pool) return false; - gcwq = pool->gcwq; - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); if (!list_empty(&work->entry)) { /* * See the comment near try_to_grab_pending()->smp_rmb(). - * If it was re-queued to a different gcwq under us, we + * If it was re-queued to a different pool under us, we * are not going to wait. */ smp_rmb(); cwq = get_work_cwq(work); - if (unlikely(!cwq || gcwq != cwq->pool->gcwq)) + if (unlikely(!cwq || pool != cwq->pool)) goto already_gone; } else { worker = find_worker_executing_work(pool, work); @@ -2866,7 +2849,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) } insert_wq_barrier(cwq, barr, work, worker); - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); /* * If @max_active is 1 or rescuer is in use, flushing another work @@ -2882,7 +2865,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) return true; already_gone: - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); return false; } @@ -3404,7 +3387,7 @@ EXPORT_SYMBOL_GPL(destroy_workqueue); * increased. * * CONTEXT: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock). */ static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active) { @@ -3438,15 +3421,14 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) for_each_cwq_cpu(cpu, wq) { struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); struct worker_pool *pool = cwq->pool; - struct global_cwq *gcwq = pool->gcwq; - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); if (!(wq->flags & WQ_FREEZABLE) || !(pool->flags & POOL_FREEZING)) cwq_set_max_active(cwq, max_active); - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); } spin_unlock(&workqueue_lock); @@ -3489,22 +3471,20 @@ EXPORT_SYMBOL_GPL(workqueue_congested); unsigned int work_busy(struct work_struct *work) { struct worker_pool *pool = get_work_pool(work); - struct global_cwq *gcwq; unsigned long flags; unsigned int ret = 0; if (!pool) return 0; - gcwq = pool->gcwq; - spin_lock_irqsave(&gcwq->lock, flags); + spin_lock_irqsave(&pool->lock, flags); if (work_pending(work)) ret |= WORK_BUSY_PENDING; if (find_worker_executing_work(pool, work)) ret |= WORK_BUSY_RUNNING; - spin_unlock_irqrestore(&gcwq->lock, flags); + spin_unlock_irqrestore(&pool->lock, flags); return ret; } @@ -3532,7 +3512,10 @@ static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq) for_each_worker_pool(pool, gcwq) mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools); - spin_lock_irq(&gcwq->lock); + + local_irq_disable(); + for_each_worker_pool(pool, gcwq) + spin_lock_nested(&pool->lock, pool - gcwq->pools); } /* release manager positions */ @@ -3540,7 +3523,10 @@ static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq) { struct worker_pool *pool; - spin_unlock_irq(&gcwq->lock); + for_each_worker_pool(pool, gcwq) + spin_unlock(&pool->lock); + local_irq_enable(); + for_each_worker_pool(pool, gcwq) mutex_unlock(&pool->assoc_mutex); } @@ -3621,9 +3607,9 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, if (!worker) return NOTIFY_BAD; - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); start_worker(worker); - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); } break; @@ -3709,7 +3695,7 @@ EXPORT_SYMBOL_GPL(work_on_cpu); * gcwq->worklist. * * CONTEXT: - * Grabs and releases workqueue_lock and gcwq->lock's. + * Grabs and releases workqueue_lock and pool->lock's. */ void freeze_workqueues_begin(void) { @@ -3725,9 +3711,11 @@ void freeze_workqueues_begin(void) struct worker_pool *pool; struct workqueue_struct *wq; - spin_lock_irq(&gcwq->lock); + local_irq_disable(); for_each_worker_pool(pool, gcwq) { + spin_lock_nested(&pool->lock, pool - gcwq->pools); + WARN_ON_ONCE(pool->flags & POOL_FREEZING); pool->flags |= POOL_FREEZING; } @@ -3739,7 +3727,9 @@ void freeze_workqueues_begin(void) cwq->max_active = 0; } - spin_unlock_irq(&gcwq->lock); + for_each_worker_pool(pool, gcwq) + spin_unlock(&pool->lock); + local_irq_enable(); } spin_unlock(&workqueue_lock); @@ -3798,7 +3788,7 @@ out_unlock: * frozen works are transferred to their respective gcwq worklists. * * CONTEXT: - * Grabs and releases workqueue_lock and gcwq->lock's. + * Grabs and releases workqueue_lock and pool->lock's. */ void thaw_workqueues(void) { @@ -3814,9 +3804,11 @@ void thaw_workqueues(void) struct worker_pool *pool; struct workqueue_struct *wq; - spin_lock_irq(&gcwq->lock); + local_irq_disable(); for_each_worker_pool(pool, gcwq) { + spin_lock_nested(&pool->lock, pool - gcwq->pools); + WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); pool->flags &= ~POOL_FREEZING; } @@ -3831,10 +3823,11 @@ void thaw_workqueues(void) cwq_set_max_active(cwq, wq->saved_max_active); } - for_each_worker_pool(pool, gcwq) + for_each_worker_pool(pool, gcwq) { wake_up_worker(pool); - - spin_unlock_irq(&gcwq->lock); + spin_unlock(&pool->lock); + } + local_irq_enable(); } workqueue_freezing = false; @@ -3859,10 +3852,9 @@ static int __init init_workqueues(void) struct global_cwq *gcwq = get_gcwq(cpu); struct worker_pool *pool; - spin_lock_init(&gcwq->lock); - for_each_worker_pool(pool, gcwq) { pool->gcwq = gcwq; + spin_lock_init(&pool->lock); pool->cpu = cpu; pool->flags |= POOL_DISASSOCIATED; INIT_LIST_HEAD(&pool->worklist); @@ -3897,9 +3889,9 @@ static int __init init_workqueues(void) worker = create_worker(pool); BUG_ON(!worker); - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); start_worker(worker); - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); } } -- cgit v1.2.3 From 94cf58bb2907bd2702fce2266955e29ab5261f53 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Jan 2013 11:01:33 -0800 Subject: workqueue: make hotplug processing per-pool Instead of holding locks from both pools and then processing the pools together, make hotplug processing per-pool - grab locks of one pool, process it, release it and then proceed to the next pool. rebind_workers() is updated to take and process @pool instead of @gcwq which results in a lot of de-indentation. gcwq_claim_assoc_and_lock() and its counterpart are replaced with in-line per-pool locking. While this patch changes processing order across pools, order within each pool remains the same. As each pool is independent, this shouldn't break anything. This is part of an effort to remove global_cwq and make worker_pool the top level abstraction, which in turn will help implementing worker pools with user-specified attributes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 149 ++++++++++++++++++++++------------------------------- 1 file changed, 62 insertions(+), 87 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c93651208760..fd400f8c9514 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1670,10 +1670,10 @@ static void busy_worker_rebind_fn(struct work_struct *work) } /** - * rebind_workers - rebind all workers of a gcwq to the associated CPU - * @gcwq: gcwq of interest + * rebind_workers - rebind all workers of a pool to the associated CPU + * @pool: pool of interest * - * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding + * @pool->cpu is coming online. Rebind all workers to the CPU. Rebinding * is different for idle and busy ones. * * Idle ones will be removed from the idle_list and woken up. They will @@ -1691,60 +1691,53 @@ static void busy_worker_rebind_fn(struct work_struct *work) * including the manager will not appear on @idle_list until rebind is * complete, making local wake-ups safe. */ -static void rebind_workers(struct global_cwq *gcwq) +static void rebind_workers(struct worker_pool *pool) { - struct worker_pool *pool; struct worker *worker, *n; struct hlist_node *pos; int i; - for_each_worker_pool(pool, gcwq) { - lockdep_assert_held(&pool->assoc_mutex); - lockdep_assert_held(&pool->lock); - } + lockdep_assert_held(&pool->assoc_mutex); + lockdep_assert_held(&pool->lock); /* dequeue and kick idle ones */ - for_each_worker_pool(pool, gcwq) { - list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { - /* - * idle workers should be off @pool->idle_list - * until rebind is complete to avoid receiving - * premature local wake-ups. - */ - list_del_init(&worker->entry); + list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { + /* + * idle workers should be off @pool->idle_list until rebind + * is complete to avoid receiving premature local wake-ups. + */ + list_del_init(&worker->entry); - /* - * worker_thread() will see the above dequeuing - * and call idle_worker_rebind(). - */ - wake_up_process(worker->task); - } + /* + * worker_thread() will see the above dequeuing and call + * idle_worker_rebind(). + */ + wake_up_process(worker->task); + } - /* rebind busy workers */ - for_each_busy_worker(worker, i, pos, pool) { - struct work_struct *rebind_work = &worker->rebind_work; - struct workqueue_struct *wq; + /* rebind busy workers */ + for_each_busy_worker(worker, i, pos, pool) { + struct work_struct *rebind_work = &worker->rebind_work; + struct workqueue_struct *wq; - if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, - work_data_bits(rebind_work))) - continue; + if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, + work_data_bits(rebind_work))) + continue; - debug_work_activate(rebind_work); + debug_work_activate(rebind_work); - /* - * wq doesn't really matter but let's keep - * @worker->pool and @cwq->pool consistent for - * sanity. - */ - if (std_worker_pool_pri(worker->pool)) - wq = system_highpri_wq; - else - wq = system_wq; - - insert_work(get_cwq(pool->cpu, wq), rebind_work, - worker->scheduled.next, - work_color_to_flags(WORK_NO_COLOR)); - } + /* + * wq doesn't really matter but let's keep @worker->pool + * and @cwq->pool consistent for sanity. + */ + if (std_worker_pool_pri(worker->pool)) + wq = system_highpri_wq; + else + wq = system_wq; + + insert_work(get_cwq(pool->cpu, wq), rebind_work, + worker->scheduled.next, + work_color_to_flags(WORK_NO_COLOR)); } } @@ -3497,7 +3490,7 @@ EXPORT_SYMBOL_GPL(work_busy); * are a lot of assumptions on strong associations among work, cwq and * gcwq which make migrating pending and scheduled works very * difficult to implement without impacting hot paths. Secondly, - * gcwqs serve mix of short, long and very long running works making + * worker pools serve mix of short, long and very long running works making * blocked draining impractical. * * This is solved by allowing the pools to be disassociated from the CPU @@ -3505,32 +3498,6 @@ EXPORT_SYMBOL_GPL(work_busy); * cpu comes back online. */ -/* claim manager positions of all pools */ -static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq) -{ - struct worker_pool *pool; - - for_each_worker_pool(pool, gcwq) - mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools); - - local_irq_disable(); - for_each_worker_pool(pool, gcwq) - spin_lock_nested(&pool->lock, pool - gcwq->pools); -} - -/* release manager positions */ -static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq) -{ - struct worker_pool *pool; - - for_each_worker_pool(pool, gcwq) - spin_unlock(&pool->lock); - local_irq_enable(); - - for_each_worker_pool(pool, gcwq) - mutex_unlock(&pool->assoc_mutex); -} - static void gcwq_unbind_fn(struct work_struct *work) { struct global_cwq *gcwq = get_gcwq(smp_processor_id()); @@ -3539,17 +3506,19 @@ static void gcwq_unbind_fn(struct work_struct *work) struct hlist_node *pos; int i; - BUG_ON(gcwq->pools[0].cpu != smp_processor_id()); + for_each_worker_pool(pool, gcwq) { + BUG_ON(pool->cpu != smp_processor_id()); - gcwq_claim_assoc_and_lock(gcwq); + mutex_lock(&pool->assoc_mutex); + spin_lock_irq(&pool->lock); - /* - * We've claimed all manager positions. Make all workers unbound - * and set DISASSOCIATED. Before this, all workers except for the - * ones which are still executing works from before the last CPU - * down must be on the cpu. After this, they may become diasporas. - */ - for_each_worker_pool(pool, gcwq) { + /* + * We've claimed all manager positions. Make all workers + * unbound and set DISASSOCIATED. Before this, all workers + * except for the ones which are still executing works from + * before the last CPU down must be on the cpu. After + * this, they may become diasporas. + */ list_for_each_entry(worker, &pool->idle_list, entry) worker->flags |= WORKER_UNBOUND; @@ -3557,9 +3526,10 @@ static void gcwq_unbind_fn(struct work_struct *work) worker->flags |= WORKER_UNBOUND; pool->flags |= POOL_DISASSOCIATED; - } - gcwq_release_assoc_and_unlock(gcwq); + spin_unlock_irq(&pool->lock); + mutex_unlock(&pool->assoc_mutex); + } /* * Call schedule() so that we cross rq->lock and thus can guarantee @@ -3615,11 +3585,16 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, case CPU_DOWN_FAILED: case CPU_ONLINE: - gcwq_claim_assoc_and_lock(gcwq); - for_each_worker_pool(pool, gcwq) + for_each_worker_pool(pool, gcwq) { + mutex_lock(&pool->assoc_mutex); + spin_lock_irq(&pool->lock); + pool->flags &= ~POOL_DISASSOCIATED; - rebind_workers(gcwq); - gcwq_release_assoc_and_unlock(gcwq); + rebind_workers(pool); + + spin_unlock_irq(&pool->lock); + mutex_unlock(&pool->assoc_mutex); + } break; } return NOTIFY_OK; -- cgit v1.2.3 From a1056305fa98c7e13b38718658a8b07a5d926460 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Jan 2013 11:01:33 -0800 Subject: workqueue: make freezing/thawing per-pool Instead of holding locks from both pools and then processing the pools together, make freezing/thwaing per-pool - grab locks of one pool, process it, release it and then proceed to the next pool. While this patch changes processing order across pools, order within each pool remains the same. As each pool is independent, this shouldn't break anything. This is part of an effort to remove global_cwq and make worker_pool the top level abstraction, which in turn will help implementing worker pools with user-specified attributes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 46 ++++++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index fd400f8c9514..b609bfba134b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3686,25 +3686,22 @@ void freeze_workqueues_begin(void) struct worker_pool *pool; struct workqueue_struct *wq; - local_irq_disable(); - for_each_worker_pool(pool, gcwq) { - spin_lock_nested(&pool->lock, pool - gcwq->pools); + spin_lock_irq(&pool->lock); WARN_ON_ONCE(pool->flags & POOL_FREEZING); pool->flags |= POOL_FREEZING; - } - list_for_each_entry(wq, &workqueues, list) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + list_for_each_entry(wq, &workqueues, list) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - if (cwq && wq->flags & WQ_FREEZABLE) - cwq->max_active = 0; - } + if (cwq && cwq->pool == pool && + (wq->flags & WQ_FREEZABLE)) + cwq->max_active = 0; + } - for_each_worker_pool(pool, gcwq) - spin_unlock(&pool->lock); - local_irq_enable(); + spin_unlock_irq(&pool->lock); + } } spin_unlock(&workqueue_lock); @@ -3779,30 +3776,27 @@ void thaw_workqueues(void) struct worker_pool *pool; struct workqueue_struct *wq; - local_irq_disable(); - for_each_worker_pool(pool, gcwq) { - spin_lock_nested(&pool->lock, pool - gcwq->pools); + spin_lock_irq(&pool->lock); WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); pool->flags &= ~POOL_FREEZING; - } - list_for_each_entry(wq, &workqueues, list) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + list_for_each_entry(wq, &workqueues, list) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - if (!cwq || !(wq->flags & WQ_FREEZABLE)) - continue; + if (!cwq || cwq->pool != pool || + !(wq->flags & WQ_FREEZABLE)) + continue; - /* restore max_active and repopulate worklist */ - cwq_set_max_active(cwq, wq->saved_max_active); - } + /* restore max_active and repopulate worklist */ + cwq_set_max_active(cwq, wq->saved_max_active); + } - for_each_worker_pool(pool, gcwq) { wake_up_worker(pool); - spin_unlock(&pool->lock); + + spin_unlock_irq(&pool->lock); } - local_irq_enable(); } workqueue_freezing = false; -- cgit v1.2.3 From 38db41d984f17938631420ff78160dda7f182d24 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Jan 2013 11:01:34 -0800 Subject: workqueue: replace for_each_worker_pool() with for_each_std_worker_pool() for_each_std_worker_pool() takes @cpu instead of @gcwq. This is part of an effort to remove global_cwq and make worker_pool the top level abstraction, which in turn will help implementing worker pools with user-specified attributes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b609bfba134b..bd639c185da1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -250,9 +250,9 @@ EXPORT_SYMBOL_GPL(system_freezable_wq); #define CREATE_TRACE_POINTS #include -#define for_each_worker_pool(pool, gcwq) \ - for ((pool) = &(gcwq)->pools[0]; \ - (pool) < &(gcwq)->pools[NR_STD_WORKER_POOLS]; (pool)++) +#define for_each_std_worker_pool(pool, cpu) \ + for ((pool) = &get_gcwq((cpu))->pools[0]; \ + (pool) < &get_gcwq((cpu))->pools[NR_STD_WORKER_POOLS]; (pool)++) #define for_each_busy_worker(worker, i, pos, pool) \ hash_for_each(pool->busy_hash, i, pos, worker, hentry) @@ -3500,14 +3500,14 @@ EXPORT_SYMBOL_GPL(work_busy); static void gcwq_unbind_fn(struct work_struct *work) { - struct global_cwq *gcwq = get_gcwq(smp_processor_id()); + int cpu = smp_processor_id(); struct worker_pool *pool; struct worker *worker; struct hlist_node *pos; int i; - for_each_worker_pool(pool, gcwq) { - BUG_ON(pool->cpu != smp_processor_id()); + for_each_std_worker_pool(pool, cpu) { + BUG_ON(cpu != smp_processor_id()); mutex_lock(&pool->assoc_mutex); spin_lock_irq(&pool->lock); @@ -3541,15 +3541,15 @@ static void gcwq_unbind_fn(struct work_struct *work) /* * Sched callbacks are disabled now. Zap nr_running. After this, * nr_running stays zero and need_more_worker() and keep_working() - * are always true as long as the worklist is not empty. @gcwq now - * behaves as unbound (in terms of concurrency management) gcwq - * which is served by workers tied to the CPU. + * are always true as long as the worklist is not empty. Pools on + * @cpu now behave as unbound (in terms of concurrency management) + * pools which are served by workers tied to the CPU. * * On return from this function, the current worker would trigger * unbound chain execution of pending work items if other workers * didn't already. */ - for_each_worker_pool(pool, gcwq) + for_each_std_worker_pool(pool, cpu) atomic_set(get_pool_nr_running(pool), 0); } @@ -3562,12 +3562,11 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; - struct global_cwq *gcwq = get_gcwq(cpu); struct worker_pool *pool; switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - for_each_worker_pool(pool, gcwq) { + for_each_std_worker_pool(pool, cpu) { struct worker *worker; if (pool->nr_workers) @@ -3585,7 +3584,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, case CPU_DOWN_FAILED: case CPU_ONLINE: - for_each_worker_pool(pool, gcwq) { + for_each_std_worker_pool(pool, cpu) { mutex_lock(&pool->assoc_mutex); spin_lock_irq(&pool->lock); @@ -3682,11 +3681,10 @@ void freeze_workqueues_begin(void) workqueue_freezing = true; for_each_gcwq_cpu(cpu) { - struct global_cwq *gcwq = get_gcwq(cpu); struct worker_pool *pool; struct workqueue_struct *wq; - for_each_worker_pool(pool, gcwq) { + for_each_std_worker_pool(pool, cpu) { spin_lock_irq(&pool->lock); WARN_ON_ONCE(pool->flags & POOL_FREEZING); @@ -3772,11 +3770,10 @@ void thaw_workqueues(void) goto out_unlock; for_each_gcwq_cpu(cpu) { - struct global_cwq *gcwq = get_gcwq(cpu); struct worker_pool *pool; struct workqueue_struct *wq; - for_each_worker_pool(pool, gcwq) { + for_each_std_worker_pool(pool, cpu) { spin_lock_irq(&pool->lock); WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); @@ -3818,11 +3815,10 @@ static int __init init_workqueues(void) /* initialize gcwqs */ for_each_gcwq_cpu(cpu) { - struct global_cwq *gcwq = get_gcwq(cpu); struct worker_pool *pool; - for_each_worker_pool(pool, gcwq) { - pool->gcwq = gcwq; + for_each_std_worker_pool(pool, cpu) { + pool->gcwq = get_gcwq(cpu); spin_lock_init(&pool->lock); pool->cpu = cpu; pool->flags |= POOL_DISASSOCIATED; @@ -3847,10 +3843,9 @@ static int __init init_workqueues(void) /* create the initial worker */ for_each_online_gcwq_cpu(cpu) { - struct global_cwq *gcwq = get_gcwq(cpu); struct worker_pool *pool; - for_each_worker_pool(pool, gcwq) { + for_each_std_worker_pool(pool, cpu) { struct worker *worker; if (cpu != WORK_CPU_UNBOUND) -- cgit v1.2.3 From 4e8f0a609677a25f504527e50981df146c5b3d08 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Jan 2013 11:01:34 -0800 Subject: workqueue: remove worker_pool->gcwq The only remaining user of pool->gcwq is std_worker_pool_pri(). Reimplement it using get_gcwq() and remove worker_pool->gcwq. This is part of an effort to remove global_cwq and make worker_pool the top level abstraction, which in turn will help implementing worker pools with user-specified attributes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bd639c185da1..475a447aa6d8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -123,7 +123,6 @@ enum { /* struct worker is defined in workqueue_internal.h */ struct worker_pool { - struct global_cwq *gcwq; /* I: the owning gcwq */ spinlock_t lock; /* the pool lock */ unsigned int cpu; /* I: the associated cpu */ int id; /* I: pool ID */ @@ -451,11 +450,6 @@ static DEFINE_IDR(worker_pool_idr); static int worker_thread(void *__worker); -static int std_worker_pool_pri(struct worker_pool *pool) -{ - return pool - pool->gcwq->pools; -} - static struct global_cwq *get_gcwq(unsigned int cpu) { if (cpu != WORK_CPU_UNBOUND) @@ -464,6 +458,11 @@ static struct global_cwq *get_gcwq(unsigned int cpu) return &unbound_global_cwq; } +static int std_worker_pool_pri(struct worker_pool *pool) +{ + return pool - get_gcwq(pool->cpu)->pools; +} + /* allocate ID and assign it to @pool */ static int worker_pool_assign_id(struct worker_pool *pool) { @@ -3818,7 +3817,6 @@ static int __init init_workqueues(void) struct worker_pool *pool; for_each_std_worker_pool(pool, cpu) { - pool->gcwq = get_gcwq(cpu); spin_lock_init(&pool->lock); pool->cpu = cpu; pool->flags |= POOL_DISASSOCIATED; -- cgit v1.2.3 From a60dc39c016a65bfdbd05c43b3707962d5ed04c7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Jan 2013 11:01:34 -0800 Subject: workqueue: remove global_cwq global_cwq is now nothing but a container for per-cpu standard worker_pools. Declare the worker pools directly as cpu/unbound_std_worker_pools[] and remove global_cwq. * ____cacheline_aligned_in_smp moved from global_cwq to worker_pool. This probably would have made sense even before this change as we want each pool to be aligned. * get_gcwq() is replaced with std_worker_pools() which returns the pointer to the standard pool array for a given CPU. * __alloc_workqueue_key() updated to use get_std_worker_pool() instead of open-coding pool determination. This is part of an effort to remove global_cwq and make worker_pool the top level abstraction, which in turn will help implementing worker pools with user-specified attributes. v2: Joonsoo pointed out that it'd better to align struct worker_pool rather than the array so that every pool is aligned. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan Cc: Joonsoo Kim --- kernel/workqueue.c | 46 +++++++++++++++++---------------------------- kernel/workqueue_internal.h | 1 - 2 files changed, 17 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 475a447aa6d8..224580f7459c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -144,16 +144,6 @@ struct worker_pool { struct mutex assoc_mutex; /* protect POOL_DISASSOCIATED */ struct ida worker_ida; /* L: for worker IDs */ -}; - -/* - * Global per-cpu workqueue. There's one and only one for each cpu - * and all works are queued and processed here regardless of their - * target workqueues. - */ -struct global_cwq { - struct worker_pool pools[NR_STD_WORKER_POOLS]; - /* normal and highpri pools */ } ____cacheline_aligned_in_smp; /* @@ -250,8 +240,8 @@ EXPORT_SYMBOL_GPL(system_freezable_wq); #include #define for_each_std_worker_pool(pool, cpu) \ - for ((pool) = &get_gcwq((cpu))->pools[0]; \ - (pool) < &get_gcwq((cpu))->pools[NR_STD_WORKER_POOLS]; (pool)++) + for ((pool) = &std_worker_pools(cpu)[0]; \ + (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++) #define for_each_busy_worker(worker, i, pos, pool) \ hash_for_each(pool->busy_hash, i, pos, worker, hentry) @@ -427,19 +417,19 @@ static LIST_HEAD(workqueues); static bool workqueue_freezing; /* W: have wqs started freezing? */ /* - * The almighty global cpu workqueues. nr_running is the only field - * which is expected to be used frequently by other cpus via - * try_to_wake_up(). Put it in a separate cacheline. + * The CPU standard worker pools. nr_running is the only field which is + * expected to be used frequently by other cpus via try_to_wake_up(). Put + * it in a separate cacheline. */ -static DEFINE_PER_CPU(struct global_cwq, global_cwq); +static DEFINE_PER_CPU(struct worker_pool [NR_STD_WORKER_POOLS], + cpu_std_worker_pools); static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_STD_WORKER_POOLS]); /* - * Global cpu workqueue and nr_running counter for unbound gcwq. The pools - * for online CPUs have POOL_DISASSOCIATED set, and all their workers have - * WORKER_UNBOUND set. + * Standard worker pools and nr_running counter for unbound CPU. The pools + * have POOL_DISASSOCIATED set, and all workers have WORKER_UNBOUND set. */ -static struct global_cwq unbound_global_cwq; +static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS]; static atomic_t unbound_pool_nr_running[NR_STD_WORKER_POOLS] = { [0 ... NR_STD_WORKER_POOLS - 1] = ATOMIC_INIT(0), /* always 0 */ }; @@ -450,17 +440,17 @@ static DEFINE_IDR(worker_pool_idr); static int worker_thread(void *__worker); -static struct global_cwq *get_gcwq(unsigned int cpu) +static struct worker_pool *std_worker_pools(int cpu) { if (cpu != WORK_CPU_UNBOUND) - return &per_cpu(global_cwq, cpu); + return per_cpu(cpu_std_worker_pools, cpu); else - return &unbound_global_cwq; + return unbound_std_worker_pools; } static int std_worker_pool_pri(struct worker_pool *pool) { - return pool - get_gcwq(pool->cpu)->pools; + return pool - std_worker_pools(pool->cpu); } /* allocate ID and assign it to @pool */ @@ -487,9 +477,9 @@ static struct worker_pool *worker_pool_by_id(int pool_id) static struct worker_pool *get_std_worker_pool(int cpu, bool highpri) { - struct global_cwq *gcwq = get_gcwq(cpu); + struct worker_pool *pools = std_worker_pools(cpu); - return &gcwq->pools[highpri]; + return &pools[highpri]; } static atomic_t *get_pool_nr_running(struct worker_pool *pool) @@ -3269,11 +3259,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, for_each_cwq_cpu(cpu, wq) { struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - struct global_cwq *gcwq = get_gcwq(cpu); - int pool_idx = (bool)(flags & WQ_HIGHPRI); BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); - cwq->pool = &gcwq->pools[pool_idx]; + cwq->pool = get_std_worker_pool(cpu, flags & WQ_HIGHPRI); cwq->wq = wq; cwq->flush_color = -1; cwq->max_active = max_active; diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index cc35e7e62091..328be4a269aa 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -10,7 +10,6 @@ #include #include -struct global_cwq; struct worker_pool; /* -- cgit v1.2.3 From e6e380ed92555533740d5f670640f6f1868132de Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Jan 2013 11:01:34 -0800 Subject: workqueue: rename nr_running variables Rename per-cpu and unbound nr_running variables such that they match the pool variables. This patch doesn't introduce any functional changes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 224580f7459c..db8d4b7471ac 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -423,14 +423,15 @@ static bool workqueue_freezing; /* W: have wqs started freezing? */ */ static DEFINE_PER_CPU(struct worker_pool [NR_STD_WORKER_POOLS], cpu_std_worker_pools); -static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_STD_WORKER_POOLS]); +static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t [NR_STD_WORKER_POOLS], + cpu_std_pool_nr_running); /* * Standard worker pools and nr_running counter for unbound CPU. The pools * have POOL_DISASSOCIATED set, and all workers have WORKER_UNBOUND set. */ static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS]; -static atomic_t unbound_pool_nr_running[NR_STD_WORKER_POOLS] = { +static atomic_t unbound_std_pool_nr_running[NR_STD_WORKER_POOLS] = { [0 ... NR_STD_WORKER_POOLS - 1] = ATOMIC_INIT(0), /* always 0 */ }; @@ -488,9 +489,9 @@ static atomic_t *get_pool_nr_running(struct worker_pool *pool) int idx = std_worker_pool_pri(pool); if (cpu != WORK_CPU_UNBOUND) - return &per_cpu(pool_nr_running, cpu)[idx]; + return &per_cpu(cpu_std_pool_nr_running, cpu)[idx]; else - return &unbound_pool_nr_running[idx]; + return &unbound_std_pool_nr_running[idx]; } static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, -- cgit v1.2.3 From 706026c2141113886f61e1ad2738c9a7723ec69c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Jan 2013 11:01:34 -0800 Subject: workqueue: post global_cwq removal cleanups Remove remaining references to gcwq. * __next_gcwq_cpu() steals __next_wq_cpu() name. The original __next_wq_cpu() became __next_cwq_cpu(). * s/for_each_gcwq_cpu/for_each_wq_cpu/ s/for_each_online_gcwq_cpu/for_each_online_wq_cpu/ * s/gcwq_mayday_timeout/pool_mayday_timeout/ * s/gcwq_unbind_fn/wq_unbind_fn/ * Drop references to gcwq in comments. This patch doesn't introduce any functional changes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 104 ++++++++++++++++++++++++++--------------------------- 1 file changed, 52 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index db8d4b7471ac..577de1073f24 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -246,8 +246,8 @@ EXPORT_SYMBOL_GPL(system_freezable_wq); #define for_each_busy_worker(worker, i, pos, pool) \ hash_for_each(pool->busy_hash, i, pos, worker, hentry) -static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask, - unsigned int sw) +static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, + unsigned int sw) { if (cpu < nr_cpu_ids) { if (sw & 1) { @@ -261,39 +261,39 @@ static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask, return WORK_CPU_NONE; } -static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, - struct workqueue_struct *wq) +static inline int __next_cwq_cpu(int cpu, const struct cpumask *mask, + struct workqueue_struct *wq) { - return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); + return __next_wq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); } /* * CPU iterators * - * An extra gcwq is defined for an invalid cpu number + * An extra cpu number is defined using an invalid cpu number * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any - * specific CPU. The following iterators are similar to - * for_each_*_cpu() iterators but also considers the unbound gcwq. + * specific CPU. The following iterators are similar to for_each_*_cpu() + * iterators but also considers the unbound CPU. * - * for_each_gcwq_cpu() : possible CPUs + WORK_CPU_UNBOUND - * for_each_online_gcwq_cpu() : online CPUs + WORK_CPU_UNBOUND + * for_each_wq_cpu() : possible CPUs + WORK_CPU_UNBOUND + * for_each_online_wq_cpu() : online CPUs + WORK_CPU_UNBOUND * for_each_cwq_cpu() : possible CPUs for bound workqueues, * WORK_CPU_UNBOUND for unbound workqueues */ -#define for_each_gcwq_cpu(cpu) \ - for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3); \ +#define for_each_wq_cpu(cpu) \ + for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3); \ (cpu) < WORK_CPU_NONE; \ - (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3)) + (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, 3)) -#define for_each_online_gcwq_cpu(cpu) \ - for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3); \ +#define for_each_online_wq_cpu(cpu) \ + for ((cpu) = __next_wq_cpu(-1, cpu_online_mask, 3); \ (cpu) < WORK_CPU_NONE; \ - (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3)) + (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3)) #define for_each_cwq_cpu(cpu, wq) \ - for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq)); \ + for ((cpu) = __next_cwq_cpu(-1, cpu_possible_mask, (wq)); \ (cpu) < WORK_CPU_NONE; \ - (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq))) + (cpu) = __next_cwq_cpu((cpu), cpu_possible_mask, (wq))) #ifdef CONFIG_DEBUG_OBJECTS_WORK @@ -655,7 +655,7 @@ static bool __need_more_worker(struct worker_pool *pool) * running workers. * * Note that, because unbound workers never contribute to nr_running, this - * function will always return %true for unbound gcwq as long as the + * function will always return %true for unbound pools as long as the * worklist isn't empty. */ static bool need_more_worker(struct worker_pool *pool) @@ -1129,14 +1129,14 @@ fail: } /** - * insert_work - insert a work into gcwq + * insert_work - insert a work into a pool * @cwq: cwq @work belongs to * @work: work to insert * @head: insertion point * @extra_flags: extra WORK_STRUCT_* flags to set * - * Insert @work which belongs to @cwq into @gcwq after @head. - * @extra_flags is or'd to work_struct flags. + * Insert @work which belongs to @cwq after @head. @extra_flags is or'd to + * work_struct flags. * * CONTEXT: * spin_lock_irq(pool->lock). @@ -1179,7 +1179,7 @@ static bool is_chained_work(struct workqueue_struct *wq) unsigned long flags; unsigned int cpu; - for_each_gcwq_cpu(cpu) { + for_each_wq_cpu(cpu) { struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); struct worker_pool *pool = cwq->pool; struct worker *worker; @@ -1533,7 +1533,7 @@ static void worker_enter_idle(struct worker *worker) mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); /* - * Sanity check nr_running. Because gcwq_unbind_fn() releases + * Sanity check nr_running. Because wq_unbind_fn() releases * pool->lock between setting %WORKER_UNBOUND and zapping * nr_running, the warning may trigger spuriously. Check iff * unbind is not in progress. @@ -1563,7 +1563,7 @@ static void worker_leave_idle(struct worker *worker) } /** - * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq + * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock pool * @worker: self * * Works which are scheduled while the cpu is online must at least be @@ -1575,10 +1575,10 @@ static void worker_leave_idle(struct worker *worker) * themselves to the target cpu and may race with cpu going down or * coming online. kthread_bind() can't be used because it may put the * worker to already dead cpu and set_cpus_allowed_ptr() can't be used - * verbatim as it's best effort and blocking and gcwq may be + * verbatim as it's best effort and blocking and pool may be * [dis]associated in the meantime. * - * This function tries set_cpus_allowed() and locks gcwq and verifies the + * This function tries set_cpus_allowed() and locks pool and verifies the * binding against %POOL_DISASSOCIATED which is set during * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker * enters idle state or fetches works without dropping lock, it can @@ -1589,7 +1589,7 @@ static void worker_leave_idle(struct worker *worker) * held. * * RETURNS: - * %true if the associated gcwq is online (@worker is successfully + * %true if the associated pool is online (@worker is successfully * bound), %false if offline. */ static bool worker_maybe_bind_and_lock(struct worker *worker) @@ -1826,7 +1826,7 @@ fail: * start_worker - start a newly created worker * @worker: worker to start * - * Make the gcwq aware of @worker and start it. + * Make the pool aware of @worker and start it. * * CONTEXT: * spin_lock_irq(pool->lock). @@ -1843,7 +1843,7 @@ static void start_worker(struct worker *worker) * destroy_worker - destroy a workqueue worker * @worker: worker to be destroyed * - * Destroy @worker and adjust @gcwq stats accordingly. + * Destroy @worker and adjust @pool stats accordingly. * * CONTEXT: * spin_lock_irq(pool->lock) which is released and regrabbed. @@ -1919,7 +1919,7 @@ static bool send_mayday(struct work_struct *work) return true; } -static void gcwq_mayday_timeout(unsigned long __pool) +static void pool_mayday_timeout(unsigned long __pool) { struct worker_pool *pool = (void *)__pool; struct work_struct *work; @@ -2047,9 +2047,9 @@ static bool maybe_destroy_workers(struct worker_pool *pool) * manage_workers - manage worker pool * @worker: self * - * Assume the manager role and manage gcwq worker pool @worker belongs + * Assume the manager role and manage the worker pool @worker belongs * to. At any given time, there can be only zero or one manager per - * gcwq. The exclusion is handled automatically by this function. + * pool. The exclusion is handled automatically by this function. * * The caller can safely start processing works on false return. On * true return, it's guaranteed that need_to_create_worker() is false @@ -2092,11 +2092,11 @@ static bool manage_workers(struct worker *worker) * CPU hotplug could have happened while we were waiting * for assoc_mutex. Hotplug itself can't handle us * because manager isn't either on idle or busy list, and - * @gcwq's state and ours could have deviated. + * @pool's state and ours could have deviated. * * As hotplug is now excluded via assoc_mutex, we can * simply try to bind. It will succeed or fail depending - * on @gcwq's current state. Try it and adjust + * on @pool's current state. Try it and adjust * %WORKER_UNBOUND accordingly. */ if (worker_maybe_bind_and_lock(worker)) @@ -2271,8 +2271,8 @@ static void process_scheduled_works(struct worker *worker) * worker_thread - the worker thread function * @__worker: self * - * The gcwq worker thread function. There's a single dynamic pool of - * these per each cpu. These workers process all works regardless of + * The worker thread function. There are NR_CPU_WORKER_POOLS dynamic pools + * of these per each cpu. These workers process all works regardless of * their specific target workqueue. The only exception is works which * belong to workqueues with a rescuer which will be explained in * rescuer_thread(). @@ -2368,14 +2368,14 @@ sleep: * Workqueue rescuer thread function. There's one rescuer for each * workqueue which has WQ_RESCUER set. * - * Regular work processing on a gcwq may block trying to create a new + * Regular work processing on a pool may block trying to create a new * worker which uses GFP_KERNEL allocation which has slight chance of * developing into deadlock if some works currently on the same queue * need to be processed to satisfy the GFP_KERNEL allocation. This is * the problem rescuer solves. * - * When such condition is possible, the gcwq summons rescuers of all - * workqueues which have works queued on the gcwq and let them process + * When such condition is possible, the pool summons rescuers of all + * workqueues which have works queued on the pool and let them process * those works so that forward progress can be guaranteed. * * This should happen rarely. @@ -3476,7 +3476,7 @@ EXPORT_SYMBOL_GPL(work_busy); * * There are two challenges in supporting CPU hotplug. Firstly, there * are a lot of assumptions on strong associations among work, cwq and - * gcwq which make migrating pending and scheduled works very + * pool which make migrating pending and scheduled works very * difficult to implement without impacting hot paths. Secondly, * worker pools serve mix of short, long and very long running works making * blocked draining impractical. @@ -3486,7 +3486,7 @@ EXPORT_SYMBOL_GPL(work_busy); * cpu comes back online. */ -static void gcwq_unbind_fn(struct work_struct *work) +static void wq_unbind_fn(struct work_struct *work) { int cpu = smp_processor_id(); struct worker_pool *pool; @@ -3601,7 +3601,7 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb, switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: /* unbinding should happen on the local CPU */ - INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); + INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); queue_work_on(cpu, system_highpri_wq, &unbind_work); flush_work(&unbind_work); break; @@ -3654,7 +3654,7 @@ EXPORT_SYMBOL_GPL(work_on_cpu); * * Start freezing workqueues. After this function returns, all freezable * workqueues will queue new works to their frozen_works list instead of - * gcwq->worklist. + * pool->worklist. * * CONTEXT: * Grabs and releases workqueue_lock and pool->lock's. @@ -3668,7 +3668,7 @@ void freeze_workqueues_begin(void) BUG_ON(workqueue_freezing); workqueue_freezing = true; - for_each_gcwq_cpu(cpu) { + for_each_wq_cpu(cpu) { struct worker_pool *pool; struct workqueue_struct *wq; @@ -3715,7 +3715,7 @@ bool freeze_workqueues_busy(void) BUG_ON(!workqueue_freezing); - for_each_gcwq_cpu(cpu) { + for_each_wq_cpu(cpu) { struct workqueue_struct *wq; /* * nr_active is monotonically decreasing. It's safe @@ -3743,7 +3743,7 @@ out_unlock: * thaw_workqueues - thaw workqueues * * Thaw workqueues. Normal queueing is restored and all collected - * frozen works are transferred to their respective gcwq worklists. + * frozen works are transferred to their respective pool worklists. * * CONTEXT: * Grabs and releases workqueue_lock and pool->lock's. @@ -3757,7 +3757,7 @@ void thaw_workqueues(void) if (!workqueue_freezing) goto out_unlock; - for_each_gcwq_cpu(cpu) { + for_each_wq_cpu(cpu) { struct worker_pool *pool; struct workqueue_struct *wq; @@ -3801,8 +3801,8 @@ static int __init init_workqueues(void) cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); - /* initialize gcwqs */ - for_each_gcwq_cpu(cpu) { + /* initialize CPU pools */ + for_each_wq_cpu(cpu) { struct worker_pool *pool; for_each_std_worker_pool(pool, cpu) { @@ -3817,7 +3817,7 @@ static int __init init_workqueues(void) pool->idle_timer.function = idle_worker_timeout; pool->idle_timer.data = (unsigned long)pool; - setup_timer(&pool->mayday_timer, gcwq_mayday_timeout, + setup_timer(&pool->mayday_timer, pool_mayday_timeout, (unsigned long)pool); mutex_init(&pool->assoc_mutex); @@ -3829,7 +3829,7 @@ static int __init init_workqueues(void) } /* create the initial worker */ - for_each_online_gcwq_cpu(cpu) { + for_each_online_wq_cpu(cpu) { struct worker_pool *pool; for_each_std_worker_pool(pool, cpu) { -- cgit v1.2.3 From c91368c4889a0ee5dd06552adbb50ae54f5096fd Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 20 Dec 2012 14:11:30 -0500 Subject: uprobes: remove redundant check We checked for uprobe==NULL earlier, no need to redo that. Signed-off-by: Sasha Levin Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1356030701-16284-22-git-send-email-sasha.levin@oracle.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/uprobes.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index dea7acfbb071..30ea9a4f4ab4 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -901,8 +901,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume } mutex_unlock(uprobes_hash(inode)); - if (uprobe) - put_uprobe(uprobe); + put_uprobe(uprobe); } static struct rb_node * -- cgit v1.2.3 From fe1c06ca7523baa668c1eaf1e1016fa64753c32e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 24 Jan 2013 14:30:22 +0800 Subject: cgroup: initialize cgrp->dentry before css_alloc() With this change, we're guaranteed that cgroup_path() won't see NULL cgrp->dentry, and thus we can remove the NULL check in it. (Well, it's not strictly true, because dummptop.dentry is always NULL but we already handle that separately.) Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8da9048078c5..a04932281bc9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1767,7 +1767,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), "cgroup_path() called without proper locking"); - if (!dentry || cgrp == dummytop) { + if (cgrp == dummytop) { /* * Inactive subsystems have no dentry for their root * cgroup @@ -4153,6 +4153,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, init_cgroup_housekeeping(cgrp); + dentry->d_fsdata = cgrp; + cgrp->dentry = dentry; + cgrp->parent = parent; cgrp->root = parent->root; cgrp->top_cgroup = parent->top_cgroup; @@ -4190,8 +4193,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, lockdep_assert_held(&dentry->d_inode->i_mutex); /* allocation complete, commit to creation */ - dentry->d_fsdata = cgrp; - cgrp->dentry = dentry; list_add_tail(&cgrp->allcg_node, &root->allcg_list); list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); root->number_of_cgroups++; -- cgit v1.2.3 From ace783b9bbfa2182b4a561498db3f09a0c56bc79 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 24 Jan 2013 14:30:48 +0800 Subject: sched: split out css_online/css_offline from tg creation/destruction This is a preparaton for later patches. - What do we gain from cpu_cgroup_css_online(): After ss->css_alloc() and before ss->css_online(), there's a small window that tg->css.cgroup is NULL. With this change, tg won't be seen before ss->css_online(), where it's added to the global list, so we're guaranteed we'll never see NULL tg->css.cgroup. - What do we gain from cpu_cgroup_css_offline(): tg is freed via RCU, so is cgroup. Without this change, This is how synchronization works: cgroup_rmdir() no ss->css_offline() diput() syncornize_rcu() ss->css_free() <-- unregister tg, and free it via call_rcu() kfree_rcu(cgroup) <-- wait possible refs to cgroup, and free cgroup We can't just kfree(cgroup), because tg might access tg->css.cgroup. With this change: cgroup_rmdir() ss->css_offline() <-- unregister tg diput() synchronize_rcu() <-- wait possible refs to tg and cgroup ss->css_free() <-- free tg kfree_rcu(cgroup) <-- free cgroup As you see, kfree_rcu() is redundant now. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo Acked-by: Ingo Molnar --- kernel/sched/auto_group.c | 3 +++ kernel/sched/core.c | 49 +++++++++++++++++++++++++++++++++++++---------- 2 files changed, 42 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 0984a21076a3..64de5f8b0c9e 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -35,6 +35,7 @@ static inline void autogroup_destroy(struct kref *kref) ag->tg->rt_se = NULL; ag->tg->rt_rq = NULL; #endif + sched_offline_group(ag->tg); sched_destroy_group(ag->tg); } @@ -76,6 +77,8 @@ static inline struct autogroup *autogroup_create(void) if (IS_ERR(tg)) goto out_free; + sched_online_group(tg, &root_task_group); + kref_init(&ag->kref); init_rwsem(&ag->lock); ag->id = atomic_inc_return(&autogroup_seq_nr); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 257002c13bb0..106167243d68 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7159,7 +7159,6 @@ static void free_sched_group(struct task_group *tg) struct task_group *sched_create_group(struct task_group *parent) { struct task_group *tg; - unsigned long flags; tg = kzalloc(sizeof(*tg), GFP_KERNEL); if (!tg) @@ -7171,6 +7170,17 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; + return tg; + +err: + free_sched_group(tg); + return ERR_PTR(-ENOMEM); +} + +void sched_online_group(struct task_group *tg, struct task_group *parent) +{ + unsigned long flags; + spin_lock_irqsave(&task_group_lock, flags); list_add_rcu(&tg->list, &task_groups); @@ -7180,12 +7190,6 @@ struct task_group *sched_create_group(struct task_group *parent) INIT_LIST_HEAD(&tg->children); list_add_rcu(&tg->siblings, &parent->children); spin_unlock_irqrestore(&task_group_lock, flags); - - return tg; - -err: - free_sched_group(tg); - return ERR_PTR(-ENOMEM); } /* rcu callback to free various structures associated with a task group */ @@ -7197,6 +7201,12 @@ static void free_sched_group_rcu(struct rcu_head *rhp) /* Destroy runqueue etc associated with a task group */ void sched_destroy_group(struct task_group *tg) +{ + /* wait for possible concurrent references to cfs_rqs complete */ + call_rcu(&tg->rcu, free_sched_group_rcu); +} + +void sched_offline_group(struct task_group *tg) { unsigned long flags; int i; @@ -7209,9 +7219,6 @@ void sched_destroy_group(struct task_group *tg) list_del_rcu(&tg->list); list_del_rcu(&tg->siblings); spin_unlock_irqrestore(&task_group_lock, flags); - - /* wait for possible concurrent references to cfs_rqs complete */ - call_rcu(&tg->rcu, free_sched_group_rcu); } /* change task's runqueue when it moves between groups. @@ -7563,6 +7570,19 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) return &tg->css; } +static int cpu_cgroup_css_online(struct cgroup *cgrp) +{ + struct task_group *tg = cgroup_tg(cgrp); + struct task_group *parent; + + if (!cgrp->parent) + return 0; + + parent = cgroup_tg(cgrp->parent); + sched_online_group(tg, parent); + return 0; +} + static void cpu_cgroup_css_free(struct cgroup *cgrp) { struct task_group *tg = cgroup_tg(cgrp); @@ -7570,6 +7590,13 @@ static void cpu_cgroup_css_free(struct cgroup *cgrp) sched_destroy_group(tg); } +static void cpu_cgroup_css_offline(struct cgroup *cgrp) +{ + struct task_group *tg = cgroup_tg(cgrp); + + sched_offline_group(tg); +} + static int cpu_cgroup_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { @@ -7925,6 +7952,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { .name = "cpu", .css_alloc = cpu_cgroup_css_alloc, .css_free = cpu_cgroup_css_free, + .css_online = cpu_cgroup_css_online, + .css_offline = cpu_cgroup_css_offline, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, -- cgit v1.2.3 From 2a73991b76cbd38c4a0c6704449ccc08c89c3ff3 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 24 Jan 2013 14:31:11 +0800 Subject: sched: remove redundant NULL cgroup check in task_group_path() A task_group won't be online (thus no one can see it) until cpu_cgroup_css_online(), and at that time tg->css.cgroup has been initialized, so this NULL check is redundant. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/sched/debug.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2cd3c1b4e582..38df0db96608 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -110,13 +110,6 @@ static char *task_group_path(struct task_group *tg) if (autogroup_path(tg, group_path, PATH_MAX)) return group_path; - /* - * May be NULL if the underlying cgroup isn't fully-created yet - */ - if (!tg->css.cgroup) { - group_path[0] = '\0'; - return group_path; - } cgroup_path(tg->css.cgroup, group_path, PATH_MAX); return group_path; } -- cgit v1.2.3 From 86a3db5643c7d29bb36ca85c7a4bb67ad4d88d77 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 24 Jan 2013 14:31:27 +0800 Subject: cgroup: remove duplicate RCU free on struct cgroup When destroying a cgroup, though in cgroup_diput() we've called synchronize_rcu(), we then still have to free it via call_rcu(). The story is, long ago to fix a race between reading /proc/sched_debug and freeing cgroup, the code was changed to utilize call_rcu(). See commit a47295e6bc42ad35f9c15ac66f598aa24debd4e2 ("cgroups: make cgroup_path() RCU-safe") As we've fixed cpu cgroup that cpu_cgroup_offline_css() is used to unregister a task_group so there won't be concurrent access to this task_group after synchronize_rcu() in diput(). Now we can just kfree(cgrp). Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a04932281bc9..af993919aa04 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -892,7 +892,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) simple_xattrs_free(&cgrp->xattrs); ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); - kfree_rcu(cgrp, rcu_head); + kfree(cgrp); } else { struct cfent *cfe = __d_cfe(dentry); struct cgroup *cgrp = dentry->d_parent->d_fsdata; -- cgit v1.2.3 From be44562613851235d801d41d5b3976dc4333f622 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 24 Jan 2013 14:31:42 +0800 Subject: cgroup: remove synchronize_rcu() from cgroup_diput() Free cgroup via call_rcu(). The actual work is done through workqueue. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 72 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index af993919aa04..02e4f201472e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -852,12 +852,52 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) return inode; } +static void cgroup_free_fn(struct work_struct *work) +{ + struct cgroup *cgrp = container_of(work, struct cgroup, free_work); + struct cgroup_subsys *ss; + + mutex_lock(&cgroup_mutex); + /* + * Release the subsystem state objects. + */ + for_each_subsys(cgrp->root, ss) + ss->css_free(cgrp); + + cgrp->root->number_of_cgroups--; + mutex_unlock(&cgroup_mutex); + + /* + * Drop the active superblock reference that we took when we + * created the cgroup + */ + deactivate_super(cgrp->root->sb); + + /* + * if we're getting rid of the cgroup, refcount should ensure + * that there are no pidlists left. + */ + BUG_ON(!list_empty(&cgrp->pidlists)); + + simple_xattrs_free(&cgrp->xattrs); + + ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); + kfree(cgrp); +} + +static void cgroup_free_rcu(struct rcu_head *head) +{ + struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); + + schedule_work(&cgrp->free_work); +} + static void cgroup_diput(struct dentry *dentry, struct inode *inode) { /* is dentry a directory ? if so, kfree() associated cgroup */ if (S_ISDIR(inode->i_mode)) { struct cgroup *cgrp = dentry->d_fsdata; - struct cgroup_subsys *ss; + BUG_ON(!(cgroup_is_removed(cgrp))); /* It's possible for external users to be holding css * reference counts on a cgroup; css_put() needs to @@ -865,34 +905,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) * the reference count in order to know if it needs to * queue the cgroup to be handled by the release * agent */ - synchronize_rcu(); - - mutex_lock(&cgroup_mutex); - /* - * Release the subsystem state objects. - */ - for_each_subsys(cgrp->root, ss) - ss->css_free(cgrp); - - cgrp->root->number_of_cgroups--; - mutex_unlock(&cgroup_mutex); - - /* - * Drop the active superblock reference that we took when we - * created the cgroup - */ - deactivate_super(cgrp->root->sb); - - /* - * if we're getting rid of the cgroup, refcount should ensure - * that there are no pidlists left. - */ - BUG_ON(!list_empty(&cgrp->pidlists)); - - simple_xattrs_free(&cgrp->xattrs); - - ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); - kfree(cgrp); + call_rcu(&cgrp->rcu_head, cgroup_free_rcu); } else { struct cfent *cfe = __d_cfe(dentry); struct cgroup *cgrp = dentry->d_parent->d_fsdata; @@ -1391,6 +1404,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->allcg_node); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); + INIT_WORK(&cgrp->free_work, cgroup_free_fn); mutex_init(&cgrp->pidlist_mutex); INIT_LIST_HEAD(&cgrp->event_list); spin_lock_init(&cgrp->event_list_lock); -- cgit v1.2.3 From 9ed8a659703876a9fe96ab86d1b296c2f0084242 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 24 Jan 2013 14:32:02 +0800 Subject: cgroup: remove bogus comments in cgroup_diput() Since commit 48ddbe194623ae089cc0576e60363f2d2e85662a ("cgroup: make css->refcnt clearing on cgroup removal optional"), each css holds a ref on cgroup's dentry, so cgroup_diput() won't be called until all css' refs go down to 0, which invalids the comments. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 02e4f201472e..800852282c21 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -899,12 +899,6 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) struct cgroup *cgrp = dentry->d_fsdata; BUG_ON(!(cgroup_is_removed(cgrp))); - /* It's possible for external users to be holding css - * reference counts on a cgroup; css_put() needs to - * be able to access the cgroup after decrementing - * the reference count in order to know if it needs to - * queue the cgroup to be handled by the release - * agent */ call_rcu(&cgrp->rcu_head, cgroup_free_rcu); } else { struct cfent *cfe = __d_cfe(dentry); -- cgit v1.2.3 From b736f48bda54ec75b7dc9306884c3843f1a78a0a Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sun, 18 Nov 2012 21:27:45 -0800 Subject: tracing: Mark tracing_dentry_percpu() static Nothing outside of kernel/trace/trace.c references tracing_dentry_percpu(). Link: http://lkml.kernel.org/r/1353302917-13995-7-git-send-email-josh@joshtriplett.org Signed-off-by: Josh Triplett Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d2a658349ca1..ca9b7dfed8ef 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4506,7 +4506,7 @@ struct dentry *tracing_init_dentry(void) static struct dentry *d_percpu; -struct dentry *tracing_dentry_percpu(void) +static struct dentry *tracing_dentry_percpu(void) { static int once; struct dentry *d_tracer; -- cgit v1.2.3 From 227536740e5cb157fb9fa9b381178c7d34b95d3b Mon Sep 17 00:00:00 2001 From: Michal Marek Date: Fri, 25 Jan 2013 13:41:00 +1030 Subject: MODSIGN: Simplify Makefile with a Kconfig helper Signed-off-by: Michal Marek Acked-by: David Howells Signed-off-by: Rusty Russell --- kernel/Makefile | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 6c072b6da239..eceac38f3c65 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -153,23 +153,7 @@ kernel/modsign_certificate.o: signing_key.x509 extra_certificates # fail and that the kernel may be used afterwards. # ############################################################################### -sign_key_with_hash := -ifeq ($(CONFIG_MODULE_SIG_SHA1),y) -sign_key_with_hash := -sha1 -endif -ifeq ($(CONFIG_MODULE_SIG_SHA224),y) -sign_key_with_hash := -sha224 -endif -ifeq ($(CONFIG_MODULE_SIG_SHA256),y) -sign_key_with_hash := -sha256 -endif -ifeq ($(CONFIG_MODULE_SIG_SHA384),y) -sign_key_with_hash := -sha384 -endif -ifeq ($(CONFIG_MODULE_SIG_SHA512),y) -sign_key_with_hash := -sha512 -endif -ifeq ($(sign_key_with_hash),) +ifndef CONFIG_MODULE_SIG_HASH $(error Could not determine digest type to use from kernel config) endif @@ -182,8 +166,8 @@ signing_key.priv signing_key.x509: x509.genkey @echo "### needs to be run as root, and uses a hardware random" @echo "### number generator if one is available." @echo "###" - openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \ - -x509 -config x509.genkey \ + openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \ + -batch -x509 -config x509.genkey \ -outform DER -out signing_key.x509 \ -keyout signing_key.priv @echo "###" -- cgit v1.2.3 From 57d2aa00dcec67afa52478730f2b524521af14fb Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Tue, 17 Jul 2012 15:03:43 +0800 Subject: sched/rt: Avoid updating RT entry timeout twice within one tick period The issue below was found in 2.6.34-rt rather than mainline rt kernel, but the issue still exists upstream as well. So please let me describe how it was noticed on 2.6.34-rt: On this version, each softirq has its own thread, it means there is at least one RT FIFO task per cpu. The priority of these tasks is set to 49 by default. If user launches an RT FIFO task with priority lower than 49 of softirq RT tasks, it's possible there are two RT FIFO tasks enqueued one cpu runqueue at one moment. By current strategy of balancing RT tasks, when it comes to RT tasks, we really need to put them off to a CPU that they can run on as soon as possible. Even if it means a bit of cache line flushing, we want RT tasks to be run with the least latency. When the user RT FIFO task which just launched before is running, the sched timer tick of the current cpu happens. In this tick period, the timeout value of the user RT task will be updated once. Subsequently, we try to wake up one softirq RT task on its local cpu. As the priority of current user RT task is lower than the softirq RT task, the current task will be preempted by the higher priority softirq RT task. Before preemption, we check to see if current can readily move to a different cpu. If so, we will reschedule to allow the RT push logic to try to move current somewhere else. Whenever the woken softirq RT task runs, it first tries to migrate the user FIFO RT task over to a cpu that is running a task of lesser priority. If migration is done, it will send a reschedule request to the found cpu by IPI interrupt. Once the target cpu responds the IPI interrupt, it will pick the migrated user RT task to preempt its current task. When the user RT task is running on the new cpu, the sched timer tick of the cpu fires. So it will tick the user RT task again. This also means the RT task timeout value will be updated again. As the migration may be done in one tick period, it means the user RT task timeout value will be updated twice within one tick. If we set a limit on the amount of cpu time for the user RT task by setrlimit(RLIMIT_RTTIME), the SIGXCPU signal should be posted upon reaching the soft limit. But exactly when the SIGXCPU signal should be sent depends on the RT task timeout value. In fact the timeout mechanism of sending the SIGXCPU signal assumes the RT task timeout is increased once every tick. However, currently the timeout value may be added twice per tick. So it results in the SIGXCPU signal being sent earlier than expected. To solve this issue, we prevent the timeout value from increasing twice within one tick time by remembering the jiffies value of last updating the timeout. As long as the RT task's jiffies is different with the global jiffies value, we allow its timeout to be updated. Signed-off-by: Ying Xue Signed-off-by: Fan Du Reviewed-by: Yong Zhang Acked-by: Steven Rostedt Cc: Link: http://lkml.kernel.org/r/1342508623-2887-1-git-send-email-ying.xue@windriver.com Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 29bda5bdf2a5..2f69ca997826 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1988,7 +1988,11 @@ static void watchdog(struct rq *rq, struct task_struct *p) if (soft != RLIM_INFINITY) { unsigned long next; - p->rt.timeout++; + if (p->rt.watchdog_stamp != jiffies) { + p->rt.timeout++; + p->rt.watchdog_stamp = jiffies; + } + next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); if (p->rt.timeout > next) p->cputime_expires.sched_exp = p->se.sum_exec_runtime; -- cgit v1.2.3 From a0327ff0eda915be623658babacef706099c11a8 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 25 Jan 2013 10:13:59 +0000 Subject: async: initialise list heads to fix crash 9fdb04cdc55 ("async: replace list of active domains with global list of pending items") added a struct list_head global_list in struct async_entry, which isn't initialised. This means that if !domain->registered at __async_schedule(), then list_del_init() will be called on the list head in async_run_entry_fn with both pointers NULL, causing a crash. This is fixed by initialising both the global_list and domain_list list_heads after kzalloc'ing the entry. This was noticed due to dapm_power_widgets() which uses ASYNC_DOMAIN_EXCLUSIVE, which initialises the domain->registered to 0. Signed-off-by: James Hogan Signed-off-by: Tejun Heo Reported-by: James Hogan Reported-by: Stephen Warren --- kernel/async.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index 6958000eeb44..8ddee2c3e5b0 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -172,6 +172,8 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a ptr(data, newcookie); return newcookie; } + INIT_LIST_HEAD(&entry->domain_list); + INIT_LIST_HEAD(&entry->global_list); INIT_WORK(&entry->work, async_run_entry_fn); entry->func = ptr; entry->data = data; -- cgit v1.2.3 From ed1ac6e91a3ff7c561008ba57747cd6cbc49385e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 11 Jan 2013 13:37:33 +0100 Subject: PM: don't use [delayed_]work_pending() There's no need to test whether a (delayed) work item is pending before queueing, flushing or cancelling it, so remove work_pending() tests used in those cases. Signed-off-by: Tejun Heo Signed-off-by: Rafael J. Wysocki --- kernel/power/autosleep.c | 2 +- kernel/power/qos.c | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c index ca304046d9e2..c6422ffeda9a 100644 --- a/kernel/power/autosleep.c +++ b/kernel/power/autosleep.c @@ -66,7 +66,7 @@ static DECLARE_WORK(suspend_work, try_to_suspend); void queue_up_suspend_work(void) { - if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON) + if (autosleep_state > PM_SUSPEND_ON) queue_work(autosleep_wq, &suspend_work); } diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 9322ff7eaad6..587dddeebf15 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -359,8 +359,7 @@ void pm_qos_update_request(struct pm_qos_request *req, return; } - if (delayed_work_pending(&req->work)) - cancel_delayed_work_sync(&req->work); + cancel_delayed_work_sync(&req->work); if (new_value != req->node.prio) pm_qos_update_target( @@ -386,8 +385,7 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value, "%s called for unknown object.", __func__)) return; - if (delayed_work_pending(&req->work)) - cancel_delayed_work_sync(&req->work); + cancel_delayed_work_sync(&req->work); if (new_value != req->node.prio) pm_qos_update_target( @@ -416,8 +414,7 @@ void pm_qos_remove_request(struct pm_qos_request *req) return; } - if (delayed_work_pending(&req->work)) - cancel_delayed_work_sync(&req->work); + cancel_delayed_work_sync(&req->work); pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, &req->node, PM_QOS_REMOVE_REQ, -- cgit v1.2.3 From 43720bd6014327ac454434496cb953edcdb9f8d6 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 11 Jan 2013 13:43:45 +0100 Subject: PM / tracing: remove deprecated power trace API The text in Documentation said it would be removed in 2.6.41; the text in the Kconfig said removal in the 3.1 release. Either way you look at it, we are well past both, so push it off a cliff. Note that the POWER_CSTATE and the POWER_PSTATE are part of the legacy tracing API. Remove all tracepoints which use these flags. As can be seen from context, most already have a trace entry via trace_cpu_idle anyways. Also, the cpufreq/cpufreq.c PSTATE one is actually unpaired, as compared to the CSTATE ones which all have a clear start/stop. As part of this, the trace_power_frequency also becomes orphaned, so it too is deleted. Signed-off-by: Paul Gortmaker Acked-by: Steven Rostedt Signed-off-by: Rafael J. Wysocki --- kernel/trace/Kconfig | 15 --------------- kernel/trace/power-traces.c | 3 --- 2 files changed, 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5d89335a485f..ad0a067ad4b3 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -78,21 +78,6 @@ config EVENT_TRACING select CONTEXT_SWITCH_TRACER bool -config EVENT_POWER_TRACING_DEPRECATED - depends on EVENT_TRACING - bool "Deprecated power event trace API, to be removed" - default y - help - Provides old power event types: - C-state/idle accounting events: - power:power_start - power:power_end - and old cpufreq accounting event: - power:power_frequency - This is for userspace compatibility - and will vanish after 5 kernel iterations, - namely 3.1. - config CONTEXT_SWITCH_TRACER bool diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index f55fcf61b223..1c71382b283d 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -13,8 +13,5 @@ #define CREATE_TRACE_POINTS #include -#ifdef EVENT_POWER_TRACING_DEPRECATED -EXPORT_TRACEPOINT_SYMBOL_GPL(power_start); -#endif EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); -- cgit v1.2.3 From 821465295b36136998ef294fe176fba4e09c1cd9 Mon Sep 17 00:00:00 2001 From: Shan Wei Date: Mon, 19 Nov 2012 13:21:01 +0800 Subject: tracing: Use __this_cpu_inc/dec operation instead of __get_cpu_var __this_cpu_inc_return() or __this_cpu_dec generates a single instruction, which is faster than __get_cpu_var operation. Link: http://lkml.kernel.org/r/50A9C1BD.1060308@gmail.com Reviewed-by: Christoph Lameter Signed-off-by: Shan Wei Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ca9b7dfed8ef..07888e15c694 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1344,7 +1344,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, */ preempt_disable_notrace(); - use_stack = ++__get_cpu_var(ftrace_stack_reserve); + use_stack = __this_cpu_inc_return(ftrace_stack_reserve); /* * We don't need any atomic variables, just a barrier. * If an interrupt comes in, we don't care, because it would @@ -1398,7 +1398,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, out: /* Again, don't let gcc optimize things here */ barrier(); - __get_cpu_var(ftrace_stack_reserve)--; + __this_cpu_dec(ftrace_stack_reserve); preempt_enable_notrace(); } -- cgit v1.2.3 From 95a79fd458b85132c25e351d45037ec9643312b2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 7 Jan 2013 18:12:14 +0100 Subject: context_tracking: Export context state for generic vtime Export the context state: whether we run in user / kernel from the context tracking subsystem point of view. This is going to be used by the generic virtual cputime accounting subsystem that is needed to implement the full dynticks. Signed-off-by: Frederic Weisbecker Cc: Andrew Morton Cc: Ingo Molnar Cc: Li Zhong Cc: Namhyung Kim Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- kernel/context_tracking.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index e0e07fd55508..54f471e536dc 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -1,24 +1,10 @@ #include #include #include -#include #include -struct context_tracking { - /* - * When active is false, hooks are not set to - * minimize overhead: TIF flags are cleared - * and calls to user_enter/exit are ignored. This - * may be further optimized using static keys. - */ - bool active; - enum { - IN_KERNEL = 0, - IN_USER, - } state; -}; -static DEFINE_PER_CPU(struct context_tracking, context_tracking) = { +DEFINE_PER_CPU(struct context_tracking, context_tracking) = { #ifdef CONFIG_CONTEXT_TRACKING_FORCE .active = true, #endif -- cgit v1.2.3 From b44f665623d15ff391c7e9fe7600c5e7bc2fc9c2 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Fri, 4 Jan 2013 12:59:40 -0500 Subject: rcu: Correct 'optimized' to 'optimize' in header comment Small grammar fix in rcutree comment regarding 'rcu_scheduler_active' var. Signed-off-by: Cody P Schafer Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e0d98157fbea..d78ba606acf1 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -105,7 +105,7 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ * The rcu_scheduler_active variable transitions from zero to one just * before the first task is spawned. So when this variable is zero, RCU * can assume that there is but one task, allowing RCU to (for example) - * optimized synchronize_sched() to a simple barrier(). When this variable + * optimize synchronize_sched() to a simple barrier(). When this variable * is one, RCU must actually do all the hard work required to detect real * grace periods. This variable is also used to suppress boot-time false * positives from lockdep-RCU error checking. -- cgit v1.2.3 From 347f42382184f7448fa96bf9e9fa4eb6f6d1099e Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Mon, 7 Jan 2013 09:36:48 -0800 Subject: rcu: Remove unused code originally used for context tracking As context tracking subsystem evolved, it stopped using ignore_user_qs and in_user defined in the rcu_dynticks structure. This commit therefore removes them. Signed-off-by: Li Zhong Signed-off-by: Paul E. McKenney Acked-by: Frederic Weisbecker --- kernel/rcutree.c | 3 --- kernel/rcutree.h | 4 ---- 2 files changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d78ba606acf1..8a13c8ef9642 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2719,9 +2719,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); -#ifdef CONFIG_RCU_USER_QS - WARN_ON_ONCE(rdp->dynticks->in_user); -#endif rdp->cpu = cpu; rdp->rsp = rsp; rcu_boot_init_nocb_percpu_data(rdp); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 4b69291b093d..6f21f2eccb16 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -102,10 +102,6 @@ struct rcu_dynticks { /* idle-period nonlazy_posted snapshot. */ int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ -#ifdef CONFIG_RCU_USER_QS - bool ignore_user_qs; /* Treat userspace as extended QS or not */ - bool in_user; /* Is the CPU in userland from RCU POV? */ -#endif }; /* RCU's kthread states for tracing. */ -- cgit v1.2.3 From 4eacdf18374e5d7d21a728b46dfec269ac8ef55c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 16 Jan 2013 17:16:37 +0100 Subject: context_tracking: Add comments on interface and internals This subsystem lacks many explanations on its purpose and design. Add these missing comments. v4: Document function parameter to be more kernel-doc friendly, as per Namhyung suggestion. Reported-by: Andrew Morton Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Li Zhong Cc: Namhyung Kim Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney --- kernel/context_tracking.c | 75 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 65 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index e0e07fd55508..d566aba7e801 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -1,3 +1,19 @@ +/* + * Context tracking: Probe on high level context boundaries such as kernel + * and userspace. This includes syscalls and exceptions entry/exit. + * + * This is used by RCU to remove its dependency on the timer tick while a CPU + * runs in userspace. + * + * Started by Frederic Weisbecker: + * + * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker + * + * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton, + * Steven Rostedt, Peter Zijlstra for suggestions and improvements. + * + */ + #include #include #include @@ -6,8 +22,8 @@ struct context_tracking { /* - * When active is false, hooks are not set to - * minimize overhead: TIF flags are cleared + * When active is false, probes are unset in order + * to minimize overhead: TIF flags are cleared * and calls to user_enter/exit are ignored. This * may be further optimized using static keys. */ @@ -24,6 +40,15 @@ static DEFINE_PER_CPU(struct context_tracking, context_tracking) = { #endif }; +/** + * user_enter - Inform the context tracking that the CPU is going to + * enter userspace mode. + * + * This function must be called right before we switch from the kernel + * to userspace, when it's guaranteed the remaining kernel instructions + * to execute won't use any RCU read side critical section because this + * function sets RCU in extended quiescent state. + */ void user_enter(void) { unsigned long flags; @@ -39,40 +64,70 @@ void user_enter(void) if (in_interrupt()) return; + /* Kernel threads aren't supposed to go to userspace */ WARN_ON_ONCE(!current->mm); local_irq_save(flags); if (__this_cpu_read(context_tracking.active) && __this_cpu_read(context_tracking.state) != IN_USER) { __this_cpu_write(context_tracking.state, IN_USER); + /* + * At this stage, only low level arch entry code remains and + * then we'll run in userspace. We can assume there won't be + * any RCU read-side critical section until the next call to + * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency + * on the tick. + */ rcu_user_enter(); } local_irq_restore(flags); } + +/** + * user_exit - Inform the context tracking that the CPU is + * exiting userspace mode and entering the kernel. + * + * This function must be called after we entered the kernel from userspace + * before any use of RCU read side critical section. This potentially include + * any high level kernel code like syscalls, exceptions, signal handling, etc... + * + * This call supports re-entrancy. This way it can be called from any exception + * handler without needing to know if we came from userspace or not. + */ void user_exit(void) { unsigned long flags; - /* - * Some contexts may involve an exception occuring in an irq, - * leading to that nesting: - * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() - * This would mess up the dyntick_nesting count though. And rcu_irq_*() - * helpers are enough to protect RCU uses inside the exception. So - * just return immediately if we detect we are in an IRQ. - */ if (in_interrupt()) return; local_irq_save(flags); if (__this_cpu_read(context_tracking.state) == IN_USER) { __this_cpu_write(context_tracking.state, IN_KERNEL); + /* + * We are going to run code that may use RCU. Inform + * RCU core about that (ie: we may need the tick again). + */ rcu_user_exit(); } local_irq_restore(flags); } + +/** + * context_tracking_task_switch - context switch the syscall callbacks + * @prev: the task that is being switched out + * @next: the task that is being switched in + * + * The context tracking uses the syscall slow path to implement its user-kernel + * boundaries probes on syscalls. This way it doesn't impact the syscall fast + * path on CPUs that don't do context tracking. + * + * But we need to clear the flag on the previous task because it may later + * migrate to some CPU that doesn't do the context tracking. As such the TIF + * flag may not be desired there. + */ void context_tracking_task_switch(struct task_struct *prev, struct task_struct *next) { -- cgit v1.2.3 From c61a2810a2161986353705b44d9503e6bb079f4f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 28 Dec 2012 18:58:39 -0800 Subject: userns: Avoid recursion in put_user_ns When freeing a deeply nested user namespace free_user_ns calls put_user_ns on it's parent which may in turn call free_user_ns again. When -fno-optimize-sibling-calls is passed to gcc one stack frame per user namespace is left on the stack, potentially overflowing the kernel stack. CONFIG_FRAME_POINTER forces -fno-optimize-sibling-calls so we can't count on gcc to optimize this code. Remove struct kref and use a plain atomic_t. Making the code more flexible and easier to comprehend. Make the loop in free_user_ns explict to guarantee that the stack does not overflow with CONFIG_FRAME_POINTER enabled. I have tested this fix with a simple program that uses unshare to create a deeply nested user namespace structure and then calls exit. With 1000 nesteuser namespaces before this change running my test program causes the kernel to die a horrible death. With 10,000,000 nested user namespaces after this change my test program runs to completion and causes no harm. Acked-by: Serge Hallyn Pointed-out-by: Vasily Kulikov Signed-off-by: "Eric W. Biederman" --- kernel/user.c | 4 +--- kernel/user_namespace.c | 17 +++++++++-------- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 33acb5e53a5f..57ebfd42023c 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -47,9 +47,7 @@ struct user_namespace init_user_ns = { .count = 4294967295U, }, }, - .kref = { - .refcount = ATOMIC_INIT(3), - }, + .count = ATOMIC_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .proc_inum = PROC_USER_INIT_INO, diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 2b042c42fbc4..24f8ec3b64d8 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -78,7 +78,7 @@ int create_user_ns(struct cred *new) return ret; } - kref_init(&ns->kref); + atomic_set(&ns->count, 1); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->owner = owner; @@ -104,15 +104,16 @@ int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) return create_user_ns(cred); } -void free_user_ns(struct kref *kref) +void free_user_ns(struct user_namespace *ns) { - struct user_namespace *parent, *ns = - container_of(kref, struct user_namespace, kref); + struct user_namespace *parent; - parent = ns->parent; - proc_free_inum(ns->proc_inum); - kmem_cache_free(user_ns_cachep, ns); - put_user_ns(parent); + do { + parent = ns->parent; + proc_free_inum(ns->proc_inum); + kmem_cache_free(user_ns_cachep, ns); + ns = parent; + } while (atomic_dec_and_test(&parent->count)); } EXPORT_SYMBOL(free_user_ns); -- cgit v1.2.3 From 0bd14b4fd72afd5df41e9fd59f356740f22fceba Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 27 Dec 2012 22:27:29 -0800 Subject: userns: Allow any uid or gid mappings that don't overlap. When I initially wrote the code for /proc//uid_map. I was lazy and avoided duplicate mappings by the simple expedient of ensuring the first number in a new extent was greater than any number in the previous extent. Unfortunately that precludes a number of valid mappings, and someone noticed and complained. So use a simple check to ensure that ranges in the mapping extents don't overlap. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- kernel/user_namespace.c | 45 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 24f8ec3b64d8..8b650837083e 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -520,6 +520,42 @@ struct seq_operations proc_projid_seq_operations = { .show = projid_m_show, }; +static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) +{ + u32 upper_first, lower_first, upper_last, lower_last; + unsigned idx; + + upper_first = extent->first; + lower_first = extent->lower_first; + upper_last = upper_first + extent->count - 1; + lower_last = lower_first + extent->count - 1; + + for (idx = 0; idx < new_map->nr_extents; idx++) { + u32 prev_upper_first, prev_lower_first; + u32 prev_upper_last, prev_lower_last; + struct uid_gid_extent *prev; + + prev = &new_map->extent[idx]; + + prev_upper_first = prev->first; + prev_lower_first = prev->lower_first; + prev_upper_last = prev_upper_first + prev->count - 1; + prev_lower_last = prev_lower_first + prev->count - 1; + + /* Does the upper range intersect a previous extent? */ + if ((prev_upper_first <= upper_last) && + (prev_upper_last >= upper_first)) + return true; + + /* Does the lower range intersect a previous extent? */ + if ((prev_lower_first <= lower_last) && + (prev_lower_last >= lower_first)) + return true; + } + return false; +} + + static DEFINE_MUTEX(id_map_mutex); static ssize_t map_write(struct file *file, const char __user *buf, @@ -532,7 +568,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, struct user_namespace *ns = seq->private; struct uid_gid_map new_map; unsigned idx; - struct uid_gid_extent *extent, *last = NULL; + struct uid_gid_extent *extent = NULL; unsigned long page = 0; char *kbuf, *pos, *next_line; ssize_t ret = -EINVAL; @@ -635,14 +671,11 @@ static ssize_t map_write(struct file *file, const char __user *buf, if ((extent->lower_first + extent->count) <= extent->lower_first) goto out; - /* For now only accept extents that are strictly in order */ - if (last && - (((last->first + last->count) > extent->first) || - ((last->lower_first + last->count) > extent->lower_first))) + /* Do the ranges in extent overlap any previous extents? */ + if (mappings_overlap(&new_map, extent)) goto out; new_map.nr_extents++; - last = extent; /* Fail if the file contains too many extents */ if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) && -- cgit v1.2.3 From 62188451f0d63add7ad0cd2a1ae269d600c1663d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 26 Jan 2013 17:19:42 +0100 Subject: cputime: Avoid multiplication overflow on utime scaling We scale stime, utime values based on rtime (sum_exec_runtime converted to jiffies). During scaling we multiple rtime * utime, which seems to be fine, since both values are converted to u64, but it's not. Let assume HZ is 1000 - 1ms tick. Process consist of 64 threads, run for 1 day, threads utilize 100% cpu on user space. Machine has 64 cpus. Process rtime = utime will be 64 * 24 * 60 * 60 * 1000 jiffies, which is 0x149970000. Multiplication rtime * utime result is 0x1a855771100000000, which can not be covered in 64 bits. Result of overflow is stall of utime values visible in user space (prev_utime in kernel), even if application still consume lot of CPU time. A solution to solve this is to perform the multiplication on stime instead of utime. It's easy to grow the utime value fast with a CPU bound thread in userspace for example. Now we assume that doing so with stime is much harder. In most cases a task shouldn't ever spend much time in kernel space as it tends to sleep waiting for jobs completion when they take long to achieve. IO is the typical example of that. Hence scaling the cputime by performing the multiplication on stime instead of utime should considerably reduce the chances of an overflow on most workloads. This is largely inspired by a patch from Stanislaw Gruszka: http://lkml.kernel.org/r/20130107113144.GA7544@redhat.com Inspired-by: Stanislaw Gruszka Reported-by: Stanislaw Gruszka Acked-by: Stanislaw Gruszka Signed-off-by: Frederic Weisbecker Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Andrew Morton Link: http://lkml.kernel.org/r/1359217182-25184-1-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 293b202fcf79..825a956ccdb6 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -509,11 +509,11 @@ EXPORT_SYMBOL_GPL(vtime_account); # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) #endif -static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) +static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total) { u64 temp = (__force u64) rtime; - temp *= (__force u64) utime; + temp *= (__force u64) stime; if (sizeof(cputime_t) == 4) temp = div_u64(temp, (__force u32) total); @@ -531,10 +531,10 @@ static void cputime_adjust(struct task_cputime *curr, struct cputime *prev, cputime_t *ut, cputime_t *st) { - cputime_t rtime, utime, total; + cputime_t rtime, stime, total; - utime = curr->utime; - total = utime + curr->stime; + stime = curr->stime; + total = stime + curr->utime; /* * Tick based cputime accounting depend on random scheduling @@ -549,17 +549,17 @@ static void cputime_adjust(struct task_cputime *curr, rtime = nsecs_to_cputime(curr->sum_exec_runtime); if (total) - utime = scale_utime(utime, rtime, total); + stime = scale_stime(stime, rtime, total); else - utime = rtime; + stime = rtime; /* * If the tick based count grows faster than the scheduler one, * the result of the scaling may go backward. * Let's enforce monotonicity. */ - prev->utime = max(prev->utime, utime); - prev->stime = max(prev->stime, rtime - prev->utime); + prev->stime = max(prev->stime, stime); + prev->utime = max(prev->utime, rtime - prev->stime); *ut = prev->utime; *st = prev->stime; -- cgit v1.2.3 From ae8dda5c473bf1a85913942adcaac449e5754bf3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 16 Jan 2013 18:02:04 +0100 Subject: cputime: Move default nsecs_to_cputime() to jiffies based cputime file If the architecture doesn't provide an implementation of nsecs_to_cputime(), the cputime accounting core uses a default one that converts the nanoseconds to jiffies. However this only makes sense if we use the jiffies based cputime. For now it doesn't matter much because this API is only called on code that uses jiffies based cputime accounting. But the code may evolve and this API may be used more broadly in the future. Keeping this default implementation around is very error prone as it may introduce a bug and hide it on architectures that don't override this API. Fix this by moving this definition to the jiffies based cputime headers as it is the only place where it belongs to. Signed-off-by: Frederic Weisbecker Cc: Andrew Morton Cc: Ingo Molnar Cc: Li Zhong Cc: Namhyung Kim Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- kernel/sched/cputime.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 293b202fcf79..5849448b981e 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -505,10 +505,6 @@ EXPORT_SYMBOL_GPL(vtime_account); #else -#ifndef nsecs_to_cputime -# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) -#endif - static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) { u64 temp = (__force u64) rtime; -- cgit v1.2.3 From abf917cd91cbb73952758f9741e2fa65002a48ee Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 25 Jul 2012 07:56:04 +0200 Subject: cputime: Generic on-demand virtual cputime accounting If we want to stop the tick further idle, we need to be able to account the cputime without using the tick. Virtual based cputime accounting solves that problem by hooking into kernel/user boundaries. However implementing CONFIG_VIRT_CPU_ACCOUNTING require low level hooks and involves more overhead. But we already have a generic context tracking subsystem that is required for RCU needs by archs which plan to shut down the tick outside idle. This patch implements a generic virtual based cputime accounting that relies on these generic kernel/user hooks. There are some upsides of doing this: - This requires no arch code to implement CONFIG_VIRT_CPU_ACCOUNTING if context tracking is already built (already necessary for RCU in full tickless mode). - We can rely on the generic context tracking subsystem to dynamically (de)activate the hooks, so that we can switch anytime between virtual and tick based accounting. This way we don't have the overhead of the virtual accounting when the tick is running periodically. And one downside: - There is probably more overhead than a native virtual based cputime accounting. But this relies on hooks that are already set anyway. Signed-off-by: Frederic Weisbecker Cc: Andrew Morton Cc: Ingo Molnar Cc: Li Zhong Cc: Namhyung Kim Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- kernel/context_tracking.c | 6 +++-- kernel/sched/cputime.c | 61 +++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 61 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 54f471e536dc..9002e92e6372 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -30,8 +30,9 @@ void user_enter(void) local_irq_save(flags); if (__this_cpu_read(context_tracking.active) && __this_cpu_read(context_tracking.state) != IN_USER) { - __this_cpu_write(context_tracking.state, IN_USER); + vtime_user_enter(current); rcu_user_enter(); + __this_cpu_write(context_tracking.state, IN_USER); } local_irq_restore(flags); } @@ -53,8 +54,9 @@ void user_exit(void) local_irq_save(flags); if (__this_cpu_read(context_tracking.state) == IN_USER) { - __this_cpu_write(context_tracking.state, IN_KERNEL); rcu_user_exit(); + vtime_user_exit(current); + __this_cpu_write(context_tracking.state, IN_KERNEL); } local_irq_restore(flags); } diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 5849448b981e..1c964eced92c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -3,6 +3,7 @@ #include #include #include +#include #include "sched.h" @@ -479,7 +480,9 @@ void vtime_task_switch(struct task_struct *prev) else vtime_account_system(prev); +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE vtime_account_user(prev); +#endif arch_vtime_task_switch(prev); } #endif @@ -495,10 +498,24 @@ void vtime_task_switch(struct task_struct *prev) #ifndef __ARCH_HAS_VTIME_ACCOUNT void vtime_account(struct task_struct *tsk) { - if (in_interrupt() || !is_idle_task(tsk)) - vtime_account_system(tsk); - else - vtime_account_idle(tsk); + if (!in_interrupt()) { + /* + * If we interrupted user, context_tracking_in_user() + * is 1 because the context tracking don't hook + * on irq entry/exit. This way we know if + * we need to flush user time on kernel entry. + */ + if (context_tracking_in_user()) { + vtime_account_user(tsk); + return; + } + + if (is_idle_task(tsk)) { + vtime_account_idle(tsk); + return; + } + } + vtime_account_system(tsk); } EXPORT_SYMBOL_GPL(vtime_account); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ @@ -583,3 +600,39 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); } #endif + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN +static DEFINE_PER_CPU(unsigned long long, cputime_snap); + +static cputime_t get_vtime_delta(void) +{ + unsigned long long delta; + + delta = sched_clock() - __this_cpu_read(cputime_snap); + __this_cpu_add(cputime_snap, delta); + + /* CHECKME: always safe to convert nsecs to cputime? */ + return nsecs_to_cputime(delta); +} + +void vtime_account_system(struct task_struct *tsk) +{ + cputime_t delta_cpu = get_vtime_delta(); + + account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu)); +} + +void vtime_account_user(struct task_struct *tsk) +{ + cputime_t delta_cpu = get_vtime_delta(); + + account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); +} + +void vtime_account_idle(struct task_struct *tsk) +{ + cputime_t delta_cpu = get_vtime_delta(); + + account_idle_time(delta_cpu); +} +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ -- cgit v1.2.3 From 3f4724ea85b7d9055a9976fa8f30b471bdfbca93 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 16 Jul 2012 18:00:34 +0200 Subject: cputime: Allow dynamic switch between tick/virtual based cputime accounting Allow to dynamically switch between tick and virtual based cputime accounting. This way we can provide a kind of "on-demand" virtual based cputime accounting. In this mode, the kernel relies on the context tracking subsystem to dynamically probe on kernel boundaries. This is in preparation for being able to stop the timer tick in more places than just the idle state. Doing so will depend on CONFIG_VIRT_CPU_ACCOUNTING_GEN which makes it possible to account the cputime without the tick by hooking on kernel/user boundaries. Depending whether the tick is stopped or not, we can switch between tick and vtime based accounting anytime in order to minimize the overhead associated to user hooks. Signed-off-by: Frederic Weisbecker Cc: Andrew Morton Cc: Ingo Molnar Cc: Li Zhong Cc: Namhyung Kim Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- kernel/sched/cputime.c | 41 +++++++++++++++++++++++++++++++---------- kernel/time/tick-sched.c | 5 ++++- 2 files changed, 35 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 1c964eced92c..e1939d38bf73 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -317,8 +317,6 @@ out: rcu_read_unlock(); } -#ifndef CONFIG_VIRT_CPU_ACCOUNTING - #ifdef CONFIG_IRQ_TIME_ACCOUNTING /* * Account a tick to a process and cpustat @@ -383,11 +381,12 @@ static void irqtime_account_idle_ticks(int ticks) irqtime_account_process_tick(current, 0, rq); } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ -static void irqtime_account_idle_ticks(int ticks) {} -static void irqtime_account_process_tick(struct task_struct *p, int user_tick, +static inline void irqtime_account_idle_ticks(int ticks) {} +static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, struct rq *rq) {} #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE /* * Account a single tick of cpu time. * @p: the process that the cpu time gets accounted to @@ -398,6 +397,9 @@ void account_process_tick(struct task_struct *p, int user_tick) cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); struct rq *rq = this_rq(); + if (vtime_accounting_enabled()) + return; + if (sched_clock_irqtime) { irqtime_account_process_tick(p, user_tick, rq); return; @@ -439,8 +441,7 @@ void account_idle_ticks(unsigned long ticks) account_idle_time(jiffies_to_cputime(ticks)); } - -#endif +#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ /* * Use precise platform statistics if available: @@ -475,6 +476,9 @@ EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe); #ifndef __ARCH_HAS_VTIME_TASK_SWITCH void vtime_task_switch(struct task_struct *prev) { + if (!vtime_accounting_enabled()) + return; + if (is_idle_task(prev)) vtime_account_idle(prev); else @@ -498,6 +502,9 @@ void vtime_task_switch(struct task_struct *prev) #ifndef __ARCH_HAS_VTIME_ACCOUNT void vtime_account(struct task_struct *tsk) { + if (!vtime_accounting_enabled()) + return; + if (!in_interrupt()) { /* * If we interrupted user, context_tracking_in_user() @@ -520,7 +527,7 @@ void vtime_account(struct task_struct *tsk) EXPORT_SYMBOL_GPL(vtime_account); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ -#else +#else /* !CONFIG_VIRT_CPU_ACCOUNTING */ static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) { @@ -599,7 +606,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime thread_group_cputime(p, &cputime); cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); } -#endif +#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN static DEFINE_PER_CPU(unsigned long long, cputime_snap); @@ -617,14 +624,23 @@ static cputime_t get_vtime_delta(void) void vtime_account_system(struct task_struct *tsk) { - cputime_t delta_cpu = get_vtime_delta(); + cputime_t delta_cpu; + + if (!vtime_accounting_enabled()) + return; + delta_cpu = get_vtime_delta(); account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu)); } void vtime_account_user(struct task_struct *tsk) { - cputime_t delta_cpu = get_vtime_delta(); + cputime_t delta_cpu; + + if (!vtime_accounting_enabled()) + return; + + delta_cpu = get_vtime_delta(); account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); } @@ -635,4 +651,9 @@ void vtime_account_idle(struct task_struct *tsk) account_idle_time(delta_cpu); } + +bool vtime_accounting_enabled(void) +{ + return context_tracking_active(); +} #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index d58e552d9fd1..46dfb6d94b1c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -631,8 +631,11 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) static void tick_nohz_account_idle_ticks(struct tick_sched *ts) { -#ifndef CONFIG_VIRT_CPU_ACCOUNTING +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE unsigned long ticks; + + if (vtime_accounting_enabled()) + return; /* * We stopped the tick in idle. Update process times would miss the * time we slept as update_process_times does only a 1 tick -- cgit v1.2.3 From 6fac4829ce0ef9b7f24369086ce5f0e9f38d37bc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 13 Nov 2012 14:20:55 +0100 Subject: cputime: Use accessors to read task cputime stats This is in preparation for the full dynticks feature. While remotely reading the cputime of a task running in a full dynticks CPU, we'll need to do some extra-computation. This way we can account the time it spent tickless in userspace since its last cputime snapshot. Signed-off-by: Frederic Weisbecker Cc: Andrew Morton Cc: Ingo Molnar Cc: Li Zhong Cc: Namhyung Kim Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- kernel/acct.c | 6 ++++-- kernel/cpu.c | 4 +++- kernel/delayacct.c | 7 +++++-- kernel/exit.c | 10 ++++++---- kernel/posix-cpu-timers.c | 28 ++++++++++++++++++++++------ kernel/sched/cputime.c | 13 +++++++------ kernel/signal.c | 12 ++++++++---- kernel/tsacct.c | 44 ++++++++++++++++++++++++++++++++++---------- 8 files changed, 89 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 051e071a06e7..e8b1627ab9c7 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -566,6 +566,7 @@ out: void acct_collect(long exitcode, int group_dead) { struct pacct_struct *pacct = ¤t->signal->pacct; + cputime_t utime, stime; unsigned long vsize = 0; if (group_dead && current->mm) { @@ -593,8 +594,9 @@ void acct_collect(long exitcode, int group_dead) pacct->ac_flag |= ACORE; if (current->flags & PF_SIGNALED) pacct->ac_flag |= AXSIG; - pacct->ac_utime += current->utime; - pacct->ac_stime += current->stime; + task_cputime(current, &utime, &stime); + pacct->ac_utime += utime; + pacct->ac_stime += stime; pacct->ac_minflt += current->min_flt; pacct->ac_majflt += current->maj_flt; spin_unlock_irq(¤t->sighand->siglock); diff --git a/kernel/cpu.c b/kernel/cpu.c index 3046a503242c..e5d5e8e1e030 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -224,11 +224,13 @@ void clear_tasks_mm_cpumask(int cpu) static inline void check_for_tasks(int cpu) { struct task_struct *p; + cputime_t utime, stime; write_lock_irq(&tasklist_lock); for_each_process(p) { + task_cputime(p, &utime, &stime); if (task_cpu(p) == cpu && p->state == TASK_RUNNING && - (p->utime || p->stime)) + (utime || stime)) printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " "(state = %ld, flags = %x)\n", p->comm, task_pid_nr(p), cpu, diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 418b3f7053aa..d473988c1d0b 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -106,6 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) unsigned long long t2, t3; unsigned long flags; struct timespec ts; + cputime_t utime, stime, stimescaled, utimescaled; /* Though tsk->delays accessed later, early exit avoids * unnecessary returning of other data @@ -114,12 +115,14 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) goto done; tmp = (s64)d->cpu_run_real_total; - cputime_to_timespec(tsk->utime + tsk->stime, &ts); + task_cputime(tsk, &utime, &stime); + cputime_to_timespec(utime + stime, &ts); tmp += timespec_to_ns(&ts); d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; tmp = (s64)d->cpu_scaled_run_real_total; - cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts); + task_cputime_scaled(tsk, &utimescaled, &stimescaled); + cputime_to_timespec(utimescaled + stimescaled, &ts); tmp += timespec_to_ns(&ts); d->cpu_scaled_run_real_total = (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; diff --git a/kernel/exit.c b/kernel/exit.c index b4df21937216..7dd20408707c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -85,6 +85,7 @@ static void __exit_signal(struct task_struct *tsk) bool group_dead = thread_group_leader(tsk); struct sighand_struct *sighand; struct tty_struct *uninitialized_var(tty); + cputime_t utime, stime; sighand = rcu_dereference_check(tsk->sighand, lockdep_tasklist_lock_is_held()); @@ -123,9 +124,10 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ - sig->utime += tsk->utime; - sig->stime += tsk->stime; - sig->gtime += tsk->gtime; + task_cputime(tsk, &utime, &stime); + sig->utime += utime; + sig->stime += stime; + sig->gtime += task_gtime(tsk); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; @@ -1092,7 +1094,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) sig = p->signal; psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; - psig->cgtime += p->gtime + sig->gtime + sig->cgtime; + psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; psig->cmin_flt += p->min_flt + sig->min_flt + sig->cmin_flt; psig->cmaj_flt += diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index a278cad1d5d6..165d47698477 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -155,11 +155,19 @@ static void bump_cpu_timer(struct k_itimer *timer, static inline cputime_t prof_ticks(struct task_struct *p) { - return p->utime + p->stime; + cputime_t utime, stime; + + task_cputime(p, &utime, &stime); + + return utime + stime; } static inline cputime_t virt_ticks(struct task_struct *p) { - return p->utime; + cputime_t utime; + + task_cputime(p, &utime, NULL); + + return utime; } static int @@ -471,18 +479,23 @@ static void cleanup_timers(struct list_head *head, */ void posix_cpu_timers_exit(struct task_struct *tsk) { + cputime_t utime, stime; + add_device_randomness((const void*) &tsk->se.sum_exec_runtime, sizeof(unsigned long long)); + task_cputime(tsk, &utime, &stime); cleanup_timers(tsk->cpu_timers, - tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); + utime, stime, tsk->se.sum_exec_runtime); } void posix_cpu_timers_exit_group(struct task_struct *tsk) { struct signal_struct *const sig = tsk->signal; + cputime_t utime, stime; + task_cputime(tsk, &utime, &stime); cleanup_timers(tsk->signal->cpu_timers, - tsk->utime + sig->utime, tsk->stime + sig->stime, + utime + sig->utime, stime + sig->stime, tsk->se.sum_exec_runtime + sig->sum_sched_runtime); } @@ -1226,11 +1239,14 @@ static inline int task_cputime_expired(const struct task_cputime *sample, static inline int fastpath_timer_check(struct task_struct *tsk) { struct signal_struct *sig; + cputime_t utime, stime; + + task_cputime(tsk, &utime, &stime); if (!task_cputime_zero(&tsk->cputime_expires)) { struct task_cputime task_sample = { - .utime = tsk->utime, - .stime = tsk->stime, + .utime = utime, + .stime = stime, .sum_exec_runtime = tsk->se.sum_exec_runtime }; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index e1939d38bf73..c533deaf06d5 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -164,7 +164,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime, task_group_account_field(p, index, (__force u64) cputime); /* Account for user time used */ - acct_update_integrals(p); + acct_account_cputime(p); } /* @@ -214,7 +214,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, task_group_account_field(p, index, (__force u64) cputime); /* Account for system time used */ - acct_update_integrals(p); + acct_account_cputime(p); } /* @@ -296,6 +296,7 @@ static __always_inline bool steal_account_process_tick(void) void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) { struct signal_struct *sig = tsk->signal; + cputime_t utime, stime; struct task_struct *t; times->utime = sig->utime; @@ -309,8 +310,9 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) t = tsk; do { - times->utime += t->utime; - times->stime += t->stime; + task_cputime(tsk, &utime, &stime); + times->utime += utime; + times->stime += stime; times->sum_exec_runtime += task_sched_runtime(t); } while_each_thread(tsk, t); out: @@ -588,11 +590,10 @@ static void cputime_adjust(struct task_cputime *curr, void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct task_cputime cputime = { - .utime = p->utime, - .stime = p->stime, .sum_exec_runtime = p->se.sum_exec_runtime, }; + task_cputime(p, &cputime.utime, &cputime.stime); cputime_adjust(&cputime, &p->prev_cputime, ut, st); } diff --git a/kernel/signal.c b/kernel/signal.c index 372771e948c2..776a45a3661b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1638,6 +1638,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig) unsigned long flags; struct sighand_struct *psig; bool autoreap = false; + cputime_t utime, stime; BUG_ON(sig == -1); @@ -1675,8 +1676,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig) task_uid(tsk)); rcu_read_unlock(); - info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); - info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime); + task_cputime(tsk, &utime, &stime); + info.si_utime = cputime_to_clock_t(utime + tsk->signal->utime); + info.si_stime = cputime_to_clock_t(stime + tsk->signal->stime); info.si_status = tsk->exit_code & 0x7f; if (tsk->exit_code & 0x80) @@ -1740,6 +1742,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, unsigned long flags; struct task_struct *parent; struct sighand_struct *sighand; + cputime_t utime, stime; if (for_ptracer) { parent = tsk->parent; @@ -1758,8 +1761,9 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); rcu_read_unlock(); - info.si_utime = cputime_to_clock_t(tsk->utime); - info.si_stime = cputime_to_clock_t(tsk->stime); + task_cputime(tsk, &utime, &stime); + info.si_utime = cputime_to_clock_t(utime); + info.si_stime = cputime_to_clock_t(stime); info.si_code = why; switch (why) { diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 625df0b44690..a1dd9a1b1327 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -32,6 +32,7 @@ void bacct_add_tsk(struct user_namespace *user_ns, { const struct cred *tcred; struct timespec uptime, ts; + cputime_t utime, stime, utimescaled, stimescaled; u64 ac_etime; BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); @@ -65,10 +66,15 @@ void bacct_add_tsk(struct user_namespace *user_ns, stats->ac_ppid = pid_alive(tsk) ? task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0; rcu_read_unlock(); - stats->ac_utime = cputime_to_usecs(tsk->utime); - stats->ac_stime = cputime_to_usecs(tsk->stime); - stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled); - stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled); + + task_cputime(tsk, &utime, &stime); + stats->ac_utime = cputime_to_usecs(utime); + stats->ac_stime = cputime_to_usecs(stime); + + task_cputime_scaled(tsk, &utimescaled, &stimescaled); + stats->ac_utimescaled = cputime_to_usecs(utimescaled); + stats->ac_stimescaled = cputime_to_usecs(stimescaled); + stats->ac_minflt = tsk->min_flt; stats->ac_majflt = tsk->maj_flt; @@ -115,11 +121,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) #undef KB #undef MB -/** - * acct_update_integrals - update mm integral fields in task_struct - * @tsk: task_struct for accounting - */ -void acct_update_integrals(struct task_struct *tsk) +static void __acct_update_integrals(struct task_struct *tsk, + cputime_t utime, cputime_t stime) { if (likely(tsk->mm)) { cputime_t time, dtime; @@ -128,7 +131,7 @@ void acct_update_integrals(struct task_struct *tsk) u64 delta; local_irq_save(flags); - time = tsk->stime + tsk->utime; + time = stime + utime; dtime = time - tsk->acct_timexpd; jiffies_to_timeval(cputime_to_jiffies(dtime), &value); delta = value.tv_sec; @@ -144,6 +147,27 @@ void acct_update_integrals(struct task_struct *tsk) } } +/** + * acct_update_integrals - update mm integral fields in task_struct + * @tsk: task_struct for accounting + */ +void acct_update_integrals(struct task_struct *tsk) +{ + cputime_t utime, stime; + + task_cputime(tsk, &utime, &stime); + __acct_update_integrals(tsk, utime, stime); +} + +/** + * acct_account_cputime - update mm integral after cputime update + * @tsk: task_struct for accounting + */ +void acct_account_cputime(struct task_struct *tsk) +{ + __acct_update_integrals(tsk, tsk->utime, tsk->stime); +} + /** * acct_clear_integrals - clear the mm integral fields in task_struct * @tsk: task_struct whose accounting fields are cleared -- cgit v1.2.3 From c11f11fcbdb5be790c565aed46411486a7586afc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 21 Jan 2013 00:50:22 +0100 Subject: kvm: Prepare to add generic guest entry/exit callbacks Do some ground preparatory work before adding guest_enter() and guest_exit() context tracking callbacks. Those will be later used to read the guest cputime safely when we run in full dynticks mode. Signed-off-by: Frederic Weisbecker Cc: Andrew Morton Cc: Gleb Natapov Cc: Ingo Molnar Cc: Li Zhong Cc: Marcelo Tosatti Cc: Namhyung Kim Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- kernel/sched/cputime.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index c533deaf06d5..a44ecdf809a1 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -465,16 +465,6 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime *st = cputime.stime; } -void vtime_account_system_irqsafe(struct task_struct *tsk) -{ - unsigned long flags; - - local_irq_save(flags); - vtime_account_system(tsk); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe); - #ifndef __ARCH_HAS_VTIME_TASK_SWITCH void vtime_task_switch(struct task_struct *prev) { -- cgit v1.2.3 From 6a61671bb2f3a1bd12cd17b8fca811a624782632 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 16 Dec 2012 20:00:34 +0100 Subject: cputime: Safely read cputime of full dynticks CPUs While remotely reading the cputime of a task running in a full dynticks CPU, the values stored in utime/stime fields of struct task_struct may be stale. Its values may be those of the last kernel <-> user transition time snapshot and we need to add the tickless time spent since this snapshot. To fix this, flush the cputime of the dynticks CPUs on kernel <-> user transition and record the time / context where we did this. Then on top of this snapshot and the current time, perform the fixup on the reader side from task_times() accessors. Signed-off-by: Frederic Weisbecker Cc: Andrew Morton Cc: Ingo Molnar Cc: Li Zhong Cc: Namhyung Kim Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner [fixed kvm module related build errors] Signed-off-by: Sedat Dilek --- kernel/context_tracking.c | 21 ++++- kernel/fork.c | 6 ++ kernel/sched/core.c | 1 + kernel/sched/cputime.c | 193 +++++++++++++++++++++++++++++++++++++++++++--- kernel/softirq.c | 6 +- 5 files changed, 211 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 9002e92e6372..74f68f4dc6c2 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -1,8 +1,9 @@ #include +#include #include #include #include - +#include DEFINE_PER_CPU(struct context_tracking, context_tracking) = { #ifdef CONFIG_CONTEXT_TRACKING_FORCE @@ -61,6 +62,24 @@ void user_exit(void) local_irq_restore(flags); } +void guest_enter(void) +{ + if (vtime_accounting_enabled()) + vtime_guest_enter(current); + else + __guest_enter(); +} +EXPORT_SYMBOL_GPL(guest_enter); + +void guest_exit(void) +{ + if (vtime_accounting_enabled()) + vtime_guest_exit(current); + else + __guest_exit(); +} +EXPORT_SYMBOL_GPL(guest_exit); + void context_tracking_task_switch(struct task_struct *prev, struct task_struct *next) { diff --git a/kernel/fork.c b/kernel/fork.c index 65ca6d27f24e..e68a95b4cf26 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1233,6 +1233,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifndef CONFIG_VIRT_CPU_ACCOUNTING p->prev_cputime.utime = p->prev_cputime.stime = 0; #endif +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN + seqlock_init(&p->vtime_seqlock); + p->vtime_snap = 0; + p->vtime_snap_whence = VTIME_SLEEPING; +#endif + #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 257002c13bb0..261022d7e79d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4666,6 +4666,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) */ idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); + vtime_init_idle(idle); #if defined(CONFIG_SMP) sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); #endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a44ecdf809a1..082e05d915b4 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -492,7 +492,7 @@ void vtime_task_switch(struct task_struct *prev) * vtime_account(). */ #ifndef __ARCH_HAS_VTIME_ACCOUNT -void vtime_account(struct task_struct *tsk) +void vtime_account_irq_enter(struct task_struct *tsk) { if (!vtime_accounting_enabled()) return; @@ -516,7 +516,7 @@ void vtime_account(struct task_struct *tsk) } vtime_account_system(tsk); } -EXPORT_SYMBOL_GPL(vtime_account); +EXPORT_SYMBOL_GPL(vtime_account_irq_enter); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ #else /* !CONFIG_VIRT_CPU_ACCOUNTING */ @@ -600,28 +600,55 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime #endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -static DEFINE_PER_CPU(unsigned long long, cputime_snap); +static unsigned long long vtime_delta(struct task_struct *tsk) +{ + unsigned long long clock; + + clock = sched_clock(); + if (clock < tsk->vtime_snap) + return 0; -static cputime_t get_vtime_delta(void) + return clock - tsk->vtime_snap; +} + +static cputime_t get_vtime_delta(struct task_struct *tsk) { - unsigned long long delta; + unsigned long long delta = vtime_delta(tsk); - delta = sched_clock() - __this_cpu_read(cputime_snap); - __this_cpu_add(cputime_snap, delta); + WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING); + tsk->vtime_snap += delta; /* CHECKME: always safe to convert nsecs to cputime? */ return nsecs_to_cputime(delta); } +static void __vtime_account_system(struct task_struct *tsk) +{ + cputime_t delta_cpu = get_vtime_delta(tsk); + + account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu)); +} + void vtime_account_system(struct task_struct *tsk) { - cputime_t delta_cpu; + if (!vtime_accounting_enabled()) + return; + + write_seqlock(&tsk->vtime_seqlock); + __vtime_account_system(tsk); + write_sequnlock(&tsk->vtime_seqlock); +} +void vtime_account_irq_exit(struct task_struct *tsk) +{ if (!vtime_accounting_enabled()) return; - delta_cpu = get_vtime_delta(); - account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu)); + write_seqlock(&tsk->vtime_seqlock); + if (context_tracking_in_user()) + tsk->vtime_snap_whence = VTIME_USER; + __vtime_account_system(tsk); + write_sequnlock(&tsk->vtime_seqlock); } void vtime_account_user(struct task_struct *tsk) @@ -631,14 +658,44 @@ void vtime_account_user(struct task_struct *tsk) if (!vtime_accounting_enabled()) return; - delta_cpu = get_vtime_delta(); + delta_cpu = get_vtime_delta(tsk); + write_seqlock(&tsk->vtime_seqlock); + tsk->vtime_snap_whence = VTIME_SYS; account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_user_enter(struct task_struct *tsk) +{ + if (!vtime_accounting_enabled()) + return; + + write_seqlock(&tsk->vtime_seqlock); + tsk->vtime_snap_whence = VTIME_USER; + __vtime_account_system(tsk); + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_guest_enter(struct task_struct *tsk) +{ + write_seqlock(&tsk->vtime_seqlock); + __vtime_account_system(tsk); + current->flags |= PF_VCPU; + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_guest_exit(struct task_struct *tsk) +{ + write_seqlock(&tsk->vtime_seqlock); + __vtime_account_system(tsk); + current->flags &= ~PF_VCPU; + write_sequnlock(&tsk->vtime_seqlock); } void vtime_account_idle(struct task_struct *tsk) { - cputime_t delta_cpu = get_vtime_delta(); + cputime_t delta_cpu = get_vtime_delta(tsk); account_idle_time(delta_cpu); } @@ -647,4 +704,116 @@ bool vtime_accounting_enabled(void) { return context_tracking_active(); } + +void arch_vtime_task_switch(struct task_struct *prev) +{ + write_seqlock(&prev->vtime_seqlock); + prev->vtime_snap_whence = VTIME_SLEEPING; + write_sequnlock(&prev->vtime_seqlock); + + write_seqlock(¤t->vtime_seqlock); + current->vtime_snap_whence = VTIME_SYS; + current->vtime_snap = sched_clock(); + write_sequnlock(¤t->vtime_seqlock); +} + +void vtime_init_idle(struct task_struct *t) +{ + unsigned long flags; + + write_seqlock_irqsave(&t->vtime_seqlock, flags); + t->vtime_snap_whence = VTIME_SYS; + t->vtime_snap = sched_clock(); + write_sequnlock_irqrestore(&t->vtime_seqlock, flags); +} + +cputime_t task_gtime(struct task_struct *t) +{ + unsigned long flags; + unsigned int seq; + cputime_t gtime; + + do { + seq = read_seqbegin_irqsave(&t->vtime_seqlock, flags); + + gtime = t->gtime; + if (t->flags & PF_VCPU) + gtime += vtime_delta(t); + + } while (read_seqretry_irqrestore(&t->vtime_seqlock, seq, flags)); + + return gtime; +} + +/* + * Fetch cputime raw values from fields of task_struct and + * add up the pending nohz execution time since the last + * cputime snapshot. + */ +static void +fetch_task_cputime(struct task_struct *t, + cputime_t *u_dst, cputime_t *s_dst, + cputime_t *u_src, cputime_t *s_src, + cputime_t *udelta, cputime_t *sdelta) +{ + unsigned long flags; + unsigned int seq; + unsigned long long delta; + + do { + *udelta = 0; + *sdelta = 0; + + seq = read_seqbegin_irqsave(&t->vtime_seqlock, flags); + + if (u_dst) + *u_dst = *u_src; + if (s_dst) + *s_dst = *s_src; + + /* Task is sleeping, nothing to add */ + if (t->vtime_snap_whence == VTIME_SLEEPING || + is_idle_task(t)) + continue; + + delta = vtime_delta(t); + + /* + * Task runs either in user or kernel space, add pending nohz time to + * the right place. + */ + if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) { + *udelta = delta; + } else { + if (t->vtime_snap_whence == VTIME_SYS) + *sdelta = delta; + } + } while (read_seqretry_irqrestore(&t->vtime_seqlock, seq, flags)); +} + + +void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) +{ + cputime_t udelta, sdelta; + + fetch_task_cputime(t, utime, stime, &t->utime, + &t->stime, &udelta, &sdelta); + if (utime) + *utime += udelta; + if (stime) + *stime += sdelta; +} + +void task_cputime_scaled(struct task_struct *t, + cputime_t *utimescaled, cputime_t *stimescaled) +{ + cputime_t udelta, sdelta; + + fetch_task_cputime(t, utimescaled, stimescaled, + &t->utimescaled, &t->stimescaled, &udelta, &sdelta); + if (utimescaled) + *utimescaled += cputime_to_scaled(udelta); + if (stimescaled) + *stimescaled += cputime_to_scaled(sdelta); +} #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ diff --git a/kernel/softirq.c b/kernel/softirq.c index ed567babe789..f5cc25f147a6 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void) current->flags &= ~PF_MEMALLOC; pending = local_softirq_pending(); - vtime_account_irq_enter(current); + account_irq_enter_time(current); __local_bh_disable((unsigned long)__builtin_return_address(0), SOFTIRQ_OFFSET); @@ -272,7 +272,7 @@ restart: lockdep_softirq_exit(); - vtime_account_irq_exit(current); + account_irq_exit_time(current); __local_bh_enable(SOFTIRQ_OFFSET); tsk_restore_flags(current, old_flags, PF_MEMALLOC); } @@ -341,7 +341,7 @@ static inline void invoke_softirq(void) */ void irq_exit(void) { - vtime_account_irq_exit(current); + account_irq_exit_time(current); trace_hardirq_exit(); sub_preempt_count(IRQ_EXIT_OFFSET); if (!in_interrupt() && local_softirq_pending()) -- cgit v1.2.3 From 6bfc09e2327dfbffc312004c16188dbf8dfb0297 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 19 Oct 2012 12:49:17 -0700 Subject: rcu: Provide RCU CPU stall warnings for tiny RCU Tiny RCU has historically omitted RCU CPU stall warnings in order to reduce memory requirements, however, lack of these warnings caused Thomas Gleixner some debugging pain recently. Therefore, this commit adds RCU CPU stall warnings to tiny RCU if RCU_TRACE=y. This keeps the memory footprint small, while still enabling CPU stall warnings in kernels built to enable them. Updated to include Josh Triplett's suggested use of RCU_STALL_COMMON config variable to simplify #if expressions. Reported-by: Thomas Gleixner Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu.h | 7 +++++++ kernel/rcupdate.c | 51 ++++++++++++++++++++++++++++++++++++++++++++ kernel/rcutiny.c | 6 ++++-- kernel/rcutiny_plugin.h | 56 +++++++++++++++++++++++++++++++++++++++++++++++++ kernel/rcutree.c | 46 +++------------------------------------- kernel/rcutree.h | 5 ----- 6 files changed, 121 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu.h b/kernel/rcu.h index 20dfba576c2b..7f8e7590e3e5 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h @@ -111,4 +111,11 @@ static inline bool __rcu_reclaim(char *rn, struct rcu_head *head) extern int rcu_expedited; +#ifdef CONFIG_RCU_STALL_COMMON + +extern int rcu_cpu_stall_suppress; +int rcu_jiffies_till_stall_check(void); + +#endif /* #ifdef CONFIG_RCU_STALL_COMMON */ + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a2cf76177b44..076730d95acc 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -412,3 +412,54 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); #else #define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) #endif + +#ifdef CONFIG_RCU_STALL_COMMON + +#ifdef CONFIG_PROVE_RCU +#define RCU_STALL_DELAY_DELTA (5 * HZ) +#else +#define RCU_STALL_DELAY_DELTA 0 +#endif + +int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ +int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; + +module_param(rcu_cpu_stall_suppress, int, 0644); +module_param(rcu_cpu_stall_timeout, int, 0644); + +int rcu_jiffies_till_stall_check(void) +{ + int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout); + + /* + * Limit check must be consistent with the Kconfig limits + * for CONFIG_RCU_CPU_STALL_TIMEOUT. + */ + if (till_stall_check < 3) { + ACCESS_ONCE(rcu_cpu_stall_timeout) = 3; + till_stall_check = 3; + } else if (till_stall_check > 300) { + ACCESS_ONCE(rcu_cpu_stall_timeout) = 300; + till_stall_check = 300; + } + return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; +} + +static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) +{ + rcu_cpu_stall_suppress = 1; + return NOTIFY_DONE; +} + +static struct notifier_block rcu_panic_block = { + .notifier_call = rcu_panic, +}; + +static int __init check_cpu_stall_init(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); + return 0; +} +early_initcall(check_cpu_stall_init); + +#endif /* #ifdef CONFIG_RCU_STALL_COMMON */ diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index e7dce58f9c2a..b899df317edc 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -51,10 +51,10 @@ static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), struct rcu_ctrlblk *rcp); -#include "rcutiny_plugin.h" - static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; +#include "rcutiny_plugin.h" + /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ static void rcu_idle_enter_common(long long newval) { @@ -205,6 +205,7 @@ int rcu_is_cpu_rrupt_from_idle(void) */ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) { + reset_cpu_stall_ticks(rcp); if (rcp->rcucblist != NULL && rcp->donetail != rcp->curtail) { rcp->donetail = rcp->curtail; @@ -251,6 +252,7 @@ void rcu_bh_qs(int cpu) */ void rcu_check_callbacks(int cpu, int user) { + check_cpu_stalls(); if (user || rcu_is_cpu_rrupt_from_idle()) rcu_sched_qs(cpu); else if (!in_softirq()) diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index f85016a2309b..8a233002faeb 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -33,6 +33,9 @@ struct rcu_ctrlblk { struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ struct rcu_head **curtail; /* ->next pointer of last CB. */ RCU_TRACE(long qlen); /* Number of pending CBs. */ + RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ + RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ + RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ RCU_TRACE(char *name); /* Name of RCU type. */ }; @@ -54,6 +57,51 @@ int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(rcu_scheduler_active); #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +#ifdef CONFIG_RCU_TRACE + +static void check_cpu_stall(struct rcu_ctrlblk *rcp) +{ + unsigned long j; + unsigned long js; + + if (rcu_cpu_stall_suppress) + return; + rcp->ticks_this_gp++; + j = jiffies; + js = rcp->jiffies_stall; + if (*rcp->curtail && ULONG_CMP_GE(j, js)) { + pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", + rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, + jiffies - rcp->gp_start, rcp->qlen); + dump_stack(); + } + if (*rcp->curtail && ULONG_CMP_GE(j, js)) + rcp->jiffies_stall = jiffies + + 3 * rcu_jiffies_till_stall_check() + 3; + else if (ULONG_CMP_GE(j, js)) + rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); +} + +static void check_cpu_stall_preempt(void); + +#endif /* #ifdef CONFIG_RCU_TRACE */ + +static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) +{ +#ifdef CONFIG_RCU_TRACE + rcp->ticks_this_gp = 0; + rcp->gp_start = jiffies; + rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); +#endif /* #ifdef CONFIG_RCU_TRACE */ +} + +static void check_cpu_stalls(void) +{ + RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); + RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); + RCU_TRACE(check_cpu_stall_preempt()); +} + #ifdef CONFIG_TINY_PREEMPT_RCU #include @@ -448,6 +496,7 @@ static void rcu_preempt_start_gp(void) /* Official start of GP. */ rcu_preempt_ctrlblk.gpnum++; RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++); + reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb); /* Any blocked RCU readers block new GP. */ if (rcu_preempt_blocked_readers_any()) @@ -1054,4 +1103,11 @@ MODULE_AUTHOR("Paul E. McKenney"); MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); MODULE_LICENSE("GPL"); +static void check_cpu_stall_preempt(void) +{ +#ifdef CONFIG_TINY_PREEMPT_RCU + check_cpu_stall(&rcu_preempt_ctrlblk.rcb); +#endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */ +} + #endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e441b77b614e..d069430f0974 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -217,12 +217,6 @@ module_param(blimit, long, 0444); module_param(qhimark, long, 0444); module_param(qlowmark, long, 0444); -int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ -int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; - -module_param(rcu_cpu_stall_suppress, int, 0644); -module_param(rcu_cpu_stall_timeout, int, 0644); - static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS; static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; @@ -793,28 +787,10 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return 0; } -static int jiffies_till_stall_check(void) -{ - int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout); - - /* - * Limit check must be consistent with the Kconfig limits - * for CONFIG_RCU_CPU_STALL_TIMEOUT. - */ - if (till_stall_check < 3) { - ACCESS_ONCE(rcu_cpu_stall_timeout) = 3; - till_stall_check = 3; - } else if (till_stall_check > 300) { - ACCESS_ONCE(rcu_cpu_stall_timeout) = 300; - till_stall_check = 300; - } - return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; -} - static void record_gp_stall_check_time(struct rcu_state *rsp) { rsp->gp_start = jiffies; - rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); + rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); } /* @@ -857,7 +833,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } - rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3; + rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; raw_spin_unlock_irqrestore(&rnp->lock, flags); /* @@ -935,7 +911,7 @@ static void print_cpu_stall(struct rcu_state *rsp) raw_spin_lock_irqsave(&rnp->lock, flags); if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) rsp->jiffies_stall = jiffies + - 3 * jiffies_till_stall_check() + 3; + 3 * rcu_jiffies_till_stall_check() + 3; raw_spin_unlock_irqrestore(&rnp->lock, flags); set_need_resched(); /* kick ourselves to get things going. */ @@ -966,12 +942,6 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) } } -static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) -{ - rcu_cpu_stall_suppress = 1; - return NOTIFY_DONE; -} - /** * rcu_cpu_stall_reset - prevent further stall warnings in current grace period * @@ -989,15 +959,6 @@ void rcu_cpu_stall_reset(void) rsp->jiffies_stall = jiffies + ULONG_MAX / 2; } -static struct notifier_block rcu_panic_block = { - .notifier_call = rcu_panic, -}; - -static void __init check_cpu_stall_init(void) -{ - atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); -} - /* * Update CPU-local rcu_data state to record the newly noticed grace period. * This is used both when we started the grace period and when we notice @@ -3074,7 +3035,6 @@ void __init rcu_init(void) cpu_notifier(rcu_cpu_notify, 0); for_each_online_cpu(cpu) rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); - check_cpu_stall_init(); } #include "rcutree_plugin.h" diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 4b69291b093d..db9bec83fe3f 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -343,11 +343,6 @@ struct rcu_data { #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ -#ifdef CONFIG_PROVE_RCU -#define RCU_STALL_DELAY_DELTA (5 * HZ) -#else -#define RCU_STALL_DELAY_DELTA 0 -#endif #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ /* to take at least one */ /* scheduling clock irq */ -- cgit v1.2.3 From 0e11c8e8a60f8591556d142c2e1e53eaf86ab528 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 Jan 2013 16:21:07 -0800 Subject: rcu: Make rcutorture's shuffler task shuffle recently added tasks A number of kthreads have been added to rcutorture, but the shuffler task was not informed of them, and thus did not shuffle them. This commit therefore adds the requisite shuffling, and, while in the area fixes up some whitespace issues. However, the shuffling is intended to keep randomly selected CPUs idle, which means that the RCU priority boosting kthreads need to avoid waking up every jiffy. This commit also makes that fix. Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index a583f1ce713d..3ebc8bfb5525 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -846,7 +846,7 @@ static int rcu_torture_boost(void *arg) /* Wait for the next test interval. */ oldstarttime = boost_starttime; while (ULONG_CMP_LT(jiffies, oldstarttime)) { - schedule_timeout_uninterruptible(1); + schedule_timeout_interruptible(oldstarttime - jiffies); rcu_stutter_wait("rcu_torture_boost"); if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) @@ -1318,19 +1318,35 @@ static void rcu_torture_shuffle_tasks(void) set_cpus_allowed_ptr(reader_tasks[i], shuffle_tmp_mask); } - if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) if (fakewriter_tasks[i]) set_cpus_allowed_ptr(fakewriter_tasks[i], shuffle_tmp_mask); } - if (writer_task) set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); - if (stats_task) set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); + if (stutter_task) + set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask); + if (fqs_task) + set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask); + if (shutdown_task) + set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask); +#ifdef CONFIG_HOTPLUG_CPU + if (onoff_task) + set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + if (stall_task) + set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask); + if (barrier_cbs_tasks) + for (i = 0; i < n_barrier_cbs; i++) + if (barrier_cbs_tasks[i]) + set_cpus_allowed_ptr(barrier_cbs_tasks[i], + shuffle_tmp_mask); + if (barrier_task) + set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask); if (rcu_idle_cpu == -1) rcu_idle_cpu = num_online_cpus() - 1; -- cgit v1.2.3 From 7b270f609982f68f2433442bf167f735e7364b06 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 Jan 2013 13:09:13 +0530 Subject: sched: Bail out of yield_to when source and target runqueue has one task In case of undercomitted scenarios, especially in large guests yield_to overhead is significantly high. when run queue length of source and target is one, take an opportunity to bail out and return -ESRCH. This return condition can be further exploited to quickly come out of PLE handler. (History: Raghavendra initially worked on break out of kvm ple handler upon seeing source runqueue length = 1, but it had to export rq length). Peter came up with the elegant idea of return -ESRCH in scheduler core. Signed-off-by: Peter Zijlstra Raghavendra, Checking the rq length of target vcpu condition added.(thanks Avi) Reviewed-by: Srikar Dronamraju Signed-off-by: Raghavendra K T Acked-by: Andrew Jones Tested-by: Chegu Vinod Signed-off-by: Gleb Natapov --- kernel/sched/core.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0533496b6228..01edad9b5d71 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4316,7 +4316,10 @@ EXPORT_SYMBOL(yield); * It's the caller's job to ensure that the target task struct * can't go away on us before we can do any checks. * - * Returns true if we indeed boosted the target task. + * Returns: + * true (>0) if we indeed boosted the target task. + * false (0) if we failed to boost the target. + * -ESRCH if there's no task to yield to. */ bool __sched yield_to(struct task_struct *p, bool preempt) { @@ -4330,6 +4333,15 @@ bool __sched yield_to(struct task_struct *p, bool preempt) again: p_rq = task_rq(p); + /* + * If we're the only runnable task on the rq and target rq also + * has only one task, there's absolutely no point in yielding. + */ + if (rq->nr_running == 1 && p_rq->nr_running == 1) { + yielded = -ESRCH; + goto out_irq; + } + double_rq_lock(rq, p_rq); while (task_rq(p) != p_rq) { double_rq_unlock(rq, p_rq); @@ -4337,13 +4349,13 @@ again: } if (!curr->sched_class->yield_to_task) - goto out; + goto out_unlock; if (curr->sched_class != p->sched_class) - goto out; + goto out_unlock; if (task_running(p_rq, p) || p->state) - goto out; + goto out_unlock; yielded = curr->sched_class->yield_to_task(rq, p, preempt); if (yielded) { @@ -4356,11 +4368,12 @@ again: resched_task(p_rq->curr); } -out: +out_unlock: double_rq_unlock(rq, p_rq); +out_irq: local_irq_restore(flags); - if (yielded) + if (yielded > 0) schedule(); return yielded; -- cgit v1.2.3 From 38dbe0b137bfe6ea92be495017885c0785179a02 Mon Sep 17 00:00:00 2001 From: Jovi Zhang Date: Fri, 25 Jan 2013 18:03:07 +0800 Subject: tracing: Remove second iterator initializer The trace iterator is already initialized by trace_init_global_iter(), so there is no need to initialize it again. Link: http://lkml.kernel.org/r/CACV3sb+G1YnO6168JhY3dEadmJi58pA5-2cSZT8E0WVHJNFt9Q@mail.gmail.com Signed-off-by: Jovi Zhang Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 07888e15c694..d399592701aa 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5030,6 +5030,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) if (disable_tracing) ftrace_kill(); + /* Simulate the iterator */ trace_init_global_iter(&iter); for_each_tracing_cpu(cpu) { @@ -5041,10 +5042,6 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) /* don't look at user memory in panic mode */ trace_flags &= ~TRACE_ITER_SYM_USEROBJ; - /* Simulate the iterator */ - iter.tr = &global_trace; - iter.trace = current_trace; - switch (oops_dump_mode) { case DUMP_ALL: iter.cpu_file = TRACE_PIPE_ALL_CPU; -- cgit v1.2.3 From 03274a3ffb449632970fdd35da72ea41cf8474da Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 29 Jan 2013 17:30:31 -0500 Subject: tracing/fgraph: Adjust fgraph depth before calling trace return callback While debugging the virtual cputime with the function graph tracer with a max_depth of 1 (most common use of the max_depth so far), I found that I was missing kernel execution because of a race condition. The code for the return side of the function has a slight race: ftrace_pop_return_trace(&trace, &ret, frame_pointer); trace.rettime = trace_clock_local(); ftrace_graph_return(&trace); barrier(); current->curr_ret_stack--; The ftrace_pop_return_trace() initializes the trace structure for the callback. The ftrace_graph_return() uses the trace structure for its own use as that structure is on the stack and is local to this function. Then the curr_ret_stack is decremented which is what the trace.depth is set to. If an interrupt comes in after the ftrace_graph_return() but before the curr_ret_stack, then the called function will get a depth of 2. If max_depth is set to 1 this function will be ignored. The problem is that the trace has already been called, and the timestamp for that trace will not reflect the time the function was about to re-enter userspace. Calls to the interrupt will not be traced because the max_depth has prevented this. To solve this issue, the ftrace_graph_return() can safely be moved after the current->curr_ret_stack has been updated. This way the timestamp for the return callback will reflect the actual time. If an interrupt comes in after the curr_ret_stack update and ftrace_graph_return(), it will be traced. It may look a little confusing to see it within the other function, but at least it will not be lost. Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 7008d2e13cf2..39ada66389cc 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -191,10 +191,16 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) ftrace_pop_return_trace(&trace, &ret, frame_pointer); trace.rettime = trace_clock_local(); - ftrace_graph_return(&trace); barrier(); current->curr_ret_stack--; + /* + * The trace should run after decrementing the ret counter + * in case an interrupt were to come in. We don't want to + * lose the interrupt if max_depth is set. + */ + ftrace_graph_return(&trace); + if (unlikely(!ret)) { ftrace_graph_stop(); WARN_ON(1); -- cgit v1.2.3 From 6f16eebe1ff82176339a0439c98ebec9768b0ee2 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 25 Jan 2013 17:08:12 -0800 Subject: timekeeping: Switch HAS_PERSISTENT_CLOCK to ALWAYS_USE_PERSISTENT_CLOCK Jason pointed out the HAS_PERSISTENT_CLOCK name isn't quite accurate for the config, as some systems may have the persistent_clock in some cases, but not always. So change the config name to the more clear ALWAYS_USE_PERSISTENT_CLOCK. Signed-off-by: John Stultz --- kernel/time/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index f7e45b9b142b..0dddb9d09d0b 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -13,7 +13,7 @@ config ARCH_CLOCKSOURCE_DATA bool # Platforms has a persistent clock -config HAS_PERSISTENT_CLOCK +config ALWAYS_USE_PERSISTENT_CLOCK bool default n -- cgit v1.2.3 From 0212f9159694be61c6bc52e925fa76643e0c1abf Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:20:11 -0800 Subject: x86: Add Crash kernel low reservation During kdump kernel's booting stage, it need to find low ram for swiotlb buffer when system does not support intel iommu/dmar remapping. kexed-tools is appending memmap=exactmap and range from /proc/iomem with "Crash kernel", and that range is above 4G for 64bit after boot protocol 2.12. We need to add another range in /proc/iomem like "Crash kernel low", so kexec-tools could find that info and append to kdump kernel command line. Try to reserve some under 4G if the normal "Crash kernel" is above 4G. User could specify the size with crashkernel_low=XX[KMG]. -v2: fix warning that is found by Fengguang's test robot. -v3: move out get_mem_size change to another patch, to solve compiling warning that is found by Borislav Petkov -v4: user must specify crashkernel_low if system does not support intel or amd iommu. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-31-git-send-email-yinghai@kernel.org Cc: Eric Biederman Cc: Rob Landley Signed-off-by: H. Peter Anvin --- kernel/kexec.c | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 5e4bd7864c5d..2436ffcec91f 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -54,6 +54,12 @@ struct resource crashk_res = { .end = 0, .flags = IORESOURCE_BUSY | IORESOURCE_MEM }; +struct resource crashk_low_res = { + .name = "Crash kernel low", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; int kexec_should_crash(struct task_struct *p) { @@ -1369,10 +1375,11 @@ static int __init parse_crashkernel_simple(char *cmdline, * That function is the entry point for command line parsing and should be * called from the arch-specific code. */ -int __init parse_crashkernel(char *cmdline, +static int __init __parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, - unsigned long long *crash_base) + unsigned long long *crash_base, + const char *name) { char *p = cmdline, *ck_cmdline = NULL; char *first_colon, *first_space; @@ -1382,16 +1389,16 @@ int __init parse_crashkernel(char *cmdline, *crash_base = 0; /* find crashkernel and use the last one if there are more */ - p = strstr(p, "crashkernel="); + p = strstr(p, name); while (p) { ck_cmdline = p; - p = strstr(p+1, "crashkernel="); + p = strstr(p+1, name); } if (!ck_cmdline) return -EINVAL; - ck_cmdline += 12; /* strlen("crashkernel=") */ + ck_cmdline += strlen(name); /* * if the commandline contains a ':', then that's the extended @@ -1409,6 +1416,23 @@ int __init parse_crashkernel(char *cmdline, return 0; } +int __init parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel="); +} + +int __init parse_crashkernel_low(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel_low="); +} static void update_vmcoreinfo_note(void) { -- cgit v1.2.3 From ad964704ba9326d027fc10fd0099b7c880e50172 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 29 Jan 2013 17:45:49 -0500 Subject: ring-buffer: Add stats field for amount read from trace ring buffer Add a stat about the number of events read from the ring buffer: # cat /debug/tracing/per_cpu/cpu0/stats entries: 39869 overrun: 870512 commit overrun: 0 bytes: 1449912 oldest event ts: 6561.368690 now ts: 6565.246426 dropped events: 0 read events: 112 <-- Added Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 18 ++++++++++++++++++ kernel/trace/trace.c | 3 +++ 2 files changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 13950d9027cb..7244acde77b0 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3102,6 +3102,24 @@ ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu); +/** + * ring_buffer_read_events_cpu - get the number of events successfully read + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of events read + */ +unsigned long +ring_buffer_read_events_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + return cpu_buffer->read; +} +EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu); + /** * ring_buffer_entries - get the number of entries in a buffer * @buffer: The ring buffer diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d399592701aa..90a1c71fdbfc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4430,6 +4430,9 @@ tracing_stats_read(struct file *filp, char __user *ubuf, cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu); trace_seq_printf(s, "dropped events: %ld\n", cnt); + cnt = ring_buffer_read_events_cpu(tr->buffer, cpu); + trace_seq_printf(s, "read events: %ld\n", cnt); + count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); kfree(s); -- cgit v1.2.3 From 5e67b51e3fb22ad43faf9589e9019ad9c6a00413 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 27 Dec 2012 11:49:45 +0900 Subject: tracing: Use sched_clock_cpu for trace_clock_global For systems with an unstable sched_clock, all cpu_clock() does is enable/ disable local irq during the call to sched_clock_cpu(). And for stable systems they are same. trace_clock_global() already disables interrupts, so it can call sched_clock_cpu() directly. Link: http://lkml.kernel.org/r/1356576585-28782-2-git-send-email-namhyung@kernel.org Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace_clock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 22b638b28e48..24bf48eabfcc 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -84,7 +84,7 @@ u64 notrace trace_clock_global(void) local_irq_save(flags); this_cpu = raw_smp_processor_id(); - now = cpu_clock(this_cpu); + now = sched_clock_cpu(this_cpu); /* * If in an NMI context then dont risk lockups and return the * cpu_clock() time: -- cgit v1.2.3 From 2fd196ec1eab2623096e7fc7e6f3976160392bce Mon Sep 17 00:00:00 2001 From: Hiraku Toyooka Date: Wed, 26 Dec 2012 11:52:52 +0900 Subject: tracing: Replace static old_tracer check of tracer name Currently the trace buffer read functions use a static variable "old_tracer" for detecting if the current tracer changes. This was suitable for a single trace file ("trace"), but to add a snapshot feature that will use the same function for its file, a check against a static variable is not sufficient. To use the output functions for two different files, instead of storing the current tracer in a static variable, as the trace iterator descriptor contains a pointer to the original current tracer's name, that pointer can now be used to check if the current tracer has changed between different reads of the trace file. Link: http://lkml.kernel.org/r/20121226025252.3252.9276.stgit@liselsia Signed-off-by: Hiraku Toyooka Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 90a1c71fdbfc..2c724662a3e8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1948,18 +1948,20 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) static void *s_start(struct seq_file *m, loff_t *pos) { struct trace_iterator *iter = m->private; - static struct tracer *old_tracer; int cpu_file = iter->cpu_file; void *p = NULL; loff_t l = 0; int cpu; - /* copy the tracer to avoid using a global lock all around */ + /* + * copy the tracer to avoid using a global lock all around. + * iter->trace is a copy of current_trace, the pointer to the + * name may be used instead of a strcmp(), as iter->trace->name + * will point to the same string as current_trace->name. + */ mutex_lock(&trace_types_lock); - if (unlikely(old_tracer != current_trace && current_trace)) { - old_tracer = current_trace; + if (unlikely(current_trace && iter->trace->name != current_trace->name)) *iter->trace = *current_trace; - } mutex_unlock(&trace_types_lock); atomic_inc(&trace_record_cmdline_disabled); @@ -3494,7 +3496,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_iterator *iter = filp->private_data; - static struct tracer *old_tracer; ssize_t sret; /* return any leftover data */ @@ -3506,10 +3507,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); - if (unlikely(old_tracer != current_trace && current_trace)) { - old_tracer = current_trace; + if (unlikely(current_trace && iter->trace->name != current_trace->name)) *iter->trace = *current_trace; - } mutex_unlock(&trace_types_lock); /* @@ -3665,7 +3664,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, .ops = &tracing_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, }; - static struct tracer *old_tracer; ssize_t ret; size_t rem; unsigned int i; @@ -3675,10 +3673,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); - if (unlikely(old_tracer != current_trace && current_trace)) { - old_tracer = current_trace; + if (unlikely(current_trace && iter->trace->name != current_trace->name)) *iter->trace = *current_trace; - } mutex_unlock(&trace_types_lock); mutex_lock(&iter->mutex); -- cgit v1.2.3 From debdd57f5145f3c6a4b3f8d0126abd1a2def7fc6 Mon Sep 17 00:00:00 2001 From: Hiraku Toyooka Date: Wed, 26 Dec 2012 11:53:00 +0900 Subject: tracing: Make a snapshot feature available from userspace Ftrace has a snapshot feature available from kernel space and latency tracers (e.g. irqsoff) are using it. This patch enables user applictions to take a snapshot via debugfs. Add "snapshot" debugfs file in "tracing" directory. snapshot: This is used to take a snapshot and to read the output of the snapshot. # echo 1 > snapshot This will allocate the spare buffer for snapshot (if it is not allocated), and take a snapshot. # cat snapshot This will show contents of the snapshot. # echo 0 > snapshot This will free the snapshot if it is allocated. Any other positive values will clear the snapshot contents if the snapshot is allocated, or return EINVAL if it is not allocated. Link: http://lkml.kernel.org/r/20121226025300.3252.86850.stgit@liselsia Cc: Jiri Olsa Cc: David Sharp Signed-off-by: Hiraku Toyooka [ Fixed irqsoff selftest and also a conflict with a change that fixes the update_max_tr. ] Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 10 ++++ kernel/trace/trace.c | 166 +++++++++++++++++++++++++++++++++++++++++++-------- kernel/trace/trace.h | 1 + 3 files changed, 151 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index cdc9d284d24e..36567564e221 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -253,6 +253,16 @@ config FTRACE_SYSCALLS help Basic tracer to catch the syscall entry and exit events. +config TRACER_SNAPSHOT + bool "Create a snapshot trace buffer" + select TRACER_MAX_TRACE + help + Allow tracing users to take snapshot of the current buffer using the + ftrace interface, e.g.: + + echo 1 > /sys/kernel/debug/tracing/snapshot + cat snapshot + config TRACE_BRANCH_PROFILING bool select GENERIC_TRACER diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2c724662a3e8..70dce64b9ecf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -710,12 +710,11 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) WARN_ON_ONCE(!irqs_disabled()); - /* If we disabled the tracer, stop now */ - if (current_trace == &nop_trace) - return; - - if (WARN_ON_ONCE(!current_trace->use_max_tr)) + if (!current_trace->allocated_snapshot) { + /* Only the nop tracer should hit this when disabling */ + WARN_ON_ONCE(current_trace != &nop_trace); return; + } arch_spin_lock(&ftrace_max_lock); @@ -743,10 +742,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); - if (!current_trace->use_max_tr) { - WARN_ON_ONCE(1); + if (WARN_ON_ONCE(!current_trace->allocated_snapshot)) return; - } arch_spin_lock(&ftrace_max_lock); @@ -866,10 +863,13 @@ int register_tracer(struct tracer *type) current_trace = type; - /* If we expanded the buffers, make sure the max is expanded too */ - if (ring_buffer_expanded && type->use_max_tr) - ring_buffer_resize(max_tr.buffer, trace_buf_size, - RING_BUFFER_ALL_CPUS); + if (type->use_max_tr) { + /* If we expanded the buffers, make sure the max is expanded too */ + if (ring_buffer_expanded) + ring_buffer_resize(max_tr.buffer, trace_buf_size, + RING_BUFFER_ALL_CPUS); + type->allocated_snapshot = true; + } /* the test is responsible for initializing and enabling */ pr_info("Testing tracer %s: ", type->name); @@ -885,10 +885,14 @@ int register_tracer(struct tracer *type) /* Only reset on passing, to avoid touching corrupted buffers */ tracing_reset_online_cpus(tr); - /* Shrink the max buffer again */ - if (ring_buffer_expanded && type->use_max_tr) - ring_buffer_resize(max_tr.buffer, 1, - RING_BUFFER_ALL_CPUS); + if (type->use_max_tr) { + type->allocated_snapshot = false; + + /* Shrink the max buffer again */ + if (ring_buffer_expanded) + ring_buffer_resize(max_tr.buffer, 1, + RING_BUFFER_ALL_CPUS); + } printk(KERN_CONT "PASSED\n"); } @@ -1964,7 +1968,11 @@ static void *s_start(struct seq_file *m, loff_t *pos) *iter->trace = *current_trace; mutex_unlock(&trace_types_lock); - atomic_inc(&trace_record_cmdline_disabled); + if (iter->snapshot && iter->trace->use_max_tr) + return ERR_PTR(-EBUSY); + + if (!iter->snapshot) + atomic_inc(&trace_record_cmdline_disabled); if (*pos != iter->pos) { iter->ent = NULL; @@ -2003,7 +2011,11 @@ static void s_stop(struct seq_file *m, void *p) { struct trace_iterator *iter = m->private; - atomic_dec(&trace_record_cmdline_disabled); + if (iter->snapshot && iter->trace->use_max_tr) + return; + + if (!iter->snapshot) + atomic_dec(&trace_record_cmdline_disabled); trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); } @@ -2438,7 +2450,7 @@ static const struct seq_operations tracer_seq_ops = { }; static struct trace_iterator * -__tracing_open(struct inode *inode, struct file *file) +__tracing_open(struct inode *inode, struct file *file, bool snapshot) { long cpu_file = (long) inode->i_private; struct trace_iterator *iter; @@ -2471,10 +2483,11 @@ __tracing_open(struct inode *inode, struct file *file) if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) goto fail; - if (current_trace && current_trace->print_max) + if ((current_trace && current_trace->print_max) || snapshot) iter->tr = &max_tr; else iter->tr = &global_trace; + iter->snapshot = snapshot; iter->pos = -1; mutex_init(&iter->mutex); iter->cpu_file = cpu_file; @@ -2491,8 +2504,9 @@ __tracing_open(struct inode *inode, struct file *file) if (trace_clocks[trace_clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; - /* stop the trace while dumping */ - tracing_stop(); + /* stop the trace while dumping if we are not opening "snapshot" */ + if (!iter->snapshot) + tracing_stop(); if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { for_each_tracing_cpu(cpu) { @@ -2555,8 +2569,9 @@ static int tracing_release(struct inode *inode, struct file *file) if (iter->trace && iter->trace->close) iter->trace->close(iter); - /* reenable tracing if it was previously enabled */ - tracing_start(); + if (!iter->snapshot) + /* reenable tracing if it was previously enabled */ + tracing_start(); mutex_unlock(&trace_types_lock); mutex_destroy(&iter->mutex); @@ -2584,7 +2599,7 @@ static int tracing_open(struct inode *inode, struct file *file) } if (file->f_mode & FMODE_READ) { - iter = __tracing_open(inode, file); + iter = __tracing_open(inode, file, false); if (IS_ERR(iter)) ret = PTR_ERR(iter); else if (trace_flags & TRACE_ITER_LATENCY_FMT) @@ -3219,7 +3234,7 @@ static int tracing_set_tracer(const char *buf) if (current_trace && current_trace->reset) current_trace->reset(tr); - had_max_tr = current_trace && current_trace->use_max_tr; + had_max_tr = current_trace && current_trace->allocated_snapshot; current_trace = &nop_trace; if (had_max_tr && !t->use_max_tr) { @@ -3238,6 +3253,8 @@ static int tracing_set_tracer(const char *buf) */ ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); set_buffer_entries(&max_tr, 1); + tracing_reset_online_cpus(&max_tr); + current_trace->allocated_snapshot = false; } destroy_trace_option_files(topts); @@ -3248,6 +3265,7 @@ static int tracing_set_tracer(const char *buf) RING_BUFFER_ALL_CPUS); if (ret < 0) goto out; + t->allocated_snapshot = true; } if (t->init) { @@ -4066,6 +4084,87 @@ static int tracing_clock_open(struct inode *inode, struct file *file) return single_open(file, tracing_clock_show, NULL); } +#ifdef CONFIG_TRACER_SNAPSHOT +static int tracing_snapshot_open(struct inode *inode, struct file *file) +{ + struct trace_iterator *iter; + int ret = 0; + + if (file->f_mode & FMODE_READ) { + iter = __tracing_open(inode, file, true); + if (IS_ERR(iter)) + ret = PTR_ERR(iter); + } + return ret; +} + +static ssize_t +tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + unsigned long val; + int ret; + + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + mutex_lock(&trace_types_lock); + + if (current_trace && current_trace->use_max_tr) { + ret = -EBUSY; + goto out; + } + + switch (val) { + case 0: + if (current_trace->allocated_snapshot) { + /* free spare buffer */ + ring_buffer_resize(max_tr.buffer, 1, + RING_BUFFER_ALL_CPUS); + set_buffer_entries(&max_tr, 1); + tracing_reset_online_cpus(&max_tr); + current_trace->allocated_snapshot = false; + } + break; + case 1: + if (!current_trace->allocated_snapshot) { + /* allocate spare buffer */ + ret = resize_buffer_duplicate_size(&max_tr, + &global_trace, RING_BUFFER_ALL_CPUS); + if (ret < 0) + break; + current_trace->allocated_snapshot = true; + } + + local_irq_disable(); + /* Now, we're going to swap */ + update_max_tr(&global_trace, current, smp_processor_id()); + local_irq_enable(); + break; + default: + if (current_trace->allocated_snapshot) + tracing_reset_online_cpus(&max_tr); + else + ret = -EINVAL; + break; + } + + if (ret >= 0) { + *ppos += cnt; + ret = cnt; + } +out: + mutex_unlock(&trace_types_lock); + return ret; +} +#endif /* CONFIG_TRACER_SNAPSHOT */ + + static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -4122,6 +4221,16 @@ static const struct file_operations trace_clock_fops = { .write = tracing_clock_write, }; +#ifdef CONFIG_TRACER_SNAPSHOT +static const struct file_operations snapshot_fops = { + .open = tracing_snapshot_open, + .read = seq_read, + .write = tracing_snapshot_write, + .llseek = tracing_seek, + .release = tracing_release, +}; +#endif /* CONFIG_TRACER_SNAPSHOT */ + struct ftrace_buffer_info { struct trace_array *tr; void *spare; @@ -4921,6 +5030,11 @@ static __init int tracer_init_debugfs(void) &ftrace_update_tot_cnt, &tracing_dyn_info_fops); #endif +#ifdef CONFIG_TRACER_SNAPSHOT + trace_create_file("snapshot", 0644, d_tracer, + (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops); +#endif + create_trace_options_dir(); for_each_tracing_cpu(cpu) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 04a2c7ab1735..57d7e5397d56 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -287,6 +287,7 @@ struct tracer { struct tracer_flags *flags; bool print_max; bool use_max_tr; + bool allocated_snapshot; }; -- cgit v1.2.3 From fc79e240be5aa379dd36a62158be5a5ee0e4aec7 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 30 Jan 2013 16:50:36 +0400 Subject: sched/rt: Do not account zero delta_exec in update_curr_rt() There are several places of consecutive calls of dequeue_task_rt() and put_prev_task_rt() in the scheduler. For example, function rt_mutex_setprio() does it. The both calls lead to update_curr_rt(), the second of it receives zeroed delta_exec. The only effective action in this case is call of sched_rt_avg_update(), which can change rq->age_stamp and rq->rt_avg. But it is possible in case of ""floating"" rq->clock. This fact is not reasonable to be accounted. Another actions do nothing. Signed-off-by: Kirill V Tkhai Acked-by: Steven Rostedt Cc: Peter Zijlstra CC: linux-rt-users Link: http://lkml.kernel.org/r/931541359550236@web1g.yandex.ru Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 2f69ca997826..94abca4d9cf5 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -925,8 +925,8 @@ static void update_curr_rt(struct rq *rq) return; delta_exec = rq->clock_task - curr->se.exec_start; - if (unlikely((s64)delta_exec < 0)) - delta_exec = 0; + if (unlikely((s64)delta_exec <= 0)) + return; schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); -- cgit v1.2.3 From 12572dbb53638c6e454ef831c8fee7de3df24389 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 14 Jan 2013 17:05:21 +0000 Subject: clockevents: Add generic timer broadcast receiver Currently the broadcast mechanism used for timers is abstracted by a function pointer on struct clock_event_device. As the fundamental mechanism for broadcast is architecture-specific, this ties each clock_event_device driver to a single architecture, even where the driver is otherwise generic. This patch adds a standard path for the receipt of timer broadcasts, so drivers and/or architecture backends need not manage redundant lists of timers for the purpose of routing broadcast timer ticks. [tglx: Made the implementation depend on the config switch as well ] Signed-off-by: Mark Rutland Reviewed-by: Santosh Shilimkar Cc: linux-arm-kernel@lists.infradead.org Cc: nico@linaro.org Cc: Will.Deacon@arm.com Cc: Marc.Zyngier@arm.com Cc: john.stultz@linaro.org Link: http://lkml.kernel.org/r/1358183124-28461-2-git-send-email-mark.rutland@arm.com Tested-by: Santosh Shilimkar Reviewed-by: Stephen Boyd Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f113755695e2..7cc81c57eb31 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -125,6 +125,23 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) return ret; } +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +int tick_receive_broadcast(void) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + struct clock_event_device *evt = td->evtdev; + + if (!evt) + return -ENODEV; + + if (!evt->event_handler) + return -EINVAL; + + evt->event_handler(evt); + return 0; +} +#endif + /* * Broadcast the event to the cpus, which are set in the mask (mangled). */ -- cgit v1.2.3 From 12ad10004645d38356b14d1fbba379c523a61916 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 14 Jan 2013 17:05:22 +0000 Subject: clockevents: Add generic timer broadcast function Currently, the timer broadcast mechanism is defined by a function pointer on struct clock_event_device. As the fundamental mechanism for broadcast is architecture-specific, this means that clock_event_device drivers cannot be shared across multiple architectures. This patch adds an (optional) architecture-specific function for timer tick broadcast, allowing drivers which may require broadcast functionality to be shared across multiple architectures. Signed-off-by: Mark Rutland Reviewed-by: Santosh Shilimkar Cc: linux-arm-kernel@lists.infradead.org Cc: nico@linaro.org Cc: Will.Deacon@arm.com Cc: Marc.Zyngier@arm.com Cc: john.stultz@linaro.org Link: http://lkml.kernel.org/r/1358183124-28461-3-git-send-email-mark.rutland@arm.com Tested-by: Santosh Shilimkar Reviewed-by: Stephen Boyd Signed-off-by: Thomas Gleixner --- kernel/time/Kconfig | 4 ++++ kernel/time/tick-broadcast.c | 13 +++++++++++++ 2 files changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 8601f0db1261..b69692250af4 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -38,6 +38,10 @@ config GENERIC_CLOCKEVENTS_BUILD default y depends on GENERIC_CLOCKEVENTS +# Architecture can handle broadcast in a driver-agnostic way +config ARCH_HAS_TICK_BROADCAST + bool + # Clockevents broadcasting infrastructure config GENERIC_CLOCKEVENTS_BROADCAST bool diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 7cc81c57eb31..f726537d24eb 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "tick-internal.h" @@ -86,6 +87,11 @@ int tick_is_broadcast_device(struct clock_event_device *dev) return (dev && tick_broadcast_device.evtdev == dev); } +static void err_broadcast(const struct cpumask *mask) +{ + pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n"); +} + /* * Check, if the device is disfunctional and a place holder, which * needs to be handled by the broadcast device. @@ -105,6 +111,13 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) */ if (!tick_device_is_functional(dev)) { dev->event_handler = tick_handle_periodic; + if (!dev->broadcast) + dev->broadcast = tick_broadcast; + if (!dev->broadcast) { + pr_warn_once("%s depends on broadcast, but no broadcast function available\n", + dev->name); + dev->broadcast = err_broadcast; + } cpumask_set_cpu(cpu, tick_get_broadcast_mask()); tick_broadcast_start_periodic(tick_broadcast_device.evtdev); ret = 1; -- cgit v1.2.3 From d840f718d28715a9833c1a8f46c2493ff3fd219b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 1 Feb 2013 18:38:47 -0500 Subject: tracing: Init current_trace to nop_trace and remove NULL checks On early boot up, when the ftrace ring buffer is initialized, the static variable current_trace is initialized to &nop_trace. Before this initialization, current_trace is NULL and will never become NULL again. It is always reassigned to a ftrace tracer. Several places check if current_trace is NULL before it uses it, and this check is frivolous, because at the point in time when the checks are made the only way current_trace could be NULL is if ftrace failed its allocations at boot up, and the paths to these locations would probably not be possible. By initializing current_trace to &nop_trace where it is declared, current_trace will never be NULL, and we can remove all these checks of current_trace being NULL which never needed to be checked in the first place. Cc: Dan Carpenter Cc: Hiraku Toyooka Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 70dce64b9ecf..5d520b7bb4c5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -249,7 +249,7 @@ static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT; static struct tracer *trace_types __read_mostly; /* current_trace points to the tracer that is currently active */ -static struct tracer *current_trace __read_mostly; +static struct tracer *current_trace __read_mostly = &nop_trace; /* * trace_types_lock is used to protect the trace_types list. @@ -2100,8 +2100,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) unsigned long total; const char *name = "preemption"; - if (type) - name = type->name; + name = type->name; get_total_entries(tr, &total, &entries); @@ -2477,13 +2476,12 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (!iter->trace) goto fail; - if (current_trace) - *iter->trace = *current_trace; + *iter->trace = *current_trace; if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) goto fail; - if ((current_trace && current_trace->print_max) || snapshot) + if (current_trace->print_max || snapshot) iter->tr = &max_tr; else iter->tr = &global_trace; @@ -3037,10 +3035,7 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, int r; mutex_lock(&trace_types_lock); - if (current_trace) - r = sprintf(buf, "%s\n", current_trace->name); - else - r = sprintf(buf, "\n"); + r = sprintf(buf, "%s\n", current_trace->name); mutex_unlock(&trace_types_lock); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); @@ -3231,10 +3226,10 @@ static int tracing_set_tracer(const char *buf) goto out; trace_branch_disable(); - if (current_trace && current_trace->reset) + if (current_trace->reset) current_trace->reset(tr); - had_max_tr = current_trace && current_trace->allocated_snapshot; + had_max_tr = current_trace->allocated_snapshot; current_trace = &nop_trace; if (had_max_tr && !t->use_max_tr) { @@ -3373,8 +3368,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) ret = -ENOMEM; goto fail; } - if (current_trace) - *iter->trace = *current_trace; + *iter->trace = *current_trace; if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { ret = -ENOMEM; @@ -3525,7 +3519,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); - if (unlikely(current_trace && iter->trace->name != current_trace->name)) + if (unlikely(iter->trace->name != current_trace->name)) *iter->trace = *current_trace; mutex_unlock(&trace_types_lock); @@ -3691,7 +3685,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); - if (unlikely(current_trace && iter->trace->name != current_trace->name)) + if (unlikely(iter->trace->name != current_trace->name)) *iter->trace = *current_trace; mutex_unlock(&trace_types_lock); @@ -4115,7 +4109,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, mutex_lock(&trace_types_lock); - if (current_trace && current_trace->use_max_tr) { + if (current_trace->use_max_tr) { ret = -EBUSY; goto out; } @@ -5299,7 +5293,7 @@ __init static int tracer_alloc_buffers(void) init_irq_work(&trace_work_wakeup, trace_wake_up); register_tracer(&nop_trace); - current_trace = &nop_trace; + /* All seems OK, enable tracing */ tracing_disabled = 0; -- cgit v1.2.3 From 60334caf37dc7c59120b21faa625534a6fffead0 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 31 Jan 2013 18:56:17 +0400 Subject: sched/rt: Further simplify pick_rt_task() Function next_prio() has been removed and pull_rt_task() is the only user of pick_next_highest_task_rt() at the moment. pull_rt_task is not interested in p->nr_cpus_allowed, its only interest is the fact that cpu is allowed to execute p. If nr_cpus_allowed == 1, cpu != task_cpu(p) and cpu is allowed then it means that task p is in the middle of the migration techniques; the task waits until it is moved by migration thread. So, lets pull it earlier. Signed-off-by: Kirill V Tkhai Acked-by: Steven Rostedt Cc: Peter Zijlstra CC: linux-rt-users Link: http://lkml.kernel.org/r/70871359644177@web16d.yandex.ru Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 94abca4d9cf5..c25de141c576 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1427,8 +1427,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && - (p->nr_cpus_allowed > 1)) + cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) return 1; return 0; } -- cgit v1.2.3 From 377840744bea59aacd524f496dc577463f94584b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 24 Dec 2012 17:28:40 -0500 Subject: switch compat_sys_[gs]etitimer(2) to COMPAT_SYSCALL_DEFINE again, strictly speaking we are in nasal daemon territory on ppc and mips - we need to sign-extend int arguments. Signed-off-by: Al Viro --- kernel/compat.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 36700e9e2be9..adf7646343da 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -290,8 +290,8 @@ static inline long put_compat_itimerval(struct compat_itimerval __user *o, __put_user(i->it_value.tv_usec, &o->it_value.tv_usec))); } -asmlinkage long compat_sys_getitimer(int which, - struct compat_itimerval __user *it) +COMPAT_SYSCALL_DEFINE2(getitimer, int, which, + struct compat_itimerval __user *, it) { struct itimerval kit; int error; @@ -302,9 +302,9 @@ asmlinkage long compat_sys_getitimer(int which, return error; } -asmlinkage long compat_sys_setitimer(int which, - struct compat_itimerval __user *in, - struct compat_itimerval __user *out) +COMPAT_SYSCALL_DEFINE3(setitimer, int, which, + struct compat_itimerval __user *, in, + struct compat_itimerval __user *, out) { struct itimerval kin, kout; int error; -- cgit v1.2.3 From eaca6eae3e0c41d41fcb9d1d70e00934988dff2e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 25 Nov 2012 23:12:10 -0500 Subject: sanitize rt_sigaction() situation a bit Switch from __ARCH_WANT_SYS_RT_SIGACTION to opposite (!CONFIG_ODD_RT_SIGACTION); the only two architectures that need it are alpha and sparc. The reason for use of CONFIG_... instead of __ARCH_... is that it's needed only kernel-side and doing it that way avoids a mess with include order on many architectures. Signed-off-by: Al Viro --- kernel/signal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 53cd5c4d1172..00cd1ce998be 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3231,7 +3231,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, } #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ -#ifdef __ARCH_WANT_SYS_RT_SIGACTION +#ifndef CONFIG_ODD_RT_SIGACTION /** * sys_rt_sigaction - alter an action taken by a process * @sig: signal to be sent @@ -3265,7 +3265,7 @@ SYSCALL_DEFINE4(rt_sigaction, int, sig, out: return ret; } -#endif /* __ARCH_WANT_SYS_RT_SIGACTION */ +#endif /* !CONFIG_ODD_RT_SIGACTION */ #ifdef __ARCH_WANT_SYS_SGETMASK -- cgit v1.2.3 From ad4b65a434bdd2ff37d095ab1ccd836203e985ba Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 24 Dec 2012 21:43:56 -0500 Subject: consolidate rt_sigsuspend() * pull compat version alongside with the native one * make little-endian compat variant just call the native * don't bother with separate conditional for compat (both native and compat are going to become unconditional very soon). Signed-off-by: Al Viro --- kernel/compat.c | 17 ----------------- kernel/signal.c | 24 +++++++++++++++++++++++- 2 files changed, 23 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index adf7646343da..0c07d435254f 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -1067,23 +1067,6 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr) #endif /* __ARCH_WANT_COMPAT_SYS_TIME */ -#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND -asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize) -{ - sigset_t newset; - compat_sigset_t newset32; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) - return -EFAULT; - sigset_from_compat(&newset, &newset32); - return sigsuspend(&newset); -} -#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ - asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) { struct timex txc; diff --git a/kernel/signal.c b/kernel/signal.c index 00cd1ce998be..4a0934e03d5c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3352,7 +3352,29 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) return -EFAULT; return sigsuspend(&newset); } -#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize) +{ +#ifdef __BIG_ENDIAN + sigset_t newset; + compat_sigset_t newset32; + + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + + if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) + return -EFAULT; + sigset_from_compat(&newset, &newset32); + return sigsuspend(&newset); +#else + /* on little-endian bitmaps don't care about granularity */ + return sys_rt_sigsuspend((sigset_t __user *)unewset, sigsetsize); +#endif +} +#endif +#endif __attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) { -- cgit v1.2.3 From 322a56cb1fcbe228eee5cdb8a9c6df9f797d998c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 25 Dec 2012 13:32:58 -0500 Subject: generic compat_sys_rt_sigprocmask() conditional on GENERIC_COMPAT_RT_SIGPROCMASK; by the end of that series it will become the same thing as COMPAT and conditional will die out. Signed-off-by: Al Viro --- kernel/compat.c | 13 ++++++++++++- kernel/signal.c | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 0c07d435254f..e2a9c4785988 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -971,7 +971,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, } void -sigset_from_compat (sigset_t *set, compat_sigset_t *compat) +sigset_from_compat(sigset_t *set, const compat_sigset_t *compat) { switch (_NSIG_WORDS) { case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); @@ -982,6 +982,17 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat) } EXPORT_SYMBOL_GPL(sigset_from_compat); +void +sigset_to_compat(compat_sigset_t *compat, const sigset_t *set) +{ + switch (_NSIG_WORDS) { + case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3]; + case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2]; + case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1]; + case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0]; + } +} + asmlinkage long compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, struct compat_siginfo __user *uinfo, diff --git a/kernel/signal.c b/kernel/signal.c index 4a0934e03d5c..bce1222b7315 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2613,6 +2613,47 @@ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset, return 0; } +#ifdef CONFIG_COMPAT +#ifdef CONFIG_GENERIC_COMPAT_RT_SIGPROCMASK +COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, + compat_sigset_t __user *, oset, compat_size_t, sigsetsize) +{ +#ifdef __BIG_ENDIAN + sigset_t old_set = current->blocked; + + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + + if (nset) { + compat_sigset_t new32; + sigset_t new_set; + int error; + if (copy_from_user(&new32, nset, sizeof(compat_sigset_t))) + return -EFAULT; + + sigset_from_compat(&new_set, &new32); + sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); + + error = sigprocmask(how, &new_set, NULL); + if (error) + return error; + } + if (oset) { + compat_sigset_t old32; + sigset_to_compat(&old32, &old_set); + if (copy_to_user(oset, &old_set, sizeof(sigset_t))) + return -EFAULT; + } + return 0; +#else + return sys_rt_sigprocmask(how, (sigset_t __user *)nset, + (sigset_t __user *)oset, sigsetsize); +#endif +} +#endif +#endif + long do_sigpending(void __user *set, unsigned long sigsetsize) { long error = -EINVAL; -- cgit v1.2.3 From fe9c1db2cfc363cd30ecfe6480481b280abf8c0a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 25 Dec 2012 14:31:38 -0500 Subject: generic compat_sys_rt_sigpending() conditional on GENERIC_COMPAT_RT_SIGPENDING; by the end of that series it will become the same thing as COMPAT and conditional will die out. Signed-off-by: Al Viro --- kernel/signal.c | 52 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index bce1222b7315..3040c349b0e1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2654,28 +2654,19 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, #endif #endif -long do_sigpending(void __user *set, unsigned long sigsetsize) +static int do_sigpending(void *set, unsigned long sigsetsize) { - long error = -EINVAL; - sigset_t pending; - if (sigsetsize > sizeof(sigset_t)) - goto out; + return -EINVAL; spin_lock_irq(¤t->sighand->siglock); - sigorsets(&pending, ¤t->pending.signal, + sigorsets(set, ¤t->pending.signal, ¤t->signal->shared_pending.signal); spin_unlock_irq(¤t->sighand->siglock); /* Outside the lock because only this thread touches it. */ - sigandsets(&pending, ¤t->blocked, &pending); - - error = -EFAULT; - if (!copy_to_user(set, &pending, sigsetsize)) - error = 0; - -out: - return error; + sigandsets(set, ¤t->blocked, set); + return 0; } /** @@ -2684,10 +2675,37 @@ out: * @set: stores pending signals * @sigsetsize: size of sigset_t type or larger */ -SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) +SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) +{ + sigset_t set; + int err = do_sigpending(&set, sigsetsize); + if (!err && copy_to_user(uset, &set, sigsetsize)) + err = -EFAULT; + return err; +} + +#ifdef CONFIG_COMPAT +#ifdef CONFIG_GENERIC_COMPAT_RT_SIGPENDING +COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, + compat_size_t, sigsetsize) { - return do_sigpending(set, sigsetsize); +#ifdef __BIG_ENDIAN + sigset_t set; + int err = do_sigpending(&set, sigsetsize); + if (!err) { + compat_sigset_t set32; + sigset_to_compat(&set32, &set); + /* we can get here only if sigsetsize <= sizeof(set) */ + if (copy_to_user(uset, &set32, sigsetsize)) + err = -EFAULT; + } + return err; +#else + return sys_rt_sigpending((sigset_t __user *)uset, sigsetsize); +#endif } +#endif +#endif #ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER @@ -3216,7 +3234,7 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) */ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) { - return do_sigpending(set, sizeof(*set)); + return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); } #endif -- cgit v1.2.3 From 75907d4d7bc5d79b82d1453d9689efc588de1b43 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 25 Dec 2012 15:19:12 -0500 Subject: generic compat_sys_rt_sigqueueinfo() conditional on GENERIC_COMPAT_RT_SIGQUEUEINFO; by the end of that series it will become the same thing as COMPAT and conditional will die out. Signed-off-by: Al Viro --- kernel/signal.c | 45 ++++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 3040c349b0e1..6cd3023cc66b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2983,6 +2983,22 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) return do_tkill(0, pid, sig); } +static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info) +{ + /* Not even root can pretend to send signals from the kernel. + * Nor can they impersonate a kill()/tgkill(), which adds source info. + */ + if (info->si_code >= 0 || info->si_code == SI_TKILL) { + /* We used to allow any < 0 si_code */ + WARN_ON_ONCE(info->si_code < 0); + return -EPERM; + } + info->si_signo = sig; + + /* POSIX.1b doesn't mention process groups. */ + return kill_proc_info(sig, info, pid); +} + /** * sys_rt_sigqueueinfo - send signal information to a signal * @pid: the PID of the thread @@ -2993,23 +3009,26 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, siginfo_t __user *, uinfo) { siginfo_t info; - if (copy_from_user(&info, uinfo, sizeof(siginfo_t))) return -EFAULT; + return do_rt_sigqueueinfo(pid, sig, &info); +} - /* Not even root can pretend to send signals from the kernel. - * Nor can they impersonate a kill()/tgkill(), which adds source info. - */ - if (info.si_code >= 0 || info.si_code == SI_TKILL) { - /* We used to allow any < 0 si_code */ - WARN_ON_ONCE(info.si_code < 0); - return -EPERM; - } - info.si_signo = sig; - - /* POSIX.1b doesn't mention process groups. */ - return kill_proc_info(sig, &info, pid); +#ifdef CONFIG_COMPAT +#ifdef CONFIG_GENERIC_COMPAT_RT_SIGQUEUEINFO +COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo, + compat_pid_t, pid, + int, sig, + struct compat_siginfo __user *, uinfo) +{ + siginfo_t info; + int ret = copy_siginfo_from_user32(&info, uinfo); + if (unlikely(ret)) + return ret; + return do_rt_sigqueueinfo(pid, sig, &info); } +#endif +#endif long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) { -- cgit v1.2.3 From 0a0e8cdf734ce723bfc4ca6032ffbc03ce17c642 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 25 Dec 2012 16:04:12 -0500 Subject: old sigsuspend variants in kernel/signal.c conditional on OLD_SIGSUSPEND/OLD_SIGSUSPEND3, depending on which variety of that fossil is needed. Signed-off-by: Al Viro --- kernel/signal.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 6cd3023cc66b..93fd4b83d866 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3454,6 +3454,23 @@ COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_ #endif #endif +#ifdef CONFIG_OLD_SIGSUSPEND +SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask) +{ + sigset_t blocked; + siginitset(&blocked, mask); + return sigsuspend(&blocked); +} +#endif +#ifdef CONFIG_OLD_SIGSUSPEND3 +SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask) +{ + sigset_t blocked; + siginitset(&blocked, mask); + return sigsuspend(&blocked); +} +#endif + __attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) { return NULL; -- cgit v1.2.3 From 28d27f2d25d24ac5dd290b789f3fe9b76590fd95 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 25 Nov 2012 19:32:25 -0500 Subject: switch compat_sys_rt_sigtimedwait to COMPAT_SYSCALL_DEFINE Signed-off-by: Al Viro --- kernel/compat.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index e2a9c4785988..460e98f8fbf0 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -993,10 +993,9 @@ sigset_to_compat(compat_sigset_t *compat, const sigset_t *set) } } -asmlinkage long -compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, - struct compat_siginfo __user *uinfo, - struct compat_timespec __user *uts, compat_size_t sigsetsize) +COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, + struct compat_siginfo __user *, uinfo, + struct compat_timespec __user *, uts, compat_size_t, sigsetsize) { compat_sigset_t s32; sigset_t s; @@ -1024,7 +1023,6 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, } return ret; - } asmlinkage long -- cgit v1.2.3 From 5cf22100229b855bc4559dccfd8d7cb7266f99f5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 25 Nov 2012 19:41:01 -0500 Subject: switch compat_sys_sigprocmask to COMPAT_SYSCALL_DEFINE In principle, C ABI violation on ppc and mips... Signed-off-by: Al Viro --- kernel/compat.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 460e98f8fbf0..a53b04a2b5eb 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -381,9 +381,9 @@ static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set) memcpy(blocked->sig, &set, sizeof(set)); } -asmlinkage long compat_sys_sigprocmask(int how, - compat_old_sigset_t __user *nset, - compat_old_sigset_t __user *oset) +COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how, + compat_old_sigset_t __user *, nset, + compat_old_sigset_t __user *, oset) { old_sigset_t old_set, new_set; sigset_t new_blocked; -- cgit v1.2.3 From 9aae8fc05d2d130797be436eb7cae29c32710193 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 24 Dec 2012 23:12:04 -0500 Subject: switch rt_tgsigqueueinfo to COMPAT_SYSCALL_DEFINE C ABI violations on sparc, ppc and mips Signed-off-by: Al Viro --- kernel/compat.c | 11 ----------- kernel/signal.c | 17 ++++++++++++++++- 2 files changed, 16 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index a53b04a2b5eb..cf75a288f0c0 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -1025,17 +1025,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, return ret; } -asmlinkage long -compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig, - struct compat_siginfo __user *uinfo) -{ - siginfo_t info; - - if (copy_siginfo_from_user32(&info, uinfo)) - return -EFAULT; - return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); -} - #ifdef __ARCH_WANT_COMPAT_SYS_TIME /* compat_time_t is a 32 bit "long" and needs to get converted. */ diff --git a/kernel/signal.c b/kernel/signal.c index 93fd4b83d866..5a4aed1244fb 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3030,7 +3030,7 @@ COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo, #endif #endif -long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) +static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) { /* This is only valid for single tasks */ if (pid <= 0 || tgid <= 0) @@ -3060,6 +3060,21 @@ SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig, return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, + compat_pid_t, tgid, + compat_pid_t, pid, + int, sig, + struct compat_siginfo __user *, uinfo) +{ + siginfo_t info; + + if (copy_siginfo_from_user32(&info, uinfo)) + return -EFAULT; + return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); +} +#endif + int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { struct task_struct *t = current; -- cgit v1.2.3 From 6883da8c6c15e85e7750c94be49ea156ed341c05 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 25 Dec 2012 16:59:28 -0500 Subject: switch compat_sys_sched_rr_get_interval to COMPAT_SYSCALL_DEFINE ... and make it unconditional - we want the sucker on all biarch platforms, really. All kinds of wrappers and private implementations can go now; fortunately, they don't cause name conflicts, so we can do that one first without any bisect hazards. Signed-off-by: Al Viro --- kernel/compat.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index cf75a288f0c0..de6f3244d325 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -1203,9 +1203,9 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info) return 0; } -#ifdef __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL -asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid, - struct compat_timespec __user *interval) +COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, + compat_pid_t, pid, + struct compat_timespec __user *, interval) { struct timespec t; int ret; @@ -1218,7 +1218,6 @@ asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid, return -EFAULT; return ret; } -#endif /* __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL */ /* * Allocate user-space memory for the duration of a single system call, -- cgit v1.2.3 From 08d32fe504a7670cab3190c624448695adcf70e4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 25 Dec 2012 18:38:15 -0500 Subject: generic sys_compat_rt_sigaction() Again, protected by a temporary config symbol (GENERIC_COMPAT_RT_SIGACTION); will be gone by the end of series. Signed-off-by: Al Viro --- kernel/signal.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 5a4aed1244fb..f3665f3de660 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3358,6 +3358,55 @@ SYSCALL_DEFINE4(rt_sigaction, int, sig, out: return ret; } +#ifdef CONFIG_COMPAT +#ifdef CONFIG_GENERIC_COMPAT_RT_SIGACTION +COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, + const struct compat_sigaction __user *, act, + struct compat_sigaction __user *, oact, + compat_size_t, sigsetsize) +{ + struct k_sigaction new_ka, old_ka; + compat_sigset_t mask; +#ifdef __ARCH_HAS_SA_RESTORER + compat_uptr_t restorer; +#endif + int ret; + + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + + if (act) { + compat_uptr_t handler; + ret = get_user(handler, &act->sa_handler); + new_ka.sa.sa_handler = compat_ptr(handler); +#ifdef __ARCH_HAS_SA_RESTORER + ret |= get_user(restorer, &act->sa_restorer); + new_ka.sa.sa_restorer = compat_ptr(restorer); +#endif + ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask)); + ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags); + if (ret) + return -EFAULT; + sigset_from_compat(&new_ka.sa.sa_mask, &mask); + } + + ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); + if (!ret && oact) { + sigset_to_compat(&mask, &old_ka.sa.sa_mask); + ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), + &oact->sa_handler); + ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask)); + ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags); +#ifdef __ARCH_HAS_SA_RESTORER + ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), + &oact->sa_restorer); +#endif + } + return ret; +} +#endif +#endif #endif /* !CONFIG_ODD_RT_SIGACTION */ #ifdef __ARCH_WANT_SYS_SGETMASK -- cgit v1.2.3 From 495dfbf767553980dbd40a19a96a8ca5fa1be616 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 25 Dec 2012 19:09:45 -0500 Subject: generic sys_sigaction() and compat_sys_sigaction() conditional on OLD_SIGACTION/COMPAT_OLD_SIGACTION Signed-off-by: Al Viro --- kernel/signal.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index f3665f3de660..79998f5b0f11 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3409,6 +3409,84 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, #endif #endif /* !CONFIG_ODD_RT_SIGACTION */ +#ifdef CONFIG_OLD_SIGACTION +SYSCALL_DEFINE3(sigaction, int, sig, + const struct old_sigaction __user *, act, + struct old_sigaction __user *, oact) +{ + struct k_sigaction new_ka, old_ka; + int ret; + + if (act) { + old_sigset_t mask; + if (!access_ok(VERIFY_READ, act, sizeof(*act)) || + __get_user(new_ka.sa.sa_handler, &act->sa_handler) || + __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) || + __get_user(new_ka.sa.sa_flags, &act->sa_flags) || + __get_user(mask, &act->sa_mask)) + return -EFAULT; +#ifdef __ARCH_HAS_KA_RESTORER + new_ka.ka_restorer = NULL; +#endif + siginitset(&new_ka.sa.sa_mask, mask); + } + + ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); + + if (!ret && oact) { + if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || + __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || + __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) || + __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || + __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) + return -EFAULT; + } + + return ret; +} +#endif +#ifdef CONFIG_COMPAT_OLD_SIGACTION +COMPAT_SYSCALL_DEFINE3(sigaction, int, sig, + const struct compat_old_sigaction __user *, act, + struct compat_old_sigaction __user *, oact) +{ + struct k_sigaction new_ka, old_ka; + int ret; + compat_old_sigset_t mask; + compat_uptr_t handler, restorer; + + if (act) { + if (!access_ok(VERIFY_READ, act, sizeof(*act)) || + __get_user(handler, &act->sa_handler) || + __get_user(restorer, &act->sa_restorer) || + __get_user(new_ka.sa.sa_flags, &act->sa_flags) || + __get_user(mask, &act->sa_mask)) + return -EFAULT; + +#ifdef __ARCH_HAS_KA_RESTORER + new_ka.ka_restorer = NULL; +#endif + new_ka.sa.sa_handler = compat_ptr(handler); + new_ka.sa.sa_restorer = compat_ptr(restorer); + siginitset(&new_ka.sa.sa_mask, mask); + } + + ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); + + if (!ret && oact) { + if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || + __put_user(ptr_to_compat(old_ka.sa.sa_handler), + &oact->sa_handler) || + __put_user(ptr_to_compat(old_ka.sa.sa_restorer), + &oact->sa_restorer) || + __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || + __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) + return -EFAULT; + } + return ret; +} +#endif + #ifdef __ARCH_WANT_SYS_SGETMASK /* -- cgit v1.2.3 From 90caf58dad77a4021e9dade5d051756cea2a2cd7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 25 Dec 2012 23:16:10 -0500 Subject: convert futex compat syscalls to COMPAT_SYSCALL_DEFINE ppc is stepping into a nasal daemon territory a bit... Signed-off-by: Al Viro --- kernel/futex_compat.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 83e368b005fc..0f016982f975 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -11,6 +11,7 @@ #include #include #include +#include #include @@ -116,9 +117,9 @@ void compat_exit_robust_list(struct task_struct *curr) } } -asmlinkage long -compat_sys_set_robust_list(struct compat_robust_list_head __user *head, - compat_size_t len) +COMPAT_SYSCALL_DEFINE2(set_robust_list, + struct compat_robust_list_head __user *, head, + compat_size_t, len) { if (!futex_cmpxchg_enabled) return -ENOSYS; @@ -131,9 +132,9 @@ compat_sys_set_robust_list(struct compat_robust_list_head __user *head, return 0; } -asmlinkage long -compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, - compat_size_t __user *len_ptr) +COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, + compat_uptr_t __user *, head_ptr, + compat_size_t __user *, len_ptr) { struct compat_robust_list_head __user *head; unsigned long ret; @@ -172,9 +173,9 @@ err_unlock: return ret; } -asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, - struct compat_timespec __user *utime, u32 __user *uaddr2, - u32 val3) +COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + struct compat_timespec __user *, utime, u32 __user *, uaddr2, + u32, val3) { struct timespec ts; ktime_t t, *tp = NULL; -- cgit v1.2.3 From 2ce5da17570771330f44ac993b77749debf7954b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Nov 2012 15:11:25 -0500 Subject: new helper: signal_setup_done() usual "call force_sigsegv or signal_delivered" logics. Takes ksignal instead of separate siginfo/k_sigaction. Signed-off-by: Al Viro --- kernel/signal.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 79998f5b0f11..775f5552fa0e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2396,6 +2396,15 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, tracehook_signal_handler(sig, info, ka, regs, stepping); } +void signal_setup_done(int failed, struct ksignal *ksig, int stepping) +{ + if (failed) + force_sigsegv(ksig->sig, current); + else + signal_delivered(ksig->sig, &ksig->info, &ksig->ka, + signal_pt_regs(), stepping); +} + /* * It could be that complete_signal() picked us to notify about the * group-wide signal. Other threads should be notified now to take -- cgit v1.2.3 From bde208d2e10b8e8cf01cadadf203f5abcf1e4fe2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 25 Nov 2012 01:36:15 -0500 Subject: switch mips to generic rt_sigsuspend(), make it unconditional mips was the last architecture not using the generic variant. Both native and compat variants switched to generic, which is made unconditional now. Signed-off-by: Al Viro --- kernel/signal.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 775f5552fa0e..87c09e3061d2 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3562,7 +3562,6 @@ int sigsuspend(sigset_t *set) return -ERESTARTNOHAND; } -#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND /** * sys_rt_sigsuspend - replace the signal mask for a value with the * @unewset value until a signal is received @@ -3603,7 +3602,6 @@ COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_ #endif } #endif -#endif #ifdef CONFIG_OLD_SIGSUSPEND SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask) -- cgit v1.2.3 From c02cf5f8ed6137e2b3b2f10e0fca336e06e09ba4 Mon Sep 17 00:00:00 2001 From: anish kumar Date: Sun, 3 Feb 2013 22:08:23 +0100 Subject: irq_work: Remove return value from the irq_work_queue() function As no one is using the return value of irq_work_queue(), so it is better to just make it void. Signed-off-by: anish kumar Acked-by: Steven Rostedt [ Fix stale comments, remove now unnecessary __irq_work_queue() intermediate function ] Signed-off-by: Frederic Weisbecker Link: http://lkml.kernel.org/r/1359925703-24304-1-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/irq_work.c | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 64eddd59ed83..c9d7478e4889 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -63,12 +63,20 @@ void __weak arch_irq_work_raise(void) } /* - * Queue the entry and raise the IPI if needed. + * Enqueue the irq_work @entry unless it's already pending + * somewhere. + * + * Can be re-enqueued while the callback is still in progress. */ -static void __irq_work_queue(struct irq_work *work) +void irq_work_queue(struct irq_work *work) { bool empty; + /* Only queue if not already pending */ + if (!irq_work_claim(work)) + return; + + /* Queue the entry and raise the IPI if needed. */ preempt_disable(); empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); @@ -78,25 +86,6 @@ static void __irq_work_queue(struct irq_work *work) preempt_enable(); } - -/* - * Enqueue the irq_work @entry, returns true on success, failure when the - * @entry was already enqueued by someone else. - * - * Can be re-enqueued while the callback is still in progress. - */ -bool irq_work_queue(struct irq_work *work) -{ - if (!irq_work_claim(work)) { - /* - * Already enqueued, can't do! - */ - return false; - } - - __irq_work_queue(work); - return true; -} EXPORT_SYMBOL_GPL(irq_work_queue); /* -- cgit v1.2.3 From e0a79f529d5ba2507486d498b25da40911d95cf6 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Mon, 28 Jan 2013 12:19:25 +0100 Subject: sched: Fix select_idle_sibling() bouncing cow syndrome If the previous CPU is cache affine and idle, select it. The current implementation simply traverses the sd_llc domain, taking the first idle CPU encountered, which walks buddy pairs hand in hand over the package, inflicting excruciating pain. 1 tbench pair (worst case) in a 10 core + SMT package: pre 15.22 MB/sec 1 procs post 252.01 MB/sec 1 procs Signed-off-by: Mike Galbraith Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1359371965.5783.127.camel@marge.simpson.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8dbee9f4ceb2..ed18c74db017 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3252,25 +3252,18 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) */ static int select_idle_sibling(struct task_struct *p, int target) { - int cpu = smp_processor_id(); - int prev_cpu = task_cpu(p); struct sched_domain *sd; struct sched_group *sg; - int i; + int i = task_cpu(p); - /* - * If the task is going to be woken-up on this cpu and if it is - * already idle, then it is the right target. - */ - if (target == cpu && idle_cpu(cpu)) - return cpu; + if (idle_cpu(target)) + return target; /* - * If the task is going to be woken-up on the cpu where it previously - * ran and if it is currently idle, then it the right target. + * If the prevous cpu is cache affine and idle, don't be stupid. */ - if (target == prev_cpu && idle_cpu(prev_cpu)) - return prev_cpu; + if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) + return i; /* * Otherwise, iterate the domains and find an elegible idle cpu. @@ -3284,7 +3277,7 @@ static int select_idle_sibling(struct task_struct *p, int target) goto next; for_each_cpu(i, sched_group_cpus(sg)) { - if (!idle_cpu(i)) + if (i == target || !idle_cpu(i)) goto next; } -- cgit v1.2.3 From e4aa0da39b6a69f3442ebc33500c21176c5eb560 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 4 Feb 2013 13:36:13 -0500 Subject: rcu: Allow rcutorture to be built at low optimization levels The uses of trace_clock_local() are dead code when CONFIG_RCU_TRACE=n, but some compilers might nevertheless generate code calling this function. This commit therefore ensures that trace_clock_local() is invoked only when CONFIG_RCU_TRACE=y. Reported-by: Tetsuo Handa Signed-off-by: Paul E. McKenney Signed-off-by: Steven Rostedt --- kernel/rcutorture.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index cd4c35d097a4..e1f3a8c96724 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -208,6 +208,20 @@ MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot"); #define rcu_can_boost() 0 #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ +#ifdef CONFIG_RCU_TRACE +static u64 notrace rcu_trace_clock_local(void) +{ + u64 ts = trace_clock_local(); + unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC); + return ts; +} +#else /* #ifdef CONFIG_RCU_TRACE */ +static u64 notrace rcu_trace_clock_local(void) +{ + return 0ULL; +} +#endif /* #else #ifdef CONFIG_RCU_TRACE */ + static unsigned long shutdown_time; /* jiffies to system shutdown. */ static unsigned long boost_starttime; /* jiffies of next boost test start. */ DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ @@ -1051,7 +1065,7 @@ static void rcu_torture_timer(unsigned long unused) idx = cur_ops->readlock(); completed = cur_ops->completed(); - ts = trace_clock_local(); + ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || @@ -1075,8 +1089,6 @@ static void rcu_torture_timer(unsigned long unused) } completed_end = cur_ops->completed(); if (pipe_count > 1) { - unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC); - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, completed, completed_end); rcutorture_trace_dump(); @@ -1122,7 +1134,7 @@ rcu_torture_reader(void *arg) } idx = cur_ops->readlock(); completed = cur_ops->completed(); - ts = trace_clock_local(); + ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || @@ -1144,9 +1156,6 @@ rcu_torture_reader(void *arg) } completed_end = cur_ops->completed(); if (pipe_count > 1) { - unsigned long __maybe_unused ts_rem = - do_div(ts, NSEC_PER_USEC); - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, completed, completed_end); rcutorture_trace_dump(); -- cgit v1.2.3 From 16559ae48c76f1ceb970b9719dea62b77eb5d06b Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 4 Feb 2013 15:35:26 -0800 Subject: kgdb: remove #include from kgdb.h There's no reason kgdb.h itself needs to include the 8250 serial port header file. So push it down to the _very_ limited number of individual drivers that need the values in that file, and fix up the places where people really wanted serial_core.h and platform_device.h. Signed-off-by: Greg Kroah-Hartman --- kernel/debug/debug_core.c | 1 + kernel/debug/gdbstub.c | 1 + 2 files changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 9a61738cefc8..c26278fd4851 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -29,6 +29,7 @@ */ #include #include +#include #include #include #include diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index ce615e064482..38573f35a5ad 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From b22affe0aef429d657bc6505aacb1c569340ddd2 Mon Sep 17 00:00:00 2001 From: Leonid Shatz Date: Mon, 4 Feb 2013 14:33:37 +0200 Subject: hrtimer: Prevent hrtimer_enqueue_reprogram race hrtimer_enqueue_reprogram contains a race which could result in timer.base switch during unlock/lock sequence. hrtimer_enqueue_reprogram is releasing the lock protecting the timer base for calling raise_softirq_irqsoff() due to a lock ordering issue versus rq->lock. If during that time another CPU calls __hrtimer_start_range_ns() on the same hrtimer, the timer base might switch, before the current CPU can lock base->lock again and therefor the unlock_timer_base() call will unlock the wrong lock. [ tglx: Added comment and massaged changelog ] Signed-off-by: Leonid Shatz Signed-off-by: Izik Eidus Cc: Andrea Arcangeli Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1359981217-389-1-git-send-email-izik.eidus@ravellosystems.com Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 6db7a5ed52b5..cdd5607c0ceb 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -640,21 +640,9 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) * and expiry check is done in the hrtimer_interrupt or in the softirq. */ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base, - int wakeup) + struct hrtimer_clock_base *base) { - if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { - if (wakeup) { - raw_spin_unlock(&base->cpu_base->lock); - raise_softirq_irqoff(HRTIMER_SOFTIRQ); - raw_spin_lock(&base->cpu_base->lock); - } else - __raise_softirq_irqoff(HRTIMER_SOFTIRQ); - - return 1; - } - - return 0; + return base->cpu_base->hres_active && hrtimer_reprogram(timer, base); } static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) @@ -735,8 +723,7 @@ static inline int hrtimer_switch_to_hres(void) { return 0; } static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base, - int wakeup) + struct hrtimer_clock_base *base) { return 0; } @@ -995,8 +982,21 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * * XXX send_remote_softirq() ? */ - if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)) - hrtimer_enqueue_reprogram(timer, new_base, wakeup); + if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases) + && hrtimer_enqueue_reprogram(timer, new_base)) { + if (wakeup) { + /* + * We need to drop cpu_base->lock to avoid a + * lock ordering issue vs. rq->lock. + */ + raw_spin_unlock(&new_base->cpu_base->lock); + raise_softirq_irqoff(HRTIMER_SOFTIRQ); + local_irq_restore(flags); + return ret; + } else { + __raise_softirq_irqoff(HRTIMER_SOFTIRQ); + } + } unlock_hrtimer_base(timer, &flags); -- cgit v1.2.3 From c3c186403c6abd32e719f005f0af950155a9e54d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 5 Feb 2013 14:37:51 +0300 Subject: sched: Fix signedness bug in yield_to() In 7b270f6099 "sched: Bail out of yield_to when source and target runqueue has one task" we changed this to store -ESRCH so it needs to be signed. Signed-off-by: Dan Carpenter Cc: Peter Zijlstra Cc: kbuild@01.org Cc: Steven Rostedt Cc: Mike Galbraith Link: http://lkml.kernel.org/r/20130205113751.GA20521@elgon.mountain Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 26058d0bebba..c5b089df7ea8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4371,7 +4371,7 @@ bool __sched yield_to(struct task_struct *p, bool preempt) struct task_struct *curr = current; struct rq *rq, *p_rq; unsigned long flags; - bool yielded = 0; + int yielded = 0; local_irq_save(flags); rq = this_rq(); -- cgit v1.2.3 From ca2eb5679f8ddffff60156af42595df44a315ef0 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 5 Feb 2013 07:25:17 +0000 Subject: tcp: remove Appropriate Byte Count support TCP Appropriate Byte Count was added by me, but later disabled. There is no point in maintaining it since it is a potential source of bugs and Linux already implements other better window protection heuristics. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- kernel/sysctl_binary.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 5a6384450501..b669ca1fa103 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -387,7 +387,6 @@ static const struct bin_table bin_net_ipv4_table[] = { { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" }, { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" }, { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" }, - { CTL_INT, NET_TCP_ABC, "tcp_abc" }, { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, -- cgit v1.2.3 From 9f3b795a626ee79574595e06d1437fe0c7d51d29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Fri, 1 Feb 2013 20:40:17 +0100 Subject: driver-core: constify data for class_find_device() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All in-kernel users of class_find_device() don't really need mutable data for match callback. In two places (kernel/power/suspend_test.c, drivers/scsi/osd/osd_uld.c) this patch changes match callbacks to use const search data. The const is propagated to rtc_class_open() and power_supply_get_by_name() parameters. Note that there's a dev reference leak in suspend_test.c that's not touched in this patch. Signed-off-by: MichaĹ‚ MirosĹ‚aw Acked-by: Grant Likely Signed-off-by: Greg Kroah-Hartman --- kernel/power/suspend_test.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 25596e450ac7..9b2a1d58558d 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -112,7 +112,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) rtc_set_alarm(rtc, &alm); } -static int __init has_wakealarm(struct device *dev, void *name_ptr) +static int __init has_wakealarm(struct device *dev, const void *data) { struct rtc_device *candidate = to_rtc_device(dev); @@ -121,7 +121,6 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr) if (!device_may_wakeup(candidate->dev.parent)) return 0; - *(const char **)name_ptr = dev_name(dev); return 1; } @@ -159,8 +158,8 @@ static int __init test_suspend(void) static char warn_no_rtc[] __initdata = KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n"; - char *pony = NULL; struct rtc_device *rtc = NULL; + struct device *dev; /* PM is initialized by now; is that state testable? */ if (test_state == PM_SUSPEND_ON) @@ -171,9 +170,9 @@ static int __init test_suspend(void) } /* RTCs have initialized by now too ... can we use one? */ - class_find_device(rtc_class, NULL, &pony, has_wakealarm); - if (pony) - rtc = rtc_class_open(pony); + dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm); + if (dev) + rtc = rtc_class_open(dev_name(dev)); if (!rtc) { printk(warn_no_rtc); goto done; -- cgit v1.2.3 From 6be195886ac26abe0194ed1bc7a9224f8a97c310 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 6 Feb 2013 18:04:53 -0800 Subject: workqueue: replace WORK_CPU_NONE/LAST with WORK_CPU_END Now that workqueue has moved away from gcwqs, workqueue no longer has the need to have a CPU identifier indicating "no cpu associated" - we now use WORK_OFFQ_POOL_NONE instead - and most uses of WORK_CPU_NONE are gone. The only left usage is as the end marker for for_each_*wq*() iterators, where the name WORK_CPU_NONE is confusing w/o actual WORK_CPU_NONE usages. Similarly, WORK_CPU_LAST which equals WORK_CPU_NONE no longer makes sense. Replace both WORK_CPU_NONE and LAST with WORK_CPU_END. This patch doesn't introduce any functional difference. tj: s/WORK_CPU_LAST/WORK_CPU_END/ and rewrote the description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 577de1073f24..7e11334a119f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -258,7 +258,7 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, if (sw & 2) return WORK_CPU_UNBOUND; } - return WORK_CPU_NONE; + return WORK_CPU_END; } static inline int __next_cwq_cpu(int cpu, const struct cpumask *mask, @@ -282,17 +282,17 @@ static inline int __next_cwq_cpu(int cpu, const struct cpumask *mask, */ #define for_each_wq_cpu(cpu) \ for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3); \ - (cpu) < WORK_CPU_NONE; \ + (cpu) < WORK_CPU_END; \ (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, 3)) #define for_each_online_wq_cpu(cpu) \ for ((cpu) = __next_wq_cpu(-1, cpu_online_mask, 3); \ - (cpu) < WORK_CPU_NONE; \ + (cpu) < WORK_CPU_END; \ (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3)) #define for_each_cwq_cpu(cpu, wq) \ for ((cpu) = __next_cwq_cpu(-1, cpu_possible_mask, (wq)); \ - (cpu) < WORK_CPU_NONE; \ + (cpu) < WORK_CPU_END; \ (cpu) = __next_cwq_cpu((cpu), cpu_possible_mask, (wq))) #ifdef CONFIG_DEBUG_OBJECTS_WORK @@ -3796,7 +3796,7 @@ static int __init init_workqueues(void) /* make sure we have enough bits for OFFQ pool ID */ BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < - WORK_CPU_LAST * NR_STD_WORKER_POOLS); + WORK_CPU_END * NR_STD_WORKER_POOLS); cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); -- cgit v1.2.3 From 038366c5cf23ae737b9f72169dd8ade2d105755b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 6 Feb 2013 18:04:53 -0800 Subject: workqueue: make work_busy() test WORK_STRUCT_PENDING first Currently, work_busy() first tests whether the work has a pool associated with it and if not, considers it idle. This works fine even for delayed_work.work queued on timer, as __queue_delayed_work() sets cwq on delayed_work.work - a queued delayed_work always has its cwq and thus pool associated with it. However, we're about to update delayed_work queueing and this won't hold. Update work_busy() such that it tests WORK_STRUCT_PENDING before the associated pool. This doesn't make any noticeable behavior difference now. With work_pending() test moved, the function read a lot better with "if (!pool)" test flipped to positive. Flip it. While at it, lose the comment about now non-existent reentrant workqueues. tj: Reorganized the function and rewrote the description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7e11334a119f..a229a56f3a32 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3443,8 +3443,6 @@ EXPORT_SYMBOL_GPL(workqueue_congested); * Test whether @work is currently pending or running. There is no * synchronization around this function and the test result is * unreliable and only useful as advisory hints or for debugging. - * Especially for reentrant wqs, the pending state might hide the - * running state. * * RETURNS: * OR'd bitmask of WORK_BUSY_* bits. @@ -3455,17 +3453,15 @@ unsigned int work_busy(struct work_struct *work) unsigned long flags; unsigned int ret = 0; - if (!pool) - return 0; - - spin_lock_irqsave(&pool->lock, flags); - if (work_pending(work)) ret |= WORK_BUSY_PENDING; - if (find_worker_executing_work(pool, work)) - ret |= WORK_BUSY_RUNNING; - spin_unlock_irqrestore(&pool->lock, flags); + if (pool) { + spin_lock_irqsave(&pool->lock, flags); + if (find_worker_executing_work(pool, work)) + ret |= WORK_BUSY_RUNNING; + spin_unlock_irqrestore(&pool->lock, flags); + } return ret; } -- cgit v1.2.3 From 60c057bca22285efefbba033624763a778f243bf Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 6 Feb 2013 18:04:53 -0800 Subject: workqueue: add delayed_work->wq to simplify reentrancy handling To avoid executing the same work item from multiple CPUs concurrently, a work_struct records the last pool it was on in its ->data so that, on the next queueing, the pool can be queried to determine whether the work item is still executing or not. A delayed_work goes through timer before actually being queued on the target workqueue and the timer needs to know the target workqueue and CPU. This is currently achieved by modifying delayed_work->work.data such that it points to the cwq which points to the target workqueue and the last CPU the work item was on. __queue_delayed_work() extracts the last CPU from delayed_work->work.data and then combines it with the target workqueue to create new work.data. The only thing this rather ugly hack achieves is encoding the target workqueue into delayed_work->work.data without using a separate field, which could be a trade off one can make; unfortunately, this entangles work->data management between regular workqueue and delayed_work code by setting cwq pointer before the work item is actually queued and becomes a hindrance for further improvements of work->data handling. This can be easily made sane by adding a target workqueue field to delayed_work. While delayed_work is used widely in the kernel and this does make it a bit larger (<5%), I think this is the right trade-off especially given the prospect of much saner handling of work->data which currently involves quite tricky memory barrier dancing, and don't expect to see any measureable effect. Add delayed_work->wq and drop the delayed_work->work.data overloading. tj: Rewrote the description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 32 +++----------------------------- 1 file changed, 3 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a229a56f3a32..41a502ce3802 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1339,10 +1339,9 @@ EXPORT_SYMBOL_GPL(queue_work); void delayed_work_timer_fn(unsigned long __data) { struct delayed_work *dwork = (struct delayed_work *)__data; - struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); /* should have been called from irqsafe timer with irq already off */ - __queue_work(dwork->cpu, cwq->wq, &dwork->work); + __queue_work(dwork->cpu, dwork->wq, &dwork->work); } EXPORT_SYMBOL_GPL(delayed_work_timer_fn); @@ -1351,7 +1350,6 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, { struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; - unsigned int lcpu; WARN_ON_ONCE(timer->function != delayed_work_timer_fn || timer->data != (unsigned long)dwork); @@ -1371,30 +1369,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, timer_stats_timer_set_start_info(&dwork->timer); - /* - * This stores cwq for the moment, for the timer_fn. Note that the - * work's pool is preserved to allow reentrance detection for - * delayed works. - */ - if (!(wq->flags & WQ_UNBOUND)) { - struct worker_pool *pool = get_work_pool(work); - - /* - * If we cannot get the last pool from @work directly, - * select the last CPU such that it avoids unnecessarily - * triggering non-reentrancy check in __queue_work(). - */ - lcpu = cpu; - if (pool) - lcpu = pool->cpu; - if (lcpu == WORK_CPU_UNBOUND) - lcpu = raw_smp_processor_id(); - } else { - lcpu = WORK_CPU_UNBOUND; - } - - set_work_cwq(work, get_cwq(lcpu, wq), 0); - + dwork->wq = wq; dwork->cpu = cpu; timer->expires = jiffies + delay; @@ -2944,8 +2919,7 @@ bool flush_delayed_work(struct delayed_work *dwork) { local_irq_disable(); if (del_timer_sync(&dwork->timer)) - __queue_work(dwork->cpu, - get_work_cwq(&dwork->work)->wq, &dwork->work); + __queue_work(dwork->cpu, dwork->wq, &dwork->work); local_irq_enable(); return flush_work(&dwork->work); } -- cgit v1.2.3 From 4468a00fd9a274fe1b30c886370d662e4a439efb Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 6 Feb 2013 18:04:53 -0800 Subject: workqueue: make work->data point to pool after try_to_grab_pending() We plan to use work->data pointing to cwq as the synchronization invariant when determining whether a given work item is on a locked pool or not, which requires work->data pointing to cwq only while the work item is queued on the associated pool. With delayed_work updated not to overload work->data for target workqueue recording, the only case where we still have off-queue work->data pointing to cwq is try_to_grab_pending() which doesn't update work->data after stealing a queued work item. There's no reason for try_to_grab_pending() to not update work->data to point to the pool instead of cwq, like the normal execution does. This patch adds set_work_pool_and_keep_pending() which makes work->data point to pool instead of cwq but keeps the pending bit unlike set_work_pool_and_clear_pending() (surprise!). After this patch, it's guaranteed that only queued work items point to cwqs. This patch doesn't introduce any visible behavior change. tj: Renamed the new helper function to match set_work_pool_and_clear_pending() and rewrote the description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 41a502ce3802..1a442c301ddb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -556,6 +556,13 @@ static void set_work_cwq(struct work_struct *work, WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); } +static void set_work_pool_and_keep_pending(struct work_struct *work, + int pool_id) +{ + set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, + WORK_STRUCT_PENDING); +} + static void set_work_pool_and_clear_pending(struct work_struct *work, int pool_id) { @@ -1115,6 +1122,9 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, cwq_dec_nr_in_flight(get_work_cwq(work), get_work_color(work)); + /* work->data points to cwq iff queued, point to pool */ + set_work_pool_and_keep_pending(work, pool->id); + spin_unlock(&pool->lock); return 1; } -- cgit v1.2.3 From 0b3dae68ac199fac224fea9a31907b44f0d257b3 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 6 Feb 2013 18:04:53 -0800 Subject: workqueue: simplify is-work-item-queued-here test Currently, determining whether a work item is queued on a locked pool involves somewhat convoluted memory barrier dancing. It goes like the following. * When a work item is queued on a pool, work->data is updated before work->entry is linked to the pending list with a wmb() inbetween. * When trying to determine whether a work item is currently queued on a pool pointed to by work->data, it locks the pool and looks at work->entry. If work->entry is linked, we then do rmb() and then check whether work->data points to the current pool. This works because, work->data can only point to a pool if it currently is or were on the pool and, * If it currently is on the pool, the tests would obviously succeed. * It it left the pool, its work->entry was cleared under pool->lock, so if we're seeing non-empty work->entry, it has to be from the work item being linked on another pool. Because work->data is updated before work->entry is linked with wmb() inbetween, work->data update from another pool is guaranteed to be visible if we do rmb() after seeing non-empty work->entry. So, we either see empty work->entry or we see updated work->data pointin to another pool. While this works, it's convoluted, to put it mildly. With recent updates, it's now guaranteed that work->data points to cwq only while the work item is queued and that updating work->data to point to cwq or back to pool is done under pool->lock, so we can simply test whether work->data points to cwq which is associated with the currently locked pool instead of the convoluted memory barrier dancing. This patch replaces the memory barrier based "are you still here, really?" test with much simpler "does work->data points to me?" test - if work->data points to a cwq which is associated with the currently locked pool, the work item is guaranteed to be queued on the pool as work->data can start and stop pointing to such cwq only under pool->lock and the start and stop coincide with queue and dequeue. tj: Rewrote the comments and description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1a442c301ddb..251f00914295 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1068,6 +1068,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, unsigned long *flags) { struct worker_pool *pool; + struct cpu_workqueue_struct *cwq; local_irq_save(*flags); @@ -1097,14 +1098,17 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, goto fail; spin_lock(&pool->lock); - if (!list_empty(&work->entry)) { - /* - * This work is queued, but perhaps we locked the wrong - * pool. In that case we must see the new value after - * rmb(), see insert_work()->wmb(). - */ - smp_rmb(); - if (pool == get_work_pool(work)) { + /* + * work->data is guaranteed to point to cwq only while the work + * item is queued on cwq->wq, and both updating work->data to point + * to cwq on queueing and to pool on dequeueing are done under + * cwq->pool->lock. This in turn guarantees that, if work->data + * points to cwq which is associated with a locked pool, the work + * item is currently queued on that pool. + */ + cwq = get_work_cwq(work); + if (cwq) { + if (cwq->pool == pool) { debug_work_deactivate(work); /* @@ -1159,13 +1163,6 @@ static void insert_work(struct cpu_workqueue_struct *cwq, /* we own @work, set data and link */ set_work_cwq(work, cwq, extra_flags); - - /* - * Ensure that we get the right work->data if we see the - * result of list_add() below, see try_to_grab_pending(). - */ - smp_wmb(); - list_add_tail(&work->entry, head); /* @@ -2799,15 +2796,10 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) return false; spin_lock_irq(&pool->lock); - if (!list_empty(&work->entry)) { - /* - * See the comment near try_to_grab_pending()->smp_rmb(). - * If it was re-queued to a different pool under us, we - * are not going to wait. - */ - smp_rmb(); - cwq = get_work_cwq(work); - if (unlikely(!cwq || pool != cwq->pool)) + /* see the comment in try_to_grab_pending() with the same code */ + cwq = get_work_cwq(work); + if (cwq) { + if (unlikely(cwq->pool != pool)) goto already_gone; } else { worker = find_worker_executing_work(pool, work); -- cgit v1.2.3 From 1606283622689bdc460052b4a1281c36de13fe49 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 6 Feb 2013 18:04:53 -0800 Subject: workqueue: cosmetic update in try_to_grab_pending() With the recent is-work-queued-here test simplification, the nested if() in try_to_grab_pending() can be collapsed. Collapse it. This patch is purely cosmetic. Signed-off-by: Tejun Heo Cc: Lai Jiangshan --- kernel/workqueue.c | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 251f00914295..e2dd61861fbd 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1107,31 +1107,27 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, * item is currently queued on that pool. */ cwq = get_work_cwq(work); - if (cwq) { - if (cwq->pool == pool) { - debug_work_deactivate(work); + if (cwq && cwq->pool == pool) { + debug_work_deactivate(work); - /* - * A delayed work item cannot be grabbed directly - * because it might have linked NO_COLOR work items - * which, if left on the delayed_list, will confuse - * cwq->nr_active management later on and cause - * stall. Make sure the work item is activated - * before grabbing. - */ - if (*work_data_bits(work) & WORK_STRUCT_DELAYED) - cwq_activate_delayed_work(work); + /* + * A delayed work item cannot be grabbed directly because + * it might have linked NO_COLOR work items which, if left + * on the delayed_list, will confuse cwq->nr_active + * management later on and cause stall. Make sure the work + * item is activated before grabbing. + */ + if (*work_data_bits(work) & WORK_STRUCT_DELAYED) + cwq_activate_delayed_work(work); - list_del_init(&work->entry); - cwq_dec_nr_in_flight(get_work_cwq(work), - get_work_color(work)); + list_del_init(&work->entry); + cwq_dec_nr_in_flight(get_work_cwq(work), get_work_color(work)); - /* work->data points to cwq iff queued, point to pool */ - set_work_pool_and_keep_pending(work, pool->id); + /* work->data points to cwq iff queued, point to pool */ + set_work_pool_and_keep_pending(work, pool->id); - spin_unlock(&pool->lock); - return 1; - } + spin_unlock(&pool->lock); + return 1; } spin_unlock(&pool->lock); fail: -- cgit v1.2.3 From cf4aebc292fac7f34f8345664320e9d4a42ca76c Mon Sep 17 00:00:00 2001 From: Clark Williams Date: Thu, 7 Feb 2013 09:46:59 -0600 Subject: sched: Move sched.h sysctl bits into separate header Move the sysctl-related bits from include/linux/sched.h into a new file: include/linux/sched/sysctl.h. Then update source files requiring access to those bits by including the new header file. Signed-off-by: Clark Williams Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20130207094659.06dced96@riff.lan Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 1 + kernel/sched/sched.h | 1 + kernel/sysctl.c | 1 + kernel/timer.c | 1 + 4 files changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 6db7a5ed52b5..8a9aa59d0d61 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fc886441436a..ed8de30a040e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1,5 +1,6 @@ #include +#include #include #include #include diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c88878db491e..7357e23aaf68 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -61,6 +61,7 @@ #include #include #include +#include #include #include diff --git a/kernel/timer.c b/kernel/timer.c index 367d00858482..3e13baf3f0ea 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From ce0dbbbb30aee6a835511d5be446462388ba9eee Mon Sep 17 00:00:00 2001 From: Clark Williams Date: Thu, 7 Feb 2013 09:47:04 -0600 Subject: sched/rt: Add a tuning knob to allow changing SCHED_RR timeslice Add a /proc/sys/kernel scheduler knob named sched_rr_timeslice_ms that allows global changing of the SCHED_RR timeslice value. User visable value is in milliseconds but is stored as jiffies. Setting to 0 (zero) resets to the default (currently 100ms). Signed-off-by: Clark Williams Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20130207094704.13751796@riff.lan Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 19 +++++++++++++++++++ kernel/sched/rt.c | 6 ++++-- kernel/sysctl.c | 7 +++++++ 3 files changed, 30 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1dff78a9e2ab..4a88f1d51563 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7509,6 +7509,25 @@ static int sched_rt_global_constraints(void) } #endif /* CONFIG_RT_GROUP_SCHED */ +int sched_rr_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + ret = proc_dointvec(table, write, buffer, lenp, ppos); + /* make sure that internally we keep jiffies */ + /* also, writing zero resets timeslice to default */ + if (!ret && write) { + sched_rr_timeslice = sched_rr_timeslice <= 0 ? + RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); + } + mutex_unlock(&mutex); + return ret; +} + int sched_rt_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index c25de141c576..fb0f77e402b6 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -7,6 +7,8 @@ #include +int sched_rr_timeslice = RR_TIMESLICE; + static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); struct rt_bandwidth def_rt_bandwidth; @@ -2016,7 +2018,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) if (--p->rt.time_slice) return; - p->rt.time_slice = RR_TIMESLICE; + p->rt.time_slice = sched_rr_timeslice; /* * Requeue to the end of queue if we (and all of our ancestors) are the @@ -2047,7 +2049,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) * Time slice is 0 for SCHED_FIFO tasks */ if (task->policy == SCHED_RR) - return RR_TIMESLICE; + return sched_rr_timeslice; else return 0; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7357e23aaf68..4fc9be955c71 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -404,6 +404,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_rt_handler, }, + { + .procname = "sched_rr_timeslice_ms", + .data = &sched_rr_timeslice, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rr_handler, + }, #ifdef CONFIG_SCHED_AUTOGROUP { .procname = "sched_autogroup_enabled", -- cgit v1.2.3 From 8bd75c77b7c6a3954140dd2e20346aef3efe4a35 Mon Sep 17 00:00:00 2001 From: Clark Williams Date: Thu, 7 Feb 2013 09:47:07 -0600 Subject: sched/rt: Move rt specific bits into new header file Move rt scheduler definitions out of include/linux/sched.h into new file include/linux/sched/rt.h Signed-off-by: Clark Williams Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20130207094707.7b9f825f@riff.lan Signed-off-by: Ingo Molnar --- kernel/futex.c | 1 + kernel/hrtimer.c | 1 + kernel/irq/manage.c | 1 + kernel/mutex.c | 1 + kernel/rtmutex-debug.c | 1 + kernel/rtmutex-tester.c | 1 + kernel/rtmutex.c | 1 + kernel/sched/cpupri.c | 2 ++ kernel/sched/sched.h | 1 + kernel/trace/trace.c | 1 + kernel/trace/trace_sched_wakeup.c | 2 +- kernel/watchdog.c | 1 + 12 files changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 19eb089ca003..9618b6e9fb36 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -60,6 +60,7 @@ #include #include #include +#include #include diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 8a9aa59d0d61..c5dde988c0ce 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e49a288fa479..02115d9592ec 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "internals.h" diff --git a/kernel/mutex.c b/kernel/mutex.c index a307cc9c9526..52f23011b6e0 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -19,6 +19,7 @@ */ #include #include +#include #include #include #include diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index 16502d3a71c8..13b243a323fa 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -17,6 +17,7 @@ * See rt.c in preempt-rt for proper credits and further information */ #include +#include #include #include #include diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 98ec49475460..7890b10084a7 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index a242e691c993..1e09308bf2a1 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "rtmutex_common.h" diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 23aa789c53ee..1095e878a46f 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -28,6 +28,8 @@ */ #include +#include +#include #include "cpupri.h" /* Convert between a 140 based task->prio, and our 102 based cpupri */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ed8de30a040e..cc03cfdf469f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1,6 +1,7 @@ #include #include +#include #include #include #include diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3c13e46d7d24..4d2e4afd956e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -39,6 +39,7 @@ #include #include #include +#include #include "trace.h" #include "trace_output.h" diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 9fe45fcefca0..75aa97fbe1a1 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -15,8 +15,8 @@ #include #include #include +#include #include - #include "trace.h" static struct trace_array *wakeup_trace; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 75a2ab3d0b02..27689422aa92 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From e19e397a85f33100bfa4210e256bec82fe22e167 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Jan 2013 11:39:44 -0800 Subject: workqueue: move nr_running into worker_pool As nr_running is likely to be accessed from other CPUs during try_to_wake_up(), it was kept outside worker_pool; however, while less frequent, other fields in worker_pool are accessed from other CPUs for, e.g., non-reentrancy check. Also, with recent pool related changes, accessing nr_running matching the worker_pool isn't as simple as it used to be. Move nr_running inside worker_pool. Keep it aligned to cacheline and define CPU pools using DEFINE_PER_CPU_SHARED_ALIGNED(). This should give at least the same cacheline behavior. get_pool_nr_running() is replaced with direct pool->nr_running accesses. Signed-off-by: Tejun Heo Cc: Joonsoo Kim --- kernel/workqueue.c | 63 +++++++++++++++++++----------------------------------- 1 file changed, 22 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e2dd61861fbd..91ce7a984c22 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -144,6 +144,13 @@ struct worker_pool { struct mutex assoc_mutex; /* protect POOL_DISASSOCIATED */ struct ida worker_ida; /* L: for worker IDs */ + + /* + * The current concurrency level. As it's likely to be accessed + * from other CPUs during try_to_wake_up(), put it in a separate + * cacheline. + */ + atomic_t nr_running ____cacheline_aligned_in_smp; } ____cacheline_aligned_in_smp; /* @@ -417,23 +424,12 @@ static LIST_HEAD(workqueues); static bool workqueue_freezing; /* W: have wqs started freezing? */ /* - * The CPU standard worker pools. nr_running is the only field which is - * expected to be used frequently by other cpus via try_to_wake_up(). Put - * it in a separate cacheline. - */ -static DEFINE_PER_CPU(struct worker_pool [NR_STD_WORKER_POOLS], - cpu_std_worker_pools); -static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t [NR_STD_WORKER_POOLS], - cpu_std_pool_nr_running); - -/* - * Standard worker pools and nr_running counter for unbound CPU. The pools - * have POOL_DISASSOCIATED set, and all workers have WORKER_UNBOUND set. + * The CPU and unbound standard worker pools. The unbound ones have + * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set. */ +static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], + cpu_std_worker_pools); static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS]; -static atomic_t unbound_std_pool_nr_running[NR_STD_WORKER_POOLS] = { - [0 ... NR_STD_WORKER_POOLS - 1] = ATOMIC_INIT(0), /* always 0 */ -}; /* idr of all pools */ static DEFINE_MUTEX(worker_pool_idr_mutex); @@ -483,17 +479,6 @@ static struct worker_pool *get_std_worker_pool(int cpu, bool highpri) return &pools[highpri]; } -static atomic_t *get_pool_nr_running(struct worker_pool *pool) -{ - int cpu = pool->cpu; - int idx = std_worker_pool_pri(pool); - - if (cpu != WORK_CPU_UNBOUND) - return &per_cpu(cpu_std_pool_nr_running, cpu)[idx]; - else - return &unbound_std_pool_nr_running[idx]; -} - static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, struct workqueue_struct *wq) { @@ -654,7 +639,7 @@ static bool work_is_canceling(struct work_struct *work) static bool __need_more_worker(struct worker_pool *pool) { - return !atomic_read(get_pool_nr_running(pool)); + return !atomic_read(&pool->nr_running); } /* @@ -679,9 +664,8 @@ static bool may_start_working(struct worker_pool *pool) /* Do I need to keep working? Called from currently running workers. */ static bool keep_working(struct worker_pool *pool) { - atomic_t *nr_running = get_pool_nr_running(pool); - - return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1; + return !list_empty(&pool->worklist) && + atomic_read(&pool->nr_running) <= 1; } /* Do we need a new worker? Called from manager. */ @@ -761,7 +745,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) if (!(worker->flags & WORKER_NOT_RUNNING)) { WARN_ON_ONCE(worker->pool->cpu != cpu); - atomic_inc(get_pool_nr_running(worker->pool)); + atomic_inc(&worker->pool->nr_running); } } @@ -785,7 +769,6 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, { struct worker *worker = kthread_data(task), *to_wakeup = NULL; struct worker_pool *pool; - atomic_t *nr_running; /* * Rescuers, which may not have all the fields set up like normal @@ -796,7 +779,6 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, return NULL; pool = worker->pool; - nr_running = get_pool_nr_running(pool); /* this can only happen on the local cpu */ BUG_ON(cpu != raw_smp_processor_id()); @@ -812,7 +794,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, * manipulating idle_list, so dereferencing idle_list without pool * lock is safe. */ - if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist)) + if (atomic_dec_and_test(&pool->nr_running) && + !list_empty(&pool->worklist)) to_wakeup = first_worker(pool); return to_wakeup ? to_wakeup->task : NULL; } @@ -844,14 +827,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags, */ if ((flags & WORKER_NOT_RUNNING) && !(worker->flags & WORKER_NOT_RUNNING)) { - atomic_t *nr_running = get_pool_nr_running(pool); - if (wakeup) { - if (atomic_dec_and_test(nr_running) && + if (atomic_dec_and_test(&pool->nr_running) && !list_empty(&pool->worklist)) wake_up_worker(pool); } else - atomic_dec(nr_running); + atomic_dec(&pool->nr_running); } worker->flags |= flags; @@ -883,7 +864,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) */ if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) if (!(worker->flags & WORKER_NOT_RUNNING)) - atomic_inc(get_pool_nr_running(pool)); + atomic_inc(&pool->nr_running); } /** @@ -1518,7 +1499,7 @@ static void worker_enter_idle(struct worker *worker) */ WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) && pool->nr_workers == pool->nr_idle && - atomic_read(get_pool_nr_running(pool))); + atomic_read(&pool->nr_running)); } /** @@ -3506,7 +3487,7 @@ static void wq_unbind_fn(struct work_struct *work) * didn't already. */ for_each_std_worker_pool(pool, cpu) - atomic_set(get_pool_nr_running(pool), 0); + atomic_set(&pool->nr_running, 0); } /* -- cgit v1.2.3 From 54d5b7d079dffa74597715a892473b474babd5b5 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 7 Feb 2013 13:14:20 -0800 Subject: workqueue: make get_work_pool_id() cheaper get_work_pool_id() currently first obtains pool using get_work_pool() and then return pool->id. For an off-queue work item, this involves obtaining pool ID from worker->data, performing idr_find() to find the matching pool and then returning its pool->id which of course is the same as the one which went into idr_find(). Just open code WORK_STRUCT_CWQ case and directly return pool ID from work->data. tj: The original patch dropped on-queue work item handling and renamed the function to offq_work_pool_id(). There isn't much benefit in doing so. Handling it only requires a single if() and we need at least BUG_ON(), which is also a branch, even if we drop on-queue handling. Open code WORK_STRUCT_CWQ case and keep the function in line with get_work_pool(). Rewrote the description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 91ce7a984c22..1801c37b28c4 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -611,9 +611,13 @@ static struct worker_pool *get_work_pool(struct work_struct *work) */ static int get_work_pool_id(struct work_struct *work) { - struct worker_pool *pool = get_work_pool(work); + unsigned long data = atomic_long_read(&work->data); + + if (data & WORK_STRUCT_CWQ) + return ((struct cpu_workqueue_struct *) + (data & WORK_STRUCT_WQ_DATA_MASK))->pool->id; - return pool ? pool->id : WORK_OFFQ_POOL_NONE; + return data >> WORK_OFFQ_POOL_SHIFT; } static void mark_work_canceling(struct work_struct *work) -- cgit v1.2.3 From 8594fade39d3ad02ef856b8c53b5d7cc538a55f5 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 7 Feb 2013 13:14:20 -0800 Subject: workqueue: pick cwq instead of pool in __queue_work() Currently, __queue_work() chooses the pool to queue a work item to and then determines cwq from the target wq and the chosen pool. This is a bit backwards in that we can determine cwq first and simply use cwq->pool. This way, we can skip get_std_worker_pool() in queueing path which will be a hurdle when implementing custom worker pools. Update __queue_work() such that it chooses the target cwq and then use cwq->pool instead of the other way around. While at it, add missing {} in an if statement. This patch doesn't introduce any functional changes. tj: The original patch had two get_cwq() calls - the first to determine the pool by doing get_cwq(cpu, wq)->pool and the second to determine the matching cwq from get_cwq(pool->cpu, wq). Updated the function such that it chooses cwq instead of pool and removed the second call. Rewrote the description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1801c37b28c4..d6fdce12ca7e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1193,8 +1193,6 @@ static bool is_chained_work(struct workqueue_struct *wq) static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, struct work_struct *work) { - bool highpri = wq->flags & WQ_HIGHPRI; - struct worker_pool *pool; struct cpu_workqueue_struct *cwq; struct list_head *worklist; unsigned int work_flags; @@ -1215,7 +1213,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, WARN_ON_ONCE(!is_chained_work(wq))) return; - /* determine pool to use */ + /* determine the cwq to use */ if (!(wq->flags & WQ_UNBOUND)) { struct worker_pool *last_pool; @@ -1228,37 +1226,36 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, * work needs to be queued on that cpu to guarantee * non-reentrancy. */ - pool = get_std_worker_pool(cpu, highpri); + cwq = get_cwq(cpu, wq); last_pool = get_work_pool(work); - if (last_pool && last_pool != pool) { + if (last_pool && last_pool != cwq->pool) { struct worker *worker; spin_lock(&last_pool->lock); worker = find_worker_executing_work(last_pool, work); - if (worker && worker->current_cwq->wq == wq) - pool = last_pool; - else { + if (worker && worker->current_cwq->wq == wq) { + cwq = get_cwq(last_pool->cpu, wq); + } else { /* meh... not running there, queue here */ spin_unlock(&last_pool->lock); - spin_lock(&pool->lock); + spin_lock(&cwq->pool->lock); } } else { - spin_lock(&pool->lock); + spin_lock(&cwq->pool->lock); } } else { - pool = get_std_worker_pool(WORK_CPU_UNBOUND, highpri); - spin_lock(&pool->lock); + cwq = get_cwq(WORK_CPU_UNBOUND, wq); + spin_lock(&cwq->pool->lock); } - /* pool determined, get cwq and queue */ - cwq = get_cwq(pool->cpu, wq); + /* cwq determined, queue */ trace_workqueue_queue_work(req_cpu, cwq, work); if (WARN_ON(!list_empty(&work->entry))) { - spin_unlock(&pool->lock); + spin_unlock(&cwq->pool->lock); return; } @@ -1276,7 +1273,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, insert_work(cwq, work, worklist, work_flags); - spin_unlock(&pool->lock); + spin_unlock(&cwq->pool->lock); } /** -- cgit v1.2.3 From 5a41344a3d83ef2c08e40bfce1efa5795def9b82 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 29 Nov 2012 16:46:02 +0800 Subject: srcu: Simplify __srcu_read_unlock() via this_cpu_dec() This commit replaces disabling of preemption and decrement of a per-CPU variable with this_cpu_dec(), which avoids preemption disabling on x86 and shortens the code on all platforms. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/srcu.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/srcu.c b/kernel/srcu.c index 2b859828cdc3..c9d003bc6c49 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -321,10 +321,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); */ void __srcu_read_unlock(struct srcu_struct *sp, int idx) { - preempt_disable(); smp_mb(); /* C */ /* Avoid leaking the critical section. */ - ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1; - preempt_enable(); + this_cpu_dec(sp->per_cpu_ref->c[idx]); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); -- cgit v1.2.3 From 6e6f1b307e23201fb3e7aaf16322e80355d2a3d5 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 29 Nov 2012 16:46:03 +0800 Subject: srcu: Add might_sleep() annotation to synchronize_srcu() Although synchronize_srcu() can sleep, it will not sleep if the fast path succeeds, which means that illegal use of synchronize_rcu() might go unnoticed. This commit therefore adds might_sleep(), which unconditionally catches illegal use of synchronize_rcu() from atomic context. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/srcu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/srcu.c b/kernel/srcu.c index c9d003bc6c49..3e43a214b4dc 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -421,6 +421,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) !lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); + might_sleep(); init_completion(&rcu.completion); head->next = NULL; -- cgit v1.2.3 From ab4d2986e44c589aa1b647d7da5e21c2707babea Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 29 Nov 2012 16:46:04 +0800 Subject: srcu: Simple cleanup for cleanup_srcu_struct() Pack six lines of code into two lines. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/srcu.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/srcu.c b/kernel/srcu.c index 3e43a214b4dc..7cf5baba96f9 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -282,12 +282,8 @@ static int srcu_readers_active(struct srcu_struct *sp) */ void cleanup_srcu_struct(struct srcu_struct *sp) { - int sum; - - sum = srcu_readers_active(sp); - WARN_ON(sum); /* Leakage unless caller handles error. */ - if (sum != 0) - return; + if (WARN_ON(srcu_readers_active(sp))) + return; /* Leakage unless caller handles error. */ free_percpu(sp->per_cpu_ref); sp->per_cpu_ref = NULL; } -- cgit v1.2.3 From 34a64b6bb64b5cf193932e2b4394c5284732e008 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 29 Nov 2012 16:46:07 +0800 Subject: srcu: Update synchronize_srcu()'s comments The core of SRCU is changed, but synchronize_srcu()'s comments describe the old algorithm. This commit therefore updates them to match the new algorithm. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/srcu.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/srcu.c b/kernel/srcu.c index 7cf5baba96f9..f098f1768215 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -450,10 +450,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) * synchronize_srcu - wait for prior SRCU read-side critical-section completion * @sp: srcu_struct with which to synchronize. * - * Flip the completed counter, and wait for the old count to drain to zero. - * As with classic RCU, the updater must use some separate means of - * synchronizing concurrent updates. Can block; must be called from - * process context. + * Wait for the count to drain to zero of both indexes. To avoid the + * possible starvation of synchronize_srcu(), it waits for the count of + * the index=((->completed & 1) ^ 1) to drain to zero at first, + * and then flip the completed and wait for the count of the other index. + * + * Can block; must be called from process context. * * Note that it is illegal to call synchronize_srcu() from the corresponding * SRCU read-side critical section; doing so will result in deadlock. -- cgit v1.2.3 From 49271ca60645d64197b28c0835fed39f74b1a2d7 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 29 Nov 2012 16:46:08 +0800 Subject: srcu: Update synchronize_srcu_expedited()'s comments Because synchronize_srcu_expedited() no longer uses synchronize_rcu_sched_expedited(), synchronize_srcu_expedited() no longer indirectly acquires any CPU-hotplug-related locks. This commit therefore updates the comments accordingly. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/srcu.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/srcu.c b/kernel/srcu.c index f098f1768215..e34f2991ed41 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -477,12 +477,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); * Wait for an SRCU grace period to elapse, but be more aggressive about * spinning rather than blocking when waiting. * - * Note that it is illegal to call this function while holding any lock - * that is acquired by a CPU-hotplug notifier. It is also illegal to call - * synchronize_srcu_expedited() from the corresponding SRCU read-side - * critical section; doing so will result in deadlock. However, it is - * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct - * from some other srcu_struct's read-side critical section, as long as + * Note that it is also illegal to call synchronize_srcu_expedited() + * from the corresponding SRCU read-side critical section; + * doing so will result in deadlock. However, it is perfectly legal + * to call synchronize_srcu_expedited() on one srcu_struct from some + * other srcu_struct's read-side critical section, as long as * the resulting graph of srcu_structs is acyclic. */ void synchronize_srcu_expedited(struct srcu_struct *sp) -- cgit v1.2.3 From 7a6b55e7108b3476d13ee9501ec69dbe1605d774 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 29 Nov 2012 16:46:09 +0800 Subject: srcu: use ACCESS_ONCE() to access sp->completed in srcu_read_lock() The old SRCU implementation loads sp->completed within an RCU-sched section, courtesy of preempt_disable(). This was required due to the use of synchronize_sched() in the old implemenation's synchronize_srcu(). However, the new implementation does not rely on synchronize_sched(), so it in turn does not require the load of sp->completed and the ->c[] counter to be in a single preempt-disabled region of code. This commit therefore moves the sp->completed access outside of the preempt-disabled region and applies ACCESS_ONCE(). The resulting code is almost as the same as before, but it removes the now-misleading rcu_dereference_index_check() call. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/srcu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/srcu.c b/kernel/srcu.c index e34f2991ed41..01d5ccb8bfe3 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -298,9 +298,8 @@ int __srcu_read_lock(struct srcu_struct *sp) { int idx; + idx = ACCESS_ONCE(sp->completed) & 0x1; preempt_disable(); - idx = rcu_dereference_index_check(sp->completed, - rcu_read_lock_sched_held()) & 0x1; ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; smp_mb(); /* B */ /* Avoid leaking the critical section. */ ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; -- cgit v1.2.3 From 63a3f603413ffe82ad775f2d62a5afff87fd94a0 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 7 Feb 2013 17:14:08 -0800 Subject: timeconst.pl: Eliminate Perl warning defined(@array) is deprecated in Perl and gives off a warning. Restructure the code to remove that warning. [ hpa: it would be interesting to revert to the timeconst.bc script. It appears that the failures reported by akpm during testing of that script was due to a known broken version of make, not a problem with bc. The Makefile rules could probably be restructured to avoid the make bug, or it is probably old enough that it doesn't matter. ] Reported-by: Andi Kleen Signed-off-by: H. Peter Anvin Cc: Andrew Morton Cc: --- kernel/timeconst.pl | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl index eb51d76e058a..3f42652a6a37 100644 --- a/kernel/timeconst.pl +++ b/kernel/timeconst.pl @@ -369,10 +369,8 @@ if ($hz eq '--can') { die "Usage: $0 HZ\n"; } - @val = @{$canned_values{$hz}}; - if (!defined(@val)) { - @val = compute_values($hz); - } + $cv = $canned_values{$hz}; + @val = defined($cv) ? @$cv : compute_values($hz); output($hz, @val); } exit 0; -- cgit v1.2.3 From 5845b81bdad374f98f809a658ec747d92c9595c4 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Fri, 8 Feb 2013 12:07:01 +1000 Subject: Revert "Revert "console: implement lockdep support for console_lock"" This reverts commit ff0d05bf73620eb7dc8aee7423e992ef87870bdf. Now that we have all the locking fixes in place, we can revert the revert. This re-enables lockdep tracking for the console lock, daee779718a319ff9f83e1ba3339334ac650bb22. Signed-off-by: Dave Airlie --- kernel/printk.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 267ce780abe8..357f714ddd49 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -87,6 +87,12 @@ static DEFINE_SEMAPHORE(console_sem); struct console *console_drivers; EXPORT_SYMBOL_GPL(console_drivers); +#ifdef CONFIG_LOCKDEP +static struct lockdep_map console_lock_dep_map = { + .name = "console_lock" +}; +#endif + /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's @@ -1918,6 +1924,7 @@ void console_lock(void) return; console_locked = 1; console_may_schedule = 1; + mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); } EXPORT_SYMBOL(console_lock); @@ -1939,6 +1946,7 @@ int console_trylock(void) } console_locked = 1; console_may_schedule = 0; + mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_); return 1; } EXPORT_SYMBOL(console_trylock); @@ -2099,6 +2107,7 @@ skip: local_irq_restore(flags); } console_locked = 0; + mutex_release(&console_lock_dep_map, 1, _RET_IP_); /* Release the exclusive_console once it is used */ if (unlikely(exclusive_console)) -- cgit v1.2.3 From bbc33d05930f870ea049eae5ed980f8b827d0813 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 21 Nov 2012 16:55:38 +0100 Subject: uprobes: Move __set_bit(UPROBE_SKIP_SSTEP) into alloc_uprobe() Cosmetic. __set_bit(UPROBE_SKIP_SSTEP) is the part of initialization, it is not clear why it is set in insert_uprobe(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 30ea9a4f4ab4..afbab2cb2742 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -430,9 +430,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) u = __insert_uprobe(uprobe); spin_unlock(&uprobes_treelock); - /* For now assume that the instruction need not be single-stepped */ - __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); - return u; } @@ -454,6 +451,8 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->offset = offset; init_rwsem(&uprobe->consumer_rwsem); mutex_init(&uprobe->copy_mutex); + /* For now assume that the instruction need not be single-stepped */ + __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); -- cgit v1.2.3 From f0744af7d0fde190674064c54e2ff60b34ac71fe Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 21 Nov 2012 18:01:43 +0100 Subject: uprobes: Kill the pointless inode/uc checks in register/unregister register/unregister verifies that inode/uc != NULL. For what? This really looks like "hide the potential problem", the caller should pass the valid data. register() also checks uc->next == NULL, probably to prevent the double-register but the caller can do other stupid/wrong things. If we do this check, then we should document that uc->next should be cleared before register() and add BUG_ON(). Also add the small comment about the i_size_read() check. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index afbab2cb2742..a39d8163b713 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -844,9 +844,7 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * struct uprobe *uprobe; int ret; - if (!inode || !uc || uc->next) - return -EINVAL; - + /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) return -EINVAL; @@ -883,9 +881,6 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume { struct uprobe *uprobe; - if (!inode || !uc) - return; - uprobe = find_uprobe(inode, offset); if (!uprobe) return; -- cgit v1.2.3 From fe20d71f25400cccc8bffef865f79250be7dbc81 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 21 Nov 2012 17:32:30 +0100 Subject: uprobes: Kill uprobe_consumer->filter() uprobe_consumer->filter() is pointless in its current form, kill it. We will add it back, but with the different signature/semantics. Perhaps we will even re-introduce the callsite in handler_chain(), but not to just skip uc->handler(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 6 ++---- kernel/trace/trace_uprobe.c | 1 - 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a39d8163b713..5cbebac27c01 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -477,10 +477,8 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) return; down_read(&uprobe->consumer_rwsem); - for (uc = uprobe->consumers; uc; uc = uc->next) { - if (!uc->filter || uc->filter(uc, current)) - uc->handler(uc, regs); - } + for (uc = uprobe->consumers; uc; uc = uc->next) + uc->handler(uc, regs); up_read(&uprobe->consumer_rwsem); } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 87b6db4ccbc5..e668024773d4 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -550,7 +550,6 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) return -EINTR; utc->cons.handler = uprobe_dispatcher; - utc->cons.filter = NULL; ret = uprobe_register(tu->inode, tu->offset, &utc->cons); if (ret) { kfree(utc); -- cgit v1.2.3 From 63633cbf82840d972248f11d2122b261d0d4779a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 22 Nov 2012 18:30:15 +0100 Subject: uprobes: Introduce filter_chain() Add the new helper filter_chain(). Currently it is only placeholder, the comment explains what is should do. We will change it later to consult every consumer to decide whether we need to install the swbp. Until then it works as if any consumer returns true, this matches the current behavior. Change install_breakpoint() to call filter_chain() instead of checking uprobe->consumers != NULL. We obviously need this, and this equally closes the race with _unregister(). Change remove_breakpoint() to call this helper too. Currently this is pointless because remove_breakpoint() is only called when the last consumer goes away, but we will change this. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 5cbebac27c01..c38bf37d0aca 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -614,6 +614,18 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, return ret; } +static bool filter_chain(struct uprobe *uprobe) +{ + /* + * TODO: + * for_each_consumer(uc) + * if (uc->filter(...)) + * return true; + * return false; + */ + return uprobe->consumers != NULL; +} + static int install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long vaddr) @@ -624,11 +636,10 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, /* * If probe is being deleted, unregister thread could be done with * the vma-rmap-walk through. Adding a probe now can be fatal since - * nobody will be able to cleanup. Also we could be from fork or - * mremap path, where the probe might have already been inserted. - * Hence behave as if probe already existed. + * nobody will be able to cleanup. But in this case filter_chain() + * must return false, all consumers have gone away. */ - if (!uprobe->consumers) + if (!filter_chain(uprobe)) return 0; ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr); @@ -655,10 +666,12 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, static int remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) { - /* can happen if uprobe_register() fails */ if (!test_bit(MMF_HAS_UPROBES, &mm->flags)) return 0; + if (filter_chain(uprobe)) + return 0; + set_bit(MMF_RECALC_UPROBES, &mm->flags); return set_orig_insn(&uprobe->arch, mm, vaddr); } @@ -1382,6 +1395,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm) * This is not strictly accurate, we can race with * uprobe_unregister() and see the already removed * uprobe if delete_uprobe() was not yet called. + * Or this uprobe can be filtered out. */ if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end)) return; -- cgit v1.2.3 From 04aab9b2006bbdeff78dc162f206fdfebeca97d9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Nov 2012 19:43:50 +0100 Subject: uprobes: _unregister() should always do register_for_each_vma(false) uprobe_unregister() removes the breakpoints only if the last consumer goes away. To support the filtering it should do this every time, we want to remove the breakpoints which nobody else want to keep. Note: given that filter_chain() is not actually implemented, this patch itself doesn't change the behaviour yet, register_for_each_vma(false) is a heavy "nop" unless there are no more consumers. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c38bf37d0aca..940199084639 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -825,12 +825,20 @@ static int __uprobe_register(struct uprobe *uprobe) return register_for_each_vma(uprobe, true); } -static void __uprobe_unregister(struct uprobe *uprobe) +static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) { - if (!register_for_each_vma(uprobe, false)) - delete_uprobe(uprobe); + int err; + + if (!consumer_del(uprobe, uc)) /* WARN? */ + return; - /* TODO : cant unregister? schedule a worker thread */ + err = register_for_each_vma(uprobe, false); + if (!uprobe->consumers) { + clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags); + /* TODO : cant unregister? schedule a worker thread */ + if (!err) + delete_uprobe(uprobe); + } } /* @@ -868,8 +876,7 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * } else if (!consumer_add(uprobe, uc)) { ret = __uprobe_register(uprobe); if (ret) { - uprobe->consumers = NULL; - __uprobe_unregister(uprobe); + __uprobe_unregister(uprobe, uc); } else { set_bit(UPROBE_RUN_HANDLER, &uprobe->flags); } @@ -897,14 +904,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume return; mutex_lock(uprobes_hash(inode)); - - if (consumer_del(uprobe, uc)) { - if (!uprobe->consumers) { - __uprobe_unregister(uprobe); - clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags); - } - } - + __uprobe_unregister(uprobe, uc); mutex_unlock(uprobes_hash(inode)); put_uprobe(uprobe); } -- cgit v1.2.3 From 9a98e03cc145c994da824dac7602334f50feb670 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Nov 2012 20:15:17 +0100 Subject: uprobes: _register() should always do register_for_each_vma(true) To support the filtering uprobe_register() should do register_for_each_vma(true) every time the new consumer comes, we need to install the previously nacked breakpoints. Note: - uprobes_mutex[] should die, what it actually protects is alloc_uprobe(). - UPROBE_RUN_HANDLER should die too, obviously it can't work unless uprobe has a single consumer. The consumer should serialize with _register/_unregister itself. Or this flag should live in uprobe_consumer->state. - Perhaps we can do some optimizations later. For example, if filter_chain() never returns false uprobe can record this fact and avoid the unnecessary register_for_each_vma(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 940199084639..d1d1394bca8b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -482,16 +482,12 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) up_read(&uprobe->consumer_rwsem); } -/* Returns the previous consumer */ -static struct uprobe_consumer * -consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) +static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) { down_write(&uprobe->consumer_rwsem); uc->next = uprobe->consumers; uprobe->consumers = uc; up_write(&uprobe->consumer_rwsem); - - return uc->next; } /* @@ -820,9 +816,15 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) return err; } -static int __uprobe_register(struct uprobe *uprobe) +static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc) { - return register_for_each_vma(uprobe, true); + int err; + + consumer_add(uprobe, uc); + err = register_for_each_vma(uprobe, true); + if (!err) /* TODO: pointless unless the first consumer */ + set_bit(UPROBE_RUN_HANDLER, &uprobe->flags); + return err; } static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) @@ -867,21 +869,14 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * if (offset > i_size_read(inode)) return -EINVAL; - ret = 0; + ret = -ENOMEM; mutex_lock(uprobes_hash(inode)); uprobe = alloc_uprobe(inode, offset); - - if (!uprobe) { - ret = -ENOMEM; - } else if (!consumer_add(uprobe, uc)) { - ret = __uprobe_register(uprobe); - if (ret) { + if (uprobe) { + ret = __uprobe_register(uprobe, uc); + if (ret) __uprobe_unregister(uprobe, uc); - } else { - set_bit(UPROBE_RUN_HANDLER, &uprobe->flags); - } } - mutex_unlock(uprobes_hash(inode)); if (uprobe) put_uprobe(uprobe); -- cgit v1.2.3 From e591c8d78e49e6206935cf31c4d2b603bbb29166 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 24 Nov 2012 17:29:40 +0100 Subject: uprobes: Introduce uprobe->register_rwsem Introduce uprobe->register_rwsem. It is taken for writing around __uprobe_register/unregister. Change handler_chain() to use this sem rather than consumer_rwsem. The main reason for this change is that we have the nasty problem with mmap_sem/consumer_rwsem dependency. filter_chain() needs to protect uprobe->consumers like handler_chain(), but they can not use the same lock. filter_chain() can be called under ->mmap_sem (currently this is always true), but we want to allow ->handler() to play with the probed task's memory, and this needs ->mmap_sem. Alternatively we could use srcu, but synchronize_srcu() is very slow and ->register_rwsem allows us to do more. In particular, we can teach handler_chain() to do remove_breakpoint() if this bp is "nacked" by all consumers, we know that we can't race with the new consumer which does uprobe_register(). See also the next patches. uprobes_mutex[] is almost ready to die. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d1d1394bca8b..61d0fa6b5012 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -91,6 +91,7 @@ static atomic_t uprobe_events = ATOMIC_INIT(0); struct uprobe { struct rb_node rb_node; /* node in the rb tree */ atomic_t ref; + struct rw_semaphore register_rwsem; struct rw_semaphore consumer_rwsem; struct mutex copy_mutex; /* TODO: kill me and UPROBE_COPY_INSN */ struct list_head pending_list; @@ -449,6 +450,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->inode = igrab(inode); uprobe->offset = offset; + init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); mutex_init(&uprobe->copy_mutex); /* For now assume that the instruction need not be single-stepped */ @@ -476,10 +478,10 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags)) return; - down_read(&uprobe->consumer_rwsem); + down_read(&uprobe->register_rwsem); for (uc = uprobe->consumers; uc; uc = uc->next) uc->handler(uc, regs); - up_read(&uprobe->consumer_rwsem); + up_read(&uprobe->register_rwsem); } static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) @@ -873,9 +875,11 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * mutex_lock(uprobes_hash(inode)); uprobe = alloc_uprobe(inode, offset); if (uprobe) { + down_write(&uprobe->register_rwsem); ret = __uprobe_register(uprobe, uc); if (ret) __uprobe_unregister(uprobe, uc); + up_write(&uprobe->register_rwsem); } mutex_unlock(uprobes_hash(inode)); if (uprobe) @@ -899,7 +903,9 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume return; mutex_lock(uprobes_hash(inode)); + down_write(&uprobe->register_rwsem); __uprobe_unregister(uprobe, uc); + up_write(&uprobe->register_rwsem); mutex_unlock(uprobes_hash(inode)); put_uprobe(uprobe); } -- cgit v1.2.3 From 1ff6fee5e62c57d5923b805bb4206acb7953f16e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 24 Nov 2012 18:15:46 +0100 Subject: uprobes: Change filter_chain() to iterate ->consumers list Now that it safe to use ->consumer_rwsem under ->mmap_sem we can almost finish the implementation of filter_chain(). It still lacks the actual uc->filter(...) call but othewrwise it is ready, just it pretends that ->filter() always returns true. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 61d0fa6b5012..4d0452363686 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -614,14 +614,19 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, static bool filter_chain(struct uprobe *uprobe) { - /* - * TODO: - * for_each_consumer(uc) - * if (uc->filter(...)) - * return true; - * return false; - */ - return uprobe->consumers != NULL; + struct uprobe_consumer *uc; + bool ret = false; + + down_read(&uprobe->consumer_rwsem); + for (uc = uprobe->consumers; uc; uc = uc->next) { + /* TODO: ret = uc->filter(...) */ + ret = true; + if (ret) + break; + } + up_read(&uprobe->consumer_rwsem); + + return ret; } static int -- cgit v1.2.3 From bb929284be40cbbdb347690742557d708fd504a9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 24 Nov 2012 18:27:08 +0100 Subject: uprobes: Kill UPROBE_RUN_HANDLER flag Simply remove UPROBE_RUN_HANDLER and the corresponding code. It can only help if uprobe has a single consumer, and in fact it is no longer needed after handler_chain() was changed to use ->register_rwsem, we simply can not race with uprobe_register(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4d0452363686..7ec2eb278634 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -83,10 +83,8 @@ static atomic_t uprobe_events = ATOMIC_INIT(0); /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 -/* Dont run handlers when first register/ last unregister in progress*/ -#define UPROBE_RUN_HANDLER 1 /* Can skip singlestep */ -#define UPROBE_SKIP_SSTEP 2 +#define UPROBE_SKIP_SSTEP 1 struct uprobe { struct rb_node rb_node; /* node in the rb tree */ @@ -475,9 +473,6 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) { struct uprobe_consumer *uc; - if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags)) - return; - down_read(&uprobe->register_rwsem); for (uc = uprobe->consumers; uc; uc = uc->next) uc->handler(uc, regs); @@ -825,13 +820,8 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc) { - int err; - consumer_add(uprobe, uc); - err = register_for_each_vma(uprobe, true); - if (!err) /* TODO: pointless unless the first consumer */ - set_bit(UPROBE_RUN_HANDLER, &uprobe->flags); - return err; + return register_for_each_vma(uprobe, true); } static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) @@ -842,12 +832,9 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u return; err = register_for_each_vma(uprobe, false); - if (!uprobe->consumers) { - clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags); - /* TODO : cant unregister? schedule a worker thread */ - if (!err) - delete_uprobe(uprobe); - } + /* TODO : cant unregister? schedule a worker thread */ + if (!uprobe->consumers && !err) + delete_uprobe(uprobe); } /* -- cgit v1.2.3 From d4d3ccc6d1eb74bd315d49a3829c5ad6c48d21b0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 24 Nov 2012 18:51:34 +0100 Subject: uprobes: Kill uprobe->copy_mutex Now that ->register_rwsem is safe under ->mmap_sem we can kill ->copy_mutex and abuse down_write(&uprobe->consumer_rwsem). This makes prepare_uprobe() even more ugly, but we should kill it anyway. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7ec2eb278634..40ced98b2bc8 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -91,7 +91,6 @@ struct uprobe { atomic_t ref; struct rw_semaphore register_rwsem; struct rw_semaphore consumer_rwsem; - struct mutex copy_mutex; /* TODO: kill me and UPROBE_COPY_INSN */ struct list_head pending_list; struct uprobe_consumer *consumers; struct inode *inode; /* Also hold a ref to inode */ @@ -450,7 +449,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->offset = offset; init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); - mutex_init(&uprobe->copy_mutex); /* For now assume that the instruction need not be single-stepped */ __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); @@ -578,7 +576,8 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) return ret; - mutex_lock(&uprobe->copy_mutex); + /* TODO: move this into _register, until then we abuse this sem. */ + down_write(&uprobe->consumer_rwsem); if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) goto out; @@ -602,7 +601,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, set_bit(UPROBE_COPY_INSN, &uprobe->flags); out: - mutex_unlock(&uprobe->copy_mutex); + up_write(&uprobe->consumer_rwsem); return ret; } -- cgit v1.2.3 From 441f1eb7db8babe2b6b4bc805f023739dbb70e33 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 25 Nov 2012 19:54:29 +0100 Subject: uprobes: Kill uprobe_events, use RB_EMPTY_ROOT() instead uprobe_events counts the number of uprobes in uprobes_tree but it is used as a boolean. We can use RB_EMPTY_ROOT() instead. Probably no_uprobe_events() added by this patch can have more callers, say, mmf_recalc_uprobes(). Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 40ced98b2bc8..5d38b40644b8 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -41,6 +41,11 @@ #define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE static struct rb_root uprobes_tree = RB_ROOT; +/* + * allows us to skip the uprobe_mmap if there are no uprobe events active + * at this time. Probably a fine grained per inode count is better? + */ +#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree) static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ @@ -74,13 +79,6 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; static struct percpu_rw_semaphore dup_mmap_sem; -/* - * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe - * events active at this time. Probably a fine grained per inode count is - * better? - */ -static atomic_t uprobe_events = ATOMIC_INIT(0); - /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 /* Can skip singlestep */ @@ -460,8 +458,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) kfree(uprobe); uprobe = cur_uprobe; iput(inode); - } else { - atomic_inc(&uprobe_events); } return uprobe; @@ -685,7 +681,6 @@ static void delete_uprobe(struct uprobe *uprobe) spin_unlock(&uprobes_treelock); iput(uprobe->inode); put_uprobe(uprobe); - atomic_dec(&uprobe_events); } struct map_info { @@ -975,7 +970,7 @@ int uprobe_mmap(struct vm_area_struct *vma) struct uprobe *uprobe, *u; struct inode *inode; - if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) + if (no_uprobe_events() || !valid_vma(vma, true)) return 0; inode = vma->vm_file->f_mapping->host; @@ -1021,7 +1016,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e */ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) + if (no_uprobe_events() || !valid_vma(vma, false)) return; if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ -- cgit v1.2.3 From 06b7bcd8cbd7eb1af331e437ec3d8f5182ae1b7e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 25 Nov 2012 22:01:42 +0100 Subject: uprobes: Introduce uprobe_is_active() The lifetime of uprobe->rb_node and uprobe->inode is not refcounted, delete_uprobe() is called when we detect that uprobe has no consumers, and it would be deadly wrong to do this twice. Change delete_uprobe() to WARN() if it was already called. We use RB_CLEAR_NODE() to mark uprobe "inactive", then RB_EMPTY_NODE() can be used to detect this case. RB_EMPTY_NODE() is not used directly, we add the trivial helper for the next change. Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 5d38b40644b8..358baddc8ac2 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -669,6 +669,10 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad return set_orig_insn(&uprobe->arch, mm, vaddr); } +static inline bool uprobe_is_active(struct uprobe *uprobe) +{ + return !RB_EMPTY_NODE(&uprobe->rb_node); +} /* * There could be threads that have already hit the breakpoint. They * will recheck the current insn and restart if find_uprobe() fails. @@ -676,9 +680,13 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad */ static void delete_uprobe(struct uprobe *uprobe) { + if (WARN_ON(!uprobe_is_active(uprobe))) + return; + spin_lock(&uprobes_treelock); rb_erase(&uprobe->rb_node, &uprobes_tree); spin_unlock(&uprobes_treelock); + RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */ iput(uprobe->inode); put_uprobe(uprobe); } -- cgit v1.2.3 From 66d06dffa5ef6f3544997440af63a91ef36a2171 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 25 Nov 2012 22:48:37 +0100 Subject: uprobes: Kill uprobes_mutex[], separate alloc_uprobe() and __uprobe_register() uprobe_register() and uprobe_unregister() are the only users of mutex_lock(uprobes_hash(inode)), and the only reason why we can't simply remove it is that we need to ensure that delete_uprobe() is not possible after alloc_uprobe() and before consumer_add(). IOW, we need to ensure that when we take uprobe->register_rwsem this uprobe is still valid and we didn't race with _unregister() which called delete_uprobe() in between. With this patch uprobe_register() simply checks uprobe_is_active() and retries if it hits this very unlikely race. uprobes_mutex[] is no longer needed and can be removed. There is another reason for this change, prepare_uprobe() should be folded into alloc_uprobe() and we do not want to hold the extra locks around read_mapping_page/etc. Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 51 +++++++++++++++---------------------------------- 1 file changed, 15 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 358baddc8ac2..c3b65d1c8443 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -50,29 +50,6 @@ static struct rb_root uprobes_tree = RB_ROOT; static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ #define UPROBES_HASH_SZ 13 - -/* - * We need separate register/unregister and mmap/munmap lock hashes because - * of mmap_sem nesting. - * - * uprobe_register() needs to install probes on (potentially) all processes - * and thus needs to acquire multiple mmap_sems (consequtively, not - * concurrently), whereas uprobe_mmap() is called while holding mmap_sem - * for the particular process doing the mmap. - * - * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem - * because of lock order against i_mmap_mutex. This means there's a hole in - * the register vma iteration where a mmap() can happen. - * - * Thus uprobe_register() can race with uprobe_mmap() and we can try and - * install a probe where one is already installed. - */ - -/* serialize (un)register */ -static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; - -#define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) - /* serialize uprobe->pending_list */ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) @@ -865,20 +842,26 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * if (offset > i_size_read(inode)) return -EINVAL; - ret = -ENOMEM; - mutex_lock(uprobes_hash(inode)); + retry: uprobe = alloc_uprobe(inode, offset); - if (uprobe) { - down_write(&uprobe->register_rwsem); + if (!uprobe) + return -ENOMEM; + /* + * We can race with uprobe_unregister()->delete_uprobe(). + * Check uprobe_is_active() and retry if it is false. + */ + down_write(&uprobe->register_rwsem); + ret = -EAGAIN; + if (likely(uprobe_is_active(uprobe))) { ret = __uprobe_register(uprobe, uc); if (ret) __uprobe_unregister(uprobe, uc); - up_write(&uprobe->register_rwsem); } - mutex_unlock(uprobes_hash(inode)); - if (uprobe) - put_uprobe(uprobe); + up_write(&uprobe->register_rwsem); + put_uprobe(uprobe); + if (unlikely(ret == -EAGAIN)) + goto retry; return ret; } @@ -896,11 +879,9 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume if (!uprobe) return; - mutex_lock(uprobes_hash(inode)); down_write(&uprobe->register_rwsem); __uprobe_unregister(uprobe, uc); up_write(&uprobe->register_rwsem); - mutex_unlock(uprobes_hash(inode)); put_uprobe(uprobe); } @@ -1609,10 +1590,8 @@ static int __init init_uprobes(void) { int i; - for (i = 0; i < UPROBES_HASH_SZ; i++) { - mutex_init(&uprobes_mutex[i]); + for (i = 0; i < UPROBES_HASH_SZ; i++) mutex_init(&uprobes_mmap_mutex[i]); - } if (percpu_init_rwsem(&dup_mmap_sem)) return -ENOMEM; -- cgit v1.2.3 From 806a98bdf2a862fef0fc880399d677b35ba525ff Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 27 Dec 2012 18:21:11 +0100 Subject: uprobes: Rationalize the usage of filter_chain() filter_chain() was added into install_breakpoint/remove_breakpoint to simplify the initial changes but this is sub-optimal. This patch shifts the callsite to the callers, register_for_each_vma() and uprobe_mmap(). This way: - It will be easier to add the new arguments. This is the main reason, we can do more optimizations later. - register_for_each_vma(is_register => true) can be optimized, we only need to consult the new consumer. The previous consumers were already asked when they called uprobe_register(). This patch also moves the MMF_HAS_UPROBES check from remove_breakpoint(), this allows to avoid the potentionally costly filter_chain(). Note that register_for_each_vma(is_register => false) doesn't really need to take ->consumer_rwsem, but I don't think it makes sense to optimize this and introduce filter_chain_lockless(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 44 +++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c3b65d1c8443..33912086d54e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -579,6 +579,11 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, return ret; } +static inline bool consumer_filter(struct uprobe_consumer *uc) +{ + return true; /* TODO: !uc->filter || uc->filter(...) */ +} + static bool filter_chain(struct uprobe *uprobe) { struct uprobe_consumer *uc; @@ -586,8 +591,7 @@ static bool filter_chain(struct uprobe *uprobe) down_read(&uprobe->consumer_rwsem); for (uc = uprobe->consumers; uc; uc = uc->next) { - /* TODO: ret = uc->filter(...) */ - ret = true; + ret = consumer_filter(uc); if (ret) break; } @@ -603,15 +607,6 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, bool first_uprobe; int ret; - /* - * If probe is being deleted, unregister thread could be done with - * the vma-rmap-walk through. Adding a probe now can be fatal since - * nobody will be able to cleanup. But in this case filter_chain() - * must return false, all consumers have gone away. - */ - if (!filter_chain(uprobe)) - return 0; - ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr); if (ret) return ret; @@ -636,12 +631,6 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, static int remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) { - if (!test_bit(MMF_HAS_UPROBES, &mm->flags)) - return 0; - - if (filter_chain(uprobe)) - return 0; - set_bit(MMF_RECALC_UPROBES, &mm->flags); return set_orig_insn(&uprobe->arch, mm, vaddr); } @@ -781,10 +770,14 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) vaddr_to_offset(vma, info->vaddr) != uprobe->offset) goto unlock; - if (is_register) - err = install_breakpoint(uprobe, mm, vma, info->vaddr); - else - err |= remove_breakpoint(uprobe, mm, info->vaddr); + if (is_register) { + /* consult only the "caller", new consumer. */ + if (consumer_filter(uprobe->consumers)) + err = install_breakpoint(uprobe, mm, vma, info->vaddr); + } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { + if (!filter_chain(uprobe)) + err |= remove_breakpoint(uprobe, mm, info->vaddr); + } unlock: up_write(&mm->mmap_sem); @@ -968,9 +961,14 @@ int uprobe_mmap(struct vm_area_struct *vma) mutex_lock(uprobes_mmap_hash(inode)); build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); - + /* + * We can race with uprobe_unregister(), this uprobe can be already + * removed. But in this case filter_chain() must return false, all + * consumers have gone away. + */ list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { - if (!fatal_signal_pending(current)) { + if (!fatal_signal_pending(current) && + filter_chain(uprobe)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); } -- cgit v1.2.3 From 8a7f2fa0dea3b019500961b86d765e6fdd4bffb2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 28 Dec 2012 17:58:38 +0100 Subject: uprobes: Reintroduce uprobe_consumer->filter() Finally add uprobe_consumer->filter() and change consumer_filter() to actually call this method. Note that ->filter() accepts mm_struct, not task_struct. Because: 1. We do not have for_each_mm_user(mm, task). 2. Even if we implement for_each_mm_user(), ->filter() can use it itself. 3. It is not clear who will actually need this interface to do the "nontrivial" filtering. Another argument is "enum uprobe_filter_ctx", consumer->filter() can use it to figure out why/where it was called. For example, perhaps we can add UPROBE_FILTER_PRE_REGISTER used by build_map_info() to quickly "nack" the unwanted mm's. In this case consumer should know that it is called under ->i_mmap_mutex. See the previous discussion at http://marc.info/?t=135214229700002 Perhaps we should pass more arguments, vma/vaddr? Note: this patch obviously can't help to filter out the child created by fork(), this will be addressed later. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 33912086d54e..c2737be3c4b8 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -579,19 +579,21 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, return ret; } -static inline bool consumer_filter(struct uprobe_consumer *uc) +static inline bool consumer_filter(struct uprobe_consumer *uc, + enum uprobe_filter_ctx ctx, struct mm_struct *mm) { - return true; /* TODO: !uc->filter || uc->filter(...) */ + return !uc->filter || uc->filter(uc, ctx, mm); } -static bool filter_chain(struct uprobe *uprobe) +static bool filter_chain(struct uprobe *uprobe, + enum uprobe_filter_ctx ctx, struct mm_struct *mm) { struct uprobe_consumer *uc; bool ret = false; down_read(&uprobe->consumer_rwsem); for (uc = uprobe->consumers; uc; uc = uc->next) { - ret = consumer_filter(uc); + ret = consumer_filter(uc, ctx, mm); if (ret) break; } @@ -772,10 +774,12 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) if (is_register) { /* consult only the "caller", new consumer. */ - if (consumer_filter(uprobe->consumers)) + if (consumer_filter(uprobe->consumers, + UPROBE_FILTER_REGISTER, mm)) err = install_breakpoint(uprobe, mm, vma, info->vaddr); } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { - if (!filter_chain(uprobe)) + if (!filter_chain(uprobe, + UPROBE_FILTER_UNREGISTER, mm)) err |= remove_breakpoint(uprobe, mm, info->vaddr); } @@ -968,7 +972,7 @@ int uprobe_mmap(struct vm_area_struct *vma) */ list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { if (!fatal_signal_pending(current) && - filter_chain(uprobe)) { + filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); } -- cgit v1.2.3 From da1816b1caeccdff04531e763bb35d7caa3ed19f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 29 Dec 2012 17:49:11 +0100 Subject: uprobes: Teach handler_chain() to filter out the probed task Currrently the are 2 problems with pre-filtering: 1. It is not possible to add/remove a task (mm) after uprobe_register() 2. A forked child inherits all breakpoints and uprobe_consumer can not control this. This patch does the first step to improve the filtering. handler_chain() removes the breakpoints installed by this uprobe from current->mm if all handlers return UPROBE_HANDLER_REMOVE. Note that handler_chain() relies on ->register_rwsem to avoid the race with uprobe_register/unregister which can add/del a consumer, or even remove and then insert the new uprobe at the same address. Perhaps we will add uprobe_apply_mm(uprobe, mm, is_register) and teach copy_mm() to do filter(UPROBE_FILTER_FORK), but I think this change makes sense anyway. Note: instead of checking the retcode from uc->handler, we could add uc->filter(UPROBE_FILTER_BPHIT). But I think this is not optimal to call 2 hooks in a row. This buys nothing, and if handler/filter do something nontrivial they will probably do the same work twice. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 58 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c2737be3c4b8..04c104ad9522 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -440,16 +440,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) return uprobe; } -static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) -{ - struct uprobe_consumer *uc; - - down_read(&uprobe->register_rwsem); - for (uc = uprobe->consumers; uc; uc = uc->next) - uc->handler(uc, regs); - up_read(&uprobe->register_rwsem); -} - static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) { down_write(&uprobe->consumer_rwsem); @@ -882,6 +872,33 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume put_uprobe(uprobe); } +static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) +{ + struct vm_area_struct *vma; + int err = 0; + + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + unsigned long vaddr; + loff_t offset; + + if (!valid_vma(vma, false) || + vma->vm_file->f_mapping->host != uprobe->inode) + continue; + + offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; + if (uprobe->offset < offset || + uprobe->offset >= offset + vma->vm_end - vma->vm_start) + continue; + + vaddr = offset_to_vaddr(vma, uprobe->offset); + err |= remove_breakpoint(uprobe, mm, vaddr); + } + up_read(&mm->mmap_sem); + + return err; +} + static struct rb_node * find_node_in_range(struct inode *inode, loff_t min, loff_t max) { @@ -1435,6 +1452,27 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) return uprobe; } +static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) +{ + struct uprobe_consumer *uc; + int remove = UPROBE_HANDLER_REMOVE; + + down_read(&uprobe->register_rwsem); + for (uc = uprobe->consumers; uc; uc = uc->next) { + int rc = uc->handler(uc, regs); + + WARN(rc & ~UPROBE_HANDLER_MASK, + "bad rc=0x%x from %pf()\n", rc, uc->handler); + remove &= rc; + } + + if (remove && uprobe->consumers) { + WARN_ON(!uprobe_is_active(uprobe)); + unapply_uprobe(uprobe, current->mm); + } + up_read(&uprobe->register_rwsem); +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. -- cgit v1.2.3 From 74e59dfc6b19e3472a7c16ad57bc831e6e647895 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 30 Dec 2012 15:54:08 +0100 Subject: uprobes: Change handle_swbp() to expose bp_vaddr to handler_chain() Change handle_swbp() to set regs->ip = bp_vaddr in advance, this is what consumer->handler() needs but uprobe_get_swbp_addr() is not exported. This also simplifies the code and makes it more consistent across the supported architectures. handle_swbp() becomes the only caller of uprobe_get_swbp_addr(). Signed-off-by: Oleg Nesterov Acked-by: Ananth N Mavinakayanahalli --- kernel/events/uprobes.c | 15 +++++++-------- kernel/trace/trace_uprobe.c | 4 ++-- 2 files changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 04c104ad9522..f1b807831fc2 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1504,6 +1504,10 @@ static void handle_swbp(struct pt_regs *regs) } return; } + + /* change it in advance for ->handler() and restart */ + instruction_pointer_set(regs, bp_vaddr); + /* * TODO: move copy_insn/etc into _register and remove this hack. * After we hit the bp, _unregister + _register can install the @@ -1511,14 +1515,14 @@ static void handle_swbp(struct pt_regs *regs) */ smp_rmb(); /* pairs with wmb() in install_breakpoint() */ if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) - goto restart; + goto out; utask = current->utask; if (!utask) { utask = add_utask(); /* Cannot allocate; re-execute the instruction. */ if (!utask) - goto restart; + goto out; } handler_chain(uprobe, regs); @@ -1531,12 +1535,7 @@ static void handle_swbp(struct pt_regs *regs) return; } -restart: - /* - * cannot singlestep; cannot skip instruction; - * re-execute the instruction. - */ - instruction_pointer_set(regs, bp_vaddr); + /* can_skip_sstep() succeeded, or restart if can't singlestep */ out: put_uprobe(uprobe); } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index e668024773d4..17d9b2bcc28d 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -492,7 +492,7 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) return; entry = ring_buffer_event_data(event); - entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); + entry->ip = instruction_pointer(task_pt_regs(current)); data = (u8 *)&entry[1]; for (i = 0; i < tu->nr_args; i++) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); @@ -667,7 +667,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) if (!entry) goto out; - entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); + entry->ip = instruction_pointer(task_pt_regs(current)); data = (u8 *)&entry[1]; for (i = 0; i < tu->nr_args; i++) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); -- cgit v1.2.3 From c8a82538001e1a68f4a319d5a75de90d1f284731 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 30 Dec 2012 17:40:39 +0100 Subject: uprobes: Move alloc_page() from xol_add_vma() to xol_alloc_area() Move alloc_page() from xol_add_vma() to xol_alloc_area() to cleanup the code. This separates the memory allocations and consolidates the -EALREADY cleanups and the error handling. Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f1b807831fc2..ea2e2a85479a 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1041,22 +1041,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon /* Slot allocation for XOL */ static int xol_add_vma(struct xol_area *area) { - struct mm_struct *mm; - int ret; - - area->page = alloc_page(GFP_HIGHUSER); - if (!area->page) - return -ENOMEM; - - ret = -EALREADY; - mm = current->mm; + struct mm_struct *mm = current->mm; + int ret = -EALREADY; down_write(&mm->mmap_sem); if (mm->uprobes_state.xol_area) goto fail; ret = -ENOMEM; - /* Try to map as high as possible, this is only a hint. */ area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); if (area->vaddr & ~PAGE_MASK) { @@ -1072,11 +1064,8 @@ static int xol_add_vma(struct xol_area *area) smp_wmb(); /* pairs with get_xol_area() */ mm->uprobes_state.xol_area = area; ret = 0; - -fail: + fail: up_write(&mm->mmap_sem); - if (ret) - __free_page(area->page); return ret; } @@ -1104,21 +1093,26 @@ static struct xol_area *xol_alloc_area(void) area = kzalloc(sizeof(*area), GFP_KERNEL); if (unlikely(!area)) - return NULL; + goto out; area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL); - if (!area->bitmap) - goto fail; + goto free_area; + + area->page = alloc_page(GFP_HIGHUSER); + if (!area->page) + goto free_bitmap; init_waitqueue_head(&area->wq); if (!xol_add_vma(area)) return area; -fail: + __free_page(area->page); + free_bitmap: kfree(area->bitmap); + free_area: kfree(area); - + out: return get_xol_area(current->mm); } -- cgit v1.2.3 From 9b545df809644912552360054c7bbe8b8a9e01fa Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Dec 2012 16:39:49 +0100 Subject: uprobes: Fold xol_alloc_area() into get_xol_area() Currently only xol_get_insn_slot() does get_xol_area() + xol_alloc_area(), but this will have more users and we do not want to copy-and-paste this code. This patch simply moves xol_alloc_area() into get_xol_area() to simplify the current and future code. Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ea2e2a85479a..7e3e5c5b0d88 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1070,27 +1070,21 @@ static int xol_add_vma(struct xol_area *area) return ret; } -static struct xol_area *get_xol_area(struct mm_struct *mm) -{ - struct xol_area *area; - - area = mm->uprobes_state.xol_area; - smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ - - return area; -} - /* - * xol_alloc_area - Allocate process's xol_area. - * This area will be used for storing instructions for execution out of - * line. + * get_xol_area - Allocate process's xol_area if necessary. + * This area will be used for storing instructions for execution out of line. * * Returns the allocated area or NULL. */ -static struct xol_area *xol_alloc_area(void) +static struct xol_area *get_xol_area(void) { + struct mm_struct *mm = current->mm; struct xol_area *area; + area = mm->uprobes_state.xol_area; + if (area) + goto ret; + area = kzalloc(sizeof(*area), GFP_KERNEL); if (unlikely(!area)) goto out; @@ -1113,7 +1107,10 @@ static struct xol_area *xol_alloc_area(void) free_area: kfree(area); out: - return get_xol_area(current->mm); + area = mm->uprobes_state.xol_area; + ret: + smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ + return area; } /* @@ -1189,14 +1186,11 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot unsigned long offset; void *vaddr; - area = get_xol_area(current->mm); - if (!area) { - area = xol_alloc_area(); - if (!area) - return 0; - } - current->utask->xol_vaddr = xol_take_insn_slot(area); + area = get_xol_area(); + if (!area) + return 0; + current->utask->xol_vaddr = xol_take_insn_slot(area); /* * Initialize the slot if xol_vaddr points to valid * instruction slot. -- cgit v1.2.3 From 5a2df662aafdabffb2cf3adb780a5adf66dfb3bc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Dec 2012 17:03:32 +0100 Subject: uprobes: Turn add_utask() into get_utask() Rename add_utask() into get_utask() and change it to allocate on demand to simplify the caller. Like get_xol_area() it will have more users. Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7e3e5c5b0d88..16e54d63a9fd 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1290,23 +1290,18 @@ void uprobe_copy_process(struct task_struct *t) } /* - * Allocate a uprobe_task object for the task. - * Called when the thread hits a breakpoint for the first time. + * Allocate a uprobe_task object for the task if if necessary. + * Called when the thread hits a breakpoint. * * Returns: * - pointer to new uprobe_task on success * - NULL otherwise */ -static struct uprobe_task *add_utask(void) +static struct uprobe_task *get_utask(void) { - struct uprobe_task *utask; - - utask = kzalloc(sizeof *utask, GFP_KERNEL); - if (unlikely(!utask)) - return NULL; - - current->utask = utask; - return utask; + if (!current->utask) + current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); + return current->utask; } /* Prepare to single-step probed instruction out of line. */ @@ -1505,13 +1500,9 @@ static void handle_swbp(struct pt_regs *regs) if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) goto out; - utask = current->utask; - if (!utask) { - utask = add_utask(); - /* Cannot allocate; re-execute the instruction. */ - if (!utask) - goto out; - } + utask = get_utask(); + if (!utask) + goto out; /* re-execute the instruction. */ handler_chain(uprobe, regs); if (can_skip_sstep(uprobe, regs)) -- cgit v1.2.3 From a6cb3f6d51253e9cf21a38b17c025018117809d7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Dec 2012 18:00:06 +0100 Subject: uprobes: Do not play with utask in xol_get_insn_slot() pre_ssout()->xol_get_insn_slot() path is confusing and buggy. This patch cleanups the code, the next one fixes the bug. Change xol_get_insn_slot() to only allocate the slot and do nothing more, move the initialization of utask->xol_vaddr/vaddr into pre_ssout(). Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 16e54d63a9fd..8d9c5bcb110e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1176,30 +1176,26 @@ static unsigned long xol_take_insn_slot(struct xol_area *area) } /* - * xol_get_insn_slot - If was not allocated a slot, then - * allocate a slot. + * xol_get_insn_slot - allocate a slot for xol. * Returns the allocated slot address or 0. */ -static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr) +static unsigned long xol_get_insn_slot(struct uprobe *uprobe) { struct xol_area *area; unsigned long offset; + unsigned long xol_vaddr; void *vaddr; area = get_xol_area(); if (!area) return 0; - current->utask->xol_vaddr = xol_take_insn_slot(area); - /* - * Initialize the slot if xol_vaddr points to valid - * instruction slot. - */ - if (unlikely(!current->utask->xol_vaddr)) + xol_vaddr = xol_take_insn_slot(area); + if (unlikely(!xol_vaddr)) return 0; - current->utask->vaddr = slot_addr; - offset = current->utask->xol_vaddr & ~PAGE_MASK; + /* Initialize the slot */ + offset = xol_vaddr & ~PAGE_MASK; vaddr = kmap_atomic(area->page); memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); kunmap_atomic(vaddr); @@ -1209,7 +1205,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot */ flush_dcache_page(area->page); - return current->utask->xol_vaddr; + return xol_vaddr; } /* @@ -1306,12 +1302,21 @@ static struct uprobe_task *get_utask(void) /* Prepare to single-step probed instruction out of line. */ static int -pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr) +pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) { - if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs)) - return 0; + struct uprobe_task *utask; + unsigned long xol_vaddr; + + utask = current->utask; + + xol_vaddr = xol_get_insn_slot(uprobe); + if (!xol_vaddr) + return -ENOMEM; + + utask->xol_vaddr = xol_vaddr; + utask->vaddr = bp_vaddr; - return -EFAULT; + return arch_uprobe_pre_xol(&uprobe->arch, regs); } /* -- cgit v1.2.3 From aba51024e7159c93914557caaa2b8cda26331091 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Dec 2012 18:12:48 +0100 Subject: uprobes: Fix utask->xol_vaddr leak in pre_ssout() pre_ssout() should do xol_free_insn_slot() if arch_uprobe_pre_xol() fails, otherwise nobody will free the allocated slot. Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8d9c5bcb110e..0527379dac5b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1306,6 +1306,7 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) { struct uprobe_task *utask; unsigned long xol_vaddr; + int err; utask = current->utask; @@ -1316,7 +1317,13 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) utask->xol_vaddr = xol_vaddr; utask->vaddr = bp_vaddr; - return arch_uprobe_pre_xol(&uprobe->arch, regs); + err = arch_uprobe_pre_xol(&uprobe->arch, regs); + if (unlikely(err)) { + xol_free_insn_slot(current); + return err; + } + + return 0; } /* -- cgit v1.2.3 From 608e7427c0a06de0d70374a9fd7defc8eb228b7e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Dec 2012 18:20:42 +0100 Subject: uprobes: Do not allocate current->utask unnecessary handle_swbp() does get_utask() before can_skip_sstep() for no reason, we do not need ->utask if can_skip_sstep() succeeds. Move get_utask() to pre_ssout() who actually starts to use it. Move the initialization of utask->active_uprobe/state as well. This way the whole initialization is consolidated in pre_ssout(). Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov --- kernel/events/uprobes.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0527379dac5b..071edcb3e62d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1308,7 +1308,9 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) unsigned long xol_vaddr; int err; - utask = current->utask; + utask = get_utask(); + if (!utask) + return -ENOMEM; xol_vaddr = xol_get_insn_slot(uprobe); if (!xol_vaddr) @@ -1323,6 +1325,8 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) return err; } + utask->active_uprobe = uprobe; + utask->state = UTASK_SSTEP; return 0; } @@ -1474,7 +1478,6 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) */ static void handle_swbp(struct pt_regs *regs) { - struct uprobe_task *utask; struct uprobe *uprobe; unsigned long bp_vaddr; int uninitialized_var(is_swbp); @@ -1512,19 +1515,12 @@ static void handle_swbp(struct pt_regs *regs) if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) goto out; - utask = get_utask(); - if (!utask) - goto out; /* re-execute the instruction. */ - handler_chain(uprobe, regs); if (can_skip_sstep(uprobe, regs)) goto out; - if (!pre_ssout(uprobe, regs, bp_vaddr)) { - utask->active_uprobe = uprobe; - utask->state = UTASK_SSTEP; + if (!pre_ssout(uprobe, regs, bp_vaddr)) return; - } /* can_skip_sstep() succeeded, or restart if can't singlestep */ out: -- cgit v1.2.3 From af4355e91f15812df8608925738c91be57c580dd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Dec 2012 18:37:11 +0100 Subject: uprobes: Kill the bogus IS_ERR_VALUE(xol_vaddr) check utask->xol_vaddr is either zero or valid, remove the bogus IS_ERR_VALUE() check in xol_free_insn_slot(). Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 071edcb3e62d..f6c7062fb950 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1223,8 +1223,7 @@ static void xol_free_insn_slot(struct task_struct *tsk) return; slot_addr = tsk->utask->xol_vaddr; - - if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr))) + if (unlikely(!slot_addr)) return; area = tsk->mm->uprobes_state.xol_area; -- cgit v1.2.3 From e8440c1458ba571bc3fac8a6beb53ff604199f5b Mon Sep 17 00:00:00 2001 From: Josh Stone Date: Sun, 13 Jan 2013 19:03:34 +0100 Subject: uprobes: Add exports for module use The original pull message for uprobes (commit 654443e2) noted: This tree includes uprobes support in 'perf probe' - but SystemTap (and other tools) can take advantage of user probe points as well. In order to actually be usable in module-based tools like SystemTap, the interface needs to be exported. This patch first adds the obvious exports for uprobe_register and uprobe_unregister. Then it also adds one for task_user_regset_view, which is necessary to get the correct state of userspace registers. Signed-off-by: Josh Stone Signed-off-by: Oleg Nesterov --- kernel/events/uprobes.c | 3 +++ kernel/ptrace.c | 6 ++++++ 2 files changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f6c7062fb950..221fc58f59e3 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -27,6 +27,7 @@ #include /* read_mapping_page */ #include #include +#include #include /* anon_vma_prepare */ #include /* set_pte_at_notify */ #include /* try_to_free_swap */ @@ -851,6 +852,7 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * goto retry; return ret; } +EXPORT_SYMBOL_GPL(uprobe_register); /* * uprobe_unregister - unregister a already registered probe. @@ -871,6 +873,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume up_write(&uprobe->register_rwsem); put_uprobe(uprobe); } +EXPORT_SYMBOL_GPL(uprobe_unregister); static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) { diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 6cbeaae4406d..acbd28424d81 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -712,6 +712,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type, kiov->iov_len, kiov->iov_base); } +/* + * This is declared in linux/regset.h and defined in machine-dependent + * code. We put the export here, near the primary machine-neutral use, + * to ensure no machine forgets it. + */ +EXPORT_SYMBOL_GPL(task_user_regset_view); #endif int ptrace_request(struct task_struct *child, long request, -- cgit v1.2.3 From 84d7ed799fd6c1366547d88ddb8188c65de3b94f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 27 Jan 2013 18:20:45 +0100 Subject: uprobes/tracing: Fix dentry/mount leak in create_trace_uprobe() create_trace_uprobe() does kern_path() to find ->d_inode, but forgets to do path_put(). We can do this right after igrab(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/trace/trace_uprobe.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 17d9b2bcc28d..06c22bad776a 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -253,16 +253,18 @@ static int create_trace_uprobe(int argc, char **argv) if (ret) goto fail_address_parse; - ret = kstrtoul(arg, 0, &offset); - if (ret) - goto fail_address_parse; - inode = igrab(path.dentry->d_inode); + path_put(&path); + if (!S_ISREG(inode->i_mode)) { ret = -EINVAL; goto fail_address_parse; } + ret = kstrtoul(arg, 0, &offset); + if (ret) + goto fail_address_parse; + argc -= 2; argv += 2; -- cgit v1.2.3 From 4161824f18ff4f56f46595a4016c7315dd0d24f1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 27 Jan 2013 18:36:24 +0100 Subject: uprobes/tracing: Fully initialize uprobe_trace_consumer before uprobe_register() probe_event_enable() does uprobe_register() and only after that sets utc->tu and tu->consumer/flags. This can race with uprobe_dispatcher() which can miss these assignments or see them out of order. Nothing really bad can happen, but this doesn't look clean/safe. And this does not allow to use uprobe_consumer->filter() we are going to add, it is called by uprobe_register() and it needs utc->tu. Change this code to initialize everything before uprobe_register(), and reset tu->consumer/flags if it fails. We can't race with event_disable(), the caller holds event_mutex, and if we could the code would be wrong anyway. In fact I think uprobe_trace_consumer should die, it buys nothing but complicates the code. We can simply add uprobe_consumer into trace_uprobe. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/trace/trace_uprobe.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 06c22bad776a..15b8eceeddc5 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -552,17 +552,18 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) return -EINTR; utc->cons.handler = uprobe_dispatcher; + utc->tu = tu; + tu->consumer = utc; + tu->flags |= flag; + ret = uprobe_register(tu->inode, tu->offset, &utc->cons); if (ret) { + tu->consumer = NULL; + tu->flags &= ~flag; kfree(utc); - return ret; } - tu->flags |= flag; - utc->tu = tu; - tu->consumer = utc; - - return 0; + return ret; } static void probe_event_disable(struct trace_uprobe *tu, int flag) -- cgit v1.2.3 From 7e4e28c53963e6cfa94d8109bb8f5233c5659048 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 28 Jan 2013 17:08:47 +0100 Subject: uprobes/tracing: Ensure inode != NULL in create_trace_uprobe() probe_event_enable/disable() check tu->inode != NULL at the start. This is ugly, if igrab() can fail create_trace_uprobe() should not succeed and "postpone" the failure. And S_ISREG(inode->i_mode) check added by d24d7dbf is not safe. Note: alloc_uprobe() should probably check igrab() != NULL as well. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/trace/trace_uprobe.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 15b8eceeddc5..f7838cfd61b9 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -256,7 +256,7 @@ static int create_trace_uprobe(int argc, char **argv) inode = igrab(path.dentry->d_inode); path_put(&path); - if (!S_ISREG(inode->i_mode)) { + if (!inode || !S_ISREG(inode->i_mode)) { ret = -EINVAL; goto fail_address_parse; } @@ -544,7 +544,7 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) struct uprobe_trace_consumer *utc; int ret = 0; - if (!tu->inode || tu->consumer) + if (tu->consumer) return -EINTR; utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL); @@ -568,7 +568,7 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) static void probe_event_disable(struct trace_uprobe *tu, int flag) { - if (!tu->inode || !tu->consumer) + if (!tu->consumer) return; uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons); -- cgit v1.2.3 From b64b007797c1e6d6b745c93c296ba1d5f4d72d86 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 31 Jan 2013 19:15:30 +0100 Subject: uprobes/tracing: Introduce is_trace_uprobe_enabled() probe_event_enable/disable() check tu->consumer != NULL to avoid the wrong uprobe_register/unregister(). We are going to kill this pointer and "struct uprobe_trace_consumer", so we add the new helper, is_trace_uprobe_enabled(), which can rely on TP_FLAG_TRACE/TP_FLAG_PROFILE instead. Note: the current logic doesn't look optimal, it is not clear why TP_FLAG_TRACE/TP_FLAG_PROFILE are mutually exclusive, we will probably change this later. Also kill the unused TP_FLAG_UPROBE. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/trace/trace_probe.h | 1 - kernel/trace/trace_uprobe.c | 9 +++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 933708677814..5c7e09d10d74 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -66,7 +66,6 @@ #define TP_FLAG_TRACE 1 #define TP_FLAG_PROFILE 2 #define TP_FLAG_REGISTERED 4 -#define TP_FLAG_UPROBE 8 /* data_rloc: data relative location, compatible with u32 */ diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index f7838cfd61b9..d6c6e2a345a7 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -539,12 +539,17 @@ partial: return TRACE_TYPE_PARTIAL_LINE; } +static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu) +{ + return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE); +} + static int probe_event_enable(struct trace_uprobe *tu, int flag) { struct uprobe_trace_consumer *utc; int ret = 0; - if (tu->consumer) + if (is_trace_uprobe_enabled(tu)) return -EINTR; utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL); @@ -568,7 +573,7 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) static void probe_event_disable(struct trace_uprobe *tu, int flag) { - if (!tu->consumer) + if (!is_trace_uprobe_enabled(tu)) return; uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons); -- cgit v1.2.3 From a932b7381f81235530c3d0acbd3ba2c7537d78e5 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 31 Jan 2013 19:47:23 +0100 Subject: uprobes/tracing: Kill uprobe_trace_consumer, embed uprobe_consumer into trace_uprobe trace_uprobe->consumer and "struct uprobe_trace_consumer" add the unnecessary indirection and complicate the code for no reason. This patch simply embeds uprobe_consumer into "struct trace_uprobe", all other changes only fix the compilation errors. Signed-off-by: Oleg Nesterov --- kernel/trace/trace_uprobe.c | 35 ++++++----------------------------- 1 file changed, 6 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d6c6e2a345a7..9c8babbfd11b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -31,17 +31,11 @@ /* * uprobe event core functions */ -struct trace_uprobe; -struct uprobe_trace_consumer { - struct uprobe_consumer cons; - struct trace_uprobe *tu; -}; - struct trace_uprobe { struct list_head list; struct ftrace_event_class class; struct ftrace_event_call call; - struct uprobe_trace_consumer *consumer; + struct uprobe_consumer consumer; struct inode *inode; char *filename; unsigned long offset; @@ -92,6 +86,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs) goto error; INIT_LIST_HEAD(&tu->list); + tu->consumer.handler = uprobe_dispatcher; return tu; error: @@ -546,27 +541,15 @@ static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu) static int probe_event_enable(struct trace_uprobe *tu, int flag) { - struct uprobe_trace_consumer *utc; int ret = 0; if (is_trace_uprobe_enabled(tu)) return -EINTR; - utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL); - if (!utc) - return -EINTR; - - utc->cons.handler = uprobe_dispatcher; - utc->tu = tu; - tu->consumer = utc; tu->flags |= flag; - - ret = uprobe_register(tu->inode, tu->offset, &utc->cons); - if (ret) { - tu->consumer = NULL; + ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); + if (ret) tu->flags &= ~flag; - kfree(utc); - } return ret; } @@ -576,10 +559,8 @@ static void probe_event_disable(struct trace_uprobe *tu, int flag) if (!is_trace_uprobe_enabled(tu)) return; - uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons); + uprobe_unregister(tu->inode, tu->offset, &tu->consumer); tu->flags &= ~flag; - kfree(tu->consumer); - tu->consumer = NULL; } static int uprobe_event_define_fields(struct ftrace_event_call *event_call) @@ -717,13 +698,9 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) { - struct uprobe_trace_consumer *utc; struct trace_uprobe *tu; - utc = container_of(con, struct uprobe_trace_consumer, cons); - tu = utc->tu; - if (!tu || tu->consumer != utc) - return 0; + tu = container_of(con, struct trace_uprobe, consumer); if (tu->flags & TP_FLAG_TRACE) uprobe_trace_func(tu, regs); -- cgit v1.2.3 From 1b47aefd9b6bd439a4be43c47acd22987ac22db8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 31 Jan 2013 19:55:27 +0100 Subject: uprobes/perf: Always increment trace_uprobe->nhit Move tu->nhit++ from uprobe_trace_func() to uprobe_dispatcher(). ->nhit counts how many time we hit the breakpoint inserted by this uprobe, we do not want to loose this info if uprobe was enabled by sys_perf_event_open(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/trace/trace_uprobe.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9c8babbfd11b..c4e29e19fdd7 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -476,8 +476,6 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) unsigned long irq_flags; struct ftrace_event_call *call = &tu->call; - tu->nhit++; - local_save_flags(irq_flags); pc = preempt_count(); @@ -701,6 +699,7 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) struct trace_uprobe *tu; tu = container_of(con, struct trace_uprobe, consumer); + tu->nhit++; if (tu->flags & TP_FLAG_TRACE) uprobe_trace_func(tu, regs); -- cgit v1.2.3 From f22c1bb6b4706be3502b378cb14564449b15f983 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 2 Feb 2013 16:27:52 +0100 Subject: perf: Introduce hw_perf_event->tp_target and ->tp_list sys_perf_event_open()->perf_init_event(event) is called before find_get_context(event), this means that event->ctx == NULL when class->reg(TRACE_REG_PERF_REGISTER/OPEN) is called and thus it can't know if this event is per-task or system-wide. This patch adds hw_perf_event->tp_target for PERF_TYPE_TRACEPOINT, this is analogous to PERF_TYPE_BREAKPOINT/bp_target we already have. The patch also moves ->bp_target up so that it can overlap with the new member, this can help the compiler to generate the better code. trace_uprobe_register() will use it for prefiltering to avoid the unnecessary breakpoints in mm's we do not want to trace. ->tp_target doesn't have its own reference, but we can rely on the fact that either sys_perf_event_open() holds a reference, or it is equal to event->ctx->task. So this pointer is always valid until free_event(). Also add the "struct list_head tp_list" into this union. It is not strictly necessary, but it can simplify the next changes and we can add it for free. Signed-off-by: Oleg Nesterov --- kernel/events/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 301079d06f24..e2d4323c6ae6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6162,11 +6162,14 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (task) { event->attach_state = PERF_ATTACH_TASK; + + if (attr->type == PERF_TYPE_TRACEPOINT) + event->hw.tp_target = task; #ifdef CONFIG_HAVE_HW_BREAKPOINT /* * hw_breakpoint is a bit difficult here.. */ - if (attr->type == PERF_TYPE_BREAKPOINT) + else if (attr->type == PERF_TYPE_BREAKPOINT) event->hw.bp_target = task; #endif } -- cgit v1.2.3 From bdf8647c44766590ed02f9a84a450a796558b753 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 3 Feb 2013 19:21:12 +0100 Subject: uprobes: Introduce uprobe_apply() Currently it is not possible to change the filtering constraints after uprobe_register(), so a consumer can not, say, start to trace a task/mm which was previously filtered out, or remove the no longer needed bp's. Introduce uprobe_apply() which simply does register_for_each_vma() again to consult uprobe_consumer->filter() and install/remove the breakpoints. The only complication is that register_for_each_vma() can no longer assume that uprobe->consumers should be consulter if is_register == T, so we change it to accept "struct uprobe_consumer *new" instead. Unlike uprobe_register(), uprobe_apply(true) doesn't do "unregister" if register_for_each_vma() fails, it is up to caller to handle the error. Note: we probably need to cleanup the current interface, it is strange that uprobe_apply/unregister need inode/offset. We should either change uprobe_register() to return "struct uprobe *", or add a private ->uprobe member in uprobe_consumer. And in the long term uprobe_apply() should take a single argument, uprobe or consumer, even "bool add" should go away. Signed-off-by: Oleg Nesterov --- kernel/events/uprobes.c | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 221fc58f59e3..a567c8c7ef31 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -733,8 +733,10 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) return curr; } -static int register_for_each_vma(struct uprobe *uprobe, bool is_register) +static int +register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) { + bool is_register = !!new; struct map_info *info; int err = 0; @@ -765,7 +767,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) if (is_register) { /* consult only the "caller", new consumer. */ - if (consumer_filter(uprobe->consumers, + if (consumer_filter(new, UPROBE_FILTER_REGISTER, mm)) err = install_breakpoint(uprobe, mm, vma, info->vaddr); } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { @@ -788,7 +790,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc) { consumer_add(uprobe, uc); - return register_for_each_vma(uprobe, true); + return register_for_each_vma(uprobe, uc); } static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) @@ -798,7 +800,7 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u if (!consumer_del(uprobe, uc)) /* WARN? */ return; - err = register_for_each_vma(uprobe, false); + err = register_for_each_vma(uprobe, NULL); /* TODO : cant unregister? schedule a worker thread */ if (!uprobe->consumers && !err) delete_uprobe(uprobe); @@ -854,6 +856,35 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * } EXPORT_SYMBOL_GPL(uprobe_register); +/* + * uprobe_apply - unregister a already registered probe. + * @inode: the file in which the probe has to be removed. + * @offset: offset from the start of the file. + * @uc: consumer which wants to add more or remove some breakpoints + * @add: add or remove the breakpoints + */ +int uprobe_apply(struct inode *inode, loff_t offset, + struct uprobe_consumer *uc, bool add) +{ + struct uprobe *uprobe; + struct uprobe_consumer *con; + int ret = -ENOENT; + + uprobe = find_uprobe(inode, offset); + if (!uprobe) + return ret; + + down_write(&uprobe->register_rwsem); + for (con = uprobe->consumers; con && con != uc ; con = con->next) + ; + if (con) + ret = register_for_each_vma(uprobe, add ? uc : NULL); + up_write(&uprobe->register_rwsem); + put_uprobe(uprobe); + + return ret; +} + /* * uprobe_unregister - unregister a already registered probe. * @inode: the file in which the probe has to be removed. -- cgit v1.2.3 From 736288ba5016e255869c26296014eeff649971c2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 3 Feb 2013 20:58:35 +0100 Subject: uprobes/perf: Teach trace_uprobe/perf code to track the active perf_event's Introduce "struct trace_uprobe_filter" which records the "active" perf_event's attached to ftrace_event_call. For the start we simply use list_head, we can optimize this later if needed. For example, we do not really need to record an event with ->parent != NULL, we can rely on parent->child_list. And we can certainly do some optimizations for the case when 2 events have the same ->tp_target or tp_target->mm. Change trace_uprobe_register() to process TRACE_REG_PERF_OPEN/CLOSE and add/del this perf_event to the list. We can probably avoid any locking, but lets start with the "obvioulsy correct" trace_uprobe_filter->rwlock which protects everything. Signed-off-by: Oleg Nesterov --- kernel/trace/trace_uprobe.c | 55 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c4e29e19fdd7..2a74a93afdae 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -28,6 +28,12 @@ #define UPROBE_EVENT_SYSTEM "uprobes" +struct trace_uprobe_filter { + rwlock_t rwlock; + int nr_systemwide; + struct list_head perf_events; +}; + /* * uprobe event core functions */ @@ -35,6 +41,7 @@ struct trace_uprobe { struct list_head list; struct ftrace_event_class class; struct ftrace_event_call call; + struct trace_uprobe_filter filter; struct uprobe_consumer consumer; struct inode *inode; char *filename; @@ -58,6 +65,18 @@ static LIST_HEAD(uprobe_list); static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); +static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) +{ + rwlock_init(&filter->rwlock); + filter->nr_systemwide = 0; + INIT_LIST_HEAD(&filter->perf_events); +} + +static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter) +{ + return !filter->nr_systemwide && list_empty(&filter->perf_events); +} + /* * Allocate new trace_uprobe and initialize it (including uprobes). */ @@ -87,6 +106,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs) INIT_LIST_HEAD(&tu->list); tu->consumer.handler = uprobe_dispatcher; + init_trace_uprobe_filter(&tu->filter); return tu; error: @@ -544,6 +564,8 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) if (is_trace_uprobe_enabled(tu)) return -EINTR; + WARN_ON(!uprobe_filter_is_empty(&tu->filter)); + tu->flags |= flag; ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); if (ret) @@ -557,6 +579,8 @@ static void probe_event_disable(struct trace_uprobe *tu, int flag) if (!is_trace_uprobe_enabled(tu)) return; + WARN_ON(!uprobe_filter_is_empty(&tu->filter)); + uprobe_unregister(tu->inode, tu->offset, &tu->consumer); tu->flags &= ~flag; } @@ -632,6 +656,30 @@ static int set_print_fmt(struct trace_uprobe *tu) } #ifdef CONFIG_PERF_EVENTS +static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) +{ + write_lock(&tu->filter.rwlock); + if (event->hw.tp_target) + list_add(&event->hw.tp_list, &tu->filter.perf_events); + else + tu->filter.nr_systemwide++; + write_unlock(&tu->filter.rwlock); + + return 0; +} + +static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) +{ + write_lock(&tu->filter.rwlock); + if (event->hw.tp_target) + list_del(&event->hw.tp_list); + else + tu->filter.nr_systemwide--; + write_unlock(&tu->filter.rwlock); + + return 0; +} + /* uprobe profile handler */ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) { @@ -687,6 +735,13 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, case TRACE_REG_PERF_UNREGISTER: probe_event_disable(tu, TP_FLAG_PROFILE); return 0; + + case TRACE_REG_PERF_OPEN: + return uprobe_perf_open(tu, data); + + case TRACE_REG_PERF_CLOSE: + return uprobe_perf_close(tu, data); + #endif default: return 0; -- cgit v1.2.3 From 31ba334836c0ac0039084859f14a5b96858493dc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 4 Feb 2013 17:11:58 +0100 Subject: uprobes/perf: Teach trace_uprobe/perf code to pre-filter Finally implement uprobe_perf_filter() which checks ->nr_systemwide or ->perf_events to figure out whether we need to insert the breakpoint. uprobe_perf_open/close are changed to do uprobe_apply(true/false) when the new perf event comes or goes away. Note that currently this is very suboptimal: - uprobe_register() called by TRACE_REG_PERF_REGISTER becomes a heavy nop, consumer->filter() always returns F at this stage. As it was already discussed we need uprobe_register_only() to avoid the costly register_for_each_vma() when possible. - uprobe_apply() is oftenly overkill. Unless "nr_systemwide != 0" changes we need uprobe_apply_mm(), unapply_uprobe() is almost what we need. - uprobe_apply() can be simply avoided sometimes, see the next changes. Testing: # perf probe -x /lib/libc.so.6 syscall # perl -e 'syscall -1 while 1' & [1] 530 # perf record -e probe_libc:syscall perl -e 'syscall -1 for 1..10; sleep 1' # perf report --show-total-period 100.00% 10 perl libc-2.8.so [.] syscall Before this patch: # cat /sys/kernel/debug/tracing/uprobe_profile /lib/libc.so.6 syscall 79291 A huge ->nrhit == 79291 reflects the fact that the background process 530 constantly hits this breakpoint too, even if doesn't contribute to the output. After the patch: # cat /sys/kernel/debug/tracing/uprobe_profile /lib/libc.so.6 syscall 10 This shows that only the target process was punished by int3. Signed-off-by: Oleg Nesterov --- kernel/trace/trace_uprobe.c | 46 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2a74a93afdae..b7850f535acf 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -557,7 +557,12 @@ static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu) return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE); } -static int probe_event_enable(struct trace_uprobe *tu, int flag) +typedef bool (*filter_func_t)(struct uprobe_consumer *self, + enum uprobe_filter_ctx ctx, + struct mm_struct *mm); + +static int +probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter) { int ret = 0; @@ -567,6 +572,7 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) WARN_ON(!uprobe_filter_is_empty(&tu->filter)); tu->flags |= flag; + tu->consumer.filter = filter; ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); if (ret) tu->flags &= ~flag; @@ -656,6 +662,22 @@ static int set_print_fmt(struct trace_uprobe *tu) } #ifdef CONFIG_PERF_EVENTS +static bool +__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) +{ + struct perf_event *event; + + if (filter->nr_systemwide) + return true; + + list_for_each_entry(event, &filter->perf_events, hw.tp_list) { + if (event->hw.tp_target->mm == mm) + return true; + } + + return false; +} + static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) { write_lock(&tu->filter.rwlock); @@ -665,6 +687,8 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) tu->filter.nr_systemwide++; write_unlock(&tu->filter.rwlock); + uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + return 0; } @@ -677,9 +701,25 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) tu->filter.nr_systemwide--; write_unlock(&tu->filter.rwlock); + uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + return 0; } +static bool uprobe_perf_filter(struct uprobe_consumer *uc, + enum uprobe_filter_ctx ctx, struct mm_struct *mm) +{ + struct trace_uprobe *tu; + int ret; + + tu = container_of(uc, struct trace_uprobe, consumer); + read_lock(&tu->filter.rwlock); + ret = __uprobe_perf_filter(&tu->filter, mm); + read_unlock(&tu->filter.rwlock); + + return ret; +} + /* uprobe profile handler */ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) { @@ -722,7 +762,7 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, switch (type) { case TRACE_REG_REGISTER: - return probe_event_enable(tu, TP_FLAG_TRACE); + return probe_event_enable(tu, TP_FLAG_TRACE, NULL); case TRACE_REG_UNREGISTER: probe_event_disable(tu, TP_FLAG_TRACE); @@ -730,7 +770,7 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return probe_event_enable(tu, TP_FLAG_PROFILE); + return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter); case TRACE_REG_PERF_UNREGISTER: probe_event_disable(tu, TP_FLAG_PROFILE); -- cgit v1.2.3 From f42d24a1d20d2e72d1e5d48930f18b138dfad117 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 4 Feb 2013 17:48:34 +0100 Subject: uprobes/perf: Teach trace_uprobe/perf code to use UPROBE_HANDLER_REMOVE Change uprobe_trace_func() and uprobe_perf_func() to return "int". Change uprobe_dispatcher() to return "trace_ret | perf_ret" although this is not needed, currently TP_FLAG_TRACE/TP_FLAG_PROFILE are mutually exclusive. The only functional change is that uprobe_perf_func() checks the filtering too and returns UPROBE_HANDLER_REMOVE if nobody wants to trace current. Testing: # perf probe -x /lib/libc.so.6 syscall # perf record -e probe_libc:syscall -i perl -e 'fork; syscall -1 for 1..10; wait' # perf report --show-total-period 100.00% 10 perl libc-2.8.so [.] syscall Before this patch: # cat /sys/kernel/debug/tracing/uprobe_profile /lib/libc.so.6 syscall 20 A child process doesn't have a counter, but still it hits this breakoint "copied" by dup_mmap(). After the patch: # cat /sys/kernel/debug/tracing/uprobe_profile /lib/libc.so.6 syscall 11 The child process hits this int3 only once and does unapply_uprobe(). Signed-off-by: Oleg Nesterov --- kernel/trace/trace_uprobe.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index b7850f535acf..2399f1416555 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -486,7 +486,7 @@ static const struct file_operations uprobe_profile_ops = { }; /* uprobe handler */ -static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) +static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) { struct uprobe_trace_entry_head *entry; struct ring_buffer_event *event; @@ -504,7 +504,7 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) event = trace_current_buffer_lock_reserve(&buffer, call->event.type, size, irq_flags, pc); if (!event) - return; + return 0; entry = ring_buffer_event_data(event); entry->ip = instruction_pointer(task_pt_regs(current)); @@ -514,6 +514,8 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) if (!filter_current_check_discard(buffer, call, entry, event)) trace_buffer_unlock_commit(buffer, event, irq_flags, pc); + + return 0; } /* Event entry printers */ @@ -721,7 +723,7 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, } /* uprobe profile handler */ -static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) +static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) { struct ftrace_event_call *call = &tu->call; struct uprobe_trace_entry_head *entry; @@ -730,11 +732,14 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) int size, __size, i; int rctx; + if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) + return UPROBE_HANDLER_REMOVE; + __size = sizeof(*entry) + tu->size; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) - return; + return 0; preempt_disable(); @@ -752,6 +757,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) out: preempt_enable(); + return 0; } #endif /* CONFIG_PERF_EVENTS */ @@ -792,18 +798,19 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) { struct trace_uprobe *tu; + int ret = 0; tu = container_of(con, struct trace_uprobe, consumer); tu->nhit++; if (tu->flags & TP_FLAG_TRACE) - uprobe_trace_func(tu, regs); + ret |= uprobe_trace_func(tu, regs); #ifdef CONFIG_PERF_EVENTS if (tu->flags & TP_FLAG_PROFILE) - uprobe_perf_func(tu, regs); + ret |= uprobe_perf_func(tu, regs); #endif - return 0; + return ret; } static struct trace_event_functions uprobe_funcs = { -- cgit v1.2.3 From b2fe8ba674e8acbb9e8e63510b802c6d054d88a3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 4 Feb 2013 19:05:43 +0100 Subject: uprobes/perf: Avoid uprobe_apply() whenever possible uprobe_perf_open/close call the costly uprobe_apply() every time, we can avoid it if: - "nr_systemwide != 0" is not changed. - There is another process/thread with the same ->mm. - copy_proccess() does inherit_event(). dup_mmap() preserves the inserted breakpoints. - event->attr.enable_on_exec == T, we can rely on uprobe_mmap() called by exec/mmap paths. - tp_target is exiting. Only _close() checks PF_EXITING, I don't think TRACE_REG_PERF_OPEN can hit the dying task too often. Signed-off-by: Oleg Nesterov --- kernel/trace/trace_uprobe.c | 42 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2399f1416555..8dad2a92dee9 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -680,30 +680,60 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) return false; } +static inline bool +uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) +{ + return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); +} + static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) { + bool done; + write_lock(&tu->filter.rwlock); - if (event->hw.tp_target) + if (event->hw.tp_target) { + /* + * event->parent != NULL means copy_process(), we can avoid + * uprobe_apply(). current->mm must be probed and we can rely + * on dup_mmap() which preserves the already installed bp's. + * + * attr.enable_on_exec means that exec/mmap will install the + * breakpoints we need. + */ + done = tu->filter.nr_systemwide || + event->parent || event->attr.enable_on_exec || + uprobe_filter_event(tu, event); list_add(&event->hw.tp_list, &tu->filter.perf_events); - else + } else { + done = tu->filter.nr_systemwide; tu->filter.nr_systemwide++; + } write_unlock(&tu->filter.rwlock); - uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + if (!done) + uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); return 0; } static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) { + bool done; + write_lock(&tu->filter.rwlock); - if (event->hw.tp_target) + if (event->hw.tp_target) { list_del(&event->hw.tp_list); - else + done = tu->filter.nr_systemwide || + (event->hw.tp_target->flags & PF_EXITING) || + uprobe_filter_event(tu, event); + } else { tu->filter.nr_systemwide--; + done = tu->filter.nr_systemwide; + } write_unlock(&tu->filter.rwlock); - uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + if (!done) + uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); return 0; } -- cgit v1.2.3 From 84e345e4e209cbe796c88fa2ad1732d7121ec100 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Fri, 8 Feb 2013 17:59:53 -0500 Subject: time, Fix setting of hardware clock in NTP code At init time, if the system time is "warped" forward in warp_clock() it will differ from the hardware clock by sys_tz.tz_minuteswest. This time difference is not taken into account when ntp updates the hardware clock, and this causes the system time to jump forward by this offset every reboot. The kernel must take this offset into account when writing the system time to the hardware clock in the ntp code. This patch adds persistent_clock_is_local which indicates that an offset has been applied in warp_clock() and accounts for the "warp" before writing the hardware clock. x86 does not have this problem as rtc writes are software limited to a +/-15 minute window relative to the current rtc time. Other arches, such as powerpc, however do a full synchronization of the system time to the rtc and will see this problem. [v2]: generated against tip/timers/core Signed-off-by: Prarit Bhargava Cc: John Stultz Cc: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time.c | 8 ++++++++ kernel/time/ntp.c | 8 ++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time.c b/kernel/time.c index d226c6a3fd28..c2a27dd93142 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -114,6 +114,12 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, return 0; } +/* + * Indicates if there is an offset between the system clock and the hardware + * clock/persistent clock/rtc. + */ +int persistent_clock_is_local; + /* * Adjust the time obtained from the CMOS to be UTC time instead of * local time. @@ -135,6 +141,8 @@ static inline void warp_clock(void) struct timespec adjust; adjust = current_kernel_time(); + if (sys_tz.tz_minuteswest != 0) + persistent_clock_is_local = 1; adjust.tv_sec += sys_tz.tz_minuteswest * 60; do_settimeofday(&adjust); } diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 313b161504b7..b10a42bb0165 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -511,13 +511,17 @@ static void sync_cmos_clock(struct work_struct *work) getnstimeofday(&now); if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) { + struct timespec adjust = now; + fail = -ENODEV; + if (persistent_clock_is_local) + adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); #ifdef CONFIG_GENERIC_CMOS_UPDATE - fail = update_persistent_clock(now); + fail = update_persistent_clock(adjust); #endif #ifdef CONFIG_RTC_SYSTOHC if (fail == -ENODEV) - fail = rtc_set_ntp_time(now); + fail = rtc_set_ntp_time(adjust); #endif } -- cgit v1.2.3 From ad72b3bea744b4db01c89af0f86f3e8920d354df Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 21 Dec 2012 17:57:00 -0800 Subject: kprobes: fix wait_for_kprobe_optimizer() wait_for_kprobe_optimizer() seems largely broken. It uses optimizer_comp which is never re-initialized, so wait_for_kprobe_optimizer() will never wait for anything once kprobe_optimizer() finishes all pending jobs for the first time. Also, aside from completion, delayed_work_pending() is %false once kprobe_optimizer() starts execution and wait_for_kprobe_optimizer() won't wait for it. Reimplement it so that it flushes optimizing_work until [un]optimizing_lists are empty. Note that this also makes optimizing_work execute immediately if someone's waiting for it, which is the nicer behavior. Only compile tested. Signed-off-by: Tejun Heo Acked-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: "David S. Miller" --- kernel/kprobes.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 098f396aa409..f230e81a9db6 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -471,7 +471,6 @@ static LIST_HEAD(unoptimizing_list); static void kprobe_optimizer(struct work_struct *work); static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); -static DECLARE_COMPLETION(optimizer_comp); #define OPTIMIZE_DELAY 5 /* @@ -552,8 +551,7 @@ static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list) /* Start optimizer after OPTIMIZE_DELAY passed */ static __kprobes void kick_kprobe_optimizer(void) { - if (!delayed_work_pending(&optimizing_work)) - schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); + schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); } /* Kprobe jump optimizer */ @@ -592,16 +590,25 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) /* Step 5: Kick optimizer again if needed */ if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) kick_kprobe_optimizer(); - else - /* Wake up all waiters */ - complete_all(&optimizer_comp); } /* Wait for completing optimization and unoptimization */ static __kprobes void wait_for_kprobe_optimizer(void) { - if (delayed_work_pending(&optimizing_work)) - wait_for_completion(&optimizer_comp); + mutex_lock(&kprobe_mutex); + + while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) { + mutex_unlock(&kprobe_mutex); + + /* this will also make optimizing_work execute immmediately */ + flush_delayed_work(&optimizing_work); + /* @optimizing_work might not have been queued yet, relax */ + cpu_relax(); + + mutex_lock(&kprobe_mutex); + } + + mutex_unlock(&kprobe_mutex); } /* Optimize kprobe if p is ready to be optimized */ -- cgit v1.2.3 From 7e73c5ae6e7991a6c01f6d096ff8afaef4458c36 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 6 Feb 2013 13:00:36 +0100 Subject: PM: Introduce suspend state PM_SUSPEND_FREEZE PM_SUSPEND_FREEZE state is a general state that does not need any platform specific support, it equals frozen processes + suspended devices + idle processors. Compared with PM_SUSPEND_MEMORY, PM_SUSPEND_FREEZE saves less power because the system is still in a running state. PM_SUSPEND_FREEZE has less resume latency because it does not touch BIOS, and the processors are in idle state. Compared with RTPM/idle, PM_SUSPEND_FREEZE saves more power as 1. the processor has longer sleep time because processes are frozen. The deeper c-state the processor supports, more power saving we can get. 2. PM_SUSPEND_FREEZE uses system suspend code path, thus we can get more power saving from the devices that does not have good RTPM support. This state is useful for 1) platforms that do not have STR, or have a broken STR. 2) platforms that have an extremely low power idle state, which can be used to replace STR. The following describes how PM_SUSPEND_FREEZE state works. 1. echo freeze > /sys/power/state 2. the processes are frozen. 3. all the devices are suspended. 4. all the processors are blocked by a wait queue 5. all the processors idles and enters (Deep) c-state. 6. an interrupt fires. 7. a processor is woken up and handles the irq. 8. if it is a general event, a) the irq handler runs and quites. b) goto step 4. 9. if it is a real wake event, say, power button pressing, keyboard touch, mouse moving, a) the irq handler runs and activate the wakeup source b) wakeup_source_activate() notifies the wait queue. c) system starts resuming from PM_SUSPEND_FREEZE 10. all the devices are resumed. 11. all the processes are unfrozen. 12. system is back to working. Known Issue: The wakeup of this new PM_SUSPEND_FREEZE state may behave differently from the previous suspend state. Take ACPI platform for example, there are some GPEs that only enabled when the system is in sleep state, to wake the system backk from S3/S4. But we are not touching these GPEs during transition to PM_SUSPEND_FREEZE. This means we may lose some wake event. But on the other hand, as we do not disable all the Interrupts during PM_SUSPEND_FREEZE, we may get some extra "wakeup" Interrupts, that are not available for S3/S4. The patches has been tested on an old Sony laptop, and here are the results: Average Power: 1. RPTM/idle for half an hour: 14.8W, 12.6W, 14.1W, 12.5W, 14.4W, 13.2W, 12.9W 2. Freeze for half an hour: 11W, 10.4W, 9.4W, 11.3W 10.5W 3. RTPM/idle for three hours: 11.6W 4. Freeze for three hours: 10W 5. Suspend to Memory: 0.5~0.9W Average Resume Latency: 1. RTPM/idle with a black screen: (From pressing keyboard to screen back) Less than 0.2s 2. Freeze: (From pressing power button to screen back) 2.50s 3. Suspend to Memory: (From pressing power button to screen back) 4.33s >From the results, we can see that all the platforms should benefit from this patch, even if it does not have Low Power S0. Signed-off-by: Zhang Rui Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 2 +- kernel/power/suspend.c | 69 ++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 57 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 1c16f9167de1..b1c26a92ca9f 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -313,7 +313,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, static suspend_state_t decode_state(const char *buf, size_t n) { #ifdef CONFIG_SUSPEND - suspend_state_t state = PM_SUSPEND_STANDBY; + suspend_state_t state = PM_SUSPEND_MIN; const char * const *s; #endif char *p; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index c8b7446b27df..d4feda084a3a 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -30,12 +30,38 @@ #include "power.h" const char *const pm_states[PM_SUSPEND_MAX] = { + [PM_SUSPEND_FREEZE] = "freeze", [PM_SUSPEND_STANDBY] = "standby", [PM_SUSPEND_MEM] = "mem", }; static const struct platform_suspend_ops *suspend_ops; +static bool need_suspend_ops(suspend_state_t state) +{ + return !!(state > PM_SUSPEND_FREEZE); +} + +static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); +static bool suspend_freeze_wake; + +static void freeze_begin(void) +{ + suspend_freeze_wake = false; +} + +static void freeze_enter(void) +{ + wait_event(suspend_freeze_wait_head, suspend_freeze_wake); +} + +void freeze_wake(void) +{ + suspend_freeze_wake = true; + wake_up(&suspend_freeze_wait_head); +} +EXPORT_SYMBOL_GPL(freeze_wake); + /** * suspend_set_ops - Set the global suspend method table. * @ops: Suspend operations to use. @@ -50,8 +76,11 @@ EXPORT_SYMBOL_GPL(suspend_set_ops); bool valid_state(suspend_state_t state) { + if (state == PM_SUSPEND_FREEZE) + return true; /* - * All states need lowlevel support and need to be valid to the lowlevel + * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel + * support and need to be valid to the lowlevel * implementation, no valid callback implies that none are valid. */ return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); @@ -89,11 +118,11 @@ static int suspend_test(int level) * hibernation). Run suspend notifiers, allocate the "suspend" console and * freeze processes. */ -static int suspend_prepare(void) +static int suspend_prepare(suspend_state_t state) { int error; - if (!suspend_ops || !suspend_ops->enter) + if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter)) return -EPERM; pm_prepare_console(); @@ -137,7 +166,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) { int error; - if (suspend_ops->prepare) { + if (need_suspend_ops(state) && suspend_ops->prepare) { error = suspend_ops->prepare(); if (error) goto Platform_finish; @@ -149,12 +178,23 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) goto Platform_finish; } - if (suspend_ops->prepare_late) { + if (need_suspend_ops(state) && suspend_ops->prepare_late) { error = suspend_ops->prepare_late(); if (error) goto Platform_wake; } + /* + * PM_SUSPEND_FREEZE equals + * frozen processes + suspended devices + idle processors. + * Thus we should invoke freeze_enter() soon after + * all the devices are suspended. + */ + if (state == PM_SUSPEND_FREEZE) { + freeze_enter(); + goto Platform_wake; + } + if (suspend_test(TEST_PLATFORM)) goto Platform_wake; @@ -182,13 +222,13 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) enable_nonboot_cpus(); Platform_wake: - if (suspend_ops->wake) + if (need_suspend_ops(state) && suspend_ops->wake) suspend_ops->wake(); dpm_resume_start(PMSG_RESUME); Platform_finish: - if (suspend_ops->finish) + if (need_suspend_ops(state) && suspend_ops->finish) suspend_ops->finish(); return error; @@ -203,11 +243,11 @@ int suspend_devices_and_enter(suspend_state_t state) int error; bool wakeup = false; - if (!suspend_ops) + if (need_suspend_ops(state) && !suspend_ops) return -ENOSYS; trace_machine_suspend(state); - if (suspend_ops->begin) { + if (need_suspend_ops(state) && suspend_ops->begin) { error = suspend_ops->begin(state); if (error) goto Close; @@ -226,7 +266,7 @@ int suspend_devices_and_enter(suspend_state_t state) do { error = suspend_enter(state, &wakeup); - } while (!error && !wakeup + } while (!error && !wakeup && need_suspend_ops(state) && suspend_ops->suspend_again && suspend_ops->suspend_again()); Resume_devices: @@ -236,13 +276,13 @@ int suspend_devices_and_enter(suspend_state_t state) ftrace_start(); resume_console(); Close: - if (suspend_ops->end) + if (need_suspend_ops(state) && suspend_ops->end) suspend_ops->end(); trace_machine_suspend(PWR_EVENT_EXIT); return error; Recover_platform: - if (suspend_ops->recover) + if (need_suspend_ops(state) && suspend_ops->recover) suspend_ops->recover(); goto Resume_devices; } @@ -278,12 +318,15 @@ static int enter_state(suspend_state_t state) if (!mutex_trylock(&pm_mutex)) return -EBUSY; + if (state == PM_SUSPEND_FREEZE) + freeze_begin(); + printk(KERN_INFO "PM: Syncing filesystems ... "); sys_sync(); printk("done.\n"); pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); - error = suspend_prepare(); + error = suspend_prepare(state); if (error) goto Unlock; -- cgit v1.2.3 From 957d1282bb8c07e682e142b9237cd9fcb8348a0b Mon Sep 17 00:00:00 2001 From: Li Fei Date: Fri, 1 Feb 2013 08:56:03 +0000 Subject: suspend: enable freeze timeout configuration through sys At present, the value of timeout for freezing is 20s, which is meaningless in case that one thread is frozen with mutex locked and another thread is trying to lock the mutex, as this time of freezing will fail unavoidably. And if there is no new wakeup event registered, the system will waste at most 20s for such meaningless trying of freezing. With this patch, the value of timeout can be configured to smaller value, so such meaningless trying of freezing will be aborted in earlier time, and later freezing can be also triggered in earlier time. And more power will be saved. In normal case on mobile phone, it costs real little time to freeze processes. On some platform, it only costs about 20ms to freeze user space processes and 10ms to freeze kernel freezable threads. Signed-off-by: Liu Chuansheng Signed-off-by: Li Fei Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 27 +++++++++++++++++++++++++++ kernel/power/process.c | 4 ++-- 2 files changed, 29 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index b1c26a92ca9f..d77663bfedeb 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -553,6 +553,30 @@ power_attr(pm_trace_dev_match); #endif /* CONFIG_PM_TRACE */ +#ifdef CONFIG_FREEZER +static ssize_t pm_freeze_timeout_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", freeze_timeout_msecs); +} + +static ssize_t pm_freeze_timeout_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + freeze_timeout_msecs = val; + return n; +} + +power_attr(pm_freeze_timeout); + +#endif /* CONFIG_FREEZER*/ + static struct attribute * g[] = { &state_attr.attr, #ifdef CONFIG_PM_TRACE @@ -575,6 +599,9 @@ static struct attribute * g[] = { #ifdef CONFIG_PM_SLEEP_DEBUG &pm_print_times_attr.attr, #endif +#endif +#ifdef CONFIG_FREEZER + &pm_freeze_timeout_attr.attr, #endif NULL, }; diff --git a/kernel/power/process.c b/kernel/power/process.c index d5a258b60c6f..98088e0e71e8 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -21,7 +21,7 @@ /* * Timeout for stopping processes */ -#define TIMEOUT (20 * HZ) +unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC; static int try_to_freeze_tasks(bool user_only) { @@ -36,7 +36,7 @@ static int try_to_freeze_tasks(bool user_only) do_gettimeofday(&start); - end_time = jiffies + TIMEOUT; + end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs); if (!user_only) freeze_workqueues_begin(); -- cgit v1.2.3 From 5d1d9a29bc0772abee765f09513779a2ef0ebbfd Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 8 Feb 2013 15:24:07 +0000 Subject: clockevents: Fix generic broadcast for FEAT_C3STOP Commit 12ad100046: "clockevents: Add generic timer broadcast function" made tick_device_uses_broadcast set up the generic broadcast function for dummy devices (where !tick_device_is_functional(dev)), but neglected to set up the broadcast function for devices that stop in low power states (with the CLOCK_EVT_FEAT_C3STOP flag). When these devices enter low power states they will not have the generic broadcast function assigned, and will bring down the system when an attempt is made to broadcast to them. This patch ensures that the broadcast function is also assigned for devices which require broadcast in low power states. Reported-by: Stephen Warren Signed-off-by: Mark Rutland Tested-by: Stephen Warren Cc: linux-arm-kernel@lists.infradead.org Cc: nico@linaro.org Cc: Marc.Zyngier@arm.com Cc: Will.Deacon@arm.com Cc: santosh.shilimkar@ti.com Cc: john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f726537d24eb..2fb8cb88df8d 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -92,6 +92,17 @@ static void err_broadcast(const struct cpumask *mask) pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n"); } +static void tick_device_setup_broadcast_func(struct clock_event_device *dev) +{ + if (!dev->broadcast) + dev->broadcast = tick_broadcast; + if (!dev->broadcast) { + pr_warn_once("%s depends on broadcast, but no broadcast function available\n", + dev->name); + dev->broadcast = err_broadcast; + } +} + /* * Check, if the device is disfunctional and a place holder, which * needs to be handled by the broadcast device. @@ -111,13 +122,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) */ if (!tick_device_is_functional(dev)) { dev->event_handler = tick_handle_periodic; - if (!dev->broadcast) - dev->broadcast = tick_broadcast; - if (!dev->broadcast) { - pr_warn_once("%s depends on broadcast, but no broadcast function available\n", - dev->name); - dev->broadcast = err_broadcast; - } + tick_device_setup_broadcast_func(dev); cpumask_set_cpu(cpu, tick_get_broadcast_mask()); tick_broadcast_start_periodic(tick_broadcast_device.evtdev); ret = 1; @@ -129,9 +134,10 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) */ if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { int cpu = smp_processor_id(); - cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); tick_broadcast_clear_oneshot(cpu); + } else { + tick_device_setup_broadcast_func(dev); } } raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); -- cgit v1.2.3 From f431b634f24d099872e78acc356c7fd35913b36b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 12 Feb 2013 16:18:59 -0500 Subject: tracing/syscalls: Allow archs to ignore tracing compat syscalls The tracing of ia32 compat system calls has been a bit of a pain as they use different system call numbers than the 64bit equivalents. I wrote a simple 'lls' program that lists files. I compiled it as a i686 ELF binary and ran it under a x86_64 box. This is the result: echo 0 > /debug/tracing/tracing_on echo 1 > /debug/tracing/events/syscalls/enable echo 1 > /debug/tracing/tracing_on ; ./lls ; echo 0 > /debug/tracing/tracing_on grep lls /debug/tracing/trace [.. skipping calls before TS_COMPAT is set ...] lls-1127 [005] d... 936.409188: sys_recvfrom(fd: 0, ubuf: 4d560fc4, size: 0, flags: 8048034, addr: 8, addr_len: f7700420) lls-1127 [005] d... 936.409190: sys_recvfrom -> 0x8a77000 lls-1127 [005] d... 936.409211: sys_lgetxattr(pathname: 0, name: 1000, value: 3, size: 22) lls-1127 [005] d... 936.409215: sys_lgetxattr -> 0xf76ff000 lls-1127 [005] d... 936.409223: sys_dup2(oldfd: 4d55ae9b, newfd: 4) lls-1127 [005] d... 936.409228: sys_dup2 -> 0xfffffffffffffffe lls-1127 [005] d... 936.409236: sys_newfstat(fd: 4d55b085, statbuf: 80000) lls-1127 [005] d... 936.409242: sys_newfstat -> 0x3 lls-1127 [005] d... 936.409243: sys_removexattr(pathname: 3, name: ffcd0060) lls-1127 [005] d... 936.409244: sys_removexattr -> 0x0 lls-1127 [005] d... 936.409245: sys_lgetxattr(pathname: 0, name: 19614, value: 1, size: 2) lls-1127 [005] d... 936.409248: sys_lgetxattr -> 0xf76e5000 lls-1127 [005] d... 936.409248: sys_newlstat(filename: 3, statbuf: 19614) lls-1127 [005] d... 936.409249: sys_newlstat -> 0x0 lls-1127 [005] d... 936.409262: sys_newfstat(fd: f76fb588, statbuf: 80000) lls-1127 [005] d... 936.409279: sys_newfstat -> 0x3 lls-1127 [005] d... 936.409279: sys_close(fd: 3) lls-1127 [005] d... 936.421550: sys_close -> 0x200 lls-1127 [005] d... 936.421558: sys_removexattr(pathname: 3, name: ffcd00d0) lls-1127 [005] d... 936.421560: sys_removexattr -> 0x0 lls-1127 [005] d... 936.421569: sys_lgetxattr(pathname: 4d564000, name: 1b1abc, value: 5, size: 802) lls-1127 [005] d... 936.421574: sys_lgetxattr -> 0x4d564000 lls-1127 [005] d... 936.421575: sys_capget(header: 4d70f000, dataptr: 1000) lls-1127 [005] d... 936.421580: sys_capget -> 0x0 lls-1127 [005] d... 936.421580: sys_lgetxattr(pathname: 4d710000, name: 3000, value: 3, size: 812) lls-1127 [005] d... 936.421589: sys_lgetxattr -> 0x4d710000 lls-1127 [005] d... 936.426130: sys_lgetxattr(pathname: 4d713000, name: 2abc, value: 3, size: 32) lls-1127 [005] d... 936.426141: sys_lgetxattr -> 0x4d713000 lls-1127 [005] d... 936.426145: sys_newlstat(filename: 3, statbuf: f76ff3f0) lls-1127 [005] d... 936.426146: sys_newlstat -> 0x0 lls-1127 [005] d... 936.431748: sys_lgetxattr(pathname: 0, name: 1000, value: 3, size: 22) Obviously I'm not calling newfstat with a fd of 4d55b085. The calls are obviously incorrect, and confusing. Other efforts have been made to fix this: https://lkml.org/lkml/2012/3/26/367 But the real solution is to rewrite the syscall internals and come up with a fixed solution. One that doesn't require all the kluge that the current solution has. Thus for now, instead of outputting incorrect data, simply ignore them. With this patch the changes now have: #> grep lls /debug/tracing/trace #> Compat system calls simply are not traced. If users need compat syscalls, then they should just use the raw syscall tracepoints. For an architecture to make their compat syscalls ignored, it must define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS (done in asm/ftrace.h) and also define an arch_trace_is_compat_syscall() function that will return true if the current task should ignore tracing the syscall. I want to stress that this change does not affect actual syscalls in any way, shape or form. It is only used within the tracing system and doesn't interfere with the syscall logic at all. The changes are consolidated nicely into trace_syscalls.c and asm/ftrace.h. I had to make one small modification to asm/thread_info.h and that was to remove the include of asm/ftrace.h. As asm/ftrace.h required the current_thread_info() it was causing include hell. That include was added back in 2008 when the function graph tracer was added: commit caf4b323 "tracing, x86: add low level support for ftrace return tracing" It does not need to be included there. Link: http://lkml.kernel.org/r/1360703939.21867.99.camel@gandalf.local.home Acked-by: H. Peter Anvin Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 43 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 5329e13e74a1..7a809e321058 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */ @@ -47,6 +48,38 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name } #endif +#ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS +/* + * Some architectures that allow for 32bit applications + * to run on a 64bit kernel, do not map the syscalls for + * the 32bit tasks the same as they do for 64bit tasks. + * + * *cough*x86*cough* + * + * In such a case, instead of reporting the wrong syscalls, + * simply ignore them. + * + * For an arch to ignore the compat syscalls it needs to + * define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as + * define the function arch_trace_is_compat_syscall() to let + * the tracing system know that it should ignore it. + */ +static int +trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs) +{ + if (unlikely(arch_trace_is_compat_syscall(regs))) + return -1; + + return syscall_get_nr(task, regs); +} +#else +static inline int +trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs) +{ + return syscall_get_nr(task, regs); +} +#endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */ + static __init struct syscall_metadata * find_syscall_meta(unsigned long syscall) { @@ -276,10 +309,10 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) struct syscall_metadata *sys_data; struct ring_buffer_event *event; struct ring_buffer *buffer; - int size; int syscall_nr; + int size; - syscall_nr = syscall_get_nr(current, regs); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; if (!test_bit(syscall_nr, enabled_enter_syscalls)) @@ -313,7 +346,7 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) struct ring_buffer *buffer; int syscall_nr; - syscall_nr = syscall_get_nr(current, regs); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; if (!test_bit(syscall_nr, enabled_exit_syscalls)) @@ -502,7 +535,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) int rctx; int size; - syscall_nr = syscall_get_nr(current, regs); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) @@ -578,7 +611,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) int rctx; int size; - syscall_nr = syscall_get_nr(current, regs); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) -- cgit v1.2.3 From 1dd638149f1f9d7d7dbb32591d5c7c2a0ea36264 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 Feb 2013 19:29:07 -0800 Subject: workqueue: fix is_chained_work() regression c9e7cf273f ("workqueue: move busy_hash from global_cwq to worker_pool") incorrectly converted is_chained_work() to use get_gcwq() inside for_each_gcwq_cpu() while removing get_gcwq(). As cwq might not exist for all possible workqueue CPUs, @cwq can be NULL and the following cwq deferences can lead to oops. Fix it by using for_each_cwq_cpu() instead, which is the better one to use anyway as we only need to check pools that the wq is associated with. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d6fdce12ca7e..0d26ab3aee59 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1167,7 +1167,7 @@ static bool is_chained_work(struct workqueue_struct *wq) unsigned long flags; unsigned int cpu; - for_each_wq_cpu(cpu) { + for_each_cwq_cpu(cpu, wq) { struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); struct worker_pool *pool = cwq->pool; struct worker *worker; -- cgit v1.2.3 From 8d03ecfe471802d6afe97da97722b6924533aa82 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 Feb 2013 19:29:10 -0800 Subject: workqueue: reimplement is_chained_work() using current_wq_worker() is_chained_work() was added before current_wq_worker() and implemented its own ham-fisted way of finding out whether %current is a workqueue worker - it iterates through all possible workers. Drop the custom implementation and reimplement using current_wq_worker(). Signed-off-by: Tejun Heo --- kernel/workqueue.c | 33 ++++++++------------------------- 1 file changed, 8 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0d26ab3aee59..ea7f696f1060 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1159,35 +1159,18 @@ static void insert_work(struct cpu_workqueue_struct *cwq, /* * Test whether @work is being queued from another work executing on the - * same workqueue. This is rather expensive and should only be used from - * cold paths. + * same workqueue. */ static bool is_chained_work(struct workqueue_struct *wq) { - unsigned long flags; - unsigned int cpu; - - for_each_cwq_cpu(cpu, wq) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - struct worker_pool *pool = cwq->pool; - struct worker *worker; - struct hlist_node *pos; - int i; + struct worker *worker; - spin_lock_irqsave(&pool->lock, flags); - for_each_busy_worker(worker, i, pos, pool) { - if (worker->task != current) - continue; - spin_unlock_irqrestore(&pool->lock, flags); - /* - * I'm @worker, no locking necessary. See if @work - * is headed to the same workqueue. - */ - return worker->current_cwq->wq == wq; - } - spin_unlock_irqrestore(&pool->lock, flags); - } - return false; + worker = current_wq_worker(); + /* + * Return %true iff I'm a worker execuing a work item on @wq. If + * I'm @worker, it's safe to dereference it without locking. + */ + return worker && worker->current_cwq->wq == wq; } static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, -- cgit v1.2.3 From 112202d9098aae2c36436e5178c0cf3ced423c7b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 Feb 2013 19:29:12 -0800 Subject: workqueue: rename cpu_workqueue to pool_workqueue workqueue has moved away from global_cwqs to worker_pools and with the scheduled custom worker pools, wforkqueues will be associated with pools which don't have anything to do with CPUs. The workqueue code went through significant amount of changes recently and mass renaming isn't likely to hurt much additionally. Let's replace 'cpu' with 'pool' so that it reflects the current design. * s/struct cpu_workqueue_struct/struct pool_workqueue/ * s/cpu_wq/pool_wq/ * s/cwq/pwq/ This patch is purely cosmetic. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 433 ++++++++++++++++++++++---------------------- kernel/workqueue_internal.h | 2 +- 2 files changed, 217 insertions(+), 218 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ea7f696f1060..0f1a264d0f22 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -154,11 +154,12 @@ struct worker_pool { } ____cacheline_aligned_in_smp; /* - * The per-CPU workqueue. The lower WORK_STRUCT_FLAG_BITS of - * work_struct->data are used for flags and thus cwqs need to be - * aligned at two's power of the number of flag bits. + * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS + * of work_struct->data are used for flags and the remaining high bits + * point to the pwq; thus, pwqs need to be aligned at two's power of the + * number of flag bits. */ -struct cpu_workqueue_struct { +struct pool_workqueue { struct worker_pool *pool; /* I: the associated pool */ struct workqueue_struct *wq; /* I: the owning workqueue */ int work_color; /* L: current color */ @@ -207,16 +208,16 @@ typedef unsigned long mayday_mask_t; struct workqueue_struct { unsigned int flags; /* W: WQ_* flags */ union { - struct cpu_workqueue_struct __percpu *pcpu; - struct cpu_workqueue_struct *single; + struct pool_workqueue __percpu *pcpu; + struct pool_workqueue *single; unsigned long v; - } cpu_wq; /* I: cwq's */ + } pool_wq; /* I: pwq's */ struct list_head list; /* W: list of all workqueues */ struct mutex flush_mutex; /* protects wq flushing */ int work_color; /* F: current work color */ int flush_color; /* F: current flush color */ - atomic_t nr_cwqs_to_flush; /* flush in progress */ + atomic_t nr_pwqs_to_flush; /* flush in progress */ struct wq_flusher *first_flusher; /* F: first flusher */ struct list_head flusher_queue; /* F: flush waiters */ struct list_head flusher_overflow; /* F: flush overflow list */ @@ -225,7 +226,7 @@ struct workqueue_struct { struct worker *rescuer; /* I: rescue worker */ int nr_drainers; /* W: drain in progress */ - int saved_max_active; /* W: saved cwq max_active */ + int saved_max_active; /* W: saved pwq max_active */ #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; #endif @@ -268,7 +269,7 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, return WORK_CPU_END; } -static inline int __next_cwq_cpu(int cpu, const struct cpumask *mask, +static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask, struct workqueue_struct *wq) { return __next_wq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); @@ -284,7 +285,7 @@ static inline int __next_cwq_cpu(int cpu, const struct cpumask *mask, * * for_each_wq_cpu() : possible CPUs + WORK_CPU_UNBOUND * for_each_online_wq_cpu() : online CPUs + WORK_CPU_UNBOUND - * for_each_cwq_cpu() : possible CPUs for bound workqueues, + * for_each_pwq_cpu() : possible CPUs for bound workqueues, * WORK_CPU_UNBOUND for unbound workqueues */ #define for_each_wq_cpu(cpu) \ @@ -297,10 +298,10 @@ static inline int __next_cwq_cpu(int cpu, const struct cpumask *mask, (cpu) < WORK_CPU_END; \ (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3)) -#define for_each_cwq_cpu(cpu, wq) \ - for ((cpu) = __next_cwq_cpu(-1, cpu_possible_mask, (wq)); \ +#define for_each_pwq_cpu(cpu, wq) \ + for ((cpu) = __next_pwq_cpu(-1, cpu_possible_mask, (wq)); \ (cpu) < WORK_CPU_END; \ - (cpu) = __next_cwq_cpu((cpu), cpu_possible_mask, (wq))) + (cpu) = __next_pwq_cpu((cpu), cpu_possible_mask, (wq))) #ifdef CONFIG_DEBUG_OBJECTS_WORK @@ -479,14 +480,14 @@ static struct worker_pool *get_std_worker_pool(int cpu, bool highpri) return &pools[highpri]; } -static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, - struct workqueue_struct *wq) +static struct pool_workqueue *get_pwq(unsigned int cpu, + struct workqueue_struct *wq) { if (!(wq->flags & WQ_UNBOUND)) { if (likely(cpu < nr_cpu_ids)) - return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); + return per_cpu_ptr(wq->pool_wq.pcpu, cpu); } else if (likely(cpu == WORK_CPU_UNBOUND)) - return wq->cpu_wq.single; + return wq->pool_wq.single; return NULL; } @@ -507,18 +508,18 @@ static int work_next_color(int color) } /* - * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data - * contain the pointer to the queued cwq. Once execution starts, the flag + * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data + * contain the pointer to the queued pwq. Once execution starts, the flag * is cleared and the high bits contain OFFQ flags and pool ID. * - * set_work_cwq(), set_work_pool_and_clear_pending(), mark_work_canceling() - * and clear_work_data() can be used to set the cwq, pool or clear + * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling() + * and clear_work_data() can be used to set the pwq, pool or clear * work->data. These functions should only be called while the work is * owned - ie. while the PENDING bit is set. * - * get_work_pool() and get_work_cwq() can be used to obtain the pool or cwq + * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq * corresponding to a work. Pool is available once the work has been - * queued anywhere after initialization until it is sync canceled. cwq is + * queued anywhere after initialization until it is sync canceled. pwq is * available only while the work item is queued. * * %WORK_OFFQ_CANCELING is used to mark a work item which is being @@ -533,12 +534,11 @@ static inline void set_work_data(struct work_struct *work, unsigned long data, atomic_long_set(&work->data, data | flags | work_static(work)); } -static void set_work_cwq(struct work_struct *work, - struct cpu_workqueue_struct *cwq, +static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq, unsigned long extra_flags) { - set_work_data(work, (unsigned long)cwq, - WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); + set_work_data(work, (unsigned long)pwq, + WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags); } static void set_work_pool_and_keep_pending(struct work_struct *work, @@ -567,11 +567,11 @@ static void clear_work_data(struct work_struct *work) set_work_data(work, WORK_STRUCT_NO_POOL, 0); } -static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work) +static struct pool_workqueue *get_work_pwq(struct work_struct *work) { unsigned long data = atomic_long_read(&work->data); - if (data & WORK_STRUCT_CWQ) + if (data & WORK_STRUCT_PWQ) return (void *)(data & WORK_STRUCT_WQ_DATA_MASK); else return NULL; @@ -589,8 +589,8 @@ static struct worker_pool *get_work_pool(struct work_struct *work) struct worker_pool *pool; int pool_id; - if (data & WORK_STRUCT_CWQ) - return ((struct cpu_workqueue_struct *) + if (data & WORK_STRUCT_PWQ) + return ((struct pool_workqueue *) (data & WORK_STRUCT_WQ_DATA_MASK))->pool; pool_id = data >> WORK_OFFQ_POOL_SHIFT; @@ -613,8 +613,8 @@ static int get_work_pool_id(struct work_struct *work) { unsigned long data = atomic_long_read(&work->data); - if (data & WORK_STRUCT_CWQ) - return ((struct cpu_workqueue_struct *) + if (data & WORK_STRUCT_PWQ) + return ((struct pool_workqueue *) (data & WORK_STRUCT_WQ_DATA_MASK))->pool->id; return data >> WORK_OFFQ_POOL_SHIFT; @@ -632,7 +632,7 @@ static bool work_is_canceling(struct work_struct *work) { unsigned long data = atomic_long_read(&work->data); - return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING); + return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING); } /* @@ -961,67 +961,67 @@ static void move_linked_works(struct work_struct *work, struct list_head *head, *nextp = n; } -static void cwq_activate_delayed_work(struct work_struct *work) +static void pwq_activate_delayed_work(struct work_struct *work) { - struct cpu_workqueue_struct *cwq = get_work_cwq(work); + struct pool_workqueue *pwq = get_work_pwq(work); trace_workqueue_activate_work(work); - move_linked_works(work, &cwq->pool->worklist, NULL); + move_linked_works(work, &pwq->pool->worklist, NULL); __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); - cwq->nr_active++; + pwq->nr_active++; } -static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) +static void pwq_activate_first_delayed(struct pool_workqueue *pwq) { - struct work_struct *work = list_first_entry(&cwq->delayed_works, + struct work_struct *work = list_first_entry(&pwq->delayed_works, struct work_struct, entry); - cwq_activate_delayed_work(work); + pwq_activate_delayed_work(work); } /** - * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight - * @cwq: cwq of interest + * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight + * @pwq: pwq of interest * @color: color of work which left the queue * * A work either has completed or is removed from pending queue, - * decrement nr_in_flight of its cwq and handle workqueue flushing. + * decrement nr_in_flight of its pwq and handle workqueue flushing. * * CONTEXT: * spin_lock_irq(pool->lock). */ -static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color) +static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) { /* ignore uncolored works */ if (color == WORK_NO_COLOR) return; - cwq->nr_in_flight[color]--; + pwq->nr_in_flight[color]--; - cwq->nr_active--; - if (!list_empty(&cwq->delayed_works)) { + pwq->nr_active--; + if (!list_empty(&pwq->delayed_works)) { /* one down, submit a delayed one */ - if (cwq->nr_active < cwq->max_active) - cwq_activate_first_delayed(cwq); + if (pwq->nr_active < pwq->max_active) + pwq_activate_first_delayed(pwq); } /* is flush in progress and are we at the flushing tip? */ - if (likely(cwq->flush_color != color)) + if (likely(pwq->flush_color != color)) return; /* are there still in-flight works? */ - if (cwq->nr_in_flight[color]) + if (pwq->nr_in_flight[color]) return; - /* this cwq is done, clear flush_color */ - cwq->flush_color = -1; + /* this pwq is done, clear flush_color */ + pwq->flush_color = -1; /* - * If this was the last cwq, wake up the first flusher. It + * If this was the last pwq, wake up the first flusher. It * will handle the rest. */ - if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) - complete(&cwq->wq->first_flusher->done); + if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush)) + complete(&pwq->wq->first_flusher->done); } /** @@ -1053,7 +1053,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, unsigned long *flags) { struct worker_pool *pool; - struct cpu_workqueue_struct *cwq; + struct pool_workqueue *pwq; local_irq_save(*flags); @@ -1084,31 +1084,31 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, spin_lock(&pool->lock); /* - * work->data is guaranteed to point to cwq only while the work - * item is queued on cwq->wq, and both updating work->data to point - * to cwq on queueing and to pool on dequeueing are done under - * cwq->pool->lock. This in turn guarantees that, if work->data - * points to cwq which is associated with a locked pool, the work + * work->data is guaranteed to point to pwq only while the work + * item is queued on pwq->wq, and both updating work->data to point + * to pwq on queueing and to pool on dequeueing are done under + * pwq->pool->lock. This in turn guarantees that, if work->data + * points to pwq which is associated with a locked pool, the work * item is currently queued on that pool. */ - cwq = get_work_cwq(work); - if (cwq && cwq->pool == pool) { + pwq = get_work_pwq(work); + if (pwq && pwq->pool == pool) { debug_work_deactivate(work); /* * A delayed work item cannot be grabbed directly because * it might have linked NO_COLOR work items which, if left - * on the delayed_list, will confuse cwq->nr_active + * on the delayed_list, will confuse pwq->nr_active * management later on and cause stall. Make sure the work * item is activated before grabbing. */ if (*work_data_bits(work) & WORK_STRUCT_DELAYED) - cwq_activate_delayed_work(work); + pwq_activate_delayed_work(work); list_del_init(&work->entry); - cwq_dec_nr_in_flight(get_work_cwq(work), get_work_color(work)); + pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work)); - /* work->data points to cwq iff queued, point to pool */ + /* work->data points to pwq iff queued, point to pool */ set_work_pool_and_keep_pending(work, pool->id); spin_unlock(&pool->lock); @@ -1125,25 +1125,24 @@ fail: /** * insert_work - insert a work into a pool - * @cwq: cwq @work belongs to + * @pwq: pwq @work belongs to * @work: work to insert * @head: insertion point * @extra_flags: extra WORK_STRUCT_* flags to set * - * Insert @work which belongs to @cwq after @head. @extra_flags is or'd to + * Insert @work which belongs to @pwq after @head. @extra_flags is or'd to * work_struct flags. * * CONTEXT: * spin_lock_irq(pool->lock). */ -static void insert_work(struct cpu_workqueue_struct *cwq, - struct work_struct *work, struct list_head *head, - unsigned int extra_flags) +static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, + struct list_head *head, unsigned int extra_flags) { - struct worker_pool *pool = cwq->pool; + struct worker_pool *pool = pwq->pool; /* we own @work, set data and link */ - set_work_cwq(work, cwq, extra_flags); + set_work_pwq(work, pwq, extra_flags); list_add_tail(&work->entry, head); /* @@ -1170,13 +1169,13 @@ static bool is_chained_work(struct workqueue_struct *wq) * Return %true iff I'm a worker execuing a work item on @wq. If * I'm @worker, it's safe to dereference it without locking. */ - return worker && worker->current_cwq->wq == wq; + return worker && worker->current_pwq->wq == wq; } static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, struct work_struct *work) { - struct cpu_workqueue_struct *cwq; + struct pool_workqueue *pwq; struct list_head *worklist; unsigned int work_flags; unsigned int req_cpu = cpu; @@ -1196,7 +1195,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, WARN_ON_ONCE(!is_chained_work(wq))) return; - /* determine the cwq to use */ + /* determine the pwq to use */ if (!(wq->flags & WQ_UNBOUND)) { struct worker_pool *last_pool; @@ -1209,54 +1208,54 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, * work needs to be queued on that cpu to guarantee * non-reentrancy. */ - cwq = get_cwq(cpu, wq); + pwq = get_pwq(cpu, wq); last_pool = get_work_pool(work); - if (last_pool && last_pool != cwq->pool) { + if (last_pool && last_pool != pwq->pool) { struct worker *worker; spin_lock(&last_pool->lock); worker = find_worker_executing_work(last_pool, work); - if (worker && worker->current_cwq->wq == wq) { - cwq = get_cwq(last_pool->cpu, wq); + if (worker && worker->current_pwq->wq == wq) { + pwq = get_pwq(last_pool->cpu, wq); } else { /* meh... not running there, queue here */ spin_unlock(&last_pool->lock); - spin_lock(&cwq->pool->lock); + spin_lock(&pwq->pool->lock); } } else { - spin_lock(&cwq->pool->lock); + spin_lock(&pwq->pool->lock); } } else { - cwq = get_cwq(WORK_CPU_UNBOUND, wq); - spin_lock(&cwq->pool->lock); + pwq = get_pwq(WORK_CPU_UNBOUND, wq); + spin_lock(&pwq->pool->lock); } - /* cwq determined, queue */ - trace_workqueue_queue_work(req_cpu, cwq, work); + /* pwq determined, queue */ + trace_workqueue_queue_work(req_cpu, pwq, work); if (WARN_ON(!list_empty(&work->entry))) { - spin_unlock(&cwq->pool->lock); + spin_unlock(&pwq->pool->lock); return; } - cwq->nr_in_flight[cwq->work_color]++; - work_flags = work_color_to_flags(cwq->work_color); + pwq->nr_in_flight[pwq->work_color]++; + work_flags = work_color_to_flags(pwq->work_color); - if (likely(cwq->nr_active < cwq->max_active)) { + if (likely(pwq->nr_active < pwq->max_active)) { trace_workqueue_activate_work(work); - cwq->nr_active++; - worklist = &cwq->pool->worklist; + pwq->nr_active++; + worklist = &pwq->pool->worklist; } else { work_flags |= WORK_STRUCT_DELAYED; - worklist = &cwq->delayed_works; + worklist = &pwq->delayed_works; } - insert_work(cwq, work, worklist, work_flags); + insert_work(pwq, work, worklist, work_flags); - spin_unlock(&cwq->pool->lock); + spin_unlock(&pwq->pool->lock); } /** @@ -1661,14 +1660,14 @@ static void rebind_workers(struct worker_pool *pool) /* * wq doesn't really matter but let's keep @worker->pool - * and @cwq->pool consistent for sanity. + * and @pwq->pool consistent for sanity. */ if (std_worker_pool_pri(worker->pool)) wq = system_highpri_wq; else wq = system_wq; - insert_work(get_cwq(pool->cpu, wq), rebind_work, + insert_work(get_pwq(pool->cpu, wq), rebind_work, worker->scheduled.next, work_color_to_flags(WORK_NO_COLOR)); } @@ -1845,15 +1844,15 @@ static void idle_worker_timeout(unsigned long __pool) static bool send_mayday(struct work_struct *work) { - struct cpu_workqueue_struct *cwq = get_work_cwq(work); - struct workqueue_struct *wq = cwq->wq; + struct pool_workqueue *pwq = get_work_pwq(work); + struct workqueue_struct *wq = pwq->wq; unsigned int cpu; if (!(wq->flags & WQ_RESCUER)) return false; /* mayday mayday mayday */ - cpu = cwq->pool->cpu; + cpu = pwq->pool->cpu; /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ if (cpu == WORK_CPU_UNBOUND) cpu = 0; @@ -2082,9 +2081,9 @@ static void process_one_work(struct worker *worker, struct work_struct *work) __releases(&pool->lock) __acquires(&pool->lock) { - struct cpu_workqueue_struct *cwq = get_work_cwq(work); + struct pool_workqueue *pwq = get_work_pwq(work); struct worker_pool *pool = worker->pool; - bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; + bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE; int work_color; struct worker *collision; #ifdef CONFIG_LOCKDEP @@ -2125,7 +2124,7 @@ __acquires(&pool->lock) hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work); worker->current_work = work; worker->current_func = work->func; - worker->current_cwq = cwq; + worker->current_pwq = pwq; work_color = get_work_color(work); list_del_init(&work->entry); @@ -2154,7 +2153,7 @@ __acquires(&pool->lock) spin_unlock_irq(&pool->lock); - lock_map_acquire_read(&cwq->wq->lockdep_map); + lock_map_acquire_read(&pwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); trace_workqueue_execute_start(work); worker->current_func(work); @@ -2164,7 +2163,7 @@ __acquires(&pool->lock) */ trace_workqueue_execute_end(work); lock_map_release(&lockdep_map); - lock_map_release(&cwq->wq->lockdep_map); + lock_map_release(&pwq->wq->lockdep_map); if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" @@ -2185,8 +2184,8 @@ __acquires(&pool->lock) hash_del(&worker->hentry); worker->current_work = NULL; worker->current_func = NULL; - worker->current_cwq = NULL; - cwq_dec_nr_in_flight(cwq, work_color); + worker->current_pwq = NULL; + pwq_dec_nr_in_flight(pwq, work_color); } /** @@ -2353,8 +2352,8 @@ repeat: */ for_each_mayday_cpu(cpu, wq->mayday_mask) { unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; - struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); - struct worker_pool *pool = cwq->pool; + struct pool_workqueue *pwq = get_pwq(tcpu, wq); + struct worker_pool *pool = pwq->pool; struct work_struct *work, *n; __set_current_state(TASK_RUNNING); @@ -2370,7 +2369,7 @@ repeat: */ BUG_ON(!list_empty(&rescuer->scheduled)); list_for_each_entry_safe(work, n, &pool->worklist, entry) - if (get_work_cwq(work) == cwq) + if (get_work_pwq(work) == pwq) move_linked_works(work, scheduled, &n); process_scheduled_works(rescuer); @@ -2405,7 +2404,7 @@ static void wq_barrier_func(struct work_struct *work) /** * insert_wq_barrier - insert a barrier work - * @cwq: cwq to insert barrier into + * @pwq: pwq to insert barrier into * @barr: wq_barrier to insert * @target: target work to attach @barr to * @worker: worker currently executing @target, NULL if @target is not executing @@ -2422,12 +2421,12 @@ static void wq_barrier_func(struct work_struct *work) * after a work with LINKED flag set. * * Note that when @worker is non-NULL, @target may be modified - * underneath us, so we can't reliably determine cwq from @target. + * underneath us, so we can't reliably determine pwq from @target. * * CONTEXT: * spin_lock_irq(pool->lock). */ -static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, +static void insert_wq_barrier(struct pool_workqueue *pwq, struct wq_barrier *barr, struct work_struct *target, struct worker *worker) { @@ -2460,23 +2459,23 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, } debug_work_activate(&barr->work); - insert_work(cwq, &barr->work, head, + insert_work(pwq, &barr->work, head, work_color_to_flags(WORK_NO_COLOR) | linked); } /** - * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing + * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing * @wq: workqueue being flushed * @flush_color: new flush color, < 0 for no-op * @work_color: new work color, < 0 for no-op * - * Prepare cwqs for workqueue flushing. + * Prepare pwqs for workqueue flushing. * - * If @flush_color is non-negative, flush_color on all cwqs should be - * -1. If no cwq has in-flight commands at the specified color, all - * cwq->flush_color's stay at -1 and %false is returned. If any cwq - * has in flight commands, its cwq->flush_color is set to - * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq + * If @flush_color is non-negative, flush_color on all pwqs should be + * -1. If no pwq has in-flight commands at the specified color, all + * pwq->flush_color's stay at -1 and %false is returned. If any pwq + * has in flight commands, its pwq->flush_color is set to + * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq * wakeup logic is armed and %true is returned. * * The caller should have initialized @wq->first_flusher prior to @@ -2484,7 +2483,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, * @flush_color is negative, no flush color update is done and %false * is returned. * - * If @work_color is non-negative, all cwqs should have the same + * If @work_color is non-negative, all pwqs should have the same * work_color which is previous to @work_color and all will be * advanced to @work_color. * @@ -2495,42 +2494,42 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, * %true if @flush_color >= 0 and there's something to flush. %false * otherwise. */ -static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq, +static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, int flush_color, int work_color) { bool wait = false; unsigned int cpu; if (flush_color >= 0) { - BUG_ON(atomic_read(&wq->nr_cwqs_to_flush)); - atomic_set(&wq->nr_cwqs_to_flush, 1); + BUG_ON(atomic_read(&wq->nr_pwqs_to_flush)); + atomic_set(&wq->nr_pwqs_to_flush, 1); } - for_each_cwq_cpu(cpu, wq) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - struct worker_pool *pool = cwq->pool; + for_each_pwq_cpu(cpu, wq) { + struct pool_workqueue *pwq = get_pwq(cpu, wq); + struct worker_pool *pool = pwq->pool; spin_lock_irq(&pool->lock); if (flush_color >= 0) { - BUG_ON(cwq->flush_color != -1); + BUG_ON(pwq->flush_color != -1); - if (cwq->nr_in_flight[flush_color]) { - cwq->flush_color = flush_color; - atomic_inc(&wq->nr_cwqs_to_flush); + if (pwq->nr_in_flight[flush_color]) { + pwq->flush_color = flush_color; + atomic_inc(&wq->nr_pwqs_to_flush); wait = true; } } if (work_color >= 0) { - BUG_ON(work_color != work_next_color(cwq->work_color)); - cwq->work_color = work_color; + BUG_ON(work_color != work_next_color(pwq->work_color)); + pwq->work_color = work_color; } spin_unlock_irq(&pool->lock); } - if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush)) + if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush)) complete(&wq->first_flusher->done); return wait; @@ -2581,7 +2580,7 @@ void flush_workqueue(struct workqueue_struct *wq) wq->first_flusher = &this_flusher; - if (!flush_workqueue_prep_cwqs(wq, wq->flush_color, + if (!flush_workqueue_prep_pwqs(wq, wq->flush_color, wq->work_color)) { /* nothing to flush, done */ wq->flush_color = next_color; @@ -2592,7 +2591,7 @@ void flush_workqueue(struct workqueue_struct *wq) /* wait in queue */ BUG_ON(wq->flush_color == this_flusher.flush_color); list_add_tail(&this_flusher.list, &wq->flusher_queue); - flush_workqueue_prep_cwqs(wq, -1, wq->work_color); + flush_workqueue_prep_pwqs(wq, -1, wq->work_color); } } else { /* @@ -2659,7 +2658,7 @@ void flush_workqueue(struct workqueue_struct *wq) list_splice_tail_init(&wq->flusher_overflow, &wq->flusher_queue); - flush_workqueue_prep_cwqs(wq, -1, wq->work_color); + flush_workqueue_prep_pwqs(wq, -1, wq->work_color); } if (list_empty(&wq->flusher_queue)) { @@ -2669,7 +2668,7 @@ void flush_workqueue(struct workqueue_struct *wq) /* * Need to flush more colors. Make the next flusher - * the new first flusher and arm cwqs. + * the new first flusher and arm pwqs. */ BUG_ON(wq->flush_color == wq->work_color); BUG_ON(wq->flush_color != next->flush_color); @@ -2677,7 +2676,7 @@ void flush_workqueue(struct workqueue_struct *wq) list_del_init(&next->list); wq->first_flusher = next; - if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1)) + if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1)) break; /* @@ -2720,13 +2719,13 @@ void drain_workqueue(struct workqueue_struct *wq) reflush: flush_workqueue(wq); - for_each_cwq_cpu(cpu, wq) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + for_each_pwq_cpu(cpu, wq) { + struct pool_workqueue *pwq = get_pwq(cpu, wq); bool drained; - spin_lock_irq(&cwq->pool->lock); - drained = !cwq->nr_active && list_empty(&cwq->delayed_works); - spin_unlock_irq(&cwq->pool->lock); + spin_lock_irq(&pwq->pool->lock); + drained = !pwq->nr_active && list_empty(&pwq->delayed_works); + spin_unlock_irq(&pwq->pool->lock); if (drained) continue; @@ -2749,7 +2748,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) { struct worker *worker = NULL; struct worker_pool *pool; - struct cpu_workqueue_struct *cwq; + struct pool_workqueue *pwq; might_sleep(); pool = get_work_pool(work); @@ -2758,18 +2757,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) spin_lock_irq(&pool->lock); /* see the comment in try_to_grab_pending() with the same code */ - cwq = get_work_cwq(work); - if (cwq) { - if (unlikely(cwq->pool != pool)) + pwq = get_work_pwq(work); + if (pwq) { + if (unlikely(pwq->pool != pool)) goto already_gone; } else { worker = find_worker_executing_work(pool, work); if (!worker) goto already_gone; - cwq = worker->current_cwq; + pwq = worker->current_pwq; } - insert_wq_barrier(cwq, barr, work, worker); + insert_wq_barrier(pwq, barr, work, worker); spin_unlock_irq(&pool->lock); /* @@ -2778,11 +2777,11 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) * flusher is not running on the same workqueue by verifying write * access. */ - if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER) - lock_map_acquire(&cwq->wq->lockdep_map); + if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER) + lock_map_acquire(&pwq->wq->lockdep_map); else - lock_map_acquire_read(&cwq->wq->lockdep_map); - lock_map_release(&cwq->wq->lockdep_map); + lock_map_acquire_read(&pwq->wq->lockdep_map); + lock_map_release(&pwq->wq->lockdep_map); return true; already_gone: @@ -3092,46 +3091,46 @@ int keventd_up(void) return system_wq != NULL; } -static int alloc_cwqs(struct workqueue_struct *wq) +static int alloc_pwqs(struct workqueue_struct *wq) { /* - * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. + * pwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. * Make sure that the alignment isn't lower than that of * unsigned long long. */ - const size_t size = sizeof(struct cpu_workqueue_struct); + const size_t size = sizeof(struct pool_workqueue); const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, __alignof__(unsigned long long)); if (!(wq->flags & WQ_UNBOUND)) - wq->cpu_wq.pcpu = __alloc_percpu(size, align); + wq->pool_wq.pcpu = __alloc_percpu(size, align); else { void *ptr; /* - * Allocate enough room to align cwq and put an extra + * Allocate enough room to align pwq and put an extra * pointer at the end pointing back to the originally * allocated pointer which will be used for free. */ ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL); if (ptr) { - wq->cpu_wq.single = PTR_ALIGN(ptr, align); - *(void **)(wq->cpu_wq.single + 1) = ptr; + wq->pool_wq.single = PTR_ALIGN(ptr, align); + *(void **)(wq->pool_wq.single + 1) = ptr; } } /* just in case, make sure it's actually aligned */ - BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); - return wq->cpu_wq.v ? 0 : -ENOMEM; + BUG_ON(!IS_ALIGNED(wq->pool_wq.v, align)); + return wq->pool_wq.v ? 0 : -ENOMEM; } -static void free_cwqs(struct workqueue_struct *wq) +static void free_pwqs(struct workqueue_struct *wq) { if (!(wq->flags & WQ_UNBOUND)) - free_percpu(wq->cpu_wq.pcpu); - else if (wq->cpu_wq.single) { - /* the pointer to free is stored right after the cwq */ - kfree(*(void **)(wq->cpu_wq.single + 1)); + free_percpu(wq->pool_wq.pcpu); + else if (wq->pool_wq.single) { + /* the pointer to free is stored right after the pwq */ + kfree(*(void **)(wq->pool_wq.single + 1)); } } @@ -3185,25 +3184,25 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, wq->flags = flags; wq->saved_max_active = max_active; mutex_init(&wq->flush_mutex); - atomic_set(&wq->nr_cwqs_to_flush, 0); + atomic_set(&wq->nr_pwqs_to_flush, 0); INIT_LIST_HEAD(&wq->flusher_queue); INIT_LIST_HEAD(&wq->flusher_overflow); lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); INIT_LIST_HEAD(&wq->list); - if (alloc_cwqs(wq) < 0) + if (alloc_pwqs(wq) < 0) goto err; - for_each_cwq_cpu(cpu, wq) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + for_each_pwq_cpu(cpu, wq) { + struct pool_workqueue *pwq = get_pwq(cpu, wq); - BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); - cwq->pool = get_std_worker_pool(cpu, flags & WQ_HIGHPRI); - cwq->wq = wq; - cwq->flush_color = -1; - cwq->max_active = max_active; - INIT_LIST_HEAD(&cwq->delayed_works); + BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); + pwq->pool = get_std_worker_pool(cpu, flags & WQ_HIGHPRI); + pwq->wq = wq; + pwq->flush_color = -1; + pwq->max_active = max_active; + INIT_LIST_HEAD(&pwq->delayed_works); } if (flags & WQ_RESCUER) { @@ -3234,8 +3233,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, spin_lock(&workqueue_lock); if (workqueue_freezing && wq->flags & WQ_FREEZABLE) - for_each_cwq_cpu(cpu, wq) - get_cwq(cpu, wq)->max_active = 0; + for_each_pwq_cpu(cpu, wq) + get_pwq(cpu, wq)->max_active = 0; list_add(&wq->list, &workqueues); @@ -3244,7 +3243,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, return wq; err: if (wq) { - free_cwqs(wq); + free_pwqs(wq); free_mayday_mask(wq->mayday_mask); kfree(wq->rescuer); kfree(wq); @@ -3275,14 +3274,14 @@ void destroy_workqueue(struct workqueue_struct *wq) spin_unlock(&workqueue_lock); /* sanity check */ - for_each_cwq_cpu(cpu, wq) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + for_each_pwq_cpu(cpu, wq) { + struct pool_workqueue *pwq = get_pwq(cpu, wq); int i; for (i = 0; i < WORK_NR_COLORS; i++) - BUG_ON(cwq->nr_in_flight[i]); - BUG_ON(cwq->nr_active); - BUG_ON(!list_empty(&cwq->delayed_works)); + BUG_ON(pwq->nr_in_flight[i]); + BUG_ON(pwq->nr_active); + BUG_ON(!list_empty(&pwq->delayed_works)); } if (wq->flags & WQ_RESCUER) { @@ -3291,29 +3290,29 @@ void destroy_workqueue(struct workqueue_struct *wq) kfree(wq->rescuer); } - free_cwqs(wq); + free_pwqs(wq); kfree(wq); } EXPORT_SYMBOL_GPL(destroy_workqueue); /** - * cwq_set_max_active - adjust max_active of a cwq - * @cwq: target cpu_workqueue_struct + * pwq_set_max_active - adjust max_active of a pwq + * @pwq: target pool_workqueue * @max_active: new max_active value. * - * Set @cwq->max_active to @max_active and activate delayed works if + * Set @pwq->max_active to @max_active and activate delayed works if * increased. * * CONTEXT: * spin_lock_irq(pool->lock). */ -static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active) +static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active) { - cwq->max_active = max_active; + pwq->max_active = max_active; - while (!list_empty(&cwq->delayed_works) && - cwq->nr_active < cwq->max_active) - cwq_activate_first_delayed(cwq); + while (!list_empty(&pwq->delayed_works) && + pwq->nr_active < pwq->max_active) + pwq_activate_first_delayed(pwq); } /** @@ -3336,15 +3335,15 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) wq->saved_max_active = max_active; - for_each_cwq_cpu(cpu, wq) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - struct worker_pool *pool = cwq->pool; + for_each_pwq_cpu(cpu, wq) { + struct pool_workqueue *pwq = get_pwq(cpu, wq); + struct worker_pool *pool = pwq->pool; spin_lock_irq(&pool->lock); if (!(wq->flags & WQ_FREEZABLE) || !(pool->flags & POOL_FREEZING)) - cwq_set_max_active(cwq, max_active); + pwq_set_max_active(pwq, max_active); spin_unlock_irq(&pool->lock); } @@ -3367,9 +3366,9 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active); */ bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + struct pool_workqueue *pwq = get_pwq(cpu, wq); - return !list_empty(&cwq->delayed_works); + return !list_empty(&pwq->delayed_works); } EXPORT_SYMBOL_GPL(workqueue_congested); @@ -3408,7 +3407,7 @@ EXPORT_SYMBOL_GPL(work_busy); * CPU hotplug. * * There are two challenges in supporting CPU hotplug. Firstly, there - * are a lot of assumptions on strong associations among work, cwq and + * are a lot of assumptions on strong associations among work, pwq and * pool which make migrating pending and scheduled works very * difficult to implement without impacting hot paths. Secondly, * worker pools serve mix of short, long and very long running works making @@ -3612,11 +3611,11 @@ void freeze_workqueues_begin(void) pool->flags |= POOL_FREEZING; list_for_each_entry(wq, &workqueues, list) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + struct pool_workqueue *pwq = get_pwq(cpu, wq); - if (cwq && cwq->pool == pool && + if (pwq && pwq->pool == pool && (wq->flags & WQ_FREEZABLE)) - cwq->max_active = 0; + pwq->max_active = 0; } spin_unlock_irq(&pool->lock); @@ -3655,13 +3654,13 @@ bool freeze_workqueues_busy(void) * to peek without lock. */ list_for_each_entry(wq, &workqueues, list) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + struct pool_workqueue *pwq = get_pwq(cpu, wq); - if (!cwq || !(wq->flags & WQ_FREEZABLE)) + if (!pwq || !(wq->flags & WQ_FREEZABLE)) continue; - BUG_ON(cwq->nr_active < 0); - if (cwq->nr_active) { + BUG_ON(pwq->nr_active < 0); + if (pwq->nr_active) { busy = true; goto out_unlock; } @@ -3701,14 +3700,14 @@ void thaw_workqueues(void) pool->flags &= ~POOL_FREEZING; list_for_each_entry(wq, &workqueues, list) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + struct pool_workqueue *pwq = get_pwq(cpu, wq); - if (!cwq || cwq->pool != pool || + if (!pwq || pwq->pool != pool || !(wq->flags & WQ_FREEZABLE)) continue; /* restore max_active and repopulate worklist */ - cwq_set_max_active(cwq, wq->saved_max_active); + pwq_set_max_active(pwq, wq->saved_max_active); } wake_up_worker(pool); diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 328be4a269aa..07650264ec15 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -28,7 +28,7 @@ struct worker { struct work_struct *current_work; /* L: work being processed */ work_func_t current_func; /* L: current_work's fn */ - struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ + struct pool_workqueue *current_pwq; /* L: current_work's pwq */ struct list_head scheduled; /* L: scheduled works */ struct task_struct *task; /* I: worker task */ struct worker_pool *pool; /* I: the associated pool */ -- cgit v1.2.3 From e9b04b5b67ec628a5e9a312e14b6864f8f73ba12 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 20 Nov 2012 11:14:10 -0500 Subject: make do_sigaltstack() static Signed-off-by: Al Viro --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 87c09e3061d2..6919050c8156 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3129,7 +3129,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) return 0; } -int +static int do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) { stack_t oss; -- cgit v1.2.3 From d64008a8f30e0b381b292788ec6f3ee509b3bb40 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 25 Nov 2012 23:12:10 -0500 Subject: burying unused conditionals __ARCH_WANT_SYS_RT_SIGACTION, __ARCH_WANT_SYS_RT_SIGSUSPEND, __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND, __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL - not used anymore CONFIG_GENERIC_{SIGALTSTACK,COMPAT_RT_SIG{ACTION,QUEUEINFO,PENDING,PROCMASK}} - can be assumed always set. --- kernel/signal.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 6919050c8156..2b8282bb487c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2623,7 +2623,6 @@ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset, } #ifdef CONFIG_COMPAT -#ifdef CONFIG_GENERIC_COMPAT_RT_SIGPROCMASK COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, compat_sigset_t __user *, oset, compat_size_t, sigsetsize) { @@ -2661,7 +2660,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, #endif } #endif -#endif static int do_sigpending(void *set, unsigned long sigsetsize) { @@ -2694,7 +2692,6 @@ SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) } #ifdef CONFIG_COMPAT -#ifdef CONFIG_GENERIC_COMPAT_RT_SIGPENDING COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, compat_size_t, sigsetsize) { @@ -2714,7 +2711,6 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, #endif } #endif -#endif #ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER @@ -3024,7 +3020,6 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, } #ifdef CONFIG_COMPAT -#ifdef CONFIG_GENERIC_COMPAT_RT_SIGQUEUEINFO COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo, compat_pid_t, pid, int, sig, @@ -3037,7 +3032,6 @@ COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo, return do_rt_sigqueueinfo(pid, sig, &info); } #endif -#endif static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) { @@ -3194,12 +3188,10 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s out: return error; } -#ifdef CONFIG_GENERIC_SIGALTSTACK SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) { return do_sigaltstack(uss, uoss, current_user_stack_pointer()); } -#endif int restore_altstack(const stack_t __user *uss) { @@ -3217,7 +3209,6 @@ int __save_altstack(stack_t __user *uss, unsigned long sp) } #ifdef CONFIG_COMPAT -#ifdef CONFIG_GENERIC_SIGALTSTACK COMPAT_SYSCALL_DEFINE2(sigaltstack, const compat_stack_t __user *, uss_ptr, compat_stack_t __user *, uoss_ptr) @@ -3267,7 +3258,6 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) __put_user(t->sas_ss_size, &uss->ss_size); } #endif -#endif #ifdef __ARCH_WANT_SYS_SIGPENDING @@ -3368,7 +3358,6 @@ out: return ret; } #ifdef CONFIG_COMPAT -#ifdef CONFIG_GENERIC_COMPAT_RT_SIGACTION COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, const struct compat_sigaction __user *, act, struct compat_sigaction __user *, oact, @@ -3415,7 +3404,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, return ret; } #endif -#endif #endif /* !CONFIG_ODD_RT_SIGACTION */ #ifdef CONFIG_OLD_SIGACTION -- cgit v1.2.3 From 7d7e499f7333f68b7e7f67d14b9c1480913b4afb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 31 Jan 2013 12:11:12 +0000 Subject: smpboot: Allow selfparking per cpu threads The stop machine threads are still killed when a cpu goes offline. The reason is that the thread is used to bring the cpu down, so it can't be parked along with the other per cpu threads. Allow a per cpu thread to be excluded from automatic parking, so it can park itself once it's done Add a create callback function as well. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Rusty Russell Cc: Paul McKenney Cc: Srivatsa S. Bhat Cc: Arjan van de Veen Cc: Paul Turner Cc: Richard Weinberger Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130131120741.553993267@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/smpboot.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/smpboot.c b/kernel/smpboot.c index d6c5fc054242..d4abac261779 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -183,9 +183,10 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) kfree(td); return PTR_ERR(tsk); } - get_task_struct(tsk); *per_cpu_ptr(ht->store, cpu) = tsk; + if (ht->create) + ht->create(cpu); return 0; } @@ -225,7 +226,7 @@ static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu) { struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); - if (tsk) + if (tsk && !ht->selfparking) kthread_park(tsk); } -- cgit v1.2.3 From 860a0ffaa3e1a9cf0ebb5f43d6a2a2ce67463e93 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 31 Jan 2013 12:11:13 +0000 Subject: stop_machine: Store task reference in a separate per cpu variable To allow the stopper thread being managed by the smpboot thread infrastructure separate out the task storage from the stopper data structure. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Rusty Russell Cc: Paul McKenney Cc: Srivatsa S. Bhat Cc: Arjan van de Veen Cc: Paul Turner Cc: Richard Weinberger Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130131120741.626690384@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/stop_machine.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 2f194e965715..aaac68c5c3be 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -37,10 +37,10 @@ struct cpu_stopper { spinlock_t lock; bool enabled; /* is this stopper enabled? */ struct list_head works; /* list of pending works */ - struct task_struct *thread; /* stopper thread */ }; static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); +static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task); static bool stop_machine_initialized = false; static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) @@ -62,16 +62,18 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed) } /* queue @work to @stopper. if offline, @work is completed immediately */ -static void cpu_stop_queue_work(struct cpu_stopper *stopper, - struct cpu_stop_work *work) +static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) { + struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); + struct task_struct *p = per_cpu(cpu_stopper_task, cpu); + unsigned long flags; spin_lock_irqsave(&stopper->lock, flags); if (stopper->enabled) { list_add_tail(&work->list, &stopper->works); - wake_up_process(stopper->thread); + wake_up_process(p); } else cpu_stop_signal_done(work->done, false); @@ -108,7 +110,7 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; cpu_stop_init_done(&done, 1); - cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work); + cpu_stop_queue_work(cpu, &work); wait_for_completion(&done.completion); return done.executed ? done.ret : -ENOENT; } @@ -130,7 +132,7 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf) { *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; - cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf); + cpu_stop_queue_work(cpu, work_buf); } /* static data for stop_cpus */ @@ -159,8 +161,7 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask, */ preempt_disable(); for_each_cpu(cpu, cpumask) - cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), - &per_cpu(stop_cpus_work, cpu)); + cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu)); preempt_enable(); } @@ -304,12 +305,11 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, { unsigned int cpu = (unsigned long)hcpu; struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - struct task_struct *p; + struct task_struct *p = per_cpu(cpu_stopper_task, cpu); switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - BUG_ON(stopper->thread || stopper->enabled || - !list_empty(&stopper->works)); + BUG_ON(p || stopper->enabled || !list_empty(&stopper->works)); p = kthread_create_on_node(cpu_stopper_thread, stopper, cpu_to_node(cpu), @@ -319,12 +319,12 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, get_task_struct(p); kthread_bind(p, cpu); sched_set_stop_task(cpu, p); - stopper->thread = p; + per_cpu(cpu_stopper_task, cpu) = p; break; case CPU_ONLINE: /* strictly unnecessary, as first user will wake it */ - wake_up_process(stopper->thread); + wake_up_process(p); /* mark enabled */ spin_lock_irq(&stopper->lock); stopper->enabled = true; @@ -339,7 +339,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, sched_set_stop_task(cpu, NULL); /* kill the stopper */ - kthread_stop(stopper->thread); + kthread_stop(p); /* drain remaining works */ spin_lock_irq(&stopper->lock); list_for_each_entry(work, &stopper->works, list) @@ -347,8 +347,8 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, stopper->enabled = false; spin_unlock_irq(&stopper->lock); /* release the stopper */ - put_task_struct(stopper->thread); - stopper->thread = NULL; + put_task_struct(p); + per_cpu(cpu_stopper_task, cpu) = NULL; break; } #endif -- cgit v1.2.3 From 14e568e78f6f80ca1e27256641ddf524c7dbdc51 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 31 Jan 2013 12:11:14 +0000 Subject: stop_machine: Use smpboot threads Use the smpboot thread infrastructure. Mark the stopper thread selfparking and park it after it has finished the take_cpu_down() work. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Rusty Russell Cc: Paul McKenney Cc: Srivatsa S. Bhat Cc: Arjan van de Veen Cc: Paul Turner Cc: Richard Weinberger Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130131120741.686315164@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 2 + kernel/stop_machine.c | 136 +++++++++++++++++++------------------------------- 2 files changed, 52 insertions(+), 86 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 3046a503242c..c91e30d1ef05 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -254,6 +254,8 @@ static int __ref take_cpu_down(void *_param) return err; cpu_notify(CPU_DYING | param->mod, param->hcpu); + /* Park the stopper thread */ + kthread_park(current); return 0; } diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index aaac68c5c3be..95d178c62d5a 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -18,7 +18,7 @@ #include #include #include - +#include #include /* @@ -245,20 +245,25 @@ int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) return ret; } -static int cpu_stopper_thread(void *data) +static int cpu_stop_should_run(unsigned int cpu) +{ + struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); + unsigned long flags; + int run; + + spin_lock_irqsave(&stopper->lock, flags); + run = !list_empty(&stopper->works); + spin_unlock_irqrestore(&stopper->lock, flags); + return run; +} + +static void cpu_stopper_thread(unsigned int cpu) { - struct cpu_stopper *stopper = data; + struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); struct cpu_stop_work *work; int ret; repeat: - set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ - - if (kthread_should_stop()) { - __set_current_state(TASK_RUNNING); - return 0; - } - work = NULL; spin_lock_irq(&stopper->lock); if (!list_empty(&stopper->works)) { @@ -274,8 +279,6 @@ repeat: struct cpu_stop_done *done = work->done; char ksym_buf[KSYM_NAME_LEN] __maybe_unused; - __set_current_state(TASK_RUNNING); - /* cpu stop callbacks are not allowed to sleep */ preempt_disable(); @@ -291,87 +294,55 @@ repeat: ksym_buf), arg); cpu_stop_signal_done(done, true); - } else - schedule(); - - goto repeat; + goto repeat; + } } extern void sched_set_stop_task(int cpu, struct task_struct *stop); -/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ -static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static void cpu_stop_create(unsigned int cpu) +{ + sched_set_stop_task(cpu, per_cpu(cpu_stopper_task, cpu)); +} + +static void cpu_stop_park(unsigned int cpu) { - unsigned int cpu = (unsigned long)hcpu; struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - struct task_struct *p = per_cpu(cpu_stopper_task, cpu); + struct cpu_stop_work *work; + unsigned long flags; - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - BUG_ON(p || stopper->enabled || !list_empty(&stopper->works)); - p = kthread_create_on_node(cpu_stopper_thread, - stopper, - cpu_to_node(cpu), - "migration/%d", cpu); - if (IS_ERR(p)) - return notifier_from_errno(PTR_ERR(p)); - get_task_struct(p); - kthread_bind(p, cpu); - sched_set_stop_task(cpu, p); - per_cpu(cpu_stopper_task, cpu) = p; - break; - - case CPU_ONLINE: - /* strictly unnecessary, as first user will wake it */ - wake_up_process(p); - /* mark enabled */ - spin_lock_irq(&stopper->lock); - stopper->enabled = true; - spin_unlock_irq(&stopper->lock); - break; - -#ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_POST_DEAD: - { - struct cpu_stop_work *work; - - sched_set_stop_task(cpu, NULL); - /* kill the stopper */ - kthread_stop(p); - /* drain remaining works */ - spin_lock_irq(&stopper->lock); - list_for_each_entry(work, &stopper->works, list) - cpu_stop_signal_done(work->done, false); - stopper->enabled = false; - spin_unlock_irq(&stopper->lock); - /* release the stopper */ - put_task_struct(p); - per_cpu(cpu_stopper_task, cpu) = NULL; - break; - } -#endif - } + /* drain remaining works */ + spin_lock_irqsave(&stopper->lock, flags); + list_for_each_entry(work, &stopper->works, list) + cpu_stop_signal_done(work->done, false); + stopper->enabled = false; + spin_unlock_irqrestore(&stopper->lock, flags); +} - return NOTIFY_OK; +static void cpu_stop_unpark(unsigned int cpu) +{ + struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); + + spin_lock_irq(&stopper->lock); + stopper->enabled = true; + spin_unlock_irq(&stopper->lock); } -/* - * Give it a higher priority so that cpu stopper is available to other - * cpu notifiers. It currently shares the same priority as sched - * migration_notifier. - */ -static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = { - .notifier_call = cpu_stop_cpu_callback, - .priority = 10, +static struct smp_hotplug_thread cpu_stop_threads = { + .store = &cpu_stopper_task, + .thread_should_run = cpu_stop_should_run, + .thread_fn = cpu_stopper_thread, + .thread_comm = "migration/%u", + .create = cpu_stop_create, + .setup = cpu_stop_unpark, + .park = cpu_stop_park, + .unpark = cpu_stop_unpark, + .selfparking = true, }; static int __init cpu_stop_init(void) { - void *bcpu = (void *)(long)smp_processor_id(); unsigned int cpu; - int err; for_each_possible_cpu(cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); @@ -380,15 +351,8 @@ static int __init cpu_stop_init(void) INIT_LIST_HEAD(&stopper->works); } - /* start one for the boot cpu */ - err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE, - bcpu); - BUG_ON(err != NOTIFY_OK); - cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); - register_cpu_notifier(&cpu_stop_cpu_notifier); - + BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads)); stop_machine_initialized = true; - return 0; } early_initcall(cpu_stop_init); -- cgit v1.2.3 From 02e176af92f3e2e9ec3a48792036566af2dcd534 Mon Sep 17 00:00:00 2001 From: Daniel Baluta Date: Wed, 6 Feb 2013 23:29:20 +0200 Subject: perf/hwbp: Fix cleanup in case of kzalloc failure Obviously this is a typo and could result in memory leaks if kzalloc fails on a given cpu. Signed-off-by: Daniel Baluta Acked-by: Frederic Weisbecker Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1360186160-7566-1-git-send-email-dbaluta@ixiacom.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/hw_breakpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index fe8a916507ed..a64f8aeb5c1f 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -676,7 +676,7 @@ int __init init_hw_breakpoint(void) err_alloc: for_each_possible_cpu(err_cpu) { for (i = 0; i < TYPE_MAX; i++) - kfree(per_cpu(nr_task_bp_pinned[i], cpu)); + kfree(per_cpu(nr_task_bp_pinned[i], err_cpu)); if (err_cpu == cpu) break; } -- cgit v1.2.3 From e6c42c295e071dd74a66b5a9fcf4f44049888ed8 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Fri, 15 Feb 2013 11:08:11 +0100 Subject: posix-cpu-timers: Fix nanosleep task_struct leak The trinity fuzzer triggered a task_struct reference leak via clock_nanosleep with CPU_TIMERs. do_cpu_nanosleep() calls posic_cpu_timer_create(), but misses a corresponding posix_cpu_timer_del() which leads to the task_struct reference leak. Reported-and-tested-by: Tommi Rantala Signed-off-by: Stanislaw Gruszka Cc: Dave Jones Cc: John Stultz Cc: Oleg Nesterov Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20130215100810.GF4392@redhat.com Signed-off-by: Thomas Gleixner --- kernel/posix-cpu-timers.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index a278cad1d5d6..942ca27a28b7 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1401,8 +1401,10 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, while (!signal_pending(current)) { if (timer.it.cpu.expires.sched == 0) { /* - * Our timer fired and was reset. + * Our timer fired and was reset, below + * deletion can not fail. */ + posix_cpu_timer_del(&timer); spin_unlock_irq(&timer.it_lock); return 0; } @@ -1420,9 +1422,26 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, * We were interrupted by a signal. */ sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); - posix_cpu_timer_set(&timer, 0, &zero_it, it); + error = posix_cpu_timer_set(&timer, 0, &zero_it, it); + if (!error) { + /* + * Timer is now unarmed, deletion can not fail. + */ + posix_cpu_timer_del(&timer); + } spin_unlock_irq(&timer.it_lock); + while (error == TIMER_RETRY) { + /* + * We need to handle case when timer was or is in the + * middle of firing. In other cases we already freed + * resources. + */ + spin_lock_irq(&timer.it_lock); + error = posix_cpu_timer_del(&timer); + spin_unlock_irq(&timer.it_lock); + } + if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) { /* * It actually did fire already. -- cgit v1.2.3 From 686855f5d833178e518d79e7912cdb3268a9fa69 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 14 Feb 2013 18:19:58 +0400 Subject: sched: add wait_for_completion_io[_timeout] The only difference between wait_for_completion[_timeout]() and wait_for_completion_io[_timeout]() is that the latter calls io_schedule_timeout() instead of schedule_timeout() so that the caller is accounted as waiting for IO, not just sleeping. These functions can be used for correct iowait time accounting when the completion struct is actually used for waiting for IO (e.g. completion of a bio request in the block layer). Signed-off-by: Vladimir Davydov Acked-by: Ingo Molnar Signed-off-by: Jens Axboe --- kernel/sched/core.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 52 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 257002c13bb0..d6fdcdcbb9b1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3267,7 +3267,8 @@ void complete_all(struct completion *x) EXPORT_SYMBOL(complete_all); static inline long __sched -do_wait_for_common(struct completion *x, long timeout, int state) +do_wait_for_common(struct completion *x, + long (*action)(long), long timeout, int state) { if (!x->done) { DECLARE_WAITQUEUE(wait, current); @@ -3280,7 +3281,7 @@ do_wait_for_common(struct completion *x, long timeout, int state) } __set_current_state(state); spin_unlock_irq(&x->wait.lock); - timeout = schedule_timeout(timeout); + timeout = action(timeout); spin_lock_irq(&x->wait.lock); } while (!x->done && timeout); __remove_wait_queue(&x->wait, &wait); @@ -3291,17 +3292,30 @@ do_wait_for_common(struct completion *x, long timeout, int state) return timeout ?: 1; } -static long __sched -wait_for_common(struct completion *x, long timeout, int state) +static inline long __sched +__wait_for_common(struct completion *x, + long (*action)(long), long timeout, int state) { might_sleep(); spin_lock_irq(&x->wait.lock); - timeout = do_wait_for_common(x, timeout, state); + timeout = do_wait_for_common(x, action, timeout, state); spin_unlock_irq(&x->wait.lock); return timeout; } +static long __sched +wait_for_common(struct completion *x, long timeout, int state) +{ + return __wait_for_common(x, schedule_timeout, timeout, state); +} + +static long __sched +wait_for_common_io(struct completion *x, long timeout, int state) +{ + return __wait_for_common(x, io_schedule_timeout, timeout, state); +} + /** * wait_for_completion: - waits for completion of a task * @x: holds the state of this particular completion @@ -3337,6 +3351,39 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout) } EXPORT_SYMBOL(wait_for_completion_timeout); +/** + * wait_for_completion_io: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. The caller is accounted as waiting + * for IO. + */ +void __sched wait_for_completion_io(struct completion *x) +{ + wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_io); + +/** + * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. The caller is accounted as waiting for IO. + * + * The return value is 0 if timed out, and positive (at least 1, or number of + * jiffies left till timeout) if completed. + */ +unsigned long __sched +wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) +{ + return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_io_timeout); + /** * wait_for_completion_interruptible: - waits for completion of a task (w/intr) * @x: holds the state of this particular completion -- cgit v1.2.3 From bf14e3b979a01cd7298d631736f965fe83c6e2bc Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Fri, 18 Jan 2013 15:12:24 +0530 Subject: sysctl: Enable PARISC "unaligned-trap" to be used cross-arch PARISC defines /proc/sys/kernel/unaligned-trap to runtime toggle unaligned access emulation. The exact mechanics of enablig/disabling are still arch specific, we can make the sysctl usable by other arches. Signed-off-by: Vineet Gupta Acked-by: Helge Deller Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: "Eric W. Biederman" Cc: Serge Hallyn --- kernel/sysctl.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c88878db491e..878b4c41cfb7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -157,6 +157,9 @@ extern int sysctl_tsb_ratio; #ifdef __hppa__ extern int pwrsw_enabled; +#endif + +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW extern int unaligned_enabled; #endif @@ -545,6 +548,8 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#endif +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW { .procname = "unaligned-trap", .data = &unaligned_enabled, -- cgit v1.2.3 From 70730bca1331fc50c3caacaea00439de1325bd6e Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 14 Feb 2013 15:13:55 -0800 Subject: kernel: Replace timeconst.pl with a bc script bc is the standard tool for multi-precision arithmetic. We switched to Perl because akpm reported a hard-to-reproduce build hang, which was very odd because affected and unaffected machines were all running the same version of GNU bc. Unfortunately switching to Perl required a really ugly "canning" mechanism to support Perl < 5.8 installations lacking the Math::BigInt module. It was recently pointed out to me that some very old versions of GNU make had problems with pipes in subshells, which was indeed the construct used in the Makefile rules in that version of the patch; Perl didn't need it so switching to Perl fixed the problem for unrelated reasons. With the problem (hopefully) root-caused, we can switch back to bc and do the arbitrary-precision arithmetic naturally. Signed-off-by: H. Peter Anvin Cc: Andrew Morton Acked-by: Sam Ravnborg Signed-off-by: Michal Marek --- kernel/Makefile | 16 ++- kernel/timeconst.bc | 108 +++++++++++++++ kernel/timeconst.pl | 378 ---------------------------------------------------- 3 files changed, 120 insertions(+), 382 deletions(-) create mode 100644 kernel/timeconst.bc delete mode 100644 kernel/timeconst.pl (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 6c072b6da239..ab1e0386bb2d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -127,11 +127,19 @@ $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(obj)/time.o: $(obj)/timeconst.h -quiet_cmd_timeconst = TIMEC $@ - cmd_timeconst = $(PERL) $< $(CONFIG_HZ) > $@ +quiet_cmd_hzfile = HZFILE $@ + cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@ + +targets += hz.bc +$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE + $(call if_changed,hzfile) + +quiet_cmd_bc = BC $@ + cmd_bc = bc -q $(filter-out FORCE,$^) > $@ + targets += timeconst.h -$(obj)/timeconst.h: $(src)/timeconst.pl FORCE - $(call if_changed,timeconst) +$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE + $(call if_changed,bc) ifeq ($(CONFIG_MODULE_SIG),y) # diff --git a/kernel/timeconst.bc b/kernel/timeconst.bc new file mode 100644 index 000000000000..511bdf2cafda --- /dev/null +++ b/kernel/timeconst.bc @@ -0,0 +1,108 @@ +scale=0 + +define gcd(a,b) { + auto t; + while (b) { + t = b; + b = a % b; + a = t; + } + return a; +} + +/* Division by reciprocal multiplication. */ +define fmul(b,n,d) { + return (2^b*n+d-1)/d; +} + +/* Adjustment factor when a ceiling value is used. Use as: + (imul * n) + (fmulxx * n + fadjxx) >> xx) */ +define fadj(b,n,d) { + auto v; + d = d/gcd(n,d); + v = 2^b*(d-1)/d; + return v; +} + +/* Compute the appropriate mul/adj values as well as a shift count, + which brings the mul value into the range 2^b-1 <= x < 2^b. Such + a shift value will be correct in the signed integer range and off + by at most one in the upper half of the unsigned range. */ +define fmuls(b,n,d) { + auto s, m; + for (s = 0; 1; s++) { + m = fmul(s,n,d); + if (m >= 2^(b-1)) + return s; + } + return 0; +} + +define timeconst(hz) { + print "/* Automatically generated by kernel/timeconst.bc */\n" + print "/* Time conversion constants for HZ == ", hz, " */\n" + print "\n" + + print "#ifndef KERNEL_TIMECONST_H\n" + print "#define KERNEL_TIMECONST_H\n\n" + + print "#include \n" + print "#include \n\n" + + print "#if HZ != ", hz, "\n" + print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n" + print "#endif\n\n" + + if (hz < 2) { + print "#error Totally bogus HZ value!\n" + } else { + s=fmuls(32,1000,hz) + obase=16 + print "#define HZ_TO_MSEC_MUL32\tU64_C(0x", fmul(s,1000,hz), ")\n" + print "#define HZ_TO_MSEC_ADJ32\tU64_C(0x", fadj(s,1000,hz), ")\n" + obase=10 + print "#define HZ_TO_MSEC_SHR32\t", s, "\n" + + s=fmuls(32,hz,1000) + obase=16 + print "#define MSEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000), ")\n" + print "#define MSEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000), ")\n" + obase=10 + print "#define MSEC_TO_HZ_SHR32\t", s, "\n" + + obase=10 + cd=gcd(hz,1000) + print "#define HZ_TO_MSEC_NUM\t\t", 1000/cd, "\n" + print "#define HZ_TO_MSEC_DEN\t\t", hz/cd, "\n" + print "#define MSEC_TO_HZ_NUM\t\t", hz/cd, "\n" + print "#define MSEC_TO_HZ_DEN\t\t", 1000/cd, "\n" + print "\n" + + s=fmuls(32,1000000,hz) + obase=16 + print "#define HZ_TO_USEC_MUL32\tU64_C(0x", fmul(s,1000000,hz), ")\n" + print "#define HZ_TO_USEC_ADJ32\tU64_C(0x", fadj(s,1000000,hz), ")\n" + obase=10 + print "#define HZ_TO_USEC_SHR32\t", s, "\n" + + s=fmuls(32,hz,1000000) + obase=16 + print "#define USEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000000), ")\n" + print "#define USEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000000), ")\n" + obase=10 + print "#define USEC_TO_HZ_SHR32\t", s, "\n" + + obase=10 + cd=gcd(hz,1000000) + print "#define HZ_TO_USEC_NUM\t\t", 1000000/cd, "\n" + print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n" + print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n" + print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n" + print "\n" + + print "#endif /* KERNEL_TIMECONST_H */\n" + } + halt +} + +timeconst(hz) diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl deleted file mode 100644 index eb51d76e058a..000000000000 --- a/kernel/timeconst.pl +++ /dev/null @@ -1,378 +0,0 @@ -#!/usr/bin/perl -# ----------------------------------------------------------------------- -# -# Copyright 2007-2008 rPath, Inc. - All Rights Reserved -# -# This file is part of the Linux kernel, and is made available under -# the terms of the GNU General Public License version 2 or (at your -# option) any later version; incorporated herein by reference. -# -# ----------------------------------------------------------------------- -# - -# -# Usage: timeconst.pl HZ > timeconst.h -# - -# Precomputed values for systems without Math::BigInt -# Generated by: -# timeconst.pl --can 24 32 48 64 100 122 128 200 250 256 300 512 1000 1024 1200 -%canned_values = ( - 24 => [ - '0xa6aaaaab','0x2aaaaaa',26, - 125,3, - '0xc49ba5e4','0x1fbe76c8b4',37, - 3,125, - '0xa2c2aaab','0xaaaa',16, - 125000,3, - '0xc9539b89','0x7fffbce4217d',47, - 3,125000, - ], 32 => [ - '0xfa000000','0x6000000',27, - 125,4, - '0x83126e98','0xfdf3b645a',36, - 4,125, - '0xf4240000','0x0',17, - 31250,1, - '0x8637bd06','0x3fff79c842fa',46, - 1,31250, - ], 48 => [ - '0xa6aaaaab','0x6aaaaaa',27, - 125,6, - '0xc49ba5e4','0xfdf3b645a',36, - 6,125, - '0xa2c2aaab','0x15555',17, - 62500,3, - '0xc9539b89','0x3fffbce4217d',46, - 3,62500, - ], 64 => [ - '0xfa000000','0xe000000',28, - 125,8, - '0x83126e98','0x7ef9db22d',35, - 8,125, - '0xf4240000','0x0',18, - 15625,1, - '0x8637bd06','0x1fff79c842fa',45, - 1,15625, - ], 100 => [ - '0xa0000000','0x0',28, - 10,1, - '0xcccccccd','0x733333333',35, - 1,10, - '0x9c400000','0x0',18, - 10000,1, - '0xd1b71759','0x1fff2e48e8a7',45, - 1,10000, - ], 122 => [ - '0x8325c53f','0xfbcda3a',28, - 500,61, - '0xf9db22d1','0x7fbe76c8b',35, - 61,500, - '0x8012e2a0','0x3ef36',18, - 500000,61, - '0xffda4053','0x1ffffbce4217',45, - 61,500000, - ], 128 => [ - '0xfa000000','0x1e000000',29, - 125,16, - '0x83126e98','0x3f7ced916',34, - 16,125, - '0xf4240000','0x40000',19, - 15625,2, - '0x8637bd06','0xfffbce4217d',44, - 2,15625, - ], 200 => [ - '0xa0000000','0x0',29, - 5,1, - '0xcccccccd','0x333333333',34, - 1,5, - '0x9c400000','0x0',19, - 5000,1, - '0xd1b71759','0xfff2e48e8a7',44, - 1,5000, - ], 250 => [ - '0x80000000','0x0',29, - 4,1, - '0x80000000','0x180000000',33, - 1,4, - '0xfa000000','0x0',20, - 4000,1, - '0x83126e98','0x7ff7ced9168',43, - 1,4000, - ], 256 => [ - '0xfa000000','0x3e000000',30, - 125,32, - '0x83126e98','0x1fbe76c8b',33, - 32,125, - '0xf4240000','0xc0000',20, - 15625,4, - '0x8637bd06','0x7ffde7210be',43, - 4,15625, - ], 300 => [ - '0xd5555556','0x2aaaaaaa',30, - 10,3, - '0x9999999a','0x1cccccccc',33, - 3,10, - '0xd0555556','0xaaaaa',20, - 10000,3, - '0x9d495183','0x7ffcb923a29',43, - 3,10000, - ], 512 => [ - '0xfa000000','0x7e000000',31, - 125,64, - '0x83126e98','0xfdf3b645',32, - 64,125, - '0xf4240000','0x1c0000',21, - 15625,8, - '0x8637bd06','0x3ffef39085f',42, - 8,15625, - ], 1000 => [ - '0x80000000','0x0',31, - 1,1, - '0x80000000','0x0',31, - 1,1, - '0xfa000000','0x0',22, - 1000,1, - '0x83126e98','0x1ff7ced9168',41, - 1,1000, - ], 1024 => [ - '0xfa000000','0xfe000000',32, - 125,128, - '0x83126e98','0x7ef9db22',31, - 128,125, - '0xf4240000','0x3c0000',22, - 15625,16, - '0x8637bd06','0x1fff79c842f',41, - 16,15625, - ], 1200 => [ - '0xd5555556','0xd5555555',32, - 5,6, - '0x9999999a','0x66666666',31, - 6,5, - '0xd0555556','0x2aaaaa',22, - 2500,3, - '0x9d495183','0x1ffcb923a29',41, - 3,2500, - ] -); - -$has_bigint = eval 'use Math::BigInt qw(bgcd); 1;'; - -sub bint($) -{ - my($x) = @_; - return Math::BigInt->new($x); -} - -# -# Constants for division by reciprocal multiplication. -# (bits, numerator, denominator) -# -sub fmul($$$) -{ - my ($b,$n,$d) = @_; - - $n = bint($n); - $d = bint($d); - - return scalar (($n << $b)+$d-bint(1))/$d; -} - -sub fadj($$$) -{ - my($b,$n,$d) = @_; - - $n = bint($n); - $d = bint($d); - - $d = $d/bgcd($n, $d); - return scalar (($d-bint(1)) << $b)/$d; -} - -sub fmuls($$$) { - my($b,$n,$d) = @_; - my($s,$m); - my($thres) = bint(1) << ($b-1); - - $n = bint($n); - $d = bint($d); - - for ($s = 0; 1; $s++) { - $m = fmul($s,$n,$d); - return $s if ($m >= $thres); - } - return 0; -} - -# Generate a hex value if the result fits in 64 bits; -# otherwise skip. -sub bignum_hex($) { - my($x) = @_; - my $s = $x->as_hex(); - - return (length($s) > 18) ? undef : $s; -} - -# Provides mul, adj, and shr factors for a specific -# (bit, time, hz) combination -sub muladj($$$) { - my($b, $t, $hz) = @_; - my $s = fmuls($b, $t, $hz); - my $m = fmul($s, $t, $hz); - my $a = fadj($s, $t, $hz); - return (bignum_hex($m), bignum_hex($a), $s); -} - -# Provides numerator, denominator values -sub numden($$) { - my($n, $d) = @_; - my $g = bgcd($n, $d); - return ($n/$g, $d/$g); -} - -# All values for a specific (time, hz) combo -sub conversions($$) { - my ($t, $hz) = @_; - my @val = (); - - # HZ_TO_xx - push(@val, muladj(32, $t, $hz)); - push(@val, numden($t, $hz)); - - # xx_TO_HZ - push(@val, muladj(32, $hz, $t)); - push(@val, numden($hz, $t)); - - return @val; -} - -sub compute_values($) { - my($hz) = @_; - my @val = (); - my $s, $m, $a, $g; - - if (!$has_bigint) { - die "$0: HZ == $hz not canned and ". - "Math::BigInt not available\n"; - } - - # MSEC conversions - push(@val, conversions(1000, $hz)); - - # USEC conversions - push(@val, conversions(1000000, $hz)); - - return @val; -} - -sub outputval($$) -{ - my($name, $val) = @_; - my $csuf; - - if (defined($val)) { - if ($name !~ /SHR/) { - $val = "U64_C($val)"; - } - printf "#define %-23s %s\n", $name.$csuf, $val.$csuf; - } -} - -sub output($@) -{ - my($hz, @val) = @_; - my $pfx, $bit, $suf, $s, $m, $a; - - print "/* Automatically generated by kernel/timeconst.pl */\n"; - print "/* Conversion constants for HZ == $hz */\n"; - print "\n"; - print "#ifndef KERNEL_TIMECONST_H\n"; - print "#define KERNEL_TIMECONST_H\n"; - print "\n"; - - print "#include \n"; - print "#include \n"; - - print "\n"; - print "#if HZ != $hz\n"; - print "#error \"kernel/timeconst.h has the wrong HZ value!\"\n"; - print "#endif\n"; - print "\n"; - - foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ', - 'HZ_TO_USEC','USEC_TO_HZ') { - foreach $bit (32) { - foreach $suf ('MUL', 'ADJ', 'SHR') { - outputval("${pfx}_$suf$bit", shift(@val)); - } - } - foreach $suf ('NUM', 'DEN') { - outputval("${pfx}_$suf", shift(@val)); - } - } - - print "\n"; - print "#endif /* KERNEL_TIMECONST_H */\n"; -} - -# Pretty-print Perl values -sub perlvals(@) { - my $v; - my @l = (); - - foreach $v (@_) { - if (!defined($v)) { - push(@l, 'undef'); - } elsif ($v =~ /^0x/) { - push(@l, "\'".$v."\'"); - } else { - push(@l, $v.''); - } - } - return join(',', @l); -} - -($hz) = @ARGV; - -# Use this to generate the %canned_values structure -if ($hz eq '--can') { - shift(@ARGV); - @hzlist = sort {$a <=> $b} (@ARGV); - - print "# Precomputed values for systems without Math::BigInt\n"; - print "# Generated by:\n"; - print "# timeconst.pl --can ", join(' ', @hzlist), "\n"; - print "\%canned_values = (\n"; - my $pf = "\t"; - foreach $hz (@hzlist) { - my @values = compute_values($hz); - print "$pf$hz => [\n"; - while (scalar(@values)) { - my $bit; - foreach $bit (32) { - my $m = shift(@values); - my $a = shift(@values); - my $s = shift(@values); - print "\t\t", perlvals($m,$a,$s), ",\n"; - } - my $n = shift(@values); - my $d = shift(@values); - print "\t\t", perlvals($n,$d), ",\n"; - } - print "\t]"; - $pf = ', '; - } - print "\n);\n"; -} else { - $hz += 0; # Force to number - if ($hz < 1) { - die "Usage: $0 HZ\n"; - } - - @val = @{$canned_values{$hz}}; - if (!defined(@val)) { - @val = compute_values($hz); - } - output($hz, @val); -} -exit 0; -- cgit v1.2.3 From 71b5707e119653039e6e95213f00479668c79b75 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 24 Jan 2013 14:43:28 +0800 Subject: cgroup: fix exit() vs rmdir() race In cgroup_exit() put_css_set_taskexit() is called without any lock, which might lead to accessing a freed cgroup: thread1 thread2 --------------------------------------------- exit() cgroup_exit() put_css_set_taskexit() atomic_dec(cgrp->count); rmdir(); /* not safe !! */ check_for_release(cgrp); rcu_read_lock() can be used to make sure the cgroup is alive. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/cgroup.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 800852282c21..5d4c92ead691 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -422,12 +422,20 @@ static void __put_css_set(struct css_set *cg, int taskexit) struct cgroup *cgrp = link->cgrp; list_del(&link->cg_link_list); list_del(&link->cgrp_link_list); + + /* + * We may not be holding cgroup_mutex, and if cgrp->count is + * dropped to 0 the cgroup can be destroyed at any time, hence + * rcu_read_lock is used to keep it alive. + */ + rcu_read_lock(); if (atomic_dec_and_test(&cgrp->count) && notify_on_release(cgrp)) { if (taskexit) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } + rcu_read_unlock(); kfree(link); } -- cgit v1.2.3 From 63f43f55c9bbc14f76b582644019b8a07dc8219a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 25 Jan 2013 16:08:01 +0800 Subject: cpuset: fix cpuset_print_task_mems_allowed() vs rename() race rename() will change dentry->d_name. The result of this race can be worse than seeing partially rewritten name, but we might access a stale pointer because rename() will re-allocate memory to hold a longer name. It's safe in the protection of dentry->d_lock. v2: check NULL dentry before acquiring dentry lock. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/cpuset.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7bb63eea6eb8..5bb9bf18438c 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2511,8 +2511,16 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk) dentry = task_cs(tsk)->css.cgroup->dentry; spin_lock(&cpuset_buffer_lock); - snprintf(cpuset_name, CPUSET_NAME_LEN, - dentry ? (const char *)dentry->d_name.name : "/"); + + if (!dentry) { + strcpy(cpuset_name, "/"); + } else { + spin_lock(&dentry->d_lock); + strlcpy(cpuset_name, (const char *)dentry->d_name.name, + CPUSET_NAME_LEN); + spin_unlock(&dentry->d_lock); + } + nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, tsk->mems_allowed); printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", -- cgit v1.2.3 From 810cbee4fad570ff167132d4ecf247d99c48f71d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 18 Feb 2013 18:56:14 +0800 Subject: cgroup: fix cgroup_rmdir() vs close(eventfd) race commit 205a872bd6f9a9a09ef035ef1e90185a8245cc58 ("cgroup: fix lockdep warning for event_control") solved a deadlock by introducing a new bug. Move cgrp->event_list to a temporary list doesn't mean you can traverse this list locklessly, because at the same time cgroup_event_wake() can be called and remove the event from the list. The result of this race is disastrous. We adopt the way how kvm irqfd code implements race-free event removal, which is now described in the comments in cgroup_event_wake(). v3: - call eventfd_signal() no matter it's eventfd close or cgroup removal that removes the cgroup event. Acked-by: Kirill A. Shutemov Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5d4c92ead691..feda81487be4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3786,8 +3786,13 @@ static void cgroup_event_remove(struct work_struct *work) remove); struct cgroup *cgrp = event->cgrp; + remove_wait_queue(event->wqh, &event->wait); + event->cft->unregister_event(cgrp, event->cft, event->eventfd); + /* Notify userspace the event is going away. */ + eventfd_signal(event->eventfd, 1); + eventfd_ctx_put(event->eventfd); kfree(event); dput(cgrp->dentry); @@ -3807,15 +3812,25 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, unsigned long flags = (unsigned long)key; if (flags & POLLHUP) { - __remove_wait_queue(event->wqh, &event->wait); - spin_lock(&cgrp->event_list_lock); - list_del_init(&event->list); - spin_unlock(&cgrp->event_list_lock); /* - * We are in atomic context, but cgroup_event_remove() may - * sleep, so we have to call it in workqueue. + * If the event has been detached at cgroup removal, we + * can simply return knowing the other side will cleanup + * for us. + * + * We can't race against event freeing since the other + * side will require wqh->lock via remove_wait_queue(), + * which we hold. */ - schedule_work(&event->remove); + spin_lock(&cgrp->event_list_lock); + if (!list_empty(&event->list)) { + list_del_init(&event->list); + /* + * We are in atomic context, but cgroup_event_remove() + * may sleep, so we have to call it in workqueue. + */ + schedule_work(&event->remove); + } + spin_unlock(&cgrp->event_list_lock); } return 0; @@ -4375,20 +4390,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Unregister events and notify userspace. * Notify userspace about cgroup removing only after rmdir of cgroup - * directory to avoid race between userspace and kernelspace. Use - * a temporary list to avoid a deadlock with cgroup_event_wake(). Since - * cgroup_event_wake() is called with the wait queue head locked, - * remove_wait_queue() cannot be called while holding event_list_lock. + * directory to avoid race between userspace and kernelspace. */ spin_lock(&cgrp->event_list_lock); - list_splice_init(&cgrp->event_list, &tmp_list); - spin_unlock(&cgrp->event_list_lock); - list_for_each_entry_safe(event, tmp, &tmp_list, list) { + list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { list_del_init(&event->list); - remove_wait_queue(event->wqh, &event->wait); - eventfd_signal(event->eventfd, 1); schedule_work(&event->remove); } + spin_unlock(&cgrp->event_list_lock); return 0; } -- cgit v1.2.3 From f169007b2773f285e098cb84c74aac0154d65ff7 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 18 Feb 2013 14:13:35 +0800 Subject: cgroup: fail if monitored file and event_control are in different cgroup If we pass fd of memory.usage_in_bytes of cgroup A to cgroup.event_control of cgroup B, then we won't get memory usage notification from A but B! What's worse, if A and B are in different mount hierarchy, we'll end up accessing NULL pointer! Disallow this kind of invalid usage. Signed-off-by: Li Zefan Acked-by: Kirill A. Shutemov Signed-off-by: Tejun Heo --- kernel/cgroup.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index feda81487be4..b5c64327e712 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3856,6 +3856,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, const char *buffer) { struct cgroup_event *event = NULL; + struct cgroup *cgrp_cfile; unsigned int efd, cfd; struct file *efile = NULL; struct file *cfile = NULL; @@ -3911,6 +3912,16 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, goto fail; } + /* + * The file to be monitored must be in the same cgroup as + * cgroup.event_control is. + */ + cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); + if (cgrp_cfile != cgrp) { + ret = -EINVAL; + goto fail; + } + if (!event->cft->register_event || !event->cft->unregister_event) { ret = -EINVAL; goto fail; -- cgit v1.2.3 From 36a5df85e9a3c218b73f6cf80098016ca3f0410d Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Fri, 1 Feb 2013 15:04:26 -0500 Subject: genirq: Export enable/disable_percpu_irq() These functions are used by the tilegx onchip network driver, and it's useful to be able to load that driver as a module. Signed-off-by: Chris Metcalf Link: http://lkml.kernel.org/r/201302012043.r11KhNZF024371@farm-0021.internal.tilera.com Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e49a288fa479..88e7bed62711 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1524,6 +1524,7 @@ void enable_percpu_irq(unsigned int irq, unsigned int type) out: irq_put_desc_unlock(desc, flags); } +EXPORT_SYMBOL_GPL(enable_percpu_irq); void disable_percpu_irq(unsigned int irq) { @@ -1537,6 +1538,7 @@ void disable_percpu_irq(unsigned int irq) irq_percpu_disable(desc, cpu); irq_put_desc_unlock(desc, flags); } +EXPORT_SYMBOL_GPL(disable_percpu_irq); /* * Internal function to unregister a percpu irqaction. -- cgit v1.2.3 From 8c189ea64eea01ca20d102ddb74d6936dd16c579 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 13 Feb 2013 15:18:38 -0500 Subject: ftrace: Call ftrace cleanup module notifier after all other notifiers Commit: c1bf08ac "ftrace: Be first to run code modification on modules" changed ftrace module notifier's priority to INT_MAX in order to process the ftrace nops before anything else could touch them (namely kprobes). This was the correct thing to do. Unfortunately, the ftrace module notifier also contains the ftrace clean up code. As opposed to the set up code, this code should be run *after* all the module notifiers have run in case a module is doing correct clean-up and unregisters its ftrace hooks. Basically, ftrace needs to do clean up on module removal, as it needs to know about code being removed so that it doesn't try to modify that code. But after it removes the module from its records, if a ftrace user tries to remove a probe, that removal will fail due as the record of that code segment no longer exists. Nothing really bad happens if the probe removal is called after ftrace did the clean up, but the ftrace removal function will return an error. Correct code (such as kprobes) will produce a WARN_ON() if it fails to remove the probe. As people get annoyed by frivolous warnings, it's best to do the ftrace clean up after everything else. By splitting the ftrace_module_notifier into two notifiers, one that does the module load setup that is run at high priority, and the other that is called for module clean up that is run at low priority, the problem is solved. Cc: stable@vger.kernel.org Reported-by: Frank Ch. Eigler Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 46 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ce8c3d68292f..98ca94a41819 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3996,37 +3996,51 @@ static void ftrace_init_module(struct module *mod, ftrace_process_locs(mod, start, end); } -static int ftrace_module_notify(struct notifier_block *self, - unsigned long val, void *data) +static int ftrace_module_notify_enter(struct notifier_block *self, + unsigned long val, void *data) { struct module *mod = data; - switch (val) { - case MODULE_STATE_COMING: + if (val == MODULE_STATE_COMING) ftrace_init_module(mod, mod->ftrace_callsites, mod->ftrace_callsites + mod->num_ftrace_callsites); - break; - case MODULE_STATE_GOING: + return 0; +} + +static int ftrace_module_notify_exit(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + if (val == MODULE_STATE_GOING) ftrace_release_mod(mod); - break; - } return 0; } #else -static int ftrace_module_notify(struct notifier_block *self, - unsigned long val, void *data) +static int ftrace_module_notify_enter(struct notifier_block *self, + unsigned long val, void *data) +{ + return 0; +} +static int ftrace_module_notify_exit(struct notifier_block *self, + unsigned long val, void *data) { return 0; } #endif /* CONFIG_MODULES */ -struct notifier_block ftrace_module_nb = { - .notifier_call = ftrace_module_notify, +struct notifier_block ftrace_module_enter_nb = { + .notifier_call = ftrace_module_notify_enter, .priority = INT_MAX, /* Run before anything that can use kprobes */ }; +struct notifier_block ftrace_module_exit_nb = { + .notifier_call = ftrace_module_notify_exit, + .priority = INT_MIN, /* Run after anything that can remove kprobes */ +}; + extern unsigned long __start_mcount_loc[]; extern unsigned long __stop_mcount_loc[]; @@ -4058,9 +4072,13 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); - ret = register_module_notifier(&ftrace_module_nb); + ret = register_module_notifier(&ftrace_module_enter_nb); + if (ret) + pr_warning("Failed to register trace ftrace module enter notifier\n"); + + ret = register_module_notifier(&ftrace_module_exit_nb); if (ret) - pr_warning("Failed to register trace ftrace module notifier\n"); + pr_warning("Failed to register trace ftrace module exit notifier\n"); set_ftrace_early_filters(); -- cgit v1.2.3 From cdc4e86b58a95005ef500916b4a8e91a0037a822 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Feb 2013 23:47:07 +0100 Subject: cputime: Remove irqsave from seqlock readers The reader side code has no requirement to disable interrupts while sampling data. The sequence counter is enough to ensure consistency. Signed-off-by: Thomas Gleixner Cc: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index ccff2752725a..9857329ed280 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -729,18 +729,17 @@ void vtime_init_idle(struct task_struct *t) cputime_t task_gtime(struct task_struct *t) { - unsigned long flags; unsigned int seq; cputime_t gtime; do { - seq = read_seqbegin_irqsave(&t->vtime_seqlock, flags); + seq = read_seqbegin(&t->vtime_seqlock); gtime = t->gtime; if (t->flags & PF_VCPU) gtime += vtime_delta(t); - } while (read_seqretry_irqrestore(&t->vtime_seqlock, seq, flags)); + } while (read_seqretry(&t->vtime_seqlock, seq)); return gtime; } @@ -756,7 +755,6 @@ fetch_task_cputime(struct task_struct *t, cputime_t *u_src, cputime_t *s_src, cputime_t *udelta, cputime_t *sdelta) { - unsigned long flags; unsigned int seq; unsigned long long delta; @@ -764,7 +762,7 @@ fetch_task_cputime(struct task_struct *t, *udelta = 0; *sdelta = 0; - seq = read_seqbegin_irqsave(&t->vtime_seqlock, flags); + seq = read_seqbegin(&t->vtime_seqlock); if (u_dst) *u_dst = *u_src; @@ -788,7 +786,7 @@ fetch_task_cputime(struct task_struct *t, if (t->vtime_snap_whence == VTIME_SYS) *sdelta = delta; } - } while (read_seqretry_irqrestore(&t->vtime_seqlock, seq, flags)); + } while (read_seqretry(&t->vtime_seqlock, seq)); } -- cgit v1.2.3 From f86f75548233a2be7379ac446e91729710d4a5f7 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Tue, 8 Jan 2013 18:35:58 +0530 Subject: lockdep: Rename print_unlock_inbalance_bug() to print_unlock_imbalance_bug() Fix the typo in the function name (s/inbalance/imbalance) Signed-off-by: Srivatsa S. Bhat Cc: rostedt@goodmis.org Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/20130108130547.32733.79507.stgit@srivatsabhat.in.ibm.com Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 7981e5b2350d..5cf12e791603 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3203,7 +3203,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, } static int -print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock, +print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, unsigned long ip) { if (!debug_locks_off()) @@ -3246,7 +3246,7 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, return 0; if (curr->lockdep_depth <= 0) - return print_unlock_inbalance_bug(curr, lock, ip); + return print_unlock_imbalance_bug(curr, lock, ip); return 1; } @@ -3317,7 +3317,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name, goto found_it; prev_hlock = hlock; } - return print_unlock_inbalance_bug(curr, lock, ip); + return print_unlock_imbalance_bug(curr, lock, ip); found_it: lockdep_init_map(lock, name, key, 0); @@ -3384,7 +3384,7 @@ lock_release_non_nested(struct task_struct *curr, goto found_it; prev_hlock = hlock; } - return print_unlock_inbalance_bug(curr, lock, ip); + return print_unlock_imbalance_bug(curr, lock, ip); found_it: if (hlock->instance == lock) -- cgit v1.2.3 From c06b4f1947213cd0902610fafcabac02d49ad728 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 27 Dec 2012 11:49:44 +0900 Subject: watchdog: Use local_clock for get_timestamp() The get_timestamp() function is always called with current cpu, thus using local_clock() would be more appropriate and it makes the code shorter and cleaner IMHO. Signed-off-by: Namhyung Kim Acked-by: Don Zickus Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1356576585-28782-1-git-send-email-namhyung@kernel.org Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 75a2ab3d0b02..082ca6878a3f 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -112,9 +112,9 @@ static int get_softlockup_thresh(void) * resolution, and we don't need to waste time with a big divide when * 2^30ns == 1.074s. */ -static unsigned long get_timestamp(int this_cpu) +static unsigned long get_timestamp(void) { - return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ + return local_clock() >> 30LL; /* 2^30 ~= 10^9 */ } static void set_sample_period(void) @@ -132,9 +132,7 @@ static void set_sample_period(void) /* Commands for resetting the watchdog */ static void __touch_watchdog(void) { - int this_cpu = smp_processor_id(); - - __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu)); + __this_cpu_write(watchdog_touch_ts, get_timestamp()); } void touch_softlockup_watchdog(void) @@ -195,7 +193,7 @@ static int is_hardlockup(void) static int is_softlockup(unsigned long touch_ts) { - unsigned long now = get_timestamp(smp_processor_id()); + unsigned long now = get_timestamp(); /* Warn about unreasonable delays: */ if (time_after(now, touch_ts + get_softlockup_thresh())) -- cgit v1.2.3 From c0540606837af79b2ae101e5e7b2206e3844d150 Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Wed, 6 Feb 2013 10:56:19 -0800 Subject: lockdep: Print more info when MAX_LOCK_DEPTH is exceeded This helps debug cases where a lock is acquired over and over without being released. Suggested-by: Steven Rostedt Signed-off-by: Ben Greear Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/1360176979-4421-1-git-send-email-greearb@candelatech.com [ Changed the printout ordering. ] Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 5cf12e791603..8a0efac4f99d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3190,9 +3190,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, #endif if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { debug_locks_off(); - printk("BUG: MAX_LOCK_DEPTH too low!\n"); + printk("BUG: MAX_LOCK_DEPTH too low, depth: %i max: %lu!\n", + curr->lockdep_depth, MAX_LOCK_DEPTH); printk("turning off the locking correctness validator.\n"); + + lockdep_print_held_locks(current); + debug_show_all_locks(); dump_stack(); + return 0; } -- cgit v1.2.3 From a6c0c943a15d0b3d6ac33760cb8f95c75f395895 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Apr 2012 11:14:55 +0200 Subject: ntp: Make ntp_lock raw seconds_overflow() is called from hard interrupt context even on Preempt-RT. This requires the lock to be a raw_spinlock. Signed-off-by: Thomas Gleixner Cc: John Stultz Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 24174b4d669b..bb1edfaafe3d 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -22,7 +22,7 @@ * NTP timekeeping variables: */ -DEFINE_SPINLOCK(ntp_lock); +DEFINE_RAW_SPINLOCK(ntp_lock); /* USER_HZ period (usecs): */ @@ -347,7 +347,7 @@ void ntp_clear(void) { unsigned long flags; - spin_lock_irqsave(&ntp_lock, flags); + raw_spin_lock_irqsave(&ntp_lock, flags); time_adjust = 0; /* stop active adjtime() */ time_status |= STA_UNSYNC; @@ -361,7 +361,7 @@ void ntp_clear(void) /* Clear PPS state variables */ pps_clear(); - spin_unlock_irqrestore(&ntp_lock, flags); + raw_spin_unlock_irqrestore(&ntp_lock, flags); } @@ -371,9 +371,9 @@ u64 ntp_tick_length(void) unsigned long flags; s64 ret; - spin_lock_irqsave(&ntp_lock, flags); + raw_spin_lock_irqsave(&ntp_lock, flags); ret = tick_length; - spin_unlock_irqrestore(&ntp_lock, flags); + raw_spin_unlock_irqrestore(&ntp_lock, flags); return ret; } @@ -394,7 +394,7 @@ int second_overflow(unsigned long secs) int leap = 0; unsigned long flags; - spin_lock_irqsave(&ntp_lock, flags); + raw_spin_lock_irqsave(&ntp_lock, flags); /* * Leap second processing. If in leap-insert state at the end of the @@ -478,7 +478,7 @@ int second_overflow(unsigned long secs) time_adjust = 0; out: - spin_unlock_irqrestore(&ntp_lock, flags); + raw_spin_unlock_irqrestore(&ntp_lock, flags); return leap; } @@ -660,7 +660,7 @@ int do_adjtimex(struct timex *txc) getnstimeofday(&ts); - spin_lock_irq(&ntp_lock); + raw_spin_lock_irq(&ntp_lock); if (txc->modes & ADJ_ADJTIME) { long save_adjust = time_adjust; @@ -702,7 +702,7 @@ int do_adjtimex(struct timex *txc) /* fill PPS status fields */ pps_fill_timex(txc); - spin_unlock_irq(&ntp_lock); + raw_spin_unlock_irq(&ntp_lock); txc->time.tv_sec = ts.tv_sec; txc->time.tv_usec = ts.tv_nsec; @@ -900,7 +900,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) pts_norm = pps_normalize_ts(*phase_ts); - spin_lock_irqsave(&ntp_lock, flags); + raw_spin_lock_irqsave(&ntp_lock, flags); /* clear the error bits, they will be set again if needed */ time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); @@ -913,7 +913,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) * just start the frequency interval */ if (unlikely(pps_fbase.tv_sec == 0)) { pps_fbase = *raw_ts; - spin_unlock_irqrestore(&ntp_lock, flags); + raw_spin_unlock_irqrestore(&ntp_lock, flags); return; } @@ -928,7 +928,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) time_status |= STA_PPSJITTER; /* restart the frequency calibration interval */ pps_fbase = *raw_ts; - spin_unlock_irqrestore(&ntp_lock, flags); + raw_spin_unlock_irqrestore(&ntp_lock, flags); pr_err("hardpps: PPSJITTER: bad pulse\n"); return; } @@ -945,7 +945,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) hardpps_update_phase(pts_norm.nsec); - spin_unlock_irqrestore(&ntp_lock, flags); + raw_spin_unlock_irqrestore(&ntp_lock, flags); } EXPORT_SYMBOL(hardpps); -- cgit v1.2.3 From fe2b05f7ca9f906be61dced5489f63b8b4d7c770 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 18 Feb 2013 09:52:08 +0100 Subject: futex: Revert "futex: Mark get_robust_list as deprecated" This reverts commit ec0c4274e33c0373e476b73e01995c53128f1257. get_robust_list() is in use and a removal would break existing user space. With the permission checks in place it's not longer a security hole. Remove the deprecation warnings. Signed-off-by: Thomas Gleixner Cc: Cyrill Gorcunov Cc: Richard Weinberger Cc: akpm@linux-foundation.org Cc: paul.gortmaker@windriver.com Cc: davej@redhat.com Cc: keescook@chromium.org Cc: stable@vger.kernel.org Cc: ebiederm@xmission.com --- kernel/futex.c | 2 -- kernel/futex_compat.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 19eb089ca003..887943044d46 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2471,8 +2471,6 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, if (!futex_cmpxchg_enabled) return -ENOSYS; - WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n"); - rcu_read_lock(); ret = -ESRCH; diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 83e368b005fc..a9642d528630 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -142,8 +142,6 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, if (!futex_cmpxchg_enabled) return -ENOSYS; - WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n"); - rcu_read_lock(); ret = -ESRCH; -- cgit v1.2.3 From 1438ade5670b56d5386c220e1ad4b5a824a1e585 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 24 Jan 2013 16:36:31 +0400 Subject: workqueue: un-GPL function delayed_work_timer_fn() commit d8e794dfd51c368ed3f686b7f4172830b60ae47b ("workqueue: set delayed_work->timer function on initialization") exports function delayed_work_timer_fn() only for GPL modules. This makes delayed-works unusable for non-GPL modules, because initialization macro now requires GPL symbol. For example schedule_delayed_work() available for non-GPL. Signed-off-by: Konstantin Khlebnikov Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org # 3.7 --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0f1a264d0f22..f4feacad3812 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1310,7 +1310,7 @@ void delayed_work_timer_fn(unsigned long __data) /* should have been called from irqsafe timer with irq already off */ __queue_work(dwork->cpu, dwork->wq, &dwork->work); } -EXPORT_SYMBOL_GPL(delayed_work_timer_fn); +EXPORT_SYMBOL(delayed_work_timer_fn); static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) -- cgit v1.2.3 From 1c3e826482ab698e418c7a894440e62c76aac893 Mon Sep 17 00:00:00 2001 From: Sha Zhengju Date: Wed, 20 Feb 2013 17:14:38 +0800 Subject: sched/core: Remove the obsolete and unused nr_uninterruptible() function Signed-off-by: Sha Zhengju Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1361351678-8065-1-git-send-email-handai.szj@taobao.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 03d7784b7bd2..b7b03cd2d4cd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1969,11 +1969,10 @@ context_switch(struct rq *rq, struct task_struct *prev, } /* - * nr_running, nr_uninterruptible and nr_context_switches: + * nr_running and nr_context_switches: * * externally visible scheduler statistics: current number of runnable - * threads, current number of uninterruptible-sleeping threads, total - * number of context switches performed since bootup. + * threads, total number of context switches performed since bootup. */ unsigned long nr_running(void) { @@ -1985,23 +1984,6 @@ unsigned long nr_running(void) return sum; } -unsigned long nr_uninterruptible(void) -{ - unsigned long i, sum = 0; - - for_each_possible_cpu(i) - sum += cpu_rq(i)->nr_uninterruptible; - - /* - * Since we read the counters lockless, it might be slightly - * inaccurate. Do not allow it to go below zero though: - */ - if (unlikely((long)sum < 0)) - sum = 0; - - return sum; -} - unsigned long long nr_context_switches(void) { int i; -- cgit v1.2.3 From e182bb38d7db7494fa5dcd82da17fe0dedf60ecf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 20 Feb 2013 15:24:12 -0800 Subject: posix-timer: Don't call idr_find() with out-of-range ID When idr_find() was fed a negative ID, it used to look up the ID ignoring the sign bit before recent ("idr: remove MAX_IDR_MASK and move left MAX_IDR_* into idr.c") patch. Now a negative ID triggers a WARN_ON_ONCE(). __lock_timer() feeds timer_id from userland directly to idr_find() without sanitizing it which can trigger the above malfunctions. Add a range check on @timer_id before invoking idr_find() in __lock_timer(). While timer_t is defined as int by all archs at the moment, Andrew worries that it may be defined as a larger type later on. Make the test cover larger integers too so that it at least is guaranteed to not return the wrong timer. Note that WARN_ON_ONCE() in idr_find() on id < 0 is transitional precaution while moving away from ignoring MSB. Once it's gone we can remove the guard as long as timer_t isn't larger than int. Signed-off-by: Tejun Heo nnn Reported-by: Sasha Levin Cc: Andrew Morton Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20130220232412.GL3570@htj.dyndns.org Signed-off-by: Thomas Gleixner --- kernel/posix-timers.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 10349d5f2ec3..7edfe4b901e7 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -639,6 +639,13 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) { struct k_itimer *timr; + /* + * timer_t could be any type >= int and we want to make sure any + * @timer_id outside positive int range fails lookup. + */ + if ((unsigned long long)timer_id > INT_MAX) + return NULL; + rcu_read_lock(); timr = idr_find(&posix_timers_id, (int)timer_id); if (timr) { -- cgit v1.2.3 From bffea77c08c361d174af7ad94887f6aecc3f340b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 21 Feb 2013 16:41:57 -0800 Subject: compat: return -EFAULT on error in waitid() The copy_to_user() call returns the number of bytes remaining but we want to return -EFAULT on error. Fixes "x32: fix waitid()" Signed-off-by: Dan Carpenter Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 36700e9e2be9..f4bddb900186 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -593,7 +593,7 @@ COMPAT_SYSCALL_DEFINE5(waitid, else ret = put_compat_rusage(&ru, uru); if (ret) - return ret; + return -EFAULT; } BUG_ON(info.si_code & __SI_MASK); -- cgit v1.2.3 From af3b56289be1f65d5c9f28bb1775e01056a5a2de Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 21 Feb 2013 16:42:40 -0800 Subject: time: don't inline EXPORT_SYMBOL functions How is the compiler even handling exported functions that are marked inline? Anyway, these shouldn't be inline because of that, so remove that marking. Based on a larger patch by Mark Charlebois to get LLVM to build the kernel. Cc: Thomas Gleixner Cc: Mark Charlebois Cc: Paul Gortmaker Cc: hank Cc: John Stultz Signed-off-by: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time.c b/kernel/time.c index c2a27dd93142..f8342a41efa6 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -240,7 +240,7 @@ EXPORT_SYMBOL(current_fs_time); * Avoid unnecessary multiplications/divisions in the * two most common HZ cases: */ -inline unsigned int jiffies_to_msecs(const unsigned long j) +unsigned int jiffies_to_msecs(const unsigned long j) { #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) return (MSEC_PER_SEC / HZ) * j; @@ -256,7 +256,7 @@ inline unsigned int jiffies_to_msecs(const unsigned long j) } EXPORT_SYMBOL(jiffies_to_msecs); -inline unsigned int jiffies_to_usecs(const unsigned long j) +unsigned int jiffies_to_usecs(const unsigned long j) { #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) return (USEC_PER_SEC / HZ) * j; -- cgit v1.2.3 From 9a46ad6d6df3b547d057c39db13f69d7170a99e9 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 21 Feb 2013 16:43:03 -0800 Subject: smp: make smp_call_function_many() use logic similar to smp_call_function_single() I'm testing swapout workload in a two-socket Xeon machine. The workload has 10 threads, each thread sequentially accesses separate memory region. TLB flush overhead is very big in the workload. For each page, page reclaim need move it from active lru list and then unmap it. Both need a TLB flush. And this is a multthread workload, TLB flush happens in 10 CPUs. In X86, TLB flush uses generic smp_call)function. So this workload stress smp_call_function_many heavily. Without patch, perf shows: + 24.49% [k] generic_smp_call_function_interrupt - 21.72% [k] _raw_spin_lock - _raw_spin_lock + 79.80% __page_check_address + 6.42% generic_smp_call_function_interrupt + 3.31% get_swap_page + 2.37% free_pcppages_bulk + 1.75% handle_pte_fault + 1.54% put_super + 1.41% grab_super_passive + 1.36% __swap_duplicate + 0.68% blk_flush_plug_list + 0.62% swap_info_get + 6.55% [k] flush_tlb_func + 6.46% [k] smp_call_function_many + 5.09% [k] call_function_interrupt + 4.75% [k] default_send_IPI_mask_sequence_phys + 2.18% [k] find_next_bit swapout throughput is around 1300M/s. With the patch, perf shows: - 27.23% [k] _raw_spin_lock - _raw_spin_lock + 80.53% __page_check_address + 8.39% generic_smp_call_function_single_interrupt + 2.44% get_swap_page + 1.76% free_pcppages_bulk + 1.40% handle_pte_fault + 1.15% __swap_duplicate + 1.05% put_super + 0.98% grab_super_passive + 0.86% blk_flush_plug_list + 0.57% swap_info_get + 8.25% [k] default_send_IPI_mask_sequence_phys + 7.55% [k] call_function_interrupt + 7.47% [k] smp_call_function_many + 7.25% [k] flush_tlb_func + 3.81% [k] _raw_spin_lock_irqsave + 3.78% [k] generic_smp_call_function_single_interrupt swapout throughput is around 1400M/s. So there is around a 7% improvement, and total cpu utilization doesn't change. Without the patch, cfd_data is shared by all CPUs. generic_smp_call_function_interrupt does read/write cfd_data several times which will create a lot of cache ping-pong. With the patch, the data becomes per-cpu. The ping-pong is avoided. And from the perf data, this doesn't make call_single_queue lock contend. Next step is to remove generic_smp_call_function_interrupt() from arch code. Signed-off-by: Shaohua Li Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Steven Rostedt Cc: Jens Axboe Cc: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 183 ++++++++++------------------------------------------------- 1 file changed, 30 insertions(+), 153 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 69f38bd98b42..8e451f3ff51b 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -16,22 +16,12 @@ #include "smpboot.h" #ifdef CONFIG_USE_GENERIC_SMP_HELPERS -static struct { - struct list_head queue; - raw_spinlock_t lock; -} call_function __cacheline_aligned_in_smp = - { - .queue = LIST_HEAD_INIT(call_function.queue), - .lock = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock), - }; - enum { CSD_FLAG_LOCK = 0x01, }; struct call_function_data { - struct call_single_data csd; - atomic_t refs; + struct call_single_data __percpu *csd; cpumask_var_t cpumask; cpumask_var_t cpumask_ipi; }; @@ -60,6 +50,11 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, cpu_to_node(cpu))) return notifier_from_errno(-ENOMEM); + cfd->csd = alloc_percpu(struct call_single_data); + if (!cfd->csd) { + free_cpumask_var(cfd->cpumask); + return notifier_from_errno(-ENOMEM); + } break; #ifdef CONFIG_HOTPLUG_CPU @@ -70,6 +65,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DEAD_FROZEN: free_cpumask_var(cfd->cpumask); free_cpumask_var(cfd->cpumask_ipi); + free_percpu(cfd->csd); break; #endif }; @@ -170,85 +166,6 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait) csd_lock_wait(data); } -/* - * Invoked by arch to handle an IPI for call function. Must be called with - * interrupts disabled. - */ -void generic_smp_call_function_interrupt(void) -{ - struct call_function_data *data; - int cpu = smp_processor_id(); - - /* - * Shouldn't receive this interrupt on a cpu that is not yet online. - */ - WARN_ON_ONCE(!cpu_online(cpu)); - - /* - * Ensure entry is visible on call_function_queue after we have - * entered the IPI. See comment in smp_call_function_many. - * If we don't have this, then we may miss an entry on the list - * and never get another IPI to process it. - */ - smp_mb(); - - /* - * It's ok to use list_for_each_rcu() here even though we may - * delete 'pos', since list_del_rcu() doesn't clear ->next - */ - list_for_each_entry_rcu(data, &call_function.queue, csd.list) { - int refs; - smp_call_func_t func; - - /* - * Since we walk the list without any locks, we might - * see an entry that was completed, removed from the - * list and is in the process of being reused. - * - * We must check that the cpu is in the cpumask before - * checking the refs, and both must be set before - * executing the callback on this cpu. - */ - - if (!cpumask_test_cpu(cpu, data->cpumask)) - continue; - - smp_rmb(); - - if (atomic_read(&data->refs) == 0) - continue; - - func = data->csd.func; /* save for later warn */ - func(data->csd.info); - - /* - * If the cpu mask is not still set then func enabled - * interrupts (BUG), and this cpu took another smp call - * function interrupt and executed func(info) twice - * on this cpu. That nested execution decremented refs. - */ - if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) { - WARN(1, "%pf enabled interrupts and double executed\n", func); - continue; - } - - refs = atomic_dec_return(&data->refs); - WARN_ON(refs < 0); - - if (refs) - continue; - - WARN_ON(!cpumask_empty(data->cpumask)); - - raw_spin_lock(&call_function.lock); - list_del_rcu(&data->csd.list); - raw_spin_unlock(&call_function.lock); - - csd_unlock(&data->csd); - } - -} - /* * Invoked by arch to handle an IPI for call function single. Must be * called from the arch with interrupts disabled. @@ -453,8 +370,7 @@ void smp_call_function_many(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) { struct call_function_data *data; - unsigned long flags; - int refs, cpu, next_cpu, this_cpu = smp_processor_id(); + int cpu, next_cpu, this_cpu = smp_processor_id(); /* * Can deadlock when called with interrupts disabled. @@ -486,50 +402,13 @@ void smp_call_function_many(const struct cpumask *mask, } data = &__get_cpu_var(cfd_data); - csd_lock(&data->csd); - - /* This BUG_ON verifies our reuse assertions and can be removed */ - BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); - - /* - * The global call function queue list add and delete are protected - * by a lock, but the list is traversed without any lock, relying - * on the rcu list add and delete to allow safe concurrent traversal. - * We reuse the call function data without waiting for any grace - * period after some other cpu removes it from the global queue. - * This means a cpu might find our data block as it is being - * filled out. - * - * We hold off the interrupt handler on the other cpu by - * ordering our writes to the cpu mask vs our setting of the - * refs counter. We assert only the cpu owning the data block - * will set a bit in cpumask, and each bit will only be cleared - * by the subject cpu. Each cpu must first find its bit is - * set and then check that refs is set indicating the element is - * ready to be processed, otherwise it must skip the entry. - * - * On the previous iteration refs was set to 0 by another cpu. - * To avoid the use of transitivity, set the counter to 0 here - * so the wmb will pair with the rmb in the interrupt handler. - */ - atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */ - - data->csd.func = func; - data->csd.info = info; - /* Ensure 0 refs is visible before mask. Also orders func and info */ - smp_wmb(); - - /* We rely on the "and" being processed before the store */ cpumask_and(data->cpumask, mask, cpu_online_mask); cpumask_clear_cpu(this_cpu, data->cpumask); - refs = cpumask_weight(data->cpumask); /* Some callers race with other cpus changing the passed mask */ - if (unlikely(!refs)) { - csd_unlock(&data->csd); + if (unlikely(!cpumask_weight(data->cpumask))) return; - } /* * After we put an entry into the list, data->cpumask @@ -537,34 +416,32 @@ void smp_call_function_many(const struct cpumask *mask, * a SMP function call, so data->cpumask will be zero. */ cpumask_copy(data->cpumask_ipi, data->cpumask); - raw_spin_lock_irqsave(&call_function.lock, flags); - /* - * Place entry at the _HEAD_ of the list, so that any cpu still - * observing the entry in generic_smp_call_function_interrupt() - * will not miss any other list entries: - */ - list_add_rcu(&data->csd.list, &call_function.queue); - /* - * We rely on the wmb() in list_add_rcu to complete our writes - * to the cpumask before this write to refs, which indicates - * data is on the list and is ready to be processed. - */ - atomic_set(&data->refs, refs); - raw_spin_unlock_irqrestore(&call_function.lock, flags); - /* - * Make the list addition visible before sending the ipi. - * (IPIs must obey or appear to obey normal Linux cache - * coherency rules -- see comment in generic_exec_single). - */ - smp_mb(); + for_each_cpu(cpu, data->cpumask) { + struct call_single_data *csd = per_cpu_ptr(data->csd, cpu); + struct call_single_queue *dst = + &per_cpu(call_single_queue, cpu); + unsigned long flags; + + csd_lock(csd); + csd->func = func; + csd->info = info; + + raw_spin_lock_irqsave(&dst->lock, flags); + list_add_tail(&csd->list, &dst->list); + raw_spin_unlock_irqrestore(&dst->lock, flags); + } /* Send a message to all CPUs in the map */ arch_send_call_function_ipi_mask(data->cpumask_ipi); - /* Optionally wait for the CPUs to complete */ - if (wait) - csd_lock_wait(&data->csd); + if (wait) { + for_each_cpu(cpu, data->cpumask) { + struct call_single_data *csd = + per_cpu_ptr(data->csd, cpu); + csd_lock_wait(csd); + } + } } EXPORT_SYMBOL(smp_call_function_many); -- cgit v1.2.3 From 7fe5e04292e71af34ae171b88caa2a139e0b6125 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Thu, 21 Feb 2013 16:43:06 -0800 Subject: sys_prctl(): arg2 is unsigned long which is never < 0 arg2 will never < 0, for its type is 'unsigned long' Also, use the provided macros. Signed-off-by: Chen Gang Reported-by: Cyrill Gorcunov Acked-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 265b37690421..83261059676c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -47,6 +47,7 @@ #include #include #include +#include #include /* Move somewhere else to avoid recompiling? */ @@ -2026,7 +2027,8 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = get_dumpable(me->mm); break; case PR_SET_DUMPABLE: - if (arg2 < 0 || arg2 > 1) { + if (arg2 != SUID_DUMP_DISABLE && + arg2 != SUID_DUMP_USER) { error = -EINVAL; break; } -- cgit v1.2.3 From f3cbd435b02fb45efc2c8a39c2ea19816669c412 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 21 Feb 2013 16:43:07 -0800 Subject: sys_prctl(): coding-style cleanup Remove a tabstop from the switch statement, in the usual fashion. A few instances of weirdwrapping were removed as a result. Cc: Chen Gang Cc: Cyrill Gorcunov Acked-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 288 +++++++++++++++++++++++++++++------------------------------ 1 file changed, 143 insertions(+), 145 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 83261059676c..840cfdad7bfc 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2013,161 +2013,159 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = 0; switch (option) { - case PR_SET_PDEATHSIG: - if (!valid_signal(arg2)) { - error = -EINVAL; - break; - } - me->pdeath_signal = arg2; - break; - case PR_GET_PDEATHSIG: - error = put_user(me->pdeath_signal, (int __user *)arg2); - break; - case PR_GET_DUMPABLE: - error = get_dumpable(me->mm); + case PR_SET_PDEATHSIG: + if (!valid_signal(arg2)) { + error = -EINVAL; break; - case PR_SET_DUMPABLE: - if (arg2 != SUID_DUMP_DISABLE && - arg2 != SUID_DUMP_USER) { - error = -EINVAL; - break; - } - set_dumpable(me->mm, arg2); + } + me->pdeath_signal = arg2; + break; + case PR_GET_PDEATHSIG: + error = put_user(me->pdeath_signal, (int __user *)arg2); + break; + case PR_GET_DUMPABLE: + error = get_dumpable(me->mm); + break; + case PR_SET_DUMPABLE: + if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) { + error = -EINVAL; break; + } + set_dumpable(me->mm, arg2); + break; - case PR_SET_UNALIGN: - error = SET_UNALIGN_CTL(me, arg2); - break; - case PR_GET_UNALIGN: - error = GET_UNALIGN_CTL(me, arg2); - break; - case PR_SET_FPEMU: - error = SET_FPEMU_CTL(me, arg2); - break; - case PR_GET_FPEMU: - error = GET_FPEMU_CTL(me, arg2); - break; - case PR_SET_FPEXC: - error = SET_FPEXC_CTL(me, arg2); - break; - case PR_GET_FPEXC: - error = GET_FPEXC_CTL(me, arg2); - break; - case PR_GET_TIMING: - error = PR_TIMING_STATISTICAL; - break; - case PR_SET_TIMING: - if (arg2 != PR_TIMING_STATISTICAL) - error = -EINVAL; - break; - case PR_SET_NAME: - comm[sizeof(me->comm)-1] = 0; - if (strncpy_from_user(comm, (char __user *)arg2, - sizeof(me->comm) - 1) < 0) - return -EFAULT; - set_task_comm(me, comm); - proc_comm_connector(me); - break; - case PR_GET_NAME: - get_task_comm(comm, me); - if (copy_to_user((char __user *)arg2, comm, - sizeof(comm))) - return -EFAULT; - break; - case PR_GET_ENDIAN: - error = GET_ENDIAN(me, arg2); - break; - case PR_SET_ENDIAN: - error = SET_ENDIAN(me, arg2); - break; - case PR_GET_SECCOMP: - error = prctl_get_seccomp(); - break; - case PR_SET_SECCOMP: - error = prctl_set_seccomp(arg2, (char __user *)arg3); - break; - case PR_GET_TSC: - error = GET_TSC_CTL(arg2); - break; - case PR_SET_TSC: - error = SET_TSC_CTL(arg2); - break; - case PR_TASK_PERF_EVENTS_DISABLE: - error = perf_event_task_disable(); - break; - case PR_TASK_PERF_EVENTS_ENABLE: - error = perf_event_task_enable(); - break; - case PR_GET_TIMERSLACK: - error = current->timer_slack_ns; - break; - case PR_SET_TIMERSLACK: - if (arg2 <= 0) - current->timer_slack_ns = + case PR_SET_UNALIGN: + error = SET_UNALIGN_CTL(me, arg2); + break; + case PR_GET_UNALIGN: + error = GET_UNALIGN_CTL(me, arg2); + break; + case PR_SET_FPEMU: + error = SET_FPEMU_CTL(me, arg2); + break; + case PR_GET_FPEMU: + error = GET_FPEMU_CTL(me, arg2); + break; + case PR_SET_FPEXC: + error = SET_FPEXC_CTL(me, arg2); + break; + case PR_GET_FPEXC: + error = GET_FPEXC_CTL(me, arg2); + break; + case PR_GET_TIMING: + error = PR_TIMING_STATISTICAL; + break; + case PR_SET_TIMING: + if (arg2 != PR_TIMING_STATISTICAL) + error = -EINVAL; + break; + case PR_SET_NAME: + comm[sizeof(me->comm) - 1] = 0; + if (strncpy_from_user(comm, (char __user *)arg2, + sizeof(me->comm) - 1) < 0) + return -EFAULT; + set_task_comm(me, comm); + proc_comm_connector(me); + break; + case PR_GET_NAME: + get_task_comm(comm, me); + if (copy_to_user((char __user *)arg2, comm, sizeof(comm))) + return -EFAULT; + break; + case PR_GET_ENDIAN: + error = GET_ENDIAN(me, arg2); + break; + case PR_SET_ENDIAN: + error = SET_ENDIAN(me, arg2); + break; + case PR_GET_SECCOMP: + error = prctl_get_seccomp(); + break; + case PR_SET_SECCOMP: + error = prctl_set_seccomp(arg2, (char __user *)arg3); + break; + case PR_GET_TSC: + error = GET_TSC_CTL(arg2); + break; + case PR_SET_TSC: + error = SET_TSC_CTL(arg2); + break; + case PR_TASK_PERF_EVENTS_DISABLE: + error = perf_event_task_disable(); + break; + case PR_TASK_PERF_EVENTS_ENABLE: + error = perf_event_task_enable(); + break; + case PR_GET_TIMERSLACK: + error = current->timer_slack_ns; + break; + case PR_SET_TIMERSLACK: + if (arg2 <= 0) + current->timer_slack_ns = current->default_timer_slack_ns; - else - current->timer_slack_ns = arg2; - break; - case PR_MCE_KILL: - if (arg4 | arg5) - return -EINVAL; - switch (arg2) { - case PR_MCE_KILL_CLEAR: - if (arg3 != 0) - return -EINVAL; - current->flags &= ~PF_MCE_PROCESS; - break; - case PR_MCE_KILL_SET: - current->flags |= PF_MCE_PROCESS; - if (arg3 == PR_MCE_KILL_EARLY) - current->flags |= PF_MCE_EARLY; - else if (arg3 == PR_MCE_KILL_LATE) - current->flags &= ~PF_MCE_EARLY; - else if (arg3 == PR_MCE_KILL_DEFAULT) - current->flags &= - ~(PF_MCE_EARLY|PF_MCE_PROCESS); - else - return -EINVAL; - break; - default: + else + current->timer_slack_ns = arg2; + break; + case PR_MCE_KILL: + if (arg4 | arg5) + return -EINVAL; + switch (arg2) { + case PR_MCE_KILL_CLEAR: + if (arg3 != 0) return -EINVAL; - } + current->flags &= ~PF_MCE_PROCESS; break; - case PR_MCE_KILL_GET: - if (arg2 | arg3 | arg4 | arg5) - return -EINVAL; - if (current->flags & PF_MCE_PROCESS) - error = (current->flags & PF_MCE_EARLY) ? - PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; + case PR_MCE_KILL_SET: + current->flags |= PF_MCE_PROCESS; + if (arg3 == PR_MCE_KILL_EARLY) + current->flags |= PF_MCE_EARLY; + else if (arg3 == PR_MCE_KILL_LATE) + current->flags &= ~PF_MCE_EARLY; + else if (arg3 == PR_MCE_KILL_DEFAULT) + current->flags &= + ~(PF_MCE_EARLY|PF_MCE_PROCESS); else - error = PR_MCE_KILL_DEFAULT; - break; - case PR_SET_MM: - error = prctl_set_mm(arg2, arg3, arg4, arg5); - break; - case PR_GET_TID_ADDRESS: - error = prctl_get_tid_address(me, (int __user **)arg2); - break; - case PR_SET_CHILD_SUBREAPER: - me->signal->is_child_subreaper = !!arg2; - break; - case PR_GET_CHILD_SUBREAPER: - error = put_user(me->signal->is_child_subreaper, - (int __user *) arg2); - break; - case PR_SET_NO_NEW_PRIVS: - if (arg2 != 1 || arg3 || arg4 || arg5) return -EINVAL; - - current->no_new_privs = 1; break; - case PR_GET_NO_NEW_PRIVS: - if (arg2 || arg3 || arg4 || arg5) - return -EINVAL; - return current->no_new_privs ? 1 : 0; default: - error = -EINVAL; - break; + return -EINVAL; + } + break; + case PR_MCE_KILL_GET: + if (arg2 | arg3 | arg4 | arg5) + return -EINVAL; + if (current->flags & PF_MCE_PROCESS) + error = (current->flags & PF_MCE_EARLY) ? + PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; + else + error = PR_MCE_KILL_DEFAULT; + break; + case PR_SET_MM: + error = prctl_set_mm(arg2, arg3, arg4, arg5); + break; + case PR_GET_TID_ADDRESS: + error = prctl_get_tid_address(me, (int __user **)arg2); + break; + case PR_SET_CHILD_SUBREAPER: + me->signal->is_child_subreaper = !!arg2; + break; + case PR_GET_CHILD_SUBREAPER: + error = put_user(me->signal->is_child_subreaper, + (int __user *)arg2); + break; + case PR_SET_NO_NEW_PRIVS: + if (arg2 != 1 || arg3 || arg4 || arg5) + return -EINVAL; + + current->no_new_privs = 1; + break; + case PR_GET_NO_NEW_PRIVS: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + return current->no_new_privs ? 1 : 0; + default: + error = -EINVAL; + break; } return error; } -- cgit v1.2.3 From d7d48f6216686602e6e3d8470563326605b01c95 Mon Sep 17 00:00:00 2001 From: Yuanhan Liu Date: Thu, 21 Feb 2013 16:44:21 -0800 Subject: kernel/nsproxy.c: remove duplicate task_cred_xxx for user_ns We can use user_ns, which is also assigned from task_cred_xxx(tsk, user_ns), at the beginning of copy_namespaces(). Signed-off-by: Yuanhan Liu Acked-by: Serge Hallyn Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/nsproxy.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 78e2ecb20165..b781e66a8f2c 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -153,8 +153,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) goto out; } - new_ns = create_new_namespaces(flags, tsk, - task_cred_xxx(tsk, user_ns), tsk->fs); + new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs); if (IS_ERR(new_ns)) { err = PTR_ERR(new_ns); goto out; -- cgit v1.2.3 From cb152ff26717961b10d0888cd983ba284cb99cd1 Mon Sep 17 00:00:00 2001 From: Nathan Zimmer Date: Thu, 21 Feb 2013 15:15:08 -0800 Subject: sched: Fix /proc/sched_stat failure on very very large systems On systems with 4096 cores doing a cat /proc/sched_stat fails, because we are trying to push all the data into a single kmalloc buffer. The issue is on these very large machines all the data will not fit in 4mb. A better solution is to not use the single_open() mechanism but to provide our own seq_operations. The output should be identical to previous version and thus not need the version number. Reported-by: Dave Jones Signed-off-by: Nathan Zimmer Cc: Peter Zijlstra Cc: Wu Fengguang [ Fix memleak] [ Fix spello in comment] [ Fix warnings] Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched/stats.c | 79 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 59 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 903ffa9e8872..e036eda1a9c9 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -21,14 +21,17 @@ static int show_schedstat(struct seq_file *seq, void *v) if (mask_str == NULL) return -ENOMEM; - seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); - seq_printf(seq, "timestamp %lu\n", jiffies); - for_each_online_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); + if (v == (void *)1) { + seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); + seq_printf(seq, "timestamp %lu\n", jiffies); + } else { + struct rq *rq; #ifdef CONFIG_SMP struct sched_domain *sd; int dcount = 0; #endif + cpu = (unsigned long)(v - 2); + rq = cpu_rq(cpu); /* runqueue-specific stats */ seq_printf(seq, @@ -77,30 +80,66 @@ static int show_schedstat(struct seq_file *seq, void *v) return 0; } -static int schedstat_open(struct inode *inode, struct file *file) +/* + * This itererator needs some explanation. + * It returns 1 for the header position. + * This means 2 is cpu 0. + * In a hotplugged system some cpus, including cpu 0, may be missing so we have + * to use cpumask_* to iterate over the cpus. + */ +static void *schedstat_start(struct seq_file *file, loff_t *offset) { - unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); - char *buf = kmalloc(size, GFP_KERNEL); - struct seq_file *m; - int res; + unsigned long n = *offset; - if (!buf) - return -ENOMEM; - res = single_open(file, show_schedstat, NULL); - if (!res) { - m = file->private_data; - m->buf = buf; - m->size = size; - } else - kfree(buf); - return res; + if (n == 0) + return (void *) 1; + + n--; + + if (n > 0) + n = cpumask_next(n - 1, cpu_online_mask); + else + n = cpumask_first(cpu_online_mask); + + *offset = n + 1; + + if (n < nr_cpu_ids) + return (void *)(unsigned long)(n + 2); + return NULL; +} + +static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset) +{ + (*offset)++; + return schedstat_start(file, offset); +} + +static void schedstat_stop(struct seq_file *file, void *data) +{ +} + +static const struct seq_operations schedstat_sops = { + .start = schedstat_start, + .next = schedstat_next, + .stop = schedstat_stop, + .show = show_schedstat, +}; + +static int schedstat_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &schedstat_sops); } +static int schedstat_release(struct inode *inode, struct file *file) +{ + return 0; +}; + static const struct file_operations proc_schedstat_operations = { .open = schedstat_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = schedstat_release, }; static int __init proc_schedstat_init(void) -- cgit v1.2.3 From bbbfeac92beff40eb86c7f682a7f1395f9f0ae52 Mon Sep 17 00:00:00 2001 From: Nathan Zimmer Date: Thu, 21 Feb 2013 15:15:09 -0800 Subject: sched: Fix /proc/sched_debug failure on very very large systems On systems with 4096 cores attemping to read /proc/sched_debug fails because we are trying to push all the data into a single kmalloc buffer. The issue is on these very large machines all the data will not fit in 4mb. A better solution is to not us the single_open mechanism but to provide our own seq_operations and treat each cpu as an individual record. The output should be identical to the previous version. Reported-by: Dave Jones Signed-off-by: Nathan Zimmer Cc: Peter Zijlstra ) [ Whitespace fixlet] [ Fix spello in comment] Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 90 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 79 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 7ae4c4c5420e..c496eb3c6459 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -269,11 +269,11 @@ static void print_cpu(struct seq_file *m, int cpu) { unsigned int freq = cpu_khz ? : 1; - SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n", + SEQ_printf(m, "cpu#%d, %u.%03u MHz\n", cpu, freq / 1000, (freq % 1000)); } #else - SEQ_printf(m, "\ncpu#%d\n", cpu); + SEQ_printf(m, "cpu#%d\n", cpu); #endif #define P(x) \ @@ -330,6 +330,7 @@ do { \ print_rq(m, rq, cpu); rcu_read_unlock(); spin_unlock_irqrestore(&sched_debug_lock, flags); + SEQ_printf(m, "\n"); } static const char *sched_tunable_scaling_names[] = { @@ -338,11 +339,10 @@ static const char *sched_tunable_scaling_names[] = { "linear" }; -static int sched_debug_show(struct seq_file *m, void *v) +static void sched_debug_header(struct seq_file *m) { u64 ktime, sched_clk, cpu_clk; unsigned long flags; - int cpu; local_irq_save(flags); ktime = ktime_to_ns(ktime_get()); @@ -384,33 +384,101 @@ static int sched_debug_show(struct seq_file *m, void *v) #undef PN #undef P - SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling", + SEQ_printf(m, " .%-40s: %d (%s)\n", + "sysctl_sched_tunable_scaling", sysctl_sched_tunable_scaling, sched_tunable_scaling_names[sysctl_sched_tunable_scaling]); + SEQ_printf(m, "\n"); +} - for_each_online_cpu(cpu) - print_cpu(m, cpu); +static int sched_debug_show(struct seq_file *m, void *v) +{ + int cpu = (unsigned long)(v - 2); - SEQ_printf(m, "\n"); + if (cpu != -1) + print_cpu(m, cpu); + else + sched_debug_header(m); return 0; } void sysrq_sched_debug_show(void) { - sched_debug_show(NULL, NULL); + int cpu; + + sched_debug_header(NULL); + for_each_online_cpu(cpu) + print_cpu(NULL, cpu); + +} + +/* + * This itererator needs some explanation. + * It returns 1 for the header position. + * This means 2 is cpu 0. + * In a hotplugged system some cpus, including cpu 0, may be missing so we have + * to use cpumask_* to iterate over the cpus. + */ +static void *sched_debug_start(struct seq_file *file, loff_t *offset) +{ + unsigned long n = *offset; + + if (n == 0) + return (void *) 1; + + n--; + + if (n > 0) + n = cpumask_next(n - 1, cpu_online_mask); + else + n = cpumask_first(cpu_online_mask); + + *offset = n + 1; + + if (n < nr_cpu_ids) + return (void *)(unsigned long)(n + 2); + return NULL; +} + +static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset) +{ + (*offset)++; + return sched_debug_start(file, offset); +} + +static void sched_debug_stop(struct seq_file *file, void *data) +{ +} + +static const struct seq_operations sched_debug_sops = { + .start = sched_debug_start, + .next = sched_debug_next, + .stop = sched_debug_stop, + .show = sched_debug_show, +}; + +static int sched_debug_release(struct inode *inode, struct file *file) +{ + seq_release(inode, file); + + return 0; } static int sched_debug_open(struct inode *inode, struct file *filp) { - return single_open(filp, sched_debug_show, NULL); + int ret = 0; + + ret = seq_open(filp, &sched_debug_sops); + + return ret; } static const struct file_operations sched_debug_fops = { .open = sched_debug_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = sched_debug_release, }; static int __init init_sched_debug_procfs(void) -- cgit v1.2.3 From 496ad9aa8ef448058e36ca7a787c61f2e63f0f54 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 23 Jan 2013 17:07:38 -0500 Subject: new helper: file_inode(file) Signed-off-by: Al Viro --- kernel/acct.c | 2 +- kernel/cgroup.c | 6 +++--- kernel/events/core.c | 2 +- kernel/fork.c | 2 +- kernel/irq/proc.c | 2 +- kernel/nsproxy.c | 2 +- kernel/relay.c | 4 ++-- kernel/sys.c | 8 ++++---- 8 files changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 051e071a06e7..0d2981358e08 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -205,7 +205,7 @@ static int acct_on(struct filename *pathname) if (IS_ERR(file)) return PTR_ERR(file); - if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) { + if (!S_ISREG(file_inode(file)->i_mode)) { filp_close(file, NULL); return -EACCES; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4855892798fd..4fe52b3b6ef6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2637,7 +2637,7 @@ static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, un */ static inline struct cftype *__file_cft(struct file *file) { - if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations) + if (file_inode(file)->i_fop != &cgroup_file_operations) return ERR_PTR(-EINVAL); return __d_cft(file->f_dentry); } @@ -3852,7 +3852,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, /* the process need read permission on control file */ /* AV: shouldn't we check that it's been opened for read instead? */ - ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ); + ret = inode_permission(file_inode(cfile), MAY_READ); if (ret < 0) goto fail; @@ -5441,7 +5441,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) struct inode *inode; struct cgroup_subsys_state *css; - inode = f->f_dentry->d_inode; + inode = file_inode(f); /* check in cgroup filesystem dir */ if (inode->i_op != &cgroup_dir_inode_operations) return ERR_PTR(-EBADF); diff --git a/kernel/events/core.c b/kernel/events/core.c index 301079d06f24..3b106554b42e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3682,7 +3682,7 @@ unlock: static int perf_fasync(int fd, struct file *filp, int on) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct perf_event *event = filp->private_data; int retval; diff --git a/kernel/fork.c b/kernel/fork.c index c535f33bbb9c..4ff724f81f25 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -413,7 +413,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) tmp->vm_next = tmp->vm_prev = NULL; file = tmp->vm_file; if (file) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct address_space *mapping = file->f_mapping; get_file(file); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 4bd4faa6323a..397db02209ed 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -76,7 +76,7 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v) static ssize_t write_irq_affinity(int type, struct file *file, const char __user *buffer, size_t count, loff_t *pos) { - unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; + unsigned int irq = (int)(long)PDE(file_inode(file))->data; cpumask_var_t new_value; int err; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 78e2ecb20165..c057104bf05c 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -251,7 +251,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) return PTR_ERR(file); err = -EINVAL; - ei = PROC_I(file->f_dentry->d_inode); + ei = PROC_I(file_inode(file)); ops = ei->ns_ops; if (nstype && (ops->type != nstype)) goto out; diff --git a/kernel/relay.c b/kernel/relay.c index e8cd2027abbd..01ab081ac53a 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1139,7 +1139,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, if (!desc->count) return 0; - mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); + mutex_lock(&file_inode(filp)->i_mutex); do { if (!relay_file_read_avail(buf, *ppos)) break; @@ -1159,7 +1159,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, *ppos = relay_file_read_end_pos(buf, read_start, ret); } } while (desc->count && ret); - mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); + mutex_unlock(&file_inode(filp)->i_mutex); return desc->written; } diff --git a/kernel/sys.c b/kernel/sys.c index 265b37690421..e3932ea50ec8 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1792,14 +1792,14 @@ SYSCALL_DEFINE1(umask, int, mask) static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { struct fd exe; - struct dentry *dentry; + struct inode *inode; int err; exe = fdget(fd); if (!exe.file) return -EBADF; - dentry = exe.file->f_path.dentry; + inode = file_inode(exe.file); /* * Because the original mm->exe_file points to executable file, make @@ -1807,11 +1807,11 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) * overall picture. */ err = -EACCES; - if (!S_ISREG(dentry->d_inode->i_mode) || + if (!S_ISREG(inode->i_mode) || exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC) goto exit; - err = inode_permission(dentry->d_inode, MAY_EXEC); + err = inode_permission(inode, MAY_EXEC); if (err) goto exit; -- cgit v1.2.3 From aa00d89c2780d72d082a015e8cbb751e65fb30ee Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Fri, 22 Feb 2013 16:33:33 -0800 Subject: sched: do not use cpu_to_node() to find an offlined cpu's node. If a cpu is offline, its nid will be set to -1, and cpu_to_node(cpu) will return -1. As a result, cpumask_of_node(nid) will return NULL. In this case, find_next_bit() in for_each_cpu will get a NULL pointer and cause panic. Here is a call trace: Call Trace: select_fallback_rq+0x71/0x190 try_to_wake_up+0x2cb/0x2f0 wake_up_process+0x15/0x20 hrtimer_wakeup+0x22/0x30 __run_hrtimer+0x83/0x320 hrtimer_interrupt+0x106/0x280 smp_apic_timer_interrupt+0x69/0x99 apic_timer_interrupt+0x6f/0x80 There is a hrtimer process sleeping, whose cpu has already been offlined. When it is waken up, it tries to find another cpu to run, and get a -1 nid. As a result, cpumask_of_node(-1) returns NULL, and causes ernel panic. This patch fixes this problem by judging if the nid is -1. If nid is not -1, a cpu on the same node will be picked. Else, a online cpu on another node will be picked. Signed-off-by: Tang Chen Signed-off-by: Wen Congyang Cc: Yasuaki Ishimatsu Cc: David Rientjes Cc: Jiang Liu Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/core.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3a673a3b0c6b..053dfd7692d1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1132,18 +1132,28 @@ EXPORT_SYMBOL_GPL(kick_process); */ static int select_fallback_rq(int cpu, struct task_struct *p) { - const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); + int nid = cpu_to_node(cpu); + const struct cpumask *nodemask = NULL; enum { cpuset, possible, fail } state = cpuset; int dest_cpu; - /* Look for allowed, online CPU in same node. */ - for_each_cpu(dest_cpu, nodemask) { - if (!cpu_online(dest_cpu)) - continue; - if (!cpu_active(dest_cpu)) - continue; - if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) - return dest_cpu; + /* + * If the node that the cpu is on has been offlined, cpu_to_node() + * will return -1. There is no cpu on the node, and we should + * select the cpu on the other node. + */ + if (nid != -1) { + nodemask = cpumask_of_node(nid); + + /* Look for allowed, online CPU in same node. */ + for_each_cpu(dest_cpu, nodemask) { + if (!cpu_online(dest_cpu)) + continue; + if (!cpu_active(dest_cpu)) + continue; + if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) + return dest_cpu; + } } for (;;) { -- cgit v1.2.3 From 75f7ad8e043d9383337d917584297f7737154bbf Mon Sep 17 00:00:00 2001 From: Paul Szabo Date: Fri, 22 Feb 2013 16:34:42 -0800 Subject: page-writeback.c: subtract min_free_kbytes from dirtyable memory When calculating amount of dirtyable memory, min_free_kbytes should be subtracted because it is not intended for dirty pages. Addresses http://bugs.debian.org/695182 [akpm@linux-foundation.org: fix up min_free_kbytes extern declarations] [akpm@linux-foundation.org: fix min() warning] Signed-off-by: Paul Szabo Acked-by: Rik van Riel Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 467d8b923fcd..95e9e55602a8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -105,7 +105,6 @@ extern char core_pattern[]; extern unsigned int core_pipe_limit; #endif extern int pid_max; -extern int min_free_kbytes; extern int pid_max_min, pid_max_max; extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; -- cgit v1.2.3 From 7f6575f1fb963d5231afbceecd3feadb6ab58cd3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 23 Feb 2013 17:28:45 +0100 Subject: cputime: Use local_clock() for full dynticks cputime accounting Running the full dynticks cputime accounting with preemptible kernel debugging trigger the following warning: [ 4.488303] BUG: using smp_processor_id() in preemptible [00000000] code: init/1 [ 4.490971] caller is native_sched_clock+0x22/0x80 [ 4.493663] Pid: 1, comm: init Not tainted 3.8.0+ #13 [ 4.496376] Call Trace: [ 4.498996] [] debug_smp_processor_id+0xdb/0xf0 [ 4.501716] [] native_sched_clock+0x22/0x80 [ 4.504434] [] sched_clock+0x9/0x10 [ 4.507185] [] fetch_task_cputime+0xad/0x120 [ 4.509916] [] task_cputime+0x35/0x60 [ 4.512622] [] acct_update_integrals+0x1e/0x40 [ 4.515372] [] do_execve_common+0x4ff/0x5c0 [ 4.518117] [] ? do_execve_common+0x144/0x5c0 [ 4.520844] [] ? rest_init+0x160/0x160 [ 4.523554] [] do_execve+0x37/0x40 [ 4.526276] [] run_init_process+0x23/0x30 [ 4.528953] [] kernel_init+0x9c/0xf0 [ 4.531608] [] ret_from_fork+0x7c/0xb0 We use sched_clock() to perform and fixup the cputime accounting. However we are calling it with preemption enabled from the read side, which trigger the bug above. To fix this up, use local_clock() instead. It takes care of preemption and also provide a more reliable clock source. This is welcome for this kind of statistic that is widely relied on in userspace. Reported-by: Thomas Gleixner Reported-by: Ingo Molnar Suggested-by: Thomas Gleixner Signed-off-by: Frederic Weisbecker Cc: Li Zhong Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Kevin Hilman Link: http://lkml.kernel.org/r/1361636925-22288-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 9857329ed280..ed12cbb135f4 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -604,7 +604,7 @@ static unsigned long long vtime_delta(struct task_struct *tsk) { unsigned long long clock; - clock = sched_clock(); + clock = local_clock(); if (clock < tsk->vtime_snap) return 0; -- cgit v1.2.3 From 3dadecce20603aa380023c65e6f55f108fd5e952 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 24 Jan 2013 02:18:08 -0500 Subject: switch vfs_getattr() to struct path Signed-off-by: Al Viro --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index b10b048367e1..950076eb3273 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2519,7 +2519,7 @@ static int copy_module_from_fd(int fd, struct load_info *info) if (err) goto out; - err = vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat); + err = vfs_getattr(&file->f_path, &stat); if (err) goto out; -- cgit v1.2.3 From 7bb307e894d51308aa0582a8c4cc5875bbc645b9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 23 Feb 2013 14:51:48 -0500 Subject: export kernel_write(), convert open-coded instances Signed-off-by: Al Viro --- kernel/sysctl_binary.c | 39 +++++++-------------------------------- 1 file changed, 7 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 5a6384450501..37f240fec37a 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -971,7 +971,6 @@ out: static ssize_t bin_intvec(struct file *file, void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) { - mm_segment_t old_fs = get_fs(); ssize_t copied = 0; char *buffer; ssize_t result; @@ -984,13 +983,10 @@ static ssize_t bin_intvec(struct file *file, if (oldval && oldlen) { unsigned __user *vec = oldval; size_t length = oldlen / sizeof(*vec); - loff_t pos = 0; char *str, *end; int i; - set_fs(KERNEL_DS); - result = vfs_read(file, buffer, BUFSZ - 1, &pos); - set_fs(old_fs); + result = kernel_read(file, 0, buffer, BUFSZ - 1); if (result < 0) goto out_kfree; @@ -1017,7 +1013,6 @@ static ssize_t bin_intvec(struct file *file, if (newval && newlen) { unsigned __user *vec = newval; size_t length = newlen / sizeof(*vec); - loff_t pos = 0; char *str, *end; int i; @@ -1033,9 +1028,7 @@ static ssize_t bin_intvec(struct file *file, str += snprintf(str, end - str, "%lu\t", value); } - set_fs(KERNEL_DS); - result = vfs_write(file, buffer, str - buffer, &pos); - set_fs(old_fs); + result = kernel_write(file, buffer, str - buffer, 0); if (result < 0) goto out_kfree; } @@ -1049,7 +1042,6 @@ out: static ssize_t bin_ulongvec(struct file *file, void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) { - mm_segment_t old_fs = get_fs(); ssize_t copied = 0; char *buffer; ssize_t result; @@ -1062,13 +1054,10 @@ static ssize_t bin_ulongvec(struct file *file, if (oldval && oldlen) { unsigned long __user *vec = oldval; size_t length = oldlen / sizeof(*vec); - loff_t pos = 0; char *str, *end; int i; - set_fs(KERNEL_DS); - result = vfs_read(file, buffer, BUFSZ - 1, &pos); - set_fs(old_fs); + result = kernel_read(file, 0, buffer, BUFSZ - 1); if (result < 0) goto out_kfree; @@ -1095,7 +1084,6 @@ static ssize_t bin_ulongvec(struct file *file, if (newval && newlen) { unsigned long __user *vec = newval; size_t length = newlen / sizeof(*vec); - loff_t pos = 0; char *str, *end; int i; @@ -1111,9 +1099,7 @@ static ssize_t bin_ulongvec(struct file *file, str += snprintf(str, end - str, "%lu\t", value); } - set_fs(KERNEL_DS); - result = vfs_write(file, buffer, str - buffer, &pos); - set_fs(old_fs); + result = kernel_write(file, buffer, str - buffer, 0); if (result < 0) goto out_kfree; } @@ -1127,19 +1113,15 @@ out: static ssize_t bin_uuid(struct file *file, void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) { - mm_segment_t old_fs = get_fs(); ssize_t result, copied = 0; /* Only supports reads */ if (oldval && oldlen) { - loff_t pos = 0; char buf[40], *str = buf; unsigned char uuid[16]; int i; - set_fs(KERNEL_DS); - result = vfs_read(file, buf, sizeof(buf) - 1, &pos); - set_fs(old_fs); + result = kernel_read(file, 0, buf, sizeof(buf) - 1); if (result < 0) goto out; @@ -1175,18 +1157,14 @@ out: static ssize_t bin_dn_node_address(struct file *file, void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) { - mm_segment_t old_fs = get_fs(); ssize_t result, copied = 0; if (oldval && oldlen) { - loff_t pos = 0; char buf[15], *nodep; unsigned long area, node; __le16 dnaddr; - set_fs(KERNEL_DS); - result = vfs_read(file, buf, sizeof(buf) - 1, &pos); - set_fs(old_fs); + result = kernel_read(file, 0, buf, sizeof(buf) - 1); if (result < 0) goto out; @@ -1215,7 +1193,6 @@ static ssize_t bin_dn_node_address(struct file *file, } if (newval && newlen) { - loff_t pos = 0; __le16 dnaddr; char buf[15]; int len; @@ -1232,9 +1209,7 @@ static ssize_t bin_dn_node_address(struct file *file, le16_to_cpu(dnaddr) >> 10, le16_to_cpu(dnaddr) & 0x3ff); - set_fs(KERNEL_DS); - result = vfs_write(file, buf, len, &pos); - set_fs(old_fs); + result = kernel_write(file, buf, len, 0); if (result < 0) goto out; } -- cgit v1.2.3 From 6131ffaa1f091415b7a24abb01f033d9c0a727f4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 27 Feb 2013 16:59:05 -0500 Subject: more file_inode() open-coded instances Signed-off-by: Al Viro --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index fbc07a29ec53..f0090a993dab 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -226,7 +226,7 @@ static void drop_futex_key_refs(union futex_key *key) * Returns a negative error code or 0 * The key words are stored in *key on success. * - * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, + * For shared mappings, it's (page->index, file_inode(vma->vm_file), * offset_within_page). For private mappings, it's (uaddr, current->mm). * We can usually work out the index without swapping in the page. * -- cgit v1.2.3 From 7ff6764061ecd4a4ef91db7b8b30aacc6a8573c9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 27 Feb 2013 17:02:52 -0800 Subject: usermodehelper: cleanup/fix __orderly_poweroff() && argv_free() __orderly_poweroff() does argv_free() if call_usermodehelper_fns() returns -ENOMEM. As Lucas pointed out, this can be wrong if -ENOMEM was not triggered by the failing call_usermodehelper_setup(), in this case both __orderly_poweroff() and argv_cleanup() can do kfree(). Kill argv_cleanup() and change __orderly_poweroff() to call argv_free() unconditionally like do_coredump() does. This info->cleanup() is not needed (and wrong) since 6c0c0d4d "fix bug in orderly_poweroff() which did the UMH_NO_WAIT => UMH_WAIT_EXEC change, we can rely on the fact that CLONE_VFORK can't return until do_execve() succeeds/fails. Signed-off-by: Oleg Nesterov Reported-by: Lucas De Marchi Cc: David Howells Cc: James Morris Cc: hongfeng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index e10566bee399..81f56445fba9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2185,11 +2185,6 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; -static void argv_cleanup(struct subprocess_info *info) -{ - argv_free(info->argv); -} - static int __orderly_poweroff(void) { int argc; @@ -2209,9 +2204,8 @@ static int __orderly_poweroff(void) } ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC, - NULL, argv_cleanup, NULL); - if (ret == -ENOMEM) - argv_free(argv); + NULL, NULL, NULL); + argv_free(argv); return ret; } -- cgit v1.2.3 From 66dd34ad31e5963d72a700ec3f2449291d322921 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Wed, 27 Feb 2013 17:03:12 -0800 Subject: signal: allow to send any siginfo to itself The idea is simple. We need to get the siginfo for each signal on checkpointing dump, and then return it back on restore. The first problem is that the kernel doesn't report complete siginfos to userspace. In a signal handler the kernel strips SI_CODE from siginfo. When a siginfo is received from signalfd, it has a different format with fixed sizes of fields. The interface of signalfd was extended. If a signalfd is created with the flag SFD_RAW, it returns siginfo in a raw format. rt_sigqueueinfo looks suitable for restoring signals, but it can't send siginfo with a positive si_code, because these codes are reserved for the kernel. In the real world each person has right to do anything with himself, so I think a process should able to send any siginfo to itself. This patch: The kernel prevents sending of siginfo with positive si_code, because these codes are reserved for kernel. I think we can allow a task to send such a siginfo to itself. This operation should not be dangerous. This functionality is required for restoring signals in checkpoint/restart. Signed-off-by: Andrey Vagin Cc: Serge Hallyn Cc: "Eric W. Biederman" Cc: Al Viro Cc: Michael Kerrisk Cc: Pavel Emelyanov Cc: Cyrill Gorcunov Cc: Michael Kerrisk Reviewed-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 2a7ae2963185..c6209135cca4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2996,7 +2996,8 @@ static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info) /* Not even root can pretend to send signals from the kernel. * Nor can they impersonate a kill()/tgkill(), which adds source info. */ - if (info->si_code >= 0 || info->si_code == SI_TKILL) { + if ((info->si_code >= 0 || info->si_code == SI_TKILL) && + (task_pid_vnr(current) != pid)) { /* We used to allow any < 0 si_code */ WARN_ON_ONCE(info->si_code < 0); return -EPERM; @@ -3045,7 +3046,8 @@ static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) /* Not even root can pretend to send signals from the kernel. * Nor can they impersonate a kill()/tgkill(), which adds source info. */ - if (info->si_code >= 0 || info->si_code == SI_TKILL) { + if (((info->si_code >= 0 || info->si_code == SI_TKILL)) && + (task_pid_vnr(current) != pid)) { /* We used to allow any < 0 si_code */ WARN_ON_ONCE(info->si_code < 0); return -EPERM; -- cgit v1.2.3 From 5d1fadc1472396d602f0eeb10d37519e2a14e8bc Mon Sep 17 00:00:00 2001 From: Valdis Kletnieks Date: Wed, 27 Feb 2013 17:03:13 -0800 Subject: kernel/signal.c: fix suboptimal printk usage Several printk's were missing KERN_INFO and KERN_CONT flags. In addition, a printk that was outside a #if/#endif should have been inside, which would result in stray blank line on non-x86 boxes. Signed-off-by: Valdis Kletnieks Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index c6209135cca4..2676aac4103d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1157,11 +1157,11 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, static void print_fatal_signal(int signr) { struct pt_regs *regs = signal_pt_regs(); - printk("%s/%d: potentially unexpected fatal signal %d.\n", + printk(KERN_INFO "%s/%d: potentially unexpected fatal signal %d.\n", current->comm, task_pid_nr(current), signr); #if defined(__i386__) && !defined(__arch_um__) - printk("code at %08lx: ", regs->ip); + printk(KERN_INFO "code at %08lx: ", regs->ip); { int i; for (i = 0; i < 16; i++) { @@ -1169,11 +1169,11 @@ static void print_fatal_signal(int signr) if (get_user(insn, (unsigned char *)(regs->ip + i))) break; - printk("%02x ", insn); + printk(KERN_CONT "%02x ", insn); } } + printk(KERN_CONT "\n"); #endif - printk("\n"); preempt_disable(); show_regs(regs); preempt_enable(); -- cgit v1.2.3 From e579d2c259be42b6f29458327e5153b22414b031 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 27 Feb 2013 17:03:15 -0800 Subject: coredump: remove redundant defines for dumpable states The existing SUID_DUMP_* defines duplicate the newer SUID_DUMPABLE_* defines introduced in 54b501992dd2 ("coredump: warn about unsafe suid_dumpable / core_pattern combo"). Remove the new ones, and use the prior values instead. Signed-off-by: Kees Cook Reported-by: Chen Gang Cc: Alexander Viro Cc: Alan Cox Cc: "Eric W. Biederman" Cc: Doug Ledford Cc: Serge Hallyn Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d8df00e69c14..d1b4ee67d2df 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2095,7 +2095,7 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, static void validate_coredump_safety(void) { #ifdef CONFIG_COREDUMP - if (suid_dumpable == SUID_DUMPABLE_SAFE && + if (suid_dumpable == SUID_DUMP_ROOT && core_pattern[0] != '/' && core_pattern[0] != '|') { printk(KERN_WARNING "Unsafe core_pattern used with "\ "suid_dumpable=2. Pipe handler or fully qualified "\ -- cgit v1.2.3 From 6aa9707099c4b25700940eb3d016f16c4434360d Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Wed, 27 Feb 2013 17:03:18 -0800 Subject: lockdep: check that no locks held at freeze time We shouldn't try_to_freeze if locks are held. Holding a lock can cause a deadlock if the lock is later acquired in the suspend or hibernate path (e.g. by dpm). Holding a lock can also cause a deadlock in the case of cgroup_freezer if a lock is held inside a frozen cgroup that is later acquired by a process outside that group. [akpm@linux-foundation.org: export debug_check_no_locks_held] Signed-off-by: Mandeep Singh Baines Cc: Ben Chan Cc: Oleg Nesterov Cc: Tejun Heo Cc: Rafael J. Wysocki Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- kernel/lockdep.c | 17 ++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 7dd20408707c..aaf97c122e8a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -835,7 +835,7 @@ void do_exit(long code) /* * Make sure we are holding no locks: */ - debug_check_no_locks_held(tsk); + debug_check_no_locks_held(); /* * We can do this unlocked here. The futex code uses this flag * just to verify whether the pi state cleanup has been done diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 8a0efac4f99d..259db207b5d9 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -4088,7 +4088,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) } EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); -static void print_held_locks_bug(struct task_struct *curr) +static void print_held_locks_bug(void) { if (!debug_locks_off()) return; @@ -4097,22 +4097,21 @@ static void print_held_locks_bug(struct task_struct *curr) printk("\n"); printk("=====================================\n"); - printk("[ BUG: lock held at task exit time! ]\n"); + printk("[ BUG: %s/%d still has locks held! ]\n", + current->comm, task_pid_nr(current)); print_kernel_ident(); printk("-------------------------------------\n"); - printk("%s/%d is exiting with locks still held!\n", - curr->comm, task_pid_nr(curr)); - lockdep_print_held_locks(curr); - + lockdep_print_held_locks(current); printk("\nstack backtrace:\n"); dump_stack(); } -void debug_check_no_locks_held(struct task_struct *task) +void debug_check_no_locks_held(void) { - if (unlikely(task->lockdep_depth > 0)) - print_held_locks_bug(task); + if (unlikely(current->lockdep_depth > 0)) + print_held_locks_bug(); } +EXPORT_SYMBOL_GPL(debug_check_no_locks_held); void debug_show_all_locks(void) { -- cgit v1.2.3 From 80d26af89a7249aa5475467000322163c60cdd72 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Wed, 27 Feb 2013 17:03:20 -0800 Subject: coredump: use a freezable_schedule for the coredump_finish wait Prevents hung_task detector from panicing the machine. This is also needed to prevent this wait from blocking suspend. (It doesnt' currently block suspend but it would once the next patch in this series is applied.) [yongjun_wei@trendmicro.com.cn: kernel/exit.c: remove duplicated include] Signed-off-by: Mandeep Singh Baines Cc: Ben Chan Cc: Oleg Nesterov Cc: Tejun Heo Cc: Rafael J. Wysocki Cc: Ingo Molnar Signed-off-by: Wei Yongjun Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index aaf97c122e8a..51e485ca9935 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -31,7 +32,6 @@ #include #include #include -#include #include #include #include @@ -485,7 +485,7 @@ static void exit_mm(struct task_struct * tsk) set_task_state(tsk, TASK_UNINTERRUPTIBLE); if (!self.task) /* see coredump_finish() */ break; - schedule(); + freezable_schedule(); } __set_task_state(tsk, TASK_RUNNING); down_read(&mm->mmap_sem); -- cgit v1.2.3 From 6f977e6b2f75fdaccfd9be82bbf72fe5c60e8807 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Wed, 27 Feb 2013 17:03:23 -0800 Subject: fork: unshare: remove dead code If new_nsproxy is set we will always call switch_task_namespaces and then set new_nsproxy back to NULL so the reassignment and fall through check are redundant Signed-off-by: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8f62b2a0f120..8d932b1c9056 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1861,10 +1861,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) exit_sem(current); } - if (new_nsproxy) { + if (new_nsproxy) switch_task_namespaces(current, new_nsproxy); - new_nsproxy = NULL; - } task_lock(current); @@ -1894,9 +1892,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) } } - if (new_nsproxy) - put_nsproxy(new_nsproxy); - bad_unshare_cleanup_cred: if (new_cred) put_cred(new_cred); -- cgit v1.2.3 From 8d67091ec6ae98ca67f77990ef9e9ec21337f077 Mon Sep 17 00:00:00 2001 From: Atsushi Kumagai Date: Wed, 27 Feb 2013 17:03:25 -0800 Subject: kexec: add the values related to buddy system for filtering free pages. tAdd adds the values related to buddy system to vmcoreinfo data so that makedumpfile (dump filtering command) can filter out all free pages with the new logic. It's faster than the current logic because it can distinguish free page by analyzing page structure at the same time as filtering for other unnecessary pages (e.g. anonymous page). OTOH, the current logic has to trace free_list to distinguish free pages while analyzing page structure to filter out other unnecessary pages. The new logic uses the fact that buddy page is marked by _mapcount == PAGE_BUDDY_MAPCOUNT_VALUE. But, _mapcount shares its memory with other fields for SLAB/SLUB when PG_slab is set, so we need to check if PG_slab is set or not before looking up _mapcount value. And we can get the order of buddy system from private field. To sum it up, the values below are required for this logic. Required values: - OFFSET(page._mapcount) - OFFSET(page.private) - NUMBER(PG_slab) - NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE) Changelog from v1 to v2: 1. remove SIZE(pageflags) The new logic was changed after I sent v1 patch. Accordingly, SIZE(pageflags) has been unnecessary for makedumpfile. What's makedumpfile: makedumpfile creates a small dumpfile by excluding unnecessary pages for the analysis. To distinguish unnecessary pages, makedumpfile gets the vmcoreinfo data which has the minimum debugging information only for dump filtering. Signed-off-by: Atsushi Kumagai Cc: "Eric W. Biederman" Acked-by: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 2436ffcec91f..7d44a9f94145 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1514,6 +1514,8 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(page, _count); VMCOREINFO_OFFSET(page, mapping); VMCOREINFO_OFFSET(page, lru); + VMCOREINFO_OFFSET(page, _mapcount); + VMCOREINFO_OFFSET(page, private); VMCOREINFO_OFFSET(pglist_data, node_zones); VMCOREINFO_OFFSET(pglist_data, nr_zones); #ifdef CONFIG_FLAT_NODE_MEM_MAP @@ -1536,6 +1538,8 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PG_lru); VMCOREINFO_NUMBER(PG_private); VMCOREINFO_NUMBER(PG_swapcache); + VMCOREINFO_NUMBER(PG_slab); + VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); arch_crash_save_vmcoreinfo(); update_vmcoreinfo_note(); -- cgit v1.2.3 From 8a525f5e7a9f1e15e93c63fe179a5a4463dde4e1 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Wed, 27 Feb 2013 17:03:26 -0800 Subject: kexec: get rid of duplicate check for hole_end hole_end has been checked to make sure it is <= crash_res.end in the while condition check, so the if condition check is duplicate. Signed-off-by: Zhang Yanfei Reviewed-by: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 7d44a9f94145..ea097ad7cc37 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -503,8 +503,6 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) break; - if (hole_end > crashk_res.end) - break; /* See if I overlap any of the segments */ for (i = 0; i < image->nr_segments; i++) { unsigned long mstart, mend; -- cgit v1.2.3 From 0d0bf6674136eb861b37213160b16388cfc1926d Mon Sep 17 00:00:00 2001 From: Mitsuhiro Tanino Date: Wed, 27 Feb 2013 17:03:27 -0800 Subject: kexec: export PG_hwpoison flag into vmcoreinfo This patch exports a PG_hwpoison into vmcoreinfo when CONFIG_MEMORY_FAILURE is defined. "makedumpfile" needs to read information of memory, such as 'mem_section', 'zone', 'pageflags' from vmcore. We introduce a function into "makedumpfile" to exclude hwpoison page from vmcore dump. In order to introduce this function, PG_hwpoison flag have to export into vmcoreinfo. Signed-off-by: Mitsuhiro Tanino Acked-by: "Eric W. Biederman" Cc: Mitsuhiro Tanino Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index ea097ad7cc37..2348bd6ef2af 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1537,6 +1537,9 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PG_private); VMCOREINFO_NUMBER(PG_swapcache); VMCOREINFO_NUMBER(PG_slab); +#ifdef CONFIG_MEMORY_FAILURE + VMCOREINFO_NUMBER(PG_hwpoison); +#endif VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); arch_crash_save_vmcoreinfo(); -- cgit v1.2.3 From fe88f2ee33731f0934e8fb26f762b6715e43ff6f Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 27 Feb 2013 17:03:28 -0800 Subject: kexec: prevent double free on image allocation failure If kimage_normal_alloc() fails to initialize an allocated kimage, it will free the image but would still set 'rimage', as a result kexec_load will try to free it again. This would explode as part of the freeing process is accessing internal members which point to uninitialized memory. Signed-off-by: Sasha Levin Cc: "Eric W. Biederman" Cc: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 2348bd6ef2af..855bfbbf4048 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -242,8 +242,6 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, if (result) goto out; - *rimage = image; - /* * Find a location for the control code buffer, and add it * the vector of segments so that it's pages will also be -- cgit v1.2.3 From b92e7e0daed31389ff5ad9f558ef1284c846f6ee Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Wed, 27 Feb 2013 17:03:29 -0800 Subject: kexec: fix memory leak in function kimage_normal_alloc If kimage_normal_alloc() fails to alloc pages for image->swap_page, it should call kimage_free_page_list() to free allocated pages in image->control_pages list before it frees image. Signed-off-by: Zhang Yanfei Cc: "Eric W. Biederman" Cc: Sasha Levin Reviewed-by: Simon Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 855bfbbf4048..6b7455e3c96b 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -229,6 +229,8 @@ out: } +static void kimage_free_page_list(struct list_head *list); + static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, unsigned long nr_segments, struct kexec_segment __user *segments) @@ -252,22 +254,22 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { printk(KERN_ERR "Could not allocate control_code_buffer\n"); - goto out; + goto out_free; } image->swap_page = kimage_alloc_control_pages(image, 0); if (!image->swap_page) { printk(KERN_ERR "Could not allocate swap buffer\n"); - goto out; + goto out_free; } - result = 0; - out: - if (result == 0) - *rimage = image; - else - kfree(image); + *rimage = image; + return 0; +out_free: + kimage_free_page_list(&image->control_pages); + kfree(image); +out: return result; } -- cgit v1.2.3 From 8c333ac2e4946a673b54f974d75397c947569c29 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Wed, 27 Feb 2013 17:03:31 -0800 Subject: kexec: avoid freeing NULL pointer in image_crash_alloc() Though there is no error if we free a NULL pointer, I think we could avoid this behaviour. Change the code a little in kimage_crash_alloc() could avoid this kind of unnecessary free. Signed-off-by: Zhang Yanfei Cc: "Eric W. Biederman" Cc: Sasha Levin Reviewed-by: Simon Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 6b7455e3c96b..bddd3d7a74b6 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -316,7 +316,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, mend = mstart + image->segment[i].memsz - 1; /* Ensure we are within the crash kernel limits */ if ((mstart < crashk_res.start) || (mend > crashk_res.end)) - goto out; + goto out_free; } /* @@ -329,16 +329,15 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { printk(KERN_ERR "Could not allocate control_code_buffer\n"); - goto out; + goto out_free; } - result = 0; -out: - if (result == 0) - *rimage = image; - else - kfree(image); + *rimage = image; + return 0; +out_free: + kfree(image); +out: return result; } -- cgit v1.2.3 From c897ff68beec97768c31f9de97de93f77ff2d87e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 27 Feb 2013 17:03:49 -0800 Subject: cgroup: don't use idr_remove_all() idr_destroy() can destroy idr by itself and idr_remove_all() is being deprecated. Drop its usage. Signed-off-by: Tejun Heo Acked-by: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fb2fb11fbb25..888fba457bb3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4618,10 +4618,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) offline_css(ss, dummytop); ss->active = 0; - if (ss->use_id) { - idr_remove_all(&ss->idr); + if (ss->use_id) idr_destroy(&ss->idr); - } /* deassign the subsys_id */ subsys[ss->subsys_id] = NULL; -- cgit v1.2.3 From d228d9ec2c9a119ce15c6446ebeec05786ab3287 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 27 Feb 2013 17:04:54 -0800 Subject: cgroup: convert to idr_alloc() Convert to the much saner new idr interface. Signed-off-by: Tejun Heo Acked-by: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 888fba457bb3..40e0df6c2a2f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5320,7 +5320,7 @@ EXPORT_SYMBOL_GPL(free_css_id); static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) { struct css_id *newid; - int myid, error, size; + int ret, size; BUG_ON(!ss->use_id); @@ -5328,35 +5328,24 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) newid = kzalloc(size, GFP_KERNEL); if (!newid) return ERR_PTR(-ENOMEM); - /* get id */ - if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) { - error = -ENOMEM; - goto err_out; - } + + idr_preload(GFP_KERNEL); spin_lock(&ss->id_lock); /* Don't use 0. allocates an ID of 1-65535 */ - error = idr_get_new_above(&ss->idr, newid, 1, &myid); + ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT); spin_unlock(&ss->id_lock); + idr_preload_end(); /* Returns error when there are no free spaces for new ID.*/ - if (error) { - error = -ENOSPC; + if (ret < 0) goto err_out; - } - if (myid > CSS_ID_MAX) - goto remove_idr; - newid->id = myid; + newid->id = ret; newid->depth = depth; return newid; -remove_idr: - error = -ENOSPC; - spin_lock(&ss->id_lock); - idr_remove(&ss->idr, myid); - spin_unlock(&ss->id_lock); err_out: kfree(newid); - return ERR_PTR(error); + return ERR_PTR(ret); } -- cgit v1.2.3 From 0e9c3be20d88aa5ed13fde4ece50f45eb96824ad Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 27 Feb 2013 17:04:55 -0800 Subject: events: convert to idr_alloc() Convert to the much saner new idr interface. Signed-off-by: Tejun Heo Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/core.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index ccc457e36354..5a92cf6beff0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5965,13 +5965,9 @@ int perf_pmu_register(struct pmu *pmu, char *name, int type) pmu->name = name; if (type < 0) { - int err = idr_pre_get(&pmu_idr, GFP_KERNEL); - if (!err) - goto free_pdc; - - err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type); - if (err) { - ret = err; + type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL); + if (type < 0) { + ret = type; goto free_pdc; } } -- cgit v1.2.3 From ee94d523bf92d3b8b2de166943d48ac7fd695e10 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 27 Feb 2013 17:04:56 -0800 Subject: posix-timers: convert to idr_alloc() Convert to the much saner new idr interface. Signed-off-by: Tejun Heo Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/posix-timers.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 7edfe4b901e7..6edbb2c55c22 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -552,24 +552,22 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, return -EAGAIN; spin_lock_init(&new_timer->it_lock); - retry: - if (unlikely(!idr_pre_get(&posix_timers_id, GFP_KERNEL))) { - error = -EAGAIN; - goto out; - } + + idr_preload(GFP_KERNEL); spin_lock_irq(&idr_lock); - error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id); + error = idr_alloc(&posix_timers_id, new_timer, 0, 0, GFP_NOWAIT); spin_unlock_irq(&idr_lock); - if (error) { - if (error == -EAGAIN) - goto retry; + idr_preload_end(); + if (error < 0) { /* * Weird looking, but we return EAGAIN if the IDR is * full (proper POSIX return value for this) */ - error = -EAGAIN; + if (error == -ENOSPC) + error = -EAGAIN; goto out; } + new_timer_id = error; it_id_set = IT_ID_SET; new_timer->it_id = (timer_t) new_timer_id; -- cgit v1.2.3 From df1778be1a33edffa51d094eeda87c858ded6560 Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Wed, 27 Feb 2013 17:05:21 -0800 Subject: sysctl: fix null checking in bin_dn_node_address() The null check of `strchr() + 1' is broken, which is always non-null, leading to OOB read. Instead, check the result of strchr(). Signed-off-by: Xi Wang Cc: "Eric W. Biederman" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl_binary.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index b25115e8c7f3..ebf72358e86a 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1171,9 +1171,10 @@ static ssize_t bin_dn_node_address(struct file *file, /* Convert the decnet address to binary */ result = -EIO; - nodep = strchr(buf, '.') + 1; + nodep = strchr(buf, '.'); if (!nodep) goto out; + ++nodep; area = simple_strtoul(buf, NULL, 10); node = simple_strtoul(nodep, NULL, 10); -- cgit v1.2.3 From cd89f46b52cd2354d3d322ea7eab193b86ba03c6 Mon Sep 17 00:00:00 2001 From: Yuanhan Liu Date: Wed, 27 Feb 2013 17:05:22 -0800 Subject: kernel/utsname_sysctl.c: put get/get_uts() into CONFIG_PROC_SYSCTL code block Put get/get_uts() into CONFIG_PROC_SYSCTL code block as they are used only when CONFIG_PROC_SYSCTL is enabled. Signed-off-by: Yuanhan Liu Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/utsname_sysctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 63da38c2d820..4f69f9a5e221 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -15,6 +15,8 @@ #include #include +#ifdef CONFIG_PROC_SYSCTL + static void *get_uts(ctl_table *table, int write) { char *which = table->data; @@ -38,7 +40,6 @@ static void put_uts(ctl_table *table, int write, void *which) up_write(&uts_sem); } -#ifdef CONFIG_PROC_SYSCTL /* * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? -- cgit v1.2.3 From bf5315366b1b708a0dd322bb389e970598f18891 Mon Sep 17 00:00:00 2001 From: Yuanhan Liu Date: Wed, 27 Feb 2013 17:05:30 -0800 Subject: kernel/utsname.c: fix wrong comment about clone_uts_ns() Fix the wrong comment about the return value of clone_uts_ns() Signed-off-by: Yuanhan Liu Acked-by: Serge Hallyn Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/utsname.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/utsname.c b/kernel/utsname.c index 08b197e8c485..a47fc5de3113 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -30,7 +30,7 @@ static struct uts_namespace *create_uts_ns(void) /* * Clone a new ns copying an original utsname, setting refcount to 1 * @old_ns: namespace to clone - * Return NULL on error (failure to kmalloc), new ns otherwise + * Return ERR_PTR(-ENOMEM) on error (failure to kmalloc), new ns otherwise */ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, struct uts_namespace *old_ns) -- cgit v1.2.3 From c759b35e6469fe7519e9fe45d5285d49f12cb657 Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Wed, 27 Feb 2013 17:05:50 -0800 Subject: kfifo: move kfifo.c from kernel/ to lib/ Move kfifo.c from kernel/ to lib/ Signed-off-by: Stefani Seibold Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 2 +- kernel/kfifo.c | 609 -------------------------------------------------------- 2 files changed, 1 insertion(+), 610 deletions(-) delete mode 100644 kernel/kfifo.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index eceac38f3c65..cdee648d2ad0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -7,7 +7,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \ sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ rcupdate.o extable.o params.o posix-timers.o \ - kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ + kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o cred.o \ async.o range.o groups.o lglock.o smpboot.o diff --git a/kernel/kfifo.c b/kernel/kfifo.c deleted file mode 100644 index 59dcf5b81d24..000000000000 --- a/kernel/kfifo.c +++ /dev/null @@ -1,609 +0,0 @@ -/* - * A generic kernel FIFO implementation - * - * Copyright (C) 2009/2010 Stefani Seibold - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#include -#include -#include -#include -#include -#include -#include - -/* - * internal helper to calculate the unused elements in a fifo - */ -static inline unsigned int kfifo_unused(struct __kfifo *fifo) -{ - return (fifo->mask + 1) - (fifo->in - fifo->out); -} - -int __kfifo_alloc(struct __kfifo *fifo, unsigned int size, - size_t esize, gfp_t gfp_mask) -{ - /* - * round down to the next power of 2, since our 'let the indices - * wrap' technique works only in this case. - */ - if (!is_power_of_2(size)) - size = rounddown_pow_of_two(size); - - fifo->in = 0; - fifo->out = 0; - fifo->esize = esize; - - if (size < 2) { - fifo->data = NULL; - fifo->mask = 0; - return -EINVAL; - } - - fifo->data = kmalloc(size * esize, gfp_mask); - - if (!fifo->data) { - fifo->mask = 0; - return -ENOMEM; - } - fifo->mask = size - 1; - - return 0; -} -EXPORT_SYMBOL(__kfifo_alloc); - -void __kfifo_free(struct __kfifo *fifo) -{ - kfree(fifo->data); - fifo->in = 0; - fifo->out = 0; - fifo->esize = 0; - fifo->data = NULL; - fifo->mask = 0; -} -EXPORT_SYMBOL(__kfifo_free); - -int __kfifo_init(struct __kfifo *fifo, void *buffer, - unsigned int size, size_t esize) -{ - size /= esize; - - if (!is_power_of_2(size)) - size = rounddown_pow_of_two(size); - - fifo->in = 0; - fifo->out = 0; - fifo->esize = esize; - fifo->data = buffer; - - if (size < 2) { - fifo->mask = 0; - return -EINVAL; - } - fifo->mask = size - 1; - - return 0; -} -EXPORT_SYMBOL(__kfifo_init); - -static void kfifo_copy_in(struct __kfifo *fifo, const void *src, - unsigned int len, unsigned int off) -{ - unsigned int size = fifo->mask + 1; - unsigned int esize = fifo->esize; - unsigned int l; - - off &= fifo->mask; - if (esize != 1) { - off *= esize; - size *= esize; - len *= esize; - } - l = min(len, size - off); - - memcpy(fifo->data + off, src, l); - memcpy(fifo->data, src + l, len - l); - /* - * make sure that the data in the fifo is up to date before - * incrementing the fifo->in index counter - */ - smp_wmb(); -} - -unsigned int __kfifo_in(struct __kfifo *fifo, - const void *buf, unsigned int len) -{ - unsigned int l; - - l = kfifo_unused(fifo); - if (len > l) - len = l; - - kfifo_copy_in(fifo, buf, len, fifo->in); - fifo->in += len; - return len; -} -EXPORT_SYMBOL(__kfifo_in); - -static void kfifo_copy_out(struct __kfifo *fifo, void *dst, - unsigned int len, unsigned int off) -{ - unsigned int size = fifo->mask + 1; - unsigned int esize = fifo->esize; - unsigned int l; - - off &= fifo->mask; - if (esize != 1) { - off *= esize; - size *= esize; - len *= esize; - } - l = min(len, size - off); - - memcpy(dst, fifo->data + off, l); - memcpy(dst + l, fifo->data, len - l); - /* - * make sure that the data is copied before - * incrementing the fifo->out index counter - */ - smp_wmb(); -} - -unsigned int __kfifo_out_peek(struct __kfifo *fifo, - void *buf, unsigned int len) -{ - unsigned int l; - - l = fifo->in - fifo->out; - if (len > l) - len = l; - - kfifo_copy_out(fifo, buf, len, fifo->out); - return len; -} -EXPORT_SYMBOL(__kfifo_out_peek); - -unsigned int __kfifo_out(struct __kfifo *fifo, - void *buf, unsigned int len) -{ - len = __kfifo_out_peek(fifo, buf, len); - fifo->out += len; - return len; -} -EXPORT_SYMBOL(__kfifo_out); - -static unsigned long kfifo_copy_from_user(struct __kfifo *fifo, - const void __user *from, unsigned int len, unsigned int off, - unsigned int *copied) -{ - unsigned int size = fifo->mask + 1; - unsigned int esize = fifo->esize; - unsigned int l; - unsigned long ret; - - off &= fifo->mask; - if (esize != 1) { - off *= esize; - size *= esize; - len *= esize; - } - l = min(len, size - off); - - ret = copy_from_user(fifo->data + off, from, l); - if (unlikely(ret)) - ret = DIV_ROUND_UP(ret + len - l, esize); - else { - ret = copy_from_user(fifo->data, from + l, len - l); - if (unlikely(ret)) - ret = DIV_ROUND_UP(ret, esize); - } - /* - * make sure that the data in the fifo is up to date before - * incrementing the fifo->in index counter - */ - smp_wmb(); - *copied = len - ret; - /* return the number of elements which are not copied */ - return ret; -} - -int __kfifo_from_user(struct __kfifo *fifo, const void __user *from, - unsigned long len, unsigned int *copied) -{ - unsigned int l; - unsigned long ret; - unsigned int esize = fifo->esize; - int err; - - if (esize != 1) - len /= esize; - - l = kfifo_unused(fifo); - if (len > l) - len = l; - - ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied); - if (unlikely(ret)) { - len -= ret; - err = -EFAULT; - } else - err = 0; - fifo->in += len; - return err; -} -EXPORT_SYMBOL(__kfifo_from_user); - -static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to, - unsigned int len, unsigned int off, unsigned int *copied) -{ - unsigned int l; - unsigned long ret; - unsigned int size = fifo->mask + 1; - unsigned int esize = fifo->esize; - - off &= fifo->mask; - if (esize != 1) { - off *= esize; - size *= esize; - len *= esize; - } - l = min(len, size - off); - - ret = copy_to_user(to, fifo->data + off, l); - if (unlikely(ret)) - ret = DIV_ROUND_UP(ret + len - l, esize); - else { - ret = copy_to_user(to + l, fifo->data, len - l); - if (unlikely(ret)) - ret = DIV_ROUND_UP(ret, esize); - } - /* - * make sure that the data is copied before - * incrementing the fifo->out index counter - */ - smp_wmb(); - *copied = len - ret; - /* return the number of elements which are not copied */ - return ret; -} - -int __kfifo_to_user(struct __kfifo *fifo, void __user *to, - unsigned long len, unsigned int *copied) -{ - unsigned int l; - unsigned long ret; - unsigned int esize = fifo->esize; - int err; - - if (esize != 1) - len /= esize; - - l = fifo->in - fifo->out; - if (len > l) - len = l; - ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied); - if (unlikely(ret)) { - len -= ret; - err = -EFAULT; - } else - err = 0; - fifo->out += len; - return err; -} -EXPORT_SYMBOL(__kfifo_to_user); - -static int setup_sgl_buf(struct scatterlist *sgl, void *buf, - int nents, unsigned int len) -{ - int n; - unsigned int l; - unsigned int off; - struct page *page; - - if (!nents) - return 0; - - if (!len) - return 0; - - n = 0; - page = virt_to_page(buf); - off = offset_in_page(buf); - l = 0; - - while (len >= l + PAGE_SIZE - off) { - struct page *npage; - - l += PAGE_SIZE; - buf += PAGE_SIZE; - npage = virt_to_page(buf); - if (page_to_phys(page) != page_to_phys(npage) - l) { - sg_set_page(sgl, page, l - off, off); - sgl = sg_next(sgl); - if (++n == nents || sgl == NULL) - return n; - page = npage; - len -= l - off; - l = off = 0; - } - } - sg_set_page(sgl, page, len, off); - return n + 1; -} - -static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl, - int nents, unsigned int len, unsigned int off) -{ - unsigned int size = fifo->mask + 1; - unsigned int esize = fifo->esize; - unsigned int l; - unsigned int n; - - off &= fifo->mask; - if (esize != 1) { - off *= esize; - size *= esize; - len *= esize; - } - l = min(len, size - off); - - n = setup_sgl_buf(sgl, fifo->data + off, nents, l); - n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l); - - return n; -} - -unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo, - struct scatterlist *sgl, int nents, unsigned int len) -{ - unsigned int l; - - l = kfifo_unused(fifo); - if (len > l) - len = l; - - return setup_sgl(fifo, sgl, nents, len, fifo->in); -} -EXPORT_SYMBOL(__kfifo_dma_in_prepare); - -unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo, - struct scatterlist *sgl, int nents, unsigned int len) -{ - unsigned int l; - - l = fifo->in - fifo->out; - if (len > l) - len = l; - - return setup_sgl(fifo, sgl, nents, len, fifo->out); -} -EXPORT_SYMBOL(__kfifo_dma_out_prepare); - -unsigned int __kfifo_max_r(unsigned int len, size_t recsize) -{ - unsigned int max = (1 << (recsize << 3)) - 1; - - if (len > max) - return max; - return len; -} -EXPORT_SYMBOL(__kfifo_max_r); - -#define __KFIFO_PEEK(data, out, mask) \ - ((data)[(out) & (mask)]) -/* - * __kfifo_peek_n internal helper function for determinate the length of - * the next record in the fifo - */ -static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize) -{ - unsigned int l; - unsigned int mask = fifo->mask; - unsigned char *data = fifo->data; - - l = __KFIFO_PEEK(data, fifo->out, mask); - - if (--recsize) - l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8; - - return l; -} - -#define __KFIFO_POKE(data, in, mask, val) \ - ( \ - (data)[(in) & (mask)] = (unsigned char)(val) \ - ) - -/* - * __kfifo_poke_n internal helper function for storeing the length of - * the record into the fifo - */ -static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize) -{ - unsigned int mask = fifo->mask; - unsigned char *data = fifo->data; - - __KFIFO_POKE(data, fifo->in, mask, n); - - if (recsize > 1) - __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8); -} - -unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize) -{ - return __kfifo_peek_n(fifo, recsize); -} -EXPORT_SYMBOL(__kfifo_len_r); - -unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf, - unsigned int len, size_t recsize) -{ - if (len + recsize > kfifo_unused(fifo)) - return 0; - - __kfifo_poke_n(fifo, len, recsize); - - kfifo_copy_in(fifo, buf, len, fifo->in + recsize); - fifo->in += len + recsize; - return len; -} -EXPORT_SYMBOL(__kfifo_in_r); - -static unsigned int kfifo_out_copy_r(struct __kfifo *fifo, - void *buf, unsigned int len, size_t recsize, unsigned int *n) -{ - *n = __kfifo_peek_n(fifo, recsize); - - if (len > *n) - len = *n; - - kfifo_copy_out(fifo, buf, len, fifo->out + recsize); - return len; -} - -unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf, - unsigned int len, size_t recsize) -{ - unsigned int n; - - if (fifo->in == fifo->out) - return 0; - - return kfifo_out_copy_r(fifo, buf, len, recsize, &n); -} -EXPORT_SYMBOL(__kfifo_out_peek_r); - -unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf, - unsigned int len, size_t recsize) -{ - unsigned int n; - - if (fifo->in == fifo->out) - return 0; - - len = kfifo_out_copy_r(fifo, buf, len, recsize, &n); - fifo->out += n + recsize; - return len; -} -EXPORT_SYMBOL(__kfifo_out_r); - -void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize) -{ - unsigned int n; - - n = __kfifo_peek_n(fifo, recsize); - fifo->out += n + recsize; -} -EXPORT_SYMBOL(__kfifo_skip_r); - -int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from, - unsigned long len, unsigned int *copied, size_t recsize) -{ - unsigned long ret; - - len = __kfifo_max_r(len, recsize); - - if (len + recsize > kfifo_unused(fifo)) { - *copied = 0; - return 0; - } - - __kfifo_poke_n(fifo, len, recsize); - - ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied); - if (unlikely(ret)) { - *copied = 0; - return -EFAULT; - } - fifo->in += len + recsize; - return 0; -} -EXPORT_SYMBOL(__kfifo_from_user_r); - -int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to, - unsigned long len, unsigned int *copied, size_t recsize) -{ - unsigned long ret; - unsigned int n; - - if (fifo->in == fifo->out) { - *copied = 0; - return 0; - } - - n = __kfifo_peek_n(fifo, recsize); - if (len > n) - len = n; - - ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied); - if (unlikely(ret)) { - *copied = 0; - return -EFAULT; - } - fifo->out += n + recsize; - return 0; -} -EXPORT_SYMBOL(__kfifo_to_user_r); - -unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo, - struct scatterlist *sgl, int nents, unsigned int len, size_t recsize) -{ - if (!nents) - BUG(); - - len = __kfifo_max_r(len, recsize); - - if (len + recsize > kfifo_unused(fifo)) - return 0; - - return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize); -} -EXPORT_SYMBOL(__kfifo_dma_in_prepare_r); - -void __kfifo_dma_in_finish_r(struct __kfifo *fifo, - unsigned int len, size_t recsize) -{ - len = __kfifo_max_r(len, recsize); - __kfifo_poke_n(fifo, len, recsize); - fifo->in += len + recsize; -} -EXPORT_SYMBOL(__kfifo_dma_in_finish_r); - -unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo, - struct scatterlist *sgl, int nents, unsigned int len, size_t recsize) -{ - if (!nents) - BUG(); - - len = __kfifo_max_r(len, recsize); - - if (len + recsize > fifo->in - fifo->out) - return 0; - - return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize); -} -EXPORT_SYMBOL(__kfifo_dma_out_prepare_r); - -void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize) -{ - unsigned int len; - - len = __kfifo_peek_n(fifo, recsize); - fifo->out += len + recsize; -} -EXPORT_SYMBOL(__kfifo_dma_out_finish_r); -- cgit v1.2.3 From 1e142b29e210b5dfb2deeb6ce2210b60af16d2a6 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 27 Feb 2013 17:05:58 -0800 Subject: kcmp: make it depend on CHECKPOINT_RESTORE Since kcmp syscall has been implemented (initially on x86 architecture) a number of other archs wire it up as well: xtensa, sparc, sh, s390, mips, microblaze, m68k (not taking into account those who uses for syscall numbers definitions). But the Makefile, which turns kcmp.o generation on still depends on former config-x86. Thus get rid of this limitation and make kcmp.o depend on CHECKPOINT_RESTORE option. Signed-off-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: Andrey Vagin Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index cdee648d2ad0..953a20ea0fa3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -25,9 +25,7 @@ endif obj-y += sched/ obj-y += power/ -ifeq ($(CONFIG_CHECKPOINT_RESTORE),y) -obj-$(CONFIG_X86) += kcmp.o -endif +obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_STACKTRACE) += stacktrace.o -- cgit v1.2.3 From b67bfe0d42cac56c512dd5da4b1b347a23f4b70a Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 27 Feb 2013 17:06:00 -0800 Subject: hlist: drop the node parameter from iterators I'm not sure why, but the hlist for each entry iterators were conceived list_for_each_entry(pos, head, member) The hlist ones were greedy and wanted an extra parameter: hlist_for_each_entry(tpos, pos, head, member) Why did they need an extra pos parameter? I'm not quite sure. Not only they don't really need it, it also prevents the iterator from looking exactly like the list iterator, which is unfortunate. Besides the semantic patch, there was some manual work required: - Fix up the actual hlist iterators in linux/list.h - Fix up the declaration of other iterators based on the hlist ones. - A very small amount of places were using the 'node' parameter, this was modified to use 'obj->member' instead. - Coccinelle didn't handle the hlist_for_each_entry_safe iterator properly, so those had to be fixed up manually. The semantic patch which is mostly the work of Peter Senna Tschudin is here: @@ iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host; type T; expression a,c,d,e; identifier b; statement S; @@ -T b; <+... when != b ( hlist_for_each_entry(a, - b, c, d) S | hlist_for_each_entry_continue(a, - b, c) S | hlist_for_each_entry_from(a, - b, c) S | hlist_for_each_entry_rcu(a, - b, c, d) S | hlist_for_each_entry_rcu_bh(a, - b, c, d) S | hlist_for_each_entry_continue_rcu_bh(a, - b, c) S | for_each_busy_worker(a, c, - b, d) S | ax25_uid_for_each(a, - b, c) S | ax25_for_each(a, - b, c) S | inet_bind_bucket_for_each(a, - b, c) S | sctp_for_each_hentry(a, - b, c) S | sk_for_each(a, - b, c) S | sk_for_each_rcu(a, - b, c) S | sk_for_each_from -(a, b) +(a) S + sk_for_each_from(a) S | sk_for_each_safe(a, - b, c, d) S | sk_for_each_bound(a, - b, c) S | hlist_for_each_entry_safe(a, - b, c, d, e) S | hlist_for_each_entry_continue_rcu(a, - b, c) S | nr_neigh_for_each(a, - b, c) S | nr_neigh_for_each_safe(a, - b, c, d) S | nr_node_for_each(a, - b, c) S | nr_node_for_each_safe(a, - b, c, d) S | - for_each_gfn_sp(a, c, d, b) S + for_each_gfn_sp(a, c, d) S | - for_each_gfn_indirect_valid_sp(a, c, d, b) S + for_each_gfn_indirect_valid_sp(a, c, d) S | for_each_host(a, - b, c) S | for_each_host_safe(a, - b, c, d) S | for_each_mesh_entry(a, - b, c, d) S ) ...+> [akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c] [akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c] [akpm@linux-foundation.org: checkpatch fixes] [akpm@linux-foundation.org: fix warnings] [akpm@linux-foudnation.org: redo intrusive kvm changes] Tested-by: Peter Senna Tschudin Acked-by: Paul E. McKenney Signed-off-by: Sasha Levin Cc: Wu Fengguang Cc: Marcelo Tosatti Cc: Gleb Natapov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 12 +++++------- kernel/events/core.c | 6 ++---- kernel/kprobes.c | 35 ++++++++++++++--------------------- kernel/pid.c | 3 +-- kernel/sched/core.c | 6 ++---- kernel/smpboot.c | 2 +- kernel/trace/ftrace.c | 24 ++++++++++-------------- kernel/trace/trace_output.c | 3 +-- kernel/tracepoint.c | 6 ++---- kernel/user-return-notifier.c | 4 ++-- kernel/user.c | 3 +-- kernel/workqueue.c | 13 +++++-------- 12 files changed, 46 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 40e0df6c2a2f..a32f9432666c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -554,7 +554,6 @@ static struct css_set *find_existing_css_set( { int i; struct cgroupfs_root *root = cgrp->root; - struct hlist_node *node; struct css_set *cg; unsigned long key; @@ -577,7 +576,7 @@ static struct css_set *find_existing_css_set( } key = css_set_hash(template); - hash_for_each_possible(css_set_table, cg, node, hlist, key) { + hash_for_each_possible(css_set_table, cg, hlist, key) { if (!compare_css_sets(cg, oldcg, cgrp, template)) continue; @@ -1611,7 +1610,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, struct cgroupfs_root *existing_root; const struct cred *cred; int i; - struct hlist_node *node; struct css_set *cg; BUG_ON(sb->s_root != NULL); @@ -1666,7 +1664,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, /* Link the top cgroup in this hierarchy into all * the css_set objects */ write_lock(&css_set_lock); - hash_for_each(css_set_table, i, node, cg, hlist) + hash_for_each(css_set_table, i, cg, hlist) link_css_set(&tmp_cg_links, cg, root_cgrp); write_unlock(&css_set_lock); @@ -4493,7 +4491,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; int i, ret; - struct hlist_node *node, *tmp; + struct hlist_node *tmp; struct css_set *cg; unsigned long key; @@ -4561,7 +4559,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * this is all done under the css_set_lock. */ write_lock(&css_set_lock); - hash_for_each_safe(css_set_table, i, node, tmp, cg, hlist) { + hash_for_each_safe(css_set_table, i, tmp, cg, hlist) { /* skip entries that we already rehashed */ if (cg->subsys[ss->subsys_id]) continue; @@ -4571,7 +4569,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) cg->subsys[ss->subsys_id] = css; /* recompute hash and restore entry */ key = css_set_hash(cg->subsys); - hash_add(css_set_table, node, key); + hash_add(css_set_table, &cg->hlist, key); } write_unlock(&css_set_lock); diff --git a/kernel/events/core.c b/kernel/events/core.c index 5a92cf6beff0..b0cd86501c30 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5126,7 +5126,6 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, { struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); struct perf_event *event; - struct hlist_node *node; struct hlist_head *head; rcu_read_lock(); @@ -5134,7 +5133,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, if (!head) goto end; - hlist_for_each_entry_rcu(event, node, head, hlist_entry) { + hlist_for_each_entry_rcu(event, head, hlist_entry) { if (perf_swevent_match(event, type, event_id, data, regs)) perf_swevent_event(event, nr, data, regs); } @@ -5419,7 +5418,6 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, { struct perf_sample_data data; struct perf_event *event; - struct hlist_node *node; struct perf_raw_record raw = { .size = entry_size, @@ -5429,7 +5427,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, perf_sample_data_init(&data, addr, 0); data.raw = &raw; - hlist_for_each_entry_rcu(event, node, head, hlist_entry) { + hlist_for_each_entry_rcu(event, head, hlist_entry) { if (perf_tp_event_match(event, &data, regs)) perf_swevent_event(event, count, &data, regs); } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 550294d58a02..e35be53f6613 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -334,11 +334,10 @@ static inline void reset_kprobe_instance(void) struct kprobe __kprobes *get_kprobe(void *addr) { struct hlist_head *head; - struct hlist_node *node; struct kprobe *p; head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; - hlist_for_each_entry_rcu(p, node, head, hlist) { + hlist_for_each_entry_rcu(p, head, hlist) { if (p->addr == addr) return p; } @@ -799,7 +798,6 @@ out: static void __kprobes optimize_all_kprobes(void) { struct hlist_head *head; - struct hlist_node *node; struct kprobe *p; unsigned int i; @@ -810,7 +808,7 @@ static void __kprobes optimize_all_kprobes(void) kprobes_allow_optimization = true; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, node, head, hlist) + hlist_for_each_entry_rcu(p, head, hlist) if (!kprobe_disabled(p)) optimize_kprobe(p); } @@ -821,7 +819,6 @@ static void __kprobes optimize_all_kprobes(void) static void __kprobes unoptimize_all_kprobes(void) { struct hlist_head *head; - struct hlist_node *node; struct kprobe *p; unsigned int i; @@ -832,7 +829,7 @@ static void __kprobes unoptimize_all_kprobes(void) kprobes_allow_optimization = false; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, node, head, hlist) { + hlist_for_each_entry_rcu(p, head, hlist) { if (!kprobe_disabled(p)) unoptimize_kprobe(p, false); } @@ -1148,7 +1145,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) { struct kretprobe_instance *ri; struct hlist_head *head, empty_rp; - struct hlist_node *node, *tmp; + struct hlist_node *tmp; unsigned long hash, flags = 0; if (unlikely(!kprobes_initialized)) @@ -1159,12 +1156,12 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) hash = hash_ptr(tk, KPROBE_HASH_BITS); head = &kretprobe_inst_table[hash]; kretprobe_table_lock(hash, &flags); - hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + hlist_for_each_entry_safe(ri, tmp, head, hlist) { if (ri->task == tk) recycle_rp_inst(ri, &empty_rp); } kretprobe_table_unlock(hash, &flags); - hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { + hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); kfree(ri); } @@ -1173,9 +1170,9 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) static inline void free_rp_inst(struct kretprobe *rp) { struct kretprobe_instance *ri; - struct hlist_node *pos, *next; + struct hlist_node *next; - hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) { + hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) { hlist_del(&ri->hlist); kfree(ri); } @@ -1185,14 +1182,14 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp) { unsigned long flags, hash; struct kretprobe_instance *ri; - struct hlist_node *pos, *next; + struct hlist_node *next; struct hlist_head *head; /* No race here */ for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) { kretprobe_table_lock(hash, &flags); head = &kretprobe_inst_table[hash]; - hlist_for_each_entry_safe(ri, pos, next, head, hlist) { + hlist_for_each_entry_safe(ri, next, head, hlist) { if (ri->rp == rp) ri->rp = NULL; } @@ -2028,7 +2025,6 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb, { struct module *mod = data; struct hlist_head *head; - struct hlist_node *node; struct kprobe *p; unsigned int i; int checkcore = (val == MODULE_STATE_GOING); @@ -2045,7 +2041,7 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb, mutex_lock(&kprobe_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, node, head, hlist) + hlist_for_each_entry_rcu(p, head, hlist) if (within_module_init((unsigned long)p->addr, mod) || (checkcore && within_module_core((unsigned long)p->addr, mod))) { @@ -2192,7 +2188,6 @@ static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v) static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) { struct hlist_head *head; - struct hlist_node *node; struct kprobe *p, *kp; const char *sym = NULL; unsigned int i = *(loff_t *) v; @@ -2201,7 +2196,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) head = &kprobe_table[i]; preempt_disable(); - hlist_for_each_entry_rcu(p, node, head, hlist) { + hlist_for_each_entry_rcu(p, head, hlist) { sym = kallsyms_lookup((unsigned long)p->addr, NULL, &offset, &modname, namebuf); if (kprobe_aggrprobe(p)) { @@ -2236,7 +2231,6 @@ static const struct file_operations debugfs_kprobes_operations = { static void __kprobes arm_all_kprobes(void) { struct hlist_head *head; - struct hlist_node *node; struct kprobe *p; unsigned int i; @@ -2249,7 +2243,7 @@ static void __kprobes arm_all_kprobes(void) /* Arming kprobes doesn't optimize kprobe itself */ for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, node, head, hlist) + hlist_for_each_entry_rcu(p, head, hlist) if (!kprobe_disabled(p)) arm_kprobe(p); } @@ -2265,7 +2259,6 @@ already_enabled: static void __kprobes disarm_all_kprobes(void) { struct hlist_head *head; - struct hlist_node *node; struct kprobe *p; unsigned int i; @@ -2282,7 +2275,7 @@ static void __kprobes disarm_all_kprobes(void) for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, node, head, hlist) { + hlist_for_each_entry_rcu(p, head, hlist) { if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) disarm_kprobe(p, false); } diff --git a/kernel/pid.c b/kernel/pid.c index f2c6a6825098..047dc6264638 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -350,10 +350,9 @@ void disable_pid_allocation(struct pid_namespace *ns) struct pid *find_pid_ns(int nr, struct pid_namespace *ns) { - struct hlist_node *elem; struct upid *pnr; - hlist_for_each_entry_rcu(pnr, elem, + hlist_for_each_entry_rcu(pnr, &pid_hash[pid_hashfn(nr, ns)], pid_chain) if (pnr->nr == nr && pnr->ns == ns) return container_of(pnr, struct pid, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2b5243176aba..12af4270c9c1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1752,9 +1752,8 @@ EXPORT_SYMBOL_GPL(preempt_notifier_unregister); static void fire_sched_in_preempt_notifiers(struct task_struct *curr) { struct preempt_notifier *notifier; - struct hlist_node *node; - hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) + hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) notifier->ops->sched_in(notifier, raw_smp_processor_id()); } @@ -1763,9 +1762,8 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, struct task_struct *next) { struct preempt_notifier *notifier; - struct hlist_node *node; - hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) + hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) notifier->ops->sched_out(notifier, next); } diff --git a/kernel/smpboot.c b/kernel/smpboot.c index d4abac261779..b9bde5727829 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -131,7 +131,7 @@ static int smpboot_thread_fn(void *data) continue; } - BUG_ON(td->cpu != smp_processor_id()); + //BUG_ON(td->cpu != smp_processor_id()); /* Check for state change setup */ switch (td->status) { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 98ca94a41819..ab25b88aae56 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -762,7 +762,6 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) { struct ftrace_profile *rec; struct hlist_head *hhd; - struct hlist_node *n; unsigned long key; key = hash_long(ip, ftrace_profile_bits); @@ -771,7 +770,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) if (hlist_empty(hhd)) return NULL; - hlist_for_each_entry_rcu(rec, n, hhd, node) { + hlist_for_each_entry_rcu(rec, hhd, node) { if (rec->ip == ip) return rec; } @@ -1133,7 +1132,6 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) unsigned long key; struct ftrace_func_entry *entry; struct hlist_head *hhd; - struct hlist_node *n; if (ftrace_hash_empty(hash)) return NULL; @@ -1145,7 +1143,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) hhd = &hash->buckets[key]; - hlist_for_each_entry_rcu(entry, n, hhd, hlist) { + hlist_for_each_entry_rcu(entry, hhd, hlist) { if (entry->ip == ip) return entry; } @@ -1202,7 +1200,7 @@ remove_hash_entry(struct ftrace_hash *hash, static void ftrace_hash_clear(struct ftrace_hash *hash) { struct hlist_head *hhd; - struct hlist_node *tp, *tn; + struct hlist_node *tn; struct ftrace_func_entry *entry; int size = 1 << hash->size_bits; int i; @@ -1212,7 +1210,7 @@ static void ftrace_hash_clear(struct ftrace_hash *hash) for (i = 0; i < size; i++) { hhd = &hash->buckets[i]; - hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) + hlist_for_each_entry_safe(entry, tn, hhd, hlist) free_hash_entry(hash, entry); } FTRACE_WARN_ON(hash->count); @@ -1275,7 +1273,6 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) { struct ftrace_func_entry *entry; struct ftrace_hash *new_hash; - struct hlist_node *tp; int size; int ret; int i; @@ -1290,7 +1287,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) size = 1 << hash->size_bits; for (i = 0; i < size; i++) { - hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { ret = add_hash_entry(new_hash, entry->ip); if (ret < 0) goto free_hash; @@ -1316,7 +1313,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, struct ftrace_hash **dst, struct ftrace_hash *src) { struct ftrace_func_entry *entry; - struct hlist_node *tp, *tn; + struct hlist_node *tn; struct hlist_head *hhd; struct ftrace_hash *old_hash; struct ftrace_hash *new_hash; @@ -1362,7 +1359,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, size = 1 << src->size_bits; for (i = 0; i < size; i++) { hhd = &src->buckets[i]; - hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) { + hlist_for_each_entry_safe(entry, tn, hhd, hlist) { if (bits > 0) key = hash_long(entry->ip, bits); else @@ -2901,7 +2898,6 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, { struct ftrace_func_probe *entry; struct hlist_head *hhd; - struct hlist_node *n; unsigned long key; key = hash_long(ip, FTRACE_HASH_BITS); @@ -2917,7 +2913,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, * on the hash. rcu_read_lock is too dangerous here. */ preempt_disable_notrace(); - hlist_for_each_entry_rcu(entry, n, hhd, node) { + hlist_for_each_entry_rcu(entry, hhd, node) { if (entry->ip == ip) entry->ops->func(ip, parent_ip, &entry->data); } @@ -3068,7 +3064,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data, int flags) { struct ftrace_func_probe *entry; - struct hlist_node *n, *tmp; + struct hlist_node *tmp; char str[KSYM_SYMBOL_LEN]; int type = MATCH_FULL; int i, len = 0; @@ -3091,7 +3087,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { struct hlist_head *hhd = &ftrace_func_hash[i]; - hlist_for_each_entry_safe(entry, n, tmp, hhd, node) { + hlist_for_each_entry_safe(entry, tmp, hhd, node) { /* break up if statements for readability */ if ((flags & PROBE_TEST_FUNC) && entry->ops != ops) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 194d79602dc7..697e88d13907 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -739,12 +739,11 @@ static int task_state_char(unsigned long state) struct trace_event *ftrace_find_event(int type) { struct trace_event *event; - struct hlist_node *n; unsigned key; key = type & (EVENT_HASHSIZE - 1); - hlist_for_each_entry(event, n, &event_hash[key], node) { + hlist_for_each_entry(event, &event_hash[key], node) { if (event->type == type) return event; } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index d96ba22dabfa..0c05a4592047 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -192,12 +192,11 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, static struct tracepoint_entry *get_tracepoint(const char *name) { struct hlist_head *head; - struct hlist_node *node; struct tracepoint_entry *e; u32 hash = jhash(name, strlen(name), 0); head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; - hlist_for_each_entry(e, node, head, hlist) { + hlist_for_each_entry(e, head, hlist) { if (!strcmp(name, e->name)) return e; } @@ -211,13 +210,12 @@ static struct tracepoint_entry *get_tracepoint(const char *name) static struct tracepoint_entry *add_tracepoint(const char *name) { struct hlist_head *head; - struct hlist_node *node; struct tracepoint_entry *e; size_t name_len = strlen(name) + 1; u32 hash = jhash(name, name_len-1, 0); head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; - hlist_for_each_entry(e, node, head, hlist) { + hlist_for_each_entry(e, head, hlist) { if (!strcmp(name, e->name)) { printk(KERN_NOTICE "tracepoint %s busy\n", name); diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c index 1744bb80f1fb..394f70b17162 100644 --- a/kernel/user-return-notifier.c +++ b/kernel/user-return-notifier.c @@ -34,11 +34,11 @@ EXPORT_SYMBOL_GPL(user_return_notifier_unregister); void fire_user_return_notifiers(void) { struct user_return_notifier *urn; - struct hlist_node *tmp1, *tmp2; + struct hlist_node *tmp2; struct hlist_head *head; head = &get_cpu_var(return_notifier_list); - hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link) + hlist_for_each_entry_safe(urn, tmp2, head, link) urn->on_user_return(urn); put_cpu_var(return_notifier_list); } diff --git a/kernel/user.c b/kernel/user.c index 57ebfd42023c..e81978e8c03b 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -105,9 +105,8 @@ static void uid_hash_remove(struct user_struct *up) static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) { struct user_struct *user; - struct hlist_node *h; - hlist_for_each_entry(user, h, hashent, uidhash_node) { + hlist_for_each_entry(user, hashent, uidhash_node) { if (uid_eq(user->uid, uid)) { atomic_inc(&user->__count); return user; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f4feacad3812..81f2457811eb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -251,8 +251,8 @@ EXPORT_SYMBOL_GPL(system_freezable_wq); for ((pool) = &std_worker_pools(cpu)[0]; \ (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++) -#define for_each_busy_worker(worker, i, pos, pool) \ - hash_for_each(pool->busy_hash, i, pos, worker, hentry) +#define for_each_busy_worker(worker, i, pool) \ + hash_for_each(pool->busy_hash, i, worker, hentry) static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, unsigned int sw) @@ -909,9 +909,8 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool, struct work_struct *work) { struct worker *worker; - struct hlist_node *tmp; - hash_for_each_possible(pool->busy_hash, worker, tmp, hentry, + hash_for_each_possible(pool->busy_hash, worker, hentry, (unsigned long)work) if (worker->current_work == work && worker->current_func == work->func) @@ -1626,7 +1625,6 @@ static void busy_worker_rebind_fn(struct work_struct *work) static void rebind_workers(struct worker_pool *pool) { struct worker *worker, *n; - struct hlist_node *pos; int i; lockdep_assert_held(&pool->assoc_mutex); @@ -1648,7 +1646,7 @@ static void rebind_workers(struct worker_pool *pool) } /* rebind busy workers */ - for_each_busy_worker(worker, i, pos, pool) { + for_each_busy_worker(worker, i, pool) { struct work_struct *rebind_work = &worker->rebind_work; struct workqueue_struct *wq; @@ -3423,7 +3421,6 @@ static void wq_unbind_fn(struct work_struct *work) int cpu = smp_processor_id(); struct worker_pool *pool; struct worker *worker; - struct hlist_node *pos; int i; for_each_std_worker_pool(pool, cpu) { @@ -3442,7 +3439,7 @@ static void wq_unbind_fn(struct work_struct *work) list_for_each_entry(worker, &pool->idle_list, entry) worker->flags |= WORKER_UNBOUND; - for_each_busy_worker(worker, i, pos, pool) + for_each_busy_worker(worker, i, pool) worker->flags |= WORKER_UNBOUND; pool->flags |= POOL_DISASSOCIATED; -- cgit v1.2.3 From f7c82d5a3c537a4b4d9d0395db4606bf4d3c7a5f Mon Sep 17 00:00:00 2001 From: John Blackwood Date: Mon, 10 Dec 2012 15:37:22 -0600 Subject: kdb: A fix for kdb command table expansion When locally adding in some additional kdb commands, I stumbled across an issue with the dynamic expansion of the kdb command table. When the number of kdb commands exceeds the size of the statically allocated kdb_base_commands[] array, additional space is allocated in the kdb_register_repeat() routine. The unused portion of the newly allocated array was not being initialized to zero properly and this would result in segfaults when help '?' was executed or when a search for a non-existing command would traverse the command table beyond the end of valid command entries and then attempt to use the non-zeroed area as actual command entries. Signed-off-by: John Blackwood Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 8875254120b6..a52493a66cf2 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2739,7 +2739,7 @@ int kdb_register_repeat(char *cmd, (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new)); kfree(kdb_commands); } - memset(new + kdb_max_commands, 0, + memset(new + kdb_max_commands - KDB_BASE_CMD_MAX, 0, kdb_command_extend * sizeof(*new)); kdb_commands = new; kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX; -- cgit v1.2.3 From 5f784f798c1a6367d314b3ea5d742a5dcc8dc7ca Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 20 Dec 2012 14:11:27 -0500 Subject: kdb: use ARRAY_SIZE where possible Signed-off-by: Sasha Levin Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index a52493a66cf2..437b74ddca81 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -124,7 +124,7 @@ static kdbmsg_t kdbmsgs[] = { }; #undef KDBMSG -static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t); +static const int __nkdb_err = ARRAY_SIZE(kdbmsgs); /* @@ -175,7 +175,7 @@ static char *__env[] = { (char *)0, }; -static const int __nenv = (sizeof(__env) / sizeof(char *)); +static const int __nenv = ARRAY_SIZE(__env); struct task_struct *kdb_curr_task(int cpu) { -- cgit v1.2.3 From 00370b8f8dd6e3171b8202f9c5187a5f73e99497 Mon Sep 17 00:00:00 2001 From: Matt Klein Date: Wed, 2 Jan 2013 13:20:49 -0800 Subject: kdb: Setup basic kdb state before invoking commands via kgdb Although invasive kdb commands are not supported via kgdb, some useful non-invasive commands like bt* require basic kdb state to be setup before calling into the kdb code. Factor out some of this code and call it before and after executing kdb commands via kgdb. Signed-off-by: Matt Klein Signed-off-by: Jason Wessel --- kernel/debug/debug_core.h | 2 ++ kernel/debug/gdbstub.c | 3 +++ kernel/debug/kdb/kdb_debugger.c | 24 ++++++++++++++++++------ 3 files changed, 23 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h index 3494c28a7e7a..2235967e78b0 100644 --- a/kernel/debug/debug_core.h +++ b/kernel/debug/debug_core.h @@ -72,6 +72,8 @@ extern int dbg_kdb_mode; #ifdef CONFIG_KGDB_KDB extern int kdb_stub(struct kgdb_state *ks); extern int kdb_parse(const char *cmdstr); +extern int kdb_common_init_state(struct kgdb_state *ks); +extern int kdb_common_deinit_state(void); #else /* ! CONFIG_KGDB_KDB */ static inline int kdb_stub(struct kgdb_state *ks) { diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index ce615e064482..ea5e3edb6915 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -782,7 +782,10 @@ static void gdb_cmd_query(struct kgdb_state *ks) len = len / 2; remcom_out_buffer[len++] = 0; + kdb_common_init_state(ks); kdb_parse(remcom_out_buffer); + kdb_common_deinit_state(); + strcpy(remcom_out_buffer, "OK"); } break; diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index be7b33b73d30..d04a6ce2d3b7 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -34,6 +34,22 @@ EXPORT_SYMBOL_GPL(kdb_poll_idx); static struct kgdb_state *kdb_ks; +int kdb_common_init_state(struct kgdb_state *ks) +{ + kdb_initial_cpu = atomic_read(&kgdb_active); + kdb_current_task = kgdb_info[ks->cpu].task; + kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo; + return 0; +} + +int kdb_common_deinit_state(void) +{ + kdb_initial_cpu = -1; + kdb_current_task = NULL; + kdb_current_regs = NULL; + return 0; +} + int kdb_stub(struct kgdb_state *ks) { int error = 0; @@ -94,9 +110,7 @@ int kdb_stub(struct kgdb_state *ks) } /* Set initial kdb state variables */ KDB_STATE_CLEAR(KGDB_TRANS); - kdb_initial_cpu = atomic_read(&kgdb_active); - kdb_current_task = kgdb_info[ks->cpu].task; - kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo; + kdb_common_init_state(ks); /* Remove any breakpoints as needed by kdb and clear single step */ kdb_bp_remove(); KDB_STATE_CLEAR(DOING_SS); @@ -125,9 +139,7 @@ int kdb_stub(struct kgdb_state *ks) * Upon exit from the kdb main loop setup break points and restart * the system based on the requested continue state */ - kdb_initial_cpu = -1; - kdb_current_task = NULL; - kdb_current_regs = NULL; + kdb_common_deinit_state(); KDB_STATE_CLEAR(PAGER); kdbnearsym_cleanup(); if (error == KDB_CMD_KGDB) { -- cgit v1.2.3 From 4eb7a66d9410927fb8fbafad8b8298b627cdd128 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Sun, 3 Feb 2013 09:32:28 -0600 Subject: kdb: Fix overlap in buffers with strcpy Maxime reported that strcpy(s->usage, s->usage+1) has no definitive guarantee that it will work on all archs the same way when you have overlapping memory. The fix is simple for the kdb code because we still have the original string memory in the function scope, so we just have to use that as the argument instead. Reported-by: Maxime Villard Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_main.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 437b74ddca81..de22c8cc6c30 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -683,32 +683,44 @@ static int kdb_defcmd(int argc, const char **argv) return KDB_ARGCOUNT; defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set), GFP_KDB); - if (!defcmd_set) { - kdb_printf("Could not allocate new defcmd_set entry for %s\n", - argv[1]); - defcmd_set = save_defcmd_set; - return KDB_NOTIMP; - } + if (!defcmd_set) + goto fail_defcmd; memcpy(defcmd_set, save_defcmd_set, defcmd_set_count * sizeof(*defcmd_set)); - kfree(save_defcmd_set); s = defcmd_set + defcmd_set_count; memset(s, 0, sizeof(*s)); s->usable = 1; s->name = kdb_strdup(argv[1], GFP_KDB); + if (!s->name) + goto fail_name; s->usage = kdb_strdup(argv[2], GFP_KDB); + if (!s->usage) + goto fail_usage; s->help = kdb_strdup(argv[3], GFP_KDB); + if (!s->help) + goto fail_help; if (s->usage[0] == '"') { - strcpy(s->usage, s->usage+1); + strcpy(s->usage, argv[2]+1); s->usage[strlen(s->usage)-1] = '\0'; } if (s->help[0] == '"') { - strcpy(s->help, s->help+1); + strcpy(s->help, argv[3]+1); s->help[strlen(s->help)-1] = '\0'; } ++defcmd_set_count; defcmd_in_progress = 1; + kfree(save_defcmd_set); return 0; +fail_help: + kfree(s->usage); +fail_usage: + kfree(s->name); +fail_name: + kfree(defcmd_set); +fail_defcmd: + kdb_printf("Could not allocate new defcmd_set entry for %s\n", argv[1]); + defcmd_set = save_defcmd_set; + return KDB_NOTIMP; } /* -- cgit v1.2.3 From 074604af21c971cf2fcfaa0f6012b4b0c9ca891a Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Mon, 4 Feb 2013 09:52:14 -0600 Subject: kdb_main: fix help print The help command was chopping all the usage instructions such that they were not readable. Example: bta [D|R|S|T|C|Z|E|U|I| Backtrace all processes matching state flag per_cpu [] [ [] [] Display per_cpu variables All that is needed is to check the how long the cmd_usage is and jump to the next line when appropriate. Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_main.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index de22c8cc6c30..25908cf2f5d7 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2442,11 +2442,15 @@ static int kdb_help(int argc, const char **argv) kdb_printf("-----------------------------" "-----------------------------\n"); for_each_kdbcmd(kt, i) { - if (kt->cmd_name) - kdb_printf("%-15.15s %-20.20s %s\n", kt->cmd_name, - kt->cmd_usage, kt->cmd_help); + char *space = ""; if (KDB_FLAG(CMD_INTERRUPT)) return 0; + if (!kt->cmd_name) + continue; + if (strlen(kt->cmd_usage) > 20) + space = "\n "; + kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name, + kt->cmd_usage, space, kt->cmd_help); } return 0; } -- cgit v1.2.3 From 1b2caa2dcb8f18d2be9c5c3c992cb6da03f1a70a Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Mon, 4 Feb 2013 10:35:33 -0600 Subject: kdb: Remove the ll command Recently some code inspection was done after fixing a problem with kmalloc used while in the kernel debugger context (which is not legal), and it turned up the fact that kdb ll command will oops the kernel. Given that there have been zero bug reports on the command combined with the fact it will oops the kernel it is clearly not being used. Instead of fixing it, it will be removed. Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_main.c | 65 --------------------------------------------- 1 file changed, 65 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 25908cf2f5d7..cdfc0a7e583e 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2362,69 +2362,6 @@ static int kdb_pid(int argc, const char **argv) return 0; } -/* - * kdb_ll - This function implements the 'll' command which follows a - * linked list and executes an arbitrary command for each - * element. - */ -static int kdb_ll(int argc, const char **argv) -{ - int diag = 0; - unsigned long addr; - long offset = 0; - unsigned long va; - unsigned long linkoffset; - int nextarg; - const char *command; - - if (argc != 3) - return KDB_ARGCOUNT; - - nextarg = 1; - diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL); - if (diag) - return diag; - - diag = kdbgetularg(argv[2], &linkoffset); - if (diag) - return diag; - - /* - * Using the starting address as - * the first element in the list, and assuming that - * the list ends with a null pointer. - */ - - va = addr; - command = kdb_strdup(argv[3], GFP_KDB); - if (!command) { - kdb_printf("%s: cannot duplicate command\n", __func__); - return 0; - } - /* Recursive use of kdb_parse, do not use argv after this point */ - argv = NULL; - - while (va) { - char buf[80]; - - if (KDB_FLAG(CMD_INTERRUPT)) - goto out; - - sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va); - diag = kdb_parse(buf); - if (diag) - goto out; - - addr = va + linkoffset; - if (kdb_getword(&va, addr, sizeof(va))) - goto out; - } - -out: - kfree(command); - return diag; -} - static int kdb_kgdb(int argc, const char **argv) { return KDB_CMD_KGDB; @@ -2866,8 +2803,6 @@ static void __init kdb_inittab(void) kdb_register_repeat("btt", kdb_bt, "", "Backtrace process given its struct task address", 0, KDB_REPEAT_NONE); - kdb_register_repeat("ll", kdb_ll, " ", - "Execute cmd for each element in linked list", 0, KDB_REPEAT_NONE); kdb_register_repeat("env", kdb_env, "", "Show environment variables", 0, KDB_REPEAT_NONE); kdb_register_repeat("set", kdb_set, "", -- cgit v1.2.3 From a37372f6c3c03dc7613eaae8bb3458c8068f5fff Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Mon, 4 Feb 2013 10:35:33 -0600 Subject: kdb: Prevent kernel oops with kdb_defcmd The kdb_defcmd can only be used to display the available command aliases while using the kernel debug shell. If you try to define a new macro while the kernel debugger is active it will oops. The debug shell macros must use pre-allocated memory set aside at the time kdb_init() is run, and the kdb_defcmd is restricted to only working at the time that the kdb_init sequence is being run, which only occurs if you actually activate the kernel debugger. Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_main.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index cdfc0a7e583e..496f596aa807 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -681,6 +681,10 @@ static int kdb_defcmd(int argc, const char **argv) } if (argc != 3) return KDB_ARGCOUNT; + if (in_dbg_master()) { + kdb_printf("Command only available during kdb_init()\n"); + return KDB_NOTIMP; + } defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set), GFP_KDB); if (!defcmd_set) @@ -2796,8 +2800,8 @@ static void __init kdb_inittab(void) "Stack traceback", 1, KDB_REPEAT_NONE); kdb_register_repeat("btp", kdb_bt, "", "Display stack for process ", 0, KDB_REPEAT_NONE); - kdb_register_repeat("bta", kdb_bt, "[DRSTCZEUIMA]", - "Display stack all processes", 0, KDB_REPEAT_NONE); + kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]", + "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE); kdb_register_repeat("btc", kdb_bt, "", "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE); kdb_register_repeat("btt", kdb_bt, "", -- cgit v1.2.3 From 36dfea42cc35509b481377980338cc3b89d79256 Mon Sep 17 00:00:00 2001 From: Vincent Date: Tue, 12 Feb 2013 11:34:15 +0100 Subject: kdb: Remove unhandled ssb command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'ssb' command can only be handled when we have a disassembler, to check for branches, so remove the 'ssb' command for now. Signed-off-by: Vincent Stehlé Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_bp.c | 20 ++------------------ kernel/debug/kdb/kdb_debugger.c | 1 - kernel/debug/kdb/kdb_main.c | 16 ---------------- kernel/debug/kdb/kdb_private.h | 4 ---- 4 files changed, 2 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index 8418c2f8ec5d..70a504601dc3 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -486,11 +486,9 @@ static int kdb_bc(int argc, const char **argv) /* * kdb_ss * - * Process the 'ss' (Single Step) and 'ssb' (Single Step to Branch) - * commands. + * Process the 'ss' (Single Step) command. * * ss - * ssb * * Parameters: * argc Argument count @@ -498,35 +496,23 @@ static int kdb_bc(int argc, const char **argv) * Outputs: * None. * Returns: - * KDB_CMD_SS[B] for success, a kdb error if failure. + * KDB_CMD_SS for success, a kdb error if failure. * Locking: * None. * Remarks: * * Set the arch specific option to trigger a debug trap after the next * instruction. - * - * For 'ssb', set the trace flag in the debug trap handler - * after printing the current insn and return directly without - * invoking the kdb command processor, until a branch instruction - * is encountered. */ static int kdb_ss(int argc, const char **argv) { - int ssb = 0; - - ssb = (strcmp(argv[0], "ssb") == 0); if (argc != 0) return KDB_ARGCOUNT; /* * Set trace flag and go. */ KDB_STATE_SET(DOING_SS); - if (ssb) { - KDB_STATE_SET(DOING_SSB); - return KDB_CMD_SSB; - } return KDB_CMD_SS; } @@ -561,8 +547,6 @@ void __init kdb_initbptab(void) kdb_register_repeat("ss", kdb_ss, "", "Single Step", 1, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("ssb", kdb_ss, "", - "Single step to branch/call", 0, KDB_REPEAT_NO_ARGS); /* * Architecture dependent initialization. */ diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index d04a6ce2d3b7..328d18ef31e4 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -114,7 +114,6 @@ int kdb_stub(struct kgdb_state *ks) /* Remove any breakpoints as needed by kdb and clear single step */ kdb_bp_remove(); KDB_STATE_CLEAR(DOING_SS); - KDB_STATE_CLEAR(DOING_SSB); KDB_STATE_SET(PAGER); /* zero out any offline cpu data */ for_each_present_cpu(i) { diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 496f596aa807..00eb8f7fbf41 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1128,7 +1128,6 @@ void kdb_set_current_task(struct task_struct *p) * KDB_CMD_GO User typed 'go'. * KDB_CMD_CPU User switched to another cpu. * KDB_CMD_SS Single step. - * KDB_CMD_SSB Single step until branch. */ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, kdb_dbtrap_t db_result) @@ -1167,14 +1166,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, kdb_printf("due to Debug @ " kdb_machreg_fmt "\n", instruction_pointer(regs)); break; - case KDB_DB_SSB: - /* - * In the midst of ssb command. Just return. - */ - KDB_DEBUG_STATE("kdb_local 3", reason); - return KDB_CMD_SSB; /* Continue with SSB command */ - - break; case KDB_DB_SS: break; case KDB_DB_SSBPT: @@ -1297,7 +1288,6 @@ do_full_getstr: if (diag == KDB_CMD_GO || diag == KDB_CMD_CPU || diag == KDB_CMD_SS - || diag == KDB_CMD_SSB || diag == KDB_CMD_KGDB) break; @@ -1384,12 +1374,6 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error, break; } - if (result == KDB_CMD_SSB) { - KDB_STATE_SET(DOING_SS); - KDB_STATE_SET(DOING_SSB); - break; - } - if (result == KDB_CMD_KGDB) { if (!KDB_STATE(DOING_KGDB)) kdb_printf("Entering please attach debugger " diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 392ec6a25844..7afd3c8c41d5 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -19,7 +19,6 @@ #define KDB_CMD_GO (-1001) #define KDB_CMD_CPU (-1002) #define KDB_CMD_SS (-1003) -#define KDB_CMD_SSB (-1004) #define KDB_CMD_KGDB (-1005) /* Internal debug flags */ @@ -125,8 +124,6 @@ extern int kdb_state; * kdb control */ #define KDB_STATE_HOLD_CPU 0x00000010 /* Hold this cpu inside kdb */ #define KDB_STATE_DOING_SS 0x00000020 /* Doing ss command */ -#define KDB_STATE_DOING_SSB 0x00000040 /* Doing ssb command, - * DOING_SS is also set */ #define KDB_STATE_SSBPT 0x00000080 /* Install breakpoint * after one ss, independent of * DOING_SS */ @@ -191,7 +188,6 @@ extern void kdb_bp_remove(void); typedef enum { KDB_DB_BPT, /* Breakpoint */ KDB_DB_SS, /* Single-step trap */ - KDB_DB_SSB, /* Single step to branch */ KDB_DB_SSBPT, /* Single step over breakpoint */ KDB_DB_NOBPT /* Spurious breakpoint */ } kdb_dbtrap_t; -- cgit v1.2.3 From 649508f684751122aa302ab10f0f06cb4a88415b Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 30 May 2012 12:11:19 +0100 Subject: trace/ring_buffer: handle 64bit aligned structs Some 32 bit architectures require 64 bit values to be aligned (for example Meta which has 64 bit read/write instructions). These require 8 byte alignment of event data too, so use !CONFIG_HAVE_64BIT_ALIGNED_ACCESS instead of !CONFIG_64BIT || CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS to decide alignment, and align buffer_data_page::data accordingly. Signed-off-by: James Hogan Cc: Frederic Weisbecker Cc: Ingo Molnar Acked-by: Steven Rostedt (previous version subtly different) --- kernel/trace/ring_buffer.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ce8514feedcd..cb4524fd0cb3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -177,7 +177,7 @@ void tracing_off_permanent(void) #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) #define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ -#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS # define RB_FORCE_8BYTE_ALIGNMENT 0 # define RB_ARCH_ALIGNMENT RB_ALIGNMENT #else @@ -185,6 +185,8 @@ void tracing_off_permanent(void) # define RB_ARCH_ALIGNMENT 8U #endif +#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) + /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -333,7 +335,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); struct buffer_data_page { u64 time_stamp; /* page time stamp */ local_t commit; /* write committed index */ - unsigned char data[]; /* data of buffer page */ + unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ }; /* -- cgit v1.2.3 From db61ec29fd56e089007cb7d9a646ea9ddf518c4d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 2 Mar 2013 20:39:15 -0500 Subject: fix compat_sys_rt_sigprocmask() Converting bitmask to 32bit granularity is fine, but we'd better _do_ something with the result. Such as "copy it to userland"... Signed-off-by: Al Viro --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 2a7ae2963185..8d1b785f0dc9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2653,7 +2653,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, if (oset) { compat_sigset_t old32; sigset_to_compat(&old32, &old_set); - if (copy_to_user(oset, &old_set, sizeof(sigset_t))) + if (copy_to_user(oset, &old32, sizeof(compat_sigset_t))) return -EFAULT; } return 0; -- cgit v1.2.3