From 27c379f7f89a4d558c685b5d89b5ba2fe79ae701 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 10 Sep 2010 13:47:29 +0200 Subject: generic-ipi: Fix deadlock in __smp_call_function_single Just got my 6 way machine to a state where cpu 0 is in an endless loop within __smp_call_function_single. All other cpus are idle. The call trace on cpu 0 looks like this: __smp_call_function_single scheduler_tick update_process_times tick_sched_timer __run_hrtimer hrtimer_interrupt clock_comparator_work do_extint ext_int_handler ----> timer irq cpu_idle __smp_call_function_single() got called from nohz_balancer_kick() (inlined) with the remote cpu being 1, wait being 0 and the per cpu variable remote_sched_softirq_cb (call_single_data) of the current cpu (0). Then it loops forever when it tries to grab the lock of the call_single_data, since it is already locked and enqueued on cpu 0. My theory how this could have happened: for some reason the scheduler decided to call __smp_call_function_single() on it's own cpu, and sends an IPI to itself. The interrupt stays pending since IRQs are disabled. If then the hypervisor schedules the cpu away it might happen that upon rescheduling both the IPI and the timer IRQ are pending. If then interrupts are enabled again it depends which one gets scheduled first. If the timer interrupt gets delivered first we end up with the local deadlock as seen in the calltrace above. Let's make __smp_call_function_single() check if the target cpu is the current cpu and execute the function immediately just like smp_call_function_single does. That should prevent at least the scenario described here. It might also be that the scheduler is not supposed to call __smp_call_function_single with the remote cpu being the current cpu, but that is a different issue. Signed-off-by: Heiko Carstens Acked-by: Peter Zijlstra Acked-by: Jens Axboe Cc: Venkatesh Pallipadi Cc: Suresh Siddha LKML-Reference: <20100910114729.GB2827@osiris.boeblingen.de.ibm.com> Signed-off-by: Ingo Molnar --- kernel/smp.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 75c970c715d3..ed6aacfcb7ef 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -365,9 +365,10 @@ call: EXPORT_SYMBOL_GPL(smp_call_function_any); /** - * __smp_call_function_single(): Run a function on another CPU + * __smp_call_function_single(): Run a function on a specific CPU * @cpu: The CPU to run on. * @data: Pre-allocated and setup data structure + * @wait: If true, wait until function has completed on specified CPU. * * Like smp_call_function_single(), but allow caller to pass in a * pre-allocated data structure. Useful for embedding @data inside @@ -376,8 +377,10 @@ EXPORT_SYMBOL_GPL(smp_call_function_any); void __smp_call_function_single(int cpu, struct call_single_data *data, int wait) { - csd_lock(data); + unsigned int this_cpu; + unsigned long flags; + this_cpu = get_cpu(); /* * Can deadlock when called with interrupts disabled. * We allow cpu's that are not yet online though, as no one else can @@ -387,7 +390,15 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() && !oops_in_progress); - generic_exec_single(cpu, data, wait); + if (cpu == this_cpu) { + local_irq_save(flags); + data->func(data->info); + local_irq_restore(flags); + } else { + csd_lock(data); + generic_exec_single(cpu, data, wait); + } + put_cpu(); } /** -- cgit v1.2.3 From c54fce6eff197d9c57c97afbf6c9722ce434fc8f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 10 Sep 2010 16:51:36 +0200 Subject: workqueue: add documentation Update copyright notice and add Documentation/workqueue.txt. Randy Dunlap, Dave Chinner: misc fixes. Signed-off-by: Tejun Heo Reviewed-By: Florian Mickler Cc: Ingo Molnar Cc: Christoph Lameter Cc: Randy Dunlap Cc: Dave Chinner --- kernel/workqueue.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 727f24e563ae..f77afd939229 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1,19 +1,26 @@ /* - * linux/kernel/workqueue.c + * kernel/workqueue.c - generic async execution with shared worker pool * - * Generic mechanism for defining kernel helper threads for running - * arbitrary tasks in process context. + * Copyright (C) 2002 Ingo Molnar * - * Started by Ingo Molnar, Copyright (C) 2002 + * Derived from the taskqueue/keventd code by: + * David Woodhouse + * Andrew Morton + * Kai Petzke + * Theodore Ts'o * - * Derived from the taskqueue/keventd code by: + * Made to use alloc_percpu by Christoph Lameter. * - * David Woodhouse - * Andrew Morton - * Kai Petzke - * Theodore Ts'o + * Copyright (C) 2010 SUSE Linux Products GmbH + * Copyright (C) 2010 Tejun Heo * - * Made to use alloc_percpu by Christoph Lameter. + * This is the generic async execution mechanism. Work items as are + * executed in process context. The worker pool is shared and + * automatically managed. There is one worker pool for each CPU and + * one extra for works which are better served by workers which are + * not bound to any specific CPU. + * + * Please read Documentation/workqueue.txt for details. */ #include -- cgit v1.2.3 From 0bf377bbb0bea6130f35613491887cc622e42a8b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 12 Sep 2010 08:14:52 +0200 Subject: sched: Improve latencies under load by decreasing minimum scheduling granularity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mathieu reported bad latencies with make -j10 kind of kbuild workloads - which is mostly caused by us scheduling with a too coarse granularity. Reduce the minimum granularity some more, to make sure we can meet the latency target. I got the following results (make -j10 kbuild load, average of 3 runs): vanilla: maximum latency: 38278.9 µs average latency: 7730.1 µs patched: maximum latency: 22702.1 µs average latency: 6684.8 µs Mathieu also measured it: | | * wakeup-latency.c (SIGEV_THREAD) with make -j10 | | - Mainline 2.6.35.2 kernel | | maximum latency: 45762.1 µs | average latency: 7348.6 µs | | - With only Peter's smaller min_gran (shown below): | | maximum latency: 29100.6 µs | average latency: 6684.1 µs | Reported-by: Mathieu Desnoyers Reported-by: Linus Torvalds Acked-by: Mathieu Desnoyers Suggested-by: Peter Zijlstra Acked-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 9b5b4f86b742..a171138a9402 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -54,13 +54,13 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling * Minimal preemption granularity for CPU-bound tasks: * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_min_granularity = 2000000ULL; -unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL; +unsigned int sysctl_sched_min_granularity = 750000ULL; +unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; /* * is kept at sysctl_sched_latency / sysctl_sched_min_granularity */ -static unsigned int sched_nr_latency = 3; +static unsigned int sched_nr_latency = 8; /* * After fork, child runs first. If set to 0 (default) then -- cgit v1.2.3 From c41d68a513c71e35a14f66d71782d27a79a81ea6 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 7 Sep 2010 16:16:18 -0700 Subject: compat: Make compat_alloc_user_space() incorporate the access_ok() compat_alloc_user_space() expects the caller to independently call access_ok() to verify the returned area. A missing call could introduce problems on some architectures. This patch incorporates the access_ok() check into compat_alloc_user_space() and also adds a sanity check on the length. The existing compat_alloc_user_space() implementations are renamed arch_compat_alloc_user_space() and are used as part of the implementation of the new global function. This patch assumes NULL will cause __get_user()/__put_user() to either fail or access userspace on all architectures. This should be followed by checking the return value of compat_access_user_space() for NULL in the callers, at which time the access_ok() in the callers can also be removed. Reported-by: Ben Hawkes Signed-off-by: H. Peter Anvin Acked-by: Benjamin Herrenschmidt Acked-by: Chris Metcalf Acked-by: David S. Miller Acked-by: Ingo Molnar Acked-by: Thomas Gleixner Acked-by: Tony Luck Cc: Andrew Morton Cc: Arnd Bergmann Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Heiko Carstens Cc: Helge Deller Cc: James Bottomley Cc: Kyle McMartin Cc: Martin Schwidefsky Cc: Paul Mackerras Cc: Ralf Baechle Cc: --- kernel/compat.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index e167efce8423..c9e2ec0b34a8 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -1126,3 +1126,24 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info) return 0; } + +/* + * Allocate user-space memory for the duration of a single system call, + * in order to marshall parameters inside a compat thunk. + */ +void __user *compat_alloc_user_space(unsigned long len) +{ + void __user *ptr; + + /* If len would occupy more than half of the entire compat space... */ + if (unlikely(len > (((compat_uptr_t)~0) >> 1))) + return NULL; + + ptr = arch_compat_alloc_user_space(len); + + if (unlikely(!access_ok(VERIFY_WRITE, ptr, len))) + return NULL; + + return ptr; +} +EXPORT_SYMBOL_GPL(compat_alloc_user_space); -- cgit v1.2.3 From e75e863dd5c7d96b91ebbd241da5328fc38a78cc Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 14 Sep 2010 16:35:14 +0200 Subject: sched: Fix user time incorrectly accounted as system time on 32-bit We have 32-bit variable overflow possibility when multiply in task_times() and thread_group_times() functions. When the overflow happens then the scaled utime value becomes erroneously small and the scaled stime becomes i erroneously big. Reported here: https://bugzilla.redhat.com/show_bug.cgi?id=633037 https://bugzilla.kernel.org/show_bug.cgi?id=16559 Reported-by: Michael Chapman Reported-by: Ciriaco Garcia de Celis Signed-off-by: Stanislaw Gruszka Signed-off-by: Peter Zijlstra Cc: Hidetoshi Seto Cc: # 2.6.32.19+ (partially) and 2.6.33+ LKML-Reference: <20100914143513.GB8415@redhat.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ed09d4f2a69c..dc85ceb90832 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3513,9 +3513,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) rtime = nsecs_to_cputime(p->se.sum_exec_runtime); if (total) { - u64 temp; + u64 temp = rtime; - temp = (u64)(rtime * utime); + temp *= utime; do_div(temp, total); utime = (cputime_t)temp; } else @@ -3546,9 +3546,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) rtime = nsecs_to_cputime(cputime.sum_exec_runtime); if (total) { - u64 temp; + u64 temp = rtime; - temp = (u64)(rtime * cputime.utime); + temp *= cputime.utime; do_div(temp, total); utime = (cputime_t)temp; } else -- cgit v1.2.3 From 068e35eee9ef98eb4cab55181977e24995d273be Mon Sep 17 00:00:00 2001 From: Matt Helsley Date: Mon, 13 Sep 2010 13:01:18 -0700 Subject: hw breakpoints: Fix pid namespace bug Hardware breakpoints can't be registered within pid namespaces because tsk->pid is passed rather than the pid in the current namespace. (See https://bugzilla.kernel.org/show_bug.cgi?id=17281 ) This is a quick fix demonstrating the problem but is not the best method of solving the problem since passing pids internally is not the best way to avoid pid namespace bugs. Subsequent patches will show a better solution. Much thanks to Frederic Weisbecker for doing the bulk of the work finding this bug. Reported-by: Robin Green Signed-off-by: Matt Helsley Signed-off-by: Peter Zijlstra Cc: Prasad Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Will Deacon Cc: Mahesh Salgaonkar Cc: 2.6.33-2.6.35 LKML-Reference: Signed-off-by: Ingo Molnar Signed-off-by: Frederic Weisbecker --- kernel/hw_breakpoint.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index d71a987fd2bf..c7c2aed9e2dc 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -433,7 +433,8 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, struct task_struct *tsk) { - return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); + return perf_event_create_kernel_counter(attr, -1, task_pid_vnr(tsk), + triggered); } EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); -- cgit v1.2.3 From f6c3f1686e7ec1dd8725a9a3dcb857dfd0c7a5bf Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 13 Sep 2010 11:02:21 -0700 Subject: sched: Fix nohz balance kick There's a situation where the nohz balancer will try to wake itself: cpu-x is idle which is also ilb_cpu got a scheduler tick during idle and the nohz_kick_needed() in trigger_load_balance() checks for rq_x->nr_running which might not be zero (because of someone waking a task on this rq etc) and this leads to the situation of the cpu-x sending a kick to itself. And this can cause a lockup. Avoid this by not marking ourself eligible for kicking. Signed-off-by: Suresh Siddha Signed-off-by: Peter Zijlstra LKML-Reference: <1284400941.2684.19.camel@sbsiddha-MOBL3.sc.intel.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a171138a9402..db3f674ca49d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3630,7 +3630,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) if (time_before(now, nohz.next_balance)) return 0; - if (!rq->nr_running) + if (rq->idle_at_tick) return 0; first_pick_cpu = atomic_read(&nohz.first_pick_cpu); -- cgit v1.2.3 From a247c3a97a0216b18a46243eda26081f1928ec37 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 22 Sep 2010 13:05:12 -0700 Subject: rmap: fix walk during fork The below bug in fork led to the rmap walk finding the parent huge-pmd twice instead of just once, because the anon_vma_chain objects of the child vma still point to the vma->vm_mm of the parent. The patch fixes it by making the rmap walk accurate during fork. It's not a big deal normally but it worth being accurate considering the cost is the same. Signed-off-by: Andrea Arcangeli Acked-by: Johannes Weiner Acked-by: Rik van Riel Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index b7e9d60a675d..c445f8cc408d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -356,10 +356,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (IS_ERR(pol)) goto fail_nomem_policy; vma_set_policy(tmp, pol); + tmp->vm_mm = mm; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; tmp->vm_flags &= ~VM_LOCKED; - tmp->vm_mm = mm; tmp->vm_next = tmp->vm_prev = NULL; file = tmp->vm_file; if (file) { -- cgit v1.2.3 From 399f1e30ac17b77d383444aff480c7390f5adf2a Mon Sep 17 00:00:00 2001 From: "Ira W. Snyder" Date: Thu, 30 Sep 2010 15:15:27 -0700 Subject: kfifo: fix scatterlist usage The kfifo_dma family of functions use sg_mark_end() on the last element in their scatterlist. This forces use of a fresh scatterlist for each DMA operation, which makes recycling a single scatterlist impossible. Change the behavior of the kfifo_dma functions to match the usage of the dma_map_sg function. This means that users must respect the returned nents value. The sample code is updated to reflect the change. This bug is trivial to cause: call kfifo_dma_in_prepare() such that it prepares a scatterlist with a single entry comprising the whole fifo. This is the case when you map the entirety of a newly created empty fifo. This causes the setup_sgl() function to mark the first scatterlist entry as the end of the chain, no matter what comes after it. Afterwards, add and remove some data from the fifo such that another call to kfifo_dma_in_prepare() will create two scatterlist entries. It returns nents=2. However, due to the previous sg_mark_end() call, sg_is_last() will now return true for the first scatterlist element. This causes the sample code to print a single scatterlist element when it should print two. By removing the call to sg_mark_end(), we make the API as similar as possible to the DMA mapping API. All users are required to respect the returned nents. Signed-off-by: Ira W. Snyder Cc: Stefani Seibold Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kfifo.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 6b5580c57644..01a0700e873f 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -365,8 +365,6 @@ static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl, n = setup_sgl_buf(sgl, fifo->data + off, nents, l); n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l); - if (n) - sg_mark_end(sgl + n - 1); return n; } -- cgit v1.2.3 From 5336377d6225959624146629ce3fc88ee8ecda3d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 5 Oct 2010 11:29:27 -0700 Subject: modules: Fix module_bug_list list corruption race With all the recent module loading cleanups, we've minimized the code that sits under module_mutex, fixing various deadlocks and making it possible to do most of the module loading in parallel. However, that whole conversion totally missed the rather obscure code that adds a new module to the list for BUG() handling. That code was doubly obscure because (a) the code itself lives in lib/bugs.c (for dubious reasons) and (b) it gets called from the architecture-specific "module_finalize()" rather than from generic code. Calling it from arch-specific code makes no sense what-so-ever to begin with, and is now actively wrong since that code isn't protected by the module loading lock any more. So this commit moves the "module_bug_{finalize,cleanup}()" calls away from the arch-specific code, and into the generic code - and in the process protects it with the module_mutex so that the list operations are now safe. Future fixups: - move the module list handling code into kernel/module.c where it belongs. - get rid of 'module_bug_list' and just use the regular list of modules (called 'modules' - imagine that) that we already create and maintain for other reasons. Reported-and-tested-by: Thomas Gleixner Cc: Rusty Russell Cc: Adrian Bunk Cc: Andrew Morton Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- kernel/module.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index d0b5f8db11b4..ccd641991842 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1537,6 +1537,7 @@ static int __unlink_module(void *_mod) { struct module *mod = _mod; list_del(&mod->list); + module_bug_cleanup(mod); return 0; } @@ -2625,6 +2626,7 @@ static struct module *load_module(void __user *umod, if (err < 0) goto ddebug; + module_bug_finalize(info.hdr, info.sechdrs, mod); list_add_rcu(&mod->list, &modules); mutex_unlock(&module_mutex); @@ -2650,6 +2652,8 @@ static struct module *load_module(void __user *umod, mutex_lock(&module_mutex); /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); + module_bug_cleanup(mod); + ddebug: if (!mod->taints) dynamic_debug_remove(info.debug); -- cgit v1.2.3