From 52ade9b3b97fd3bea42842a056fe0786c28d0555 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 16 May 2007 15:28:14 -0700 Subject: Fix ACPI suspend / device suspend ordering problem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit e3c7db621bed4afb8e231cb005057f2feb5db557 we fixed the resume ordering, so that the ACPI low-level resume code was called before the actual driver resume was called. However, that broke the nesting logic of suspend and resume, and we continued to suspend the devices _after_ we the ACPI device suspend code was called. That resulted in us saving PCI state for devices that had already been changed by ACPI, and in some cases disabled entirely (causing the PCI save_state to be all-ones). Which in turn caused the wrong state to be written back on resume. This moves the ACPI device suspend to after the device model per-device suspend() calls. This fixes the bogus state save. Thanks to Lukáš Hejtmánek for testing. Acked-by: Lukas Hejtmanek Acked-by: Rafael J. Wysocki Cc: Len Brown Cc: Pavel Machek Cc: Andrew Morton Cc: Greg KH Signed-off-by: Linus Torvalds --- kernel/power/main.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 40d56a31245e..b98b80ccf437 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -97,25 +97,26 @@ static int suspend_prepare(suspend_state_t state) } } - if (pm_ops->prepare) { - if ((error = pm_ops->prepare(state))) - goto Thaw; - } - suspend_console(); error = device_suspend(PMSG_SUSPEND); if (error) { printk(KERN_ERR "Some devices failed to suspend\n"); - goto Resume_devices; + goto Resume_console; } + if (pm_ops->prepare) { + if ((error = pm_ops->prepare(state))) + goto Resume_devices; + } + error = disable_nonboot_cpus(); if (!error) return 0; enable_nonboot_cpus(); - Resume_devices: pm_finish(state); + Resume_devices: device_resume(); + Resume_console: resume_console(); Thaw: thaw_processes(); -- cgit v1.2.3 From a35afb830f8d71ec211531aeb9a621b09a2efb39 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 16 May 2007 22:10:57 -0700 Subject: Remove SLAB_CTOR_CONSTRUCTOR SLAB_CTOR_CONSTRUCTOR is always specified. No point in checking it. Signed-off-by: Christoph Lameter Cc: David Howells Cc: Jens Axboe Cc: Steven French Cc: Michael Halcrow Cc: OGAWA Hirofumi Cc: Miklos Szeredi Cc: Steven Whitehouse Cc: Roman Zippel Cc: David Woodhouse Cc: Dave Kleikamp Cc: Trond Myklebust Cc: "J. Bruce Fields" Cc: Anton Altaparmakov Cc: Mark Fasheh Cc: Paul Mackerras Cc: Christoph Hellwig Cc: Jan Kara Cc: David Chinner Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 49530e40ea8b..87069cfc18a1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1427,10 +1427,8 @@ static void sighand_ctor(void *data, struct kmem_cache *cachep, { struct sighand_struct *sighand = data; - if (flags & SLAB_CTOR_CONSTRUCTOR) { - spin_lock_init(&sighand->siglock); - INIT_LIST_HEAD(&sighand->signalfd_list); - } + spin_lock_init(&sighand->siglock); + INIT_LIST_HEAD(&sighand->signalfd_list); } void __init proc_caches_init(void) -- cgit v1.2.3 From 71ce92f3fa442069670a52fa6230a6064c4517b3 Mon Sep 17 00:00:00 2001 From: Dan Aloni Date: Wed, 16 May 2007 22:11:16 -0700 Subject: make sysctl/kernel/core_pattern and fs/exec.c agree on maximum core filename size Make sysctl/kernel/core_pattern and fs/exec.c agree on maximum core filename size and change it to 128, so that extensive patterns such as '/local/cores/%e-%h-%s-%t-%p.core' won't result in truncated filename generation. Signed-off-by: Dan Aloni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4073353abd4f..30ee462ee79f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -227,7 +227,7 @@ static ctl_table kern_table[] = { .ctl_name = KERN_CORE_PATTERN, .procname = "core_pattern", .data = core_pattern, - .maxlen = 128, + .maxlen = CORENAME_MAX_SIZE, .mode = 0644, .proc_handler = &proc_dostring, .strategy = &sysctl_string, -- cgit v1.2.3 From 8d98a690f58e0d6ecf424b7ca84488475cf87bd9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 16 May 2007 22:11:19 -0700 Subject: swsusp: fix sysfs interface The sysfs files /sys/power/disk and /sys/power/state do not work as documented, since they allow the user to write only a few initial characters of the input string to trigger the option (eg. 'echo pl > /sys/power/disk' activates the platform mode of hibernation). Fix it. Special thanks to Peter Moulder for pointing out the problem. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 3 ++- kernel/power/main.c | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index b5f0543ed84d..f445b9cd60fb 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -416,7 +416,8 @@ static ssize_t disk_store(struct kset *kset, const char *buf, size_t n) mutex_lock(&pm_mutex); for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { - if (!strncmp(buf, hibernation_modes[i], len)) { + if (len == strlen(hibernation_modes[i]) + && !strncmp(buf, hibernation_modes[i], len)) { mode = i; break; } diff --git a/kernel/power/main.c b/kernel/power/main.c index b98b80ccf437..8812985f3029 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -290,13 +290,13 @@ static ssize_t state_store(struct kset *kset, const char *buf, size_t n) len = p ? p - buf : n; /* First, check if we are requested to hibernate */ - if (!strncmp(buf, "disk", len)) { + if (len == 4 && !strncmp(buf, "disk", len)) { error = hibernate(); return error ? error : n; } for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { - if (*s && !strncmp(buf, *s, len)) + if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) break; } if (state < PM_SUSPEND_MAX && *s) -- cgit v1.2.3 From e8edc6e03a5c8562dc70a6d969f732bdb355a7e7 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 21 May 2007 01:22:52 +0400 Subject: Detach sched.h from mm.h First thing mm.h does is including sched.h solely for can_do_mlock() inline function which has "current" dereference inside. By dealing with can_do_mlock() mm.h can be detached from sched.h which is good. See below, why. This patch a) removes unconditional inclusion of sched.h from mm.h b) makes can_do_mlock() normal function in mm/mlock.c c) exports can_do_mlock() to not break compilation d) adds sched.h inclusions back to files that were getting it indirectly. e) adds less bloated headers to some files (asm/signal.h, jiffies.h) that were getting them indirectly Net result is: a) mm.h users would get less code to open, read, preprocess, parse, ... if they don't need sched.h b) sched.h stops being dependency for significant number of files: on x86_64 allmodconfig touching sched.h results in recompile of 4083 files, after patch it's only 3744 (-8.3%). Cross-compile tested on all arm defconfigs, all mips defconfigs, all powerpc defconfigs, alpha alpha-up arm i386 i386-up i386-defconfig i386-allnoconfig ia64 ia64-up m68k mips parisc parisc-up powerpc powerpc-up s390 s390-up sparc sparc-up sparc64 sparc64-up um-x86_64 x86_64 x86_64-up x86_64-defconfig x86_64-allnoconfig as well as my two usual configs. Signed-off-by: Alexey Dobriyan Signed-off-by: Linus Torvalds --- kernel/profile.c | 1 + kernel/time/ntp.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index cc91b9bf759d..5b20fe977bed 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -26,6 +26,7 @@ #include #include #include +#include struct profile_hit { u32 pc, hits; diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index cb25649c6f50..87aa5ff931e0 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include #include -- cgit v1.2.3 From 33e1c288da62a6a5aa9077a6b7bfa690b1b02cf4 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 23 May 2007 13:57:24 -0700 Subject: freezer: close potential race between refrigerator and thaw_tasks If the freezing of tasks fails and a task is preempted in refrigerator() before calling frozen_process(), then thaw_tasks() may run before this task is frozen. In that case the task will freeze and no one will thaw it. To fix this race we can call freezing(current) in refrigerator() along with frozen_process(current) under the task_lock() which also should be taken in the error path of try_to_freeze_tasks() as well as in thaw_process(). Moreover, if thaw_process() additionally clears TIF_FREEZE for tasks that are not frozen, we can be sure that all tasks are thawed and there are no pending "freeze" requests after thaw_tasks() has run. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Cc: Gautham R Shenoy Cc: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/process.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 088419387388..02e490e311eb 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -37,10 +37,18 @@ void refrigerator(void) /* Hmm, should we be allowed to suspend when there are realtime processes around? */ long save; + + task_lock(current); + if (freezing(current)) { + frozen_process(current); + task_unlock(current); + } else { + task_unlock(current); + return; + } save = current->state; pr_debug("%s entered refrigerator\n", current->comm); - frozen_process(current); spin_lock_irq(¤t->sighand->siglock); recalc_sigpending(); /* We sent fake signal, clean it up */ spin_unlock_irq(¤t->sighand->siglock); @@ -152,10 +160,12 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space) if (is_user_space(p) == !freeze_user_space) continue; + task_lock(p); if (freezeable(p) && !frozen(p)) printk(KERN_ERR " %s\n", p->comm); cancel_freezing(p); + task_unlock(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); } -- cgit v1.2.3 From ba96a0c88098697a63e80157718b7440414ed24d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 23 May 2007 13:57:25 -0700 Subject: freezer: fix vfork problem Currently try_to_freeze_tasks() has to wait until all of the vforked processes exit and for this reason every user can make it fail. To fix this problem we can introduce the additional process flag PF_FREEZER_SKIP to be used by tasks that do not want to be counted as freezable by the freezer and want to have TIF_FREEZE set nevertheless. Then, this flag can be set by tasks using sys_vfork() before they call wait_for_completion(&vfork) and cleared after they have woken up. After clearing it, the tasks should call try_to_freeze() as soon as possible. Signed-off-by: Rafael J. Wysocki Cc: Gautham R Shenoy Cc: Oleg Nesterov Cc: Pavel Machek Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 +++ kernel/power/process.c | 29 +++++++++-------------------- 2 files changed, 12 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 87069cfc18a1..73ad5cda1bcd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -1405,7 +1406,9 @@ long do_fork(unsigned long clone_flags, } if (clone_flags & CLONE_VFORK) { + freezer_do_not_count(); wait_for_completion(&vfork); + freezer_count(); if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) { current->ptrace_message = nr; ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); diff --git a/kernel/power/process.c b/kernel/power/process.c index 02e490e311eb..eefca8581fa0 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -120,22 +120,12 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space) cancel_freezing(p); continue; } - if (is_user_space(p)) { - if (!freeze_user_space) - continue; - - /* Freeze the task unless there is a vfork - * completion pending - */ - if (!p->vfork_done) - freeze_process(p); - } else { - if (freeze_user_space) - continue; - - freeze_process(p); - } - todo++; + if (is_user_space(p) == !freeze_user_space) + continue; + + freeze_process(p); + if (!freezer_should_skip(p)) + todo++; } while_each_thread(g, p); read_unlock(&tasklist_lock); yield(); /* Yield is okay here */ @@ -161,7 +151,8 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space) continue; task_lock(p); - if (freezeable(p) && !frozen(p)) + if (freezeable(p) && !frozen(p) && + !freezer_should_skip(p)) printk(KERN_ERR " %s\n", p->comm); cancel_freezing(p); @@ -210,9 +201,7 @@ static void thaw_tasks(int thaw_user_space) if (is_user_space(p) == !thaw_user_space) continue; - if (!thaw_process(p)) - printk(KERN_WARNING " Strange, %s not stopped\n", - p->comm ); + thaw_process(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); } -- cgit v1.2.3 From 49b12d4f5e274517b8bc032d507abf31cc2f4150 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 23 May 2007 13:57:26 -0700 Subject: freezer: take kernel_execve into consideration Kernel threads can become userland processes by calling kernel_execve(). In particular, this may happen right after the try_to_freeze_tasks() called with FREEZER_USER_SPACE has returned, so try_to_freeze_tasks() needs to take userspace processes into consideration even if it is called with FREEZER_KERNEL_THREADS. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Cc: Gautham R Shenoy Cc: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/process.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index eefca8581fa0..2cea2658e985 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -120,7 +120,7 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space) cancel_freezing(p); continue; } - if (is_user_space(p) == !freeze_user_space) + if (freeze_user_space && !is_user_space(p)) continue; freeze_process(p); @@ -147,7 +147,7 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space) TIMEOUT / HZ, todo); read_lock(&tasklist_lock); do_each_thread(g, p) { - if (is_user_space(p) == !freeze_user_space) + if (freeze_user_space && !is_user_space(p)) continue; task_lock(p); -- cgit v1.2.3 From a076e4bca2fdabb9e45d86722cc72c0944da5f94 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 23 May 2007 13:57:27 -0700 Subject: freezer: fix kthread_create vs freezer theoretical race kthread() sleeps in TASK_INTERRUPTIBLE state waiting for the first wakeup. In theory, this wakeup may come from freeze_process()->signal_wake_up(), so the task can disappear even before kthread_create() sets its ->comm. Change kthread() to use TASK_UNINTERRUPTIBLE. [akpm@linux-foundation.org: s/BUG_ON/WARN_ON+recover] Signed-off-by: Oleg Nesterov Acked-by: "Eric W. Biederman" Signed-off-by: Rafael J. Wysocki Cc: Gautham R Shenoy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index df8a8e8f6ca4..bbd51b81a3e8 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -70,7 +70,7 @@ static int kthread(void *_create) data = create->data; /* OK, tell user we're spawned, wait for stop or wakeup */ - __set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_UNINTERRUPTIBLE); complete(&create->started); schedule(); @@ -162,7 +162,10 @@ EXPORT_SYMBOL(kthread_create); */ void kthread_bind(struct task_struct *k, unsigned int cpu) { - BUG_ON(k->state != TASK_INTERRUPTIBLE); + if (k->state != TASK_UNINTERRUPTIBLE) { + WARN_ON(1); + return; + } /* Must have done schedule() in kthread() before we set_task_cpu */ wait_task_inactive(k); set_task_cpu(k, cpu); -- cgit v1.2.3 From 88f18ba028b5939bb6f77bd690e5ad8d01bb24cc Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 23 May 2007 13:57:29 -0700 Subject: freezer: move frozen_process() to kernel/power/process.c Other than refrigerator, no one else calls frozen_process(). So move it from include/linux/freezer.h to kernel/power/process.c. Also, since a task can be marked as frozen by itself, we don't need to pass the (struct task_struct *p) parameter to frozen_process(). Signed-off-by: Gautham R Shenoy Signed-off-by: Rafael J. Wysocki Cc: Oleg Nesterov Cc: Pavel Machek Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/process.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 2cea2658e985..d31d638ab4c0 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -31,6 +31,18 @@ static inline int freezeable(struct task_struct * p) return 1; } +/* + * freezing is complete, mark current process as frozen + */ +static inline void frozen_process(void) +{ + if (!unlikely(current->flags & PF_NOFREEZE)) { + current->flags |= PF_FROZEN; + wmb(); + } + clear_tsk_thread_flag(current, TIF_FREEZE); +} + /* Refrigerator is place where frozen processes are stored :-). */ void refrigerator(void) { @@ -40,7 +52,7 @@ void refrigerator(void) task_lock(current); if (freezing(current)) { - frozen_process(current); + frozen_process(); task_unlock(current); } else { task_unlock(current); -- cgit v1.2.3 From 72fcde966252abd17d70e4e216a0411a34523a8f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 May 2007 13:57:30 -0700 Subject: Ignore bogus ACPI info for offline CPUs Booting a SMP kernel with maxcpus=1 on a SMP system leads to a hard hang, because ACPI ignores the maxcpus setting and sends timer broadcast info for the offline CPUs. This results in a stuck for ever call to smp_call_function_single() on an offline CPU. Ignore the bogus information and print a kernel error to remind ACPI folks to fix it. Signed-off-by: Thomas Gleixner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/tick-broadcast.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index eadfce2fff74..8001d37071f5 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -243,11 +243,18 @@ void tick_broadcast_on_off(unsigned long reason, int *oncpu) { int cpu = get_cpu(); - if (cpu == *oncpu) - tick_do_broadcast_on_off(&reason); - else - smp_call_function_single(*oncpu, tick_do_broadcast_on_off, - &reason, 1, 1); + if (!cpu_isset(*oncpu, cpu_online_map)) { + printk(KERN_ERR "tick-braodcast: ignoring broadcast for " + "offline CPU #%d\n", *oncpu); + } else { + + if (cpu == *oncpu) + tick_do_broadcast_on_off(&reason); + else + smp_call_function_single(*oncpu, + tick_do_broadcast_on_off, + &reason, 1, 1); + } put_cpu(); } -- cgit v1.2.3 From 352823160613b65fdaa558be486720a71f75ed86 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 May 2007 13:57:37 -0700 Subject: NOHZ: Rate limit the local softirq pending warning output The warning in the NOHZ code, which triggers when a CPU goes idle with softirqs pending can fill up the logs quite quickly. Rate limit the output until we found the root cause of that problem. Signed-off-by: Thomas Gleixner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/tick-sched.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3483e6cb9549..3e7ebc4646b7 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -167,9 +167,15 @@ void tick_nohz_stop_sched_tick(void) goto end; cpu = smp_processor_id(); - if (unlikely(local_softirq_pending())) - printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", - local_softirq_pending()); + if (unlikely(local_softirq_pending())) { + static int ratelimit; + + if (ratelimit < 10) { + printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", + local_softirq_pending()); + ratelimit++; + } + } now = ktime_get(); /* -- cgit v1.2.3 From 7bb44adef39ad3bda2be40bb34686bc56bd563a5 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 23 May 2007 13:57:44 -0700 Subject: recalc_sigpending_tsk fixes Steve Hawkes discovered a problem where recalc_sigpending_tsk was called in do_sigaction but no signal_wake_up call was made, preventing later signals from waking up blocked threads with TIF_SIGPENDING already set. In fact, the few other calls to recalc_sigpending_tsk outside the signals code are also subject to this problem in other race conditions. This change makes recalc_sigpending_tsk private to the signals code. It changes the outside calls, as well as do_sigaction, to use the new recalc_sigpending_and_wake instead. Signed-off-by: Roland McGrath Cc: Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 7 ++----- kernel/power/process.c | 2 +- kernel/signal.c | 24 ++++++++++++++++++------ 3 files changed, 21 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index c6d14b8008dd..5b888c24e43e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -762,11 +762,8 @@ static void exit_notify(struct task_struct *tsk) read_lock(&tasklist_lock); spin_lock_irq(&tsk->sighand->siglock); for (t = next_thread(tsk); t != tsk; t = next_thread(t)) - if (!signal_pending(t) && !(t->flags & PF_EXITING)) { - recalc_sigpending_tsk(t); - if (signal_pending(t)) - signal_wake_up(t, 0); - } + if (!signal_pending(t) && !(t->flags & PF_EXITING)) + recalc_sigpending_and_wake(t); spin_unlock_irq(&tsk->sighand->siglock); read_unlock(&tasklist_lock); } diff --git a/kernel/power/process.c b/kernel/power/process.c index d31d638ab4c0..e0233d8422b9 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -101,7 +101,7 @@ static void cancel_freezing(struct task_struct *p) pr_debug(" clean up: %s\n", p->comm); do_not_freeze(p); spin_lock_irqsave(&p->sighand->siglock, flags); - recalc_sigpending_tsk(p); + recalc_sigpending_and_wake(p); spin_unlock_irqrestore(&p->sighand->siglock, flags); } } diff --git a/kernel/signal.c b/kernel/signal.c index 364fc95bf97c..acdfc0549c6f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -96,15 +96,27 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) #define PENDING(p,b) has_pending_signals(&(p)->signal, (b)) -fastcall void recalc_sigpending_tsk(struct task_struct *t) +static int recalc_sigpending_tsk(struct task_struct *t) { if (t->signal->group_stop_count > 0 || (freezing(t)) || PENDING(&t->pending, &t->blocked) || - PENDING(&t->signal->shared_pending, &t->blocked)) + PENDING(&t->signal->shared_pending, &t->blocked)) { set_tsk_thread_flag(t, TIF_SIGPENDING); - else - clear_tsk_thread_flag(t, TIF_SIGPENDING); + return 1; + } + clear_tsk_thread_flag(t, TIF_SIGPENDING); + return 0; +} + +/* + * After recalculating TIF_SIGPENDING, we need to make sure the task wakes up. + * This is superfluous when called on current, the wakeup is a harmless no-op. + */ +void recalc_sigpending_and_wake(struct task_struct *t) +{ + if (recalc_sigpending_tsk(t)) + signal_wake_up(t, 0); } void recalc_sigpending(void) @@ -744,7 +756,7 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) action->sa.sa_handler = SIG_DFL; if (blocked) { sigdelset(&t->blocked, sig); - recalc_sigpending_tsk(t); + recalc_sigpending_and_wake(t); } } ret = specific_send_sig_info(sig, info, t); @@ -2273,7 +2285,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) rm_from_queue_full(&mask, &t->signal->shared_pending); do { rm_from_queue_full(&mask, &t->pending); - recalc_sigpending_tsk(t); + recalc_sigpending_and_wake(t); t = next_thread(t); } while (t != current); } -- cgit v1.2.3 From 14441960e8c27a64487e0b455b323e784f33583f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 23 May 2007 13:57:57 -0700 Subject: simplify cleanup_workqueue_thread() cleanup_workqueue_thread() and cwq_should_stop() are overcomplicated. Convert the code to use kthread_should_stop/kthread_stop as was suggested by Gautham and Srivatsa. In particular this patch removes the (unlikely) busy-wait loop from the exit path, it was a temporary and ugly kludge (if not a bug). Note: the current code was designed to solve another old problem: work->func can't share locks with hotplug callbacks. I think this could be done, see http://marc.info/?l=linux-kernel&m=116905366428633 but this needs some more complications to preserve CPU affinity of cwq->thread during cpu_up(). A freezer-based hotplug looks more appealing. [akpm@linux-foundation.org: make it more tolerant of gcc borkenness] Signed-off-by: Oleg Nesterov Cc: Zilvinas Valinskas Cc: Gautham R Shenoy Cc: Srivatsa Vaddagiri Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 84 ++++++++++++++++++++++++------------------------------ 1 file changed, 37 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index fb56fedd5c02..3bebf73be976 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -47,7 +47,6 @@ struct cpu_workqueue_struct { struct workqueue_struct *wq; struct task_struct *thread; - int should_stop; int run_depth; /* Detect run_workqueue() recursion depth */ } ____cacheline_aligned; @@ -71,7 +70,13 @@ static LIST_HEAD(workqueues); static int singlethread_cpu __read_mostly; static cpumask_t cpu_singlethread_map __read_mostly; -/* optimization, we could use cpu_possible_map */ +/* + * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD + * flushes cwq->worklist. This means that flush_workqueue/wait_on_work + * which comes in between can't use for_each_online_cpu(). We could + * use cpu_possible_map, the cpumask below is more a documentation + * than optimization. + */ static cpumask_t cpu_populated_map __read_mostly; /* If it's single threaded, it isn't in the list of workqueues. */ @@ -272,24 +277,6 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) spin_unlock_irq(&cwq->lock); } -/* - * NOTE: the caller must not touch *cwq if this func returns true - */ -static int cwq_should_stop(struct cpu_workqueue_struct *cwq) -{ - int should_stop = cwq->should_stop; - - if (unlikely(should_stop)) { - spin_lock_irq(&cwq->lock); - should_stop = cwq->should_stop && list_empty(&cwq->worklist); - if (should_stop) - cwq->thread = NULL; - spin_unlock_irq(&cwq->lock); - } - - return should_stop; -} - static int worker_thread(void *__cwq) { struct cpu_workqueue_struct *cwq = __cwq; @@ -302,14 +289,15 @@ static int worker_thread(void *__cwq) for (;;) { prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); - if (!freezing(current) && !cwq->should_stop - && list_empty(&cwq->worklist)) + if (!freezing(current) && + !kthread_should_stop() && + list_empty(&cwq->worklist)) schedule(); finish_wait(&cwq->more_work, &wait); try_to_freeze(); - if (cwq_should_stop(cwq)) + if (kthread_should_stop()) break; run_workqueue(cwq); @@ -340,18 +328,21 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, insert_work(cwq, &barr->work, tail); } -static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) +static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) { + int active; + if (cwq->thread == current) { /* * Probably keventd trying to flush its own queue. So simply run * it by hand rather than deadlocking. */ run_workqueue(cwq); + active = 1; } else { struct wq_barrier barr; - int active = 0; + active = 0; spin_lock_irq(&cwq->lock); if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { insert_wq_barrier(cwq, &barr, 1); @@ -362,6 +353,8 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) if (active) wait_for_completion(&barr.done); } + + return active; } /** @@ -674,7 +667,6 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) return PTR_ERR(p); cwq->thread = p; - cwq->should_stop = 0; return 0; } @@ -740,29 +732,27 @@ EXPORT_SYMBOL_GPL(__create_workqueue); static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) { - struct wq_barrier barr; - int alive = 0; - - spin_lock_irq(&cwq->lock); - if (cwq->thread != NULL) { - insert_wq_barrier(cwq, &barr, 1); - cwq->should_stop = 1; - alive = 1; - } - spin_unlock_irq(&cwq->lock); + /* + * Our caller is either destroy_workqueue() or CPU_DEAD, + * workqueue_mutex protects cwq->thread + */ + if (cwq->thread == NULL) + return; - if (alive) { - wait_for_completion(&barr.done); + /* + * If the caller is CPU_DEAD the single flush_cpu_workqueue() + * is not enough, a concurrent flush_workqueue() can insert a + * barrier after us. + * When ->worklist becomes empty it is safe to exit because no + * more work_structs can be queued on this cwq: flush_workqueue + * checks list_empty(), and a "normal" queue_work() can't use + * a dead CPU. + */ + while (flush_cpu_workqueue(cwq)) + ; - while (unlikely(cwq->thread != NULL)) - cpu_relax(); - /* - * Wait until cwq->thread unlocks cwq->lock, - * it won't touch *cwq after that. - */ - smp_rmb(); - spin_unlock_wait(&cwq->lock); - } + kthread_stop(cwq->thread); + cwq->thread = NULL; } /** -- cgit v1.2.3 From 6373da1fb719f167c775c96015b0553507c34a18 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Wed, 23 May 2007 13:58:00 -0700 Subject: power: Fix sizeof(PAGE_SIZE) typo Fix sizeof(PAGE_SIZE) typo. It should be just PAGE_SIZE for zeroing the swsusp_header. Signed-off-by: OGAWA Hirofumi Signed-off-by: OGAWA Hirofumi Cc: Pavel Machek Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/swap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index b8b235cc19d1..8b1a1b837145 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -584,7 +584,7 @@ int swsusp_check(void) resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); if (!IS_ERR(resume_bdev)) { set_blocksize(resume_bdev, PAGE_SIZE); - memset(swsusp_header, 0, sizeof(PAGE_SIZE)); + memset(swsusp_header, 0, PAGE_SIZE); error = bio_read_page(swsusp_resume_block, swsusp_header, NULL); if (error) -- cgit v1.2.3 From 98d8256739f2c6c636fa2da359f5949c739ae839 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 May 2007 13:58:18 -0700 Subject: Prevent going idle with softirq pending The NOHZ patch contains a check for softirqs pending when a CPU goes idle. The BUG is unrelated to NOHZ, it just was made visible by the NOHZ patch. The BUG showed up mainly on P4 / hyperthreading enabled machines which lead the investigations into the wrong direction in the first place. The real cause is in cond_resched_softirq(): cond_resched_softirq() is enabling softirqs without invoking the softirq daemon when softirqs are pending. This leads to the warning message in the NOHZ idle code: t1 runs softirq disabled code on CPU#0 interrupt happens, softirq is raised, but deferred (softirqs disabled) t1 calls cond_resched_softirq() enables softirqs via _local_bh_enable() calls schedule() t2 runs t1 is migrated to CPU#1 t2 is done and invokes idle() NOHZ detects the pending softirq Fix: change _local_bh_enable() to local_bh_enable() so the softirq daemon is invoked. Thanks to Anant Nitya for debugging this with great patience ! Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 799d23b4e35d..13cdab3b4c48 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4775,9 +4775,7 @@ int __sched cond_resched_softirq(void) BUG_ON(!in_softirq()); if (need_resched() && system_state == SYSTEM_RUNNING) { - raw_local_irq_disable(); - _local_bh_enable(); - raw_local_irq_enable(); + local_bh_enable(); __cond_resched(); local_bh_disable(); return 1; -- cgit v1.2.3 From 92ea77275b5345c1300433f28689493dc4163f24 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 24 May 2007 08:37:14 -0700 Subject: Fix crash with irqpoll due to the IRQF_IRQPOLL flag testing With irqpoll enabled, trying to test the IRQF_IRQPOLL flag in the actions would cause a NULL pointer dereference if no action was installed (for example, the driver might have been unloaded with interrupts still pending). So be a bit more careful about testing the flag by making sure to test for that case. (The actual _change_ is trivial, the patch is more than a one-liner because I rewrote the testing to also be much more readable. Original (discarded) bugfix by Bernhard Walle. Cc: Bernhard Walle Tested-by: Vivek Goyal Signed-off-by: Linus Torvalds --- kernel/irq/spurious.c | 46 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index b0d81aae472f..bd9e272d55e9 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -135,6 +135,39 @@ report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) } } +static inline int try_misrouted_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) +{ + struct irqaction *action; + + if (!irqfixup) + return 0; + + /* We didn't actually handle the IRQ - see if it was misrouted? */ + if (action_ret == IRQ_NONE) + return 1; + + /* + * But for 'irqfixup == 2' we also do it for handled interrupts if + * they are marked as IRQF_IRQPOLL (or for irq zero, which is the + * traditional PC timer interrupt.. Legacy) + */ + if (irqfixup < 2) + return 0; + + if (!irq) + return 1; + + /* + * Since we don't get the descriptor lock, "action" can + * change under us. We don't really care, but we don't + * want to follow a NULL pointer. So tell the compiler to + * just load it once by using a barrier. + */ + action = desc->action; + barrier(); + return action && (action->flags & IRQF_IRQPOLL); +} + void note_interrupt(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) { @@ -144,15 +177,10 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, report_bad_irq(irq, desc, action_ret); } - if (unlikely(irqfixup)) { - /* Don't punish working computers */ - if ((irqfixup == 2 && ((irq == 0) || - (desc->action->flags & IRQF_IRQPOLL))) || - action_ret == IRQ_NONE) { - int ok = misrouted_irq(irq); - if (action_ret == IRQ_NONE) - desc->irqs_unhandled -= ok; - } + if (unlikely(try_misrouted_irq(irq, desc, action_ret))) { + int ok = misrouted_irq(irq); + if (action_ret == IRQ_NONE) + desc->irqs_unhandled -= ok; } desc->irq_count++; -- cgit v1.2.3 From eaad084bb0f3a6259e56400cd45d061dbf040600 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 29 May 2007 23:47:39 +0200 Subject: NOHZ: prevent multiplication overflow - stop timer for huge timeouts get_next_timer_interrupt() returns a delta of (LONG_MAX > 1) in case there is no timer pending. On 64 bit machines this results in a multiplication overflow in tick_nohz_stop_sched_tick(). Reported by: Dave Miller Make the return value a constant and limit the return value to a 32 bit value. When the max timeout value is returned, we can safely stop the tick timer device. The max jiffies delta results in a 12 days timeout for HZ=1000. In the long term the get_next_timer_interrupt() code needs to be reworked to return ktime instead of jiffies, but we have to wait until the last users of the original NO_IDLE_HZ code are converted. Signed-off-by: Thomas Gleixner Acked-off-by: David S. Miller Signed-off-by: Linus Torvalds --- kernel/time/tick-sched.c | 16 +++++++++++++++- kernel/timer.c | 10 +++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3e7ebc4646b7..52db9e3c526e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -247,6 +247,21 @@ void tick_nohz_stop_sched_tick(void) if (cpu == tick_do_timer_cpu) tick_do_timer_cpu = -1; + ts->idle_sleeps++; + + /* + * delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that + * there is no timer pending or at least extremly far + * into the future (12 days for HZ=1000). In this case + * we simply stop the tick timer: + */ + if (unlikely(delta_jiffies >= NEXT_TIMER_MAX_DELTA)) { + ts->idle_expires.tv64 = KTIME_MAX; + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) + hrtimer_cancel(&ts->sched_timer); + goto out; + } + /* * calculate the expiry time for the next timer wheel * timer @@ -254,7 +269,6 @@ void tick_nohz_stop_sched_tick(void) expires = ktime_add_ns(last_update, tick_period.tv64 * delta_jiffies); ts->idle_expires = expires; - ts->idle_sleeps++; if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { hrtimer_start(&ts->sched_timer, expires, diff --git a/kernel/timer.c b/kernel/timer.c index 5ec5490f8d85..1a69705c2fb9 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -666,7 +666,7 @@ static inline void __run_timers(tvec_base_t *base) static unsigned long __next_timer_interrupt(tvec_base_t *base) { unsigned long timer_jiffies = base->timer_jiffies; - unsigned long expires = timer_jiffies + (LONG_MAX >> 1); + unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; int index, slot, array, found = 0; struct timer_list *nte; tvec_t *varray[4]; @@ -752,6 +752,14 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now, tsdelta = ktime_to_timespec(hr_delta); delta = timespec_to_jiffies(&tsdelta); + + /* + * Limit the delta to the max value, which is checked in + * tick_nohz_stop_sched_tick(): + */ + if (delta > NEXT_TIMER_MAX_DELTA) + delta = NEXT_TIMER_MAX_DELTA; + /* * Take rounding errors in to account and make sure, that it * expires in the next tick. Otherwise we go into an endless -- cgit v1.2.3 From 7a74fc4925067c2102175baef73f9b07ab519b71 Mon Sep 17 00:00:00 2001 From: Kyle McMartin Date: Wed, 30 May 2007 02:43:16 -0400 Subject: fix possible null ptr deref in kallsyms_lookup ugh, this function gets called by our unwinder. recursive backtrace for the win... bisection to find this one was "fun." Signed-off-by: Kyle McMartin Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index f1bda23140b2..fed54418626c 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -257,7 +257,8 @@ const char *kallsyms_lookup(unsigned long addr, pos = get_symbol_pos(addr, symbolsize, offset); /* Grab name */ kallsyms_expand_symbol(get_symbol_offset(pos), namebuf); - *modname = NULL; + if (modname) + *modname = NULL; return namebuf; } -- cgit v1.2.3