From 97c7801cd5b0bb6a38c16108a496235474dc6310 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 11 Oct 2006 01:20:45 -0700 Subject: [PATCH] swsusp: Use suspend_console Add suspend_console() and resume_console() to the suspend-to-disk code paths so that the users of netconsole can use swsusp with it. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 8 ++++++++ kernel/power/user.c | 8 +++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index d72234942798..d3a158a60312 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include "power.h" @@ -119,8 +120,10 @@ int pm_suspend_disk(void) if (error) return error; + suspend_console(); error = device_suspend(PMSG_FREEZE); if (error) { + resume_console(); printk("Some devices failed to suspend\n"); unprepare_processes(); return error; @@ -133,6 +136,7 @@ int pm_suspend_disk(void) if (in_suspend) { device_resume(); + resume_console(); pr_debug("PM: writing image.\n"); error = swsusp_write(); if (!error) @@ -148,6 +152,7 @@ int pm_suspend_disk(void) swsusp_free(); Done: device_resume(); + resume_console(); unprepare_processes(); return error; } @@ -212,7 +217,9 @@ static int software_resume(void) pr_debug("PM: Preparing devices for restore.\n"); + suspend_console(); if ((error = device_suspend(PMSG_PRETHAW))) { + resume_console(); printk("Some devices failed to suspend\n"); swsusp_free(); goto Thaw; @@ -224,6 +231,7 @@ static int software_resume(void) swsusp_resume(); pr_debug("PM: Restore failed, recovering.n"); device_resume(); + resume_console(); Thaw: unprepare_processes(); Done: diff --git a/kernel/power/user.c b/kernel/power/user.c index 93b5dd283dea..d991d3b0e5a4 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -173,12 +174,14 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, /* Free memory before shutting down devices. */ error = swsusp_shrink_memory(); if (!error) { + suspend_console(); error = device_suspend(PMSG_FREEZE); if (!error) { in_suspend = 1; error = swsusp_suspend(); device_resume(); } + resume_console(); } up(&pm_sem); if (!error) @@ -196,11 +199,13 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, snapshot_free_unused_memory(&data->handle); down(&pm_sem); pm_prepare_console(); + suspend_console(); error = device_suspend(PMSG_PRETHAW); if (!error) { error = swsusp_resume(); device_resume(); } + resume_console(); pm_restore_console(); up(&pm_sem); break; @@ -289,6 +294,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, } /* Put devices to sleep */ + suspend_console(); error = device_suspend(PMSG_SUSPEND); if (error) { printk(KERN_ERR "Failed to suspend some devices.\n"); @@ -299,7 +305,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, /* Wake up devices */ device_resume(); } - + resume_console(); if (pm_ops->finish) pm_ops->finish(PM_SUSPEND_MEM); -- cgit v1.2.3 From 469340236a7c9673df3e6a2425f559517f01464e Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 11 Oct 2006 01:21:26 -0700 Subject: [PATCH] mm: kevent threads: use MPOL_DEFAULT Switch the memory policy of the kevent threads to MPOL_DEFAULT while leaving the kzalloc of the workqueue structure on interleave. This means that all code executed in the context of the kevent thread is allocating node local. Signed-off-by: Christoph Lameter Cc: Christoph Lameter Cc: Alok Kataria Cc: Andi Kleen Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index cfc737bffe6d..3df9bfc7ff78 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -28,6 +28,7 @@ #include #include #include +#include /* * The per-CPU workqueue (if single thread, we always use the first @@ -245,6 +246,12 @@ static int worker_thread(void *__cwq) sigprocmask(SIG_BLOCK, &blocked, NULL); flush_signals(current); + /* + * We inherited MPOL_INTERLEAVE from the booting kernel. + * Set MPOL_DEFAULT to insure node local allocations. + */ + numa_default_policy(); + /* SIG_IGN makes children autoreap: see do_notify_parent(). */ sa.sa.sa_handler = SIG_IGN; sa.sa.sa_flags = 0; -- cgit v1.2.3 From fa3ba2e81ea23416272a22009bba95954c81969c Mon Sep 17 00:00:00 2001 From: Florin Malita Date: Wed, 11 Oct 2006 01:21:48 -0700 Subject: [PATCH] fix Module taint flags listing in Oops/panic Module taint flags listing in Oops/panic has a couple of issues: * taint_flags() doesn't null-terminate the buffer after printing the flags * per-module taints are only set if the kernel is not already tainted (with that particular flag) => only the first offending module gets its taint info correctly updated Some additional changes: * 'license_gplok' is no longer needed - equivalent to !(taints & TAINT_PROPRIETARY_MODULE) - so we can drop it from struct module * exporting module taint info via /proc/module: pwc 88576 0 - Live 0xf8c32000 evilmod 6784 1 pwc, Live 0xf8bbf000 (PF) Signed-off-by: Florin Malita Cc: "Randy.Dunlap" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 94 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 49 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 7f60e782de1e..67009bd56c52 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -87,6 +87,12 @@ static inline int strong_try_module_get(struct module *mod) return try_module_get(mod); } +static inline void add_taint_module(struct module *mod, unsigned flag) +{ + add_taint(flag); + mod->taints |= flag; +} + /* A thread that wants to hold a reference to a module only while it * is running can call ths to safely exit. * nfsd and lockd use this. @@ -847,12 +853,10 @@ static int check_version(Elf_Shdr *sechdrs, return 0; } /* Not in module's version table. OK, but that taints the kernel. */ - if (!(tainted & TAINT_FORCED_MODULE)) { + if (!(tainted & TAINT_FORCED_MODULE)) printk("%s: no version for \"%s\" found: kernel tainted.\n", mod->name, symname); - add_taint(TAINT_FORCED_MODULE); - mod->taints |= TAINT_FORCED_MODULE; - } + add_taint_module(mod, TAINT_FORCED_MODULE); return 1; } @@ -910,7 +914,8 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, unsigned long ret; const unsigned long *crc; - ret = __find_symbol(name, &owner, &crc, mod->license_gplok); + ret = __find_symbol(name, &owner, &crc, + !(mod->taints & TAINT_PROPRIETARY_MODULE)); if (ret) { /* use_module can fail due to OOM, or module unloading */ if (!check_version(sechdrs, versindex, name, mod, crc) || @@ -1335,12 +1340,11 @@ static void set_license(struct module *mod, const char *license) if (!license) license = "unspecified"; - mod->license_gplok = license_is_gpl_compatible(license); - if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) { - printk(KERN_WARNING "%s: module license '%s' taints kernel.\n", - mod->name, license); - add_taint(TAINT_PROPRIETARY_MODULE); - mod->taints |= TAINT_PROPRIETARY_MODULE; + if (!license_is_gpl_compatible(license)) { + if (!(tainted & TAINT_PROPRIETARY_MODULE)) + printk(KERN_WARNING "%s: module license '%s' taints" + "kernel.\n", mod->name, license); + add_taint_module(mod, TAINT_PROPRIETARY_MODULE); } } @@ -1619,8 +1623,7 @@ static struct module *load_module(void __user *umod, modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); /* This is allowed: modprobe --force will invalidate it. */ if (!modmagic) { - add_taint(TAINT_FORCED_MODULE); - mod->taints |= TAINT_FORCED_MODULE; + add_taint_module(mod, TAINT_FORCED_MODULE); printk(KERN_WARNING "%s: no version magic, tainting kernel.\n", mod->name); } else if (!same_magic(modmagic, vermagic)) { @@ -1714,14 +1717,10 @@ static struct module *load_module(void __user *umod, /* Set up license info based on the info section */ set_license(mod, get_modinfo(sechdrs, infoindex, "license")); - if (strcmp(mod->name, "ndiswrapper") == 0) { - add_taint(TAINT_PROPRIETARY_MODULE); - mod->taints |= TAINT_PROPRIETARY_MODULE; - } - if (strcmp(mod->name, "driverloader") == 0) { - add_taint(TAINT_PROPRIETARY_MODULE); - mod->taints |= TAINT_PROPRIETARY_MODULE; - } + if (strcmp(mod->name, "ndiswrapper") == 0) + add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + if (strcmp(mod->name, "driverloader") == 0) + add_taint_module(mod, TAINT_PROPRIETARY_MODULE); /* Set up MODINFO_ATTR fields */ setup_modinfo(mod, sechdrs, infoindex); @@ -1766,8 +1765,7 @@ static struct module *load_module(void __user *umod, (mod->num_unused_gpl_syms && !unusedgplcrcindex)) { printk(KERN_WARNING "%s: No versions for exported symbols." " Tainting kernel.\n", mod->name); - add_taint(TAINT_FORCED_MODULE); - mod->taints |= TAINT_FORCED_MODULE; + add_taint_module(mod, TAINT_FORCED_MODULE); } #endif @@ -2132,9 +2130,33 @@ static void m_stop(struct seq_file *m, void *p) mutex_unlock(&module_mutex); } +static char *taint_flags(unsigned int taints, char *buf) +{ + int bx = 0; + + if (taints) { + buf[bx++] = '('; + if (taints & TAINT_PROPRIETARY_MODULE) + buf[bx++] = 'P'; + if (taints & TAINT_FORCED_MODULE) + buf[bx++] = 'F'; + /* + * TAINT_FORCED_RMMOD: could be added. + * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't + * apply to modules. + */ + buf[bx++] = ')'; + } + buf[bx] = '\0'; + + return buf; +} + static int m_show(struct seq_file *m, void *p) { struct module *mod = list_entry(p, struct module, list); + char buf[8]; + seq_printf(m, "%s %lu", mod->name, mod->init_size + mod->core_size); print_unload_info(m, mod); @@ -2147,6 +2169,10 @@ static int m_show(struct seq_file *m, void *p) /* Used by oprofile and other similar tools. */ seq_printf(m, " 0x%p", mod->module_core); + /* Taints info */ + if (mod->taints) + seq_printf(m, " %s", taint_flags(mod->taints, buf)); + seq_printf(m, "\n"); return 0; } @@ -2235,28 +2261,6 @@ struct module *module_text_address(unsigned long addr) return mod; } -static char *taint_flags(unsigned int taints, char *buf) -{ - *buf = '\0'; - if (taints) { - int bx; - - buf[0] = '('; - bx = 1; - if (taints & TAINT_PROPRIETARY_MODULE) - buf[bx++] = 'P'; - if (taints & TAINT_FORCED_MODULE) - buf[bx++] = 'F'; - /* - * TAINT_FORCED_RMMOD: could be added. - * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't - * apply to modules. - */ - buf[bx] = ')'; - } - return buf; -} - /* Don't grab lock, we're oopsing. */ void print_modules(void) { -- cgit v1.2.3 From beed33a816204cb402c69266475b6a60a2433ceb Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 11 Oct 2006 01:21:52 -0700 Subject: [PATCH] sched: likely profiling This likely profiling is pretty fun. I found a few possible problems in sched.c. This patch may be not measurable, but when I did measure long ago, nooping (un)likely cost a couple of % on scheduler heavy benchmarks, so it all adds up. Tweak some branch hints: - the 2nd 64 bits in the bitmask is likely to be populated, because it contains the first 28 bits (nearly 3/4) of the normal priorities. (ratio of 669669:691 ~= 1000:1). - it isn't unlikely that context switching switches to another process. it might be very rapidly switching to and from the idle process (ratio of 475815:419004 and 471330:423544). Let the branch predictor decide. - preempt_enable seems to be very often called in a nested preempt_disable or with interrupts disabled (ratio of 3567760:87965 ~= 40:1) Signed-off-by: Nick Piggin Acked-by: Ingo Molnar Cc: Daniel Walker Cc: Hua Zhong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 53608a59d6e3..094b5687eef6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1822,14 +1822,14 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm = next->mm; struct mm_struct *oldmm = prev->active_mm; - if (unlikely(!mm)) { + if (!mm) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else switch_mm(oldmm, mm, next); - if (unlikely(!prev->mm)) { + if (!prev->mm) { prev->active_mm = NULL; WARN_ON(rq->prev_mm); rq->prev_mm = oldmm; @@ -3491,7 +3491,7 @@ asmlinkage void __sched preempt_schedule(void) * If there is a non-zero preempt_count or interrupts are disabled, * we do not want to preempt the current task. Just return.. */ - if (unlikely(ti->preempt_count || irqs_disabled())) + if (likely(ti->preempt_count || irqs_disabled())) return; need_resched: -- cgit v1.2.3 From 01a3ee2b203e511e20f98b85a9172fd32c53e87c Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Wed, 11 Oct 2006 01:21:55 -0700 Subject: [PATCH] bitmap: parse input from kernel and user buffers lib/bitmap.c:bitmap_parse() is a library function that received as input a user buffer. This seemed to have originated from the way the write_proc function of the /proc filesystem operates. This has been reworked to not use kmalloc and eliminates a lot of get_user() overhead by performing one access_ok before using __get_user(). We need to test if we are in kernel or user space (is_user) and access the buffer differently. We cannot use __get_user() to access kernel addresses in all cases, for example in architectures with separate address space for kernel and user. This function will be useful for other uses as well; for example, taking input for /sysfs instead of /proc, so it was changed to accept kernel buffers. We have this use for the Linux UWB project, as part as the upcoming bandwidth allocator code. Only a few routines used this function and they were changed too. Signed-off-by: Reinette Chatre Signed-off-by: Inaky Perez-Gonzalez Cc: Paul Jackson Cc: Joe Korty Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/proc.c | 2 +- kernel/profile.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 607c7809ad01..9a352667007c 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -57,7 +57,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, if (!irq_desc[irq].chip->set_affinity || no_irq_affinity) return -EIO; - err = cpumask_parse(buffer, count, new_value); + err = cpumask_parse_user(buffer, count, new_value); if (err) return err; diff --git a/kernel/profile.c b/kernel/profile.c index 857300a2afec..f940b462eec9 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -399,7 +399,7 @@ static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffe unsigned long full_count = count, err; cpumask_t new_value; - err = cpumask_parse(buffer, count, new_value); + err = cpumask_parse_user(buffer, count, new_value); if (err) return err; -- cgit v1.2.3 From 3dc3099a9b2c346b16383597fadaa79a05a52388 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 11 Oct 2006 01:22:06 -0700 Subject: [PATCH] lockdep: use BUILD_BUG_ON Signed-off-by: Alexey Dobriyan Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/lockdep.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 4c0553461000..805a322a5655 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -1114,8 +1114,6 @@ static int count_matching_names(struct lock_class *new_class) return count + 1; } -extern void __error_too_big_MAX_LOCKDEP_SUBCLASSES(void); - /* * Register a lock's class in the hash-table, if the class is not present * yet. Otherwise we look it up. We cache the result in the lock object @@ -1153,8 +1151,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) * (or spin_lock_init()) call - which acts as the key. For static * locks we use the lock object itself as the key. */ - if (sizeof(struct lock_class_key) > sizeof(struct lock_class)) - __error_too_big_MAX_LOCKDEP_SUBCLASSES(); + BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(struct lock_class)); key = lock->key->subkeys + subclass; -- cgit v1.2.3 From 256a6b41365e17cebe5c2fc91ddff716c9aa055a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 11 Oct 2006 01:22:08 -0700 Subject: [PATCH] lockdep: fix printk recursion logic Bug reported and fixed by Tilman Schmidt : if lockdep is enabled then log messages make it to /var/log/messages belatedly. The reason is a missed wakeup of klogd. Initially there was only a lockdep_internal() protection against lockdep recursion within vprintk() - it grew the 'outer' lockdep_off()/on() protection only later on. But that lockdep_off() made the release_console_sem() within vprintk() always happen under the lockdep_internal() condition, causing the bug. The right solution to remove the inner protection against recursion here - the outer one is enough. Signed-off-by: Ingo Molnar Cc: Tilman Schmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 771f5e861bcd..f7d427ef5038 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -820,15 +820,8 @@ void release_console_sem(void) console_locked = 0; up(&console_sem); spin_unlock_irqrestore(&logbuf_lock, flags); - if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) { - /* - * If we printk from within the lock dependency code, - * from within the scheduler code, then do not lock - * up due to self-recursion: - */ - if (!lockdep_internal()) - wake_up_interruptible(&log_wait); - } + if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) + wake_up_interruptible(&log_wait); } EXPORT_SYMBOL(release_console_sem); -- cgit v1.2.3 From 39af114377bf80d2a88e47be33d578d1fa9b0775 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 16 Oct 2006 09:01:46 -0700 Subject: [PATCH] fix epoll_pwait when EPOLL=n Fixes http://bugzilla.kernel.org/show_bug.cgi?id=7371 sys_epoll_pwait needs to be listed as a conditional (weak) entry point for CONFIG_EPOLL=n. Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 7a3b2e75f040..0e53314b14de 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -49,6 +49,7 @@ cond_syscall(compat_sys_get_robust_list); cond_syscall(sys_epoll_create); cond_syscall(sys_epoll_ctl); cond_syscall(sys_epoll_wait); +cond_syscall(sys_epoll_pwait); cond_syscall(sys_semget); cond_syscall(sys_semop); cond_syscall(sys_semtimedop); -- cgit v1.2.3 From ca268c691de95612981b93e58899c1d73fdb6b47 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 17 Oct 2006 00:09:28 -0700 Subject: [PATCH] lockdep: increase max allowed recursion depth In general, lockdep warnings are intended to be non-fatal, so I have put in various practical limits on internal data structure failure modes. We haven't had a /single/ lockdep-internal crash ever since lockdep went upstream [the unwinder crashes are outside of lockdep], and that's largely due to the good internal checks it does. Recursion within the dependency graph is currently limited to 20, that's probably not enough on some many-CPU boxes - this patch doubles it to 40. I have written the lockdep functions to have as small stackframes as possible, so 40 should be OK too. (The practical recursion limit should be somewhere between 100 and 200 entries. If we hit that then I'll change the algorithm to be iteration-based. Graph walking logic is so easy to program via recursion, so i'd like to keep recursion as long as possible.) Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/lockdep.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 805a322a5655..d1a3b2cfe4a9 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -575,6 +575,8 @@ static noinline int print_circular_bug_tail(void) return 0; } +#define RECURSION_LIMIT 40 + static int noinline print_infinite_recursion_bug(void) { __raw_spin_unlock(&hash_lock); @@ -595,7 +597,7 @@ check_noncircular(struct lock_class *source, unsigned int depth) debug_atomic_inc(&nr_cyclic_check_recursions); if (depth > max_recursion_depth) max_recursion_depth = depth; - if (depth >= 20) + if (depth >= RECURSION_LIMIT) return print_infinite_recursion_bug(); /* * Check this lock's dependency list: @@ -645,7 +647,7 @@ find_usage_forwards(struct lock_class *source, unsigned int depth) if (depth > max_recursion_depth) max_recursion_depth = depth; - if (depth >= 20) + if (depth >= RECURSION_LIMIT) return print_infinite_recursion_bug(); debug_atomic_inc(&nr_find_usage_forwards_checks); @@ -684,7 +686,7 @@ find_usage_backwards(struct lock_class *source, unsigned int depth) if (depth > max_recursion_depth) max_recursion_depth = depth; - if (depth >= 20) + if (depth >= RECURSION_LIMIT) return print_infinite_recursion_bug(); debug_atomic_inc(&nr_find_usage_backwards_checks); -- cgit v1.2.3 From 3f4a0b917ce72ef47e438d354c433eb645218e87 Mon Sep 17 00:00:00 2001 From: john stultz Date: Tue, 17 Oct 2006 00:09:32 -0700 Subject: [PATCH] i386 Time: Avoid PIT SMP lockups Avoid possible PIT livelock issues seen on SMP systems (and reported by Andi), by not allowing it as a clocksource on SMP boxes. However, since the PIT may no longer be present, we have to properly handle the cases where SMP systems have TSC skew and fall back from the TSC. Since the PIT isn't there, it would "fall back" to the TSC again. So this changes the jiffies rating to 1, and the TSC-bad rating value to 0. Thus you will get the following behavior priority on i386 systems: tsc [if present & stable] hpet [if present] cyclone [if present] acpi_pm [if present] pit [if UP] jiffies Rather then the current more complicated: tsc [if present & stable] hpet [if present] cyclone [if present] acpi_pm [if present] pit [if cpus < 4] tsc [if present & unstable] jiffies Signed-off-by: John Stultz Cc: Andi Kleen Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/jiffies.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 126bb30c4afe..a99b2a6e6a07 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -57,7 +57,7 @@ static cycle_t jiffies_read(void) struct clocksource clocksource_jiffies = { .name = "jiffies", - .rating = 0, /* lowest rating*/ + .rating = 1, /* lowest valid rating*/ .read = jiffies_read, .mask = 0xffffffff, /*32bits*/ .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ -- cgit v1.2.3 From ac08c26492a0ad4d94a25bd47d5630cd38337069 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Oct 2006 00:09:39 -0700 Subject: [PATCH] posix-cpu-timers: prevent signal delivery starvation The integer divisions in the timer accounting code can round the result down to 0. Adding 0 is without effect and the signal delivery stops. Clamp the division result to minimum 1 to avoid this. Problem was reported by Seongbae Park , who provided also an inital patch. Roland sayeth: I have had some more time to think about the problem, and to reproduce it using Toyo's test case. For the record, if my understanding of the problem is correct, this happens only in one very particular case. First, the expiry time has to be so soon that in cputime_t units (usually 1s/HZ ticks) it's < nthreads so the division yields zero. Second, it only affects each thread that is so new that its CPU time accumulation is zero so now+0 is still zero and ->it_*_expires winds up staying zero. For the VIRT and PROF clocks when cputime_t is tick granularity (or the SCHED clock on configurations where sched_clock's value only advances on clock ticks), this is not hard to arrange with new threads starting up and blocking before they accumulate a whole tick of CPU time. That's what happens in Toyo's test case. Note that in general it is fine for that division to round down to zero, and set each thread's expiry time to its "now" time. The problem only arises with thread's whose "now" value is still zero, so that now+0 winds up 0 and is interpreted as "not set" instead of ">= now". So it would be a sufficient and more precise fix to just use max(ticks, 1) inside the loop when setting each it_*_expires value. But, it does no harm to round the division up to one and always advance every thread's expiry time. If the thread didn't already fire timers for the expiry time of "now", there is no expectation that it will do so before the next tick anyway. So I followed Thomas's patch in lifting the max out of the loops. This patch also covers the reload cases, which are harder to write a test for (and I didn't try). I've tested it with Toyo's case and it fixes that. [toyoa@mvista.com: fix: min_t -> max_t] Signed-off-by: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Roland McGrath Cc: Daniel Walker Cc: Toyo Abe Cc: john stultz Cc: Roman Zippel Cc: Seongbae Park Cc: Peter Mattis Cc: Rohit Seth Cc: Martin Bligh Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/posix-cpu-timers.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 479b16b44f79..7c3e1e6dfb5b 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -87,6 +87,19 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, return a; } +/* + * Divide and limit the result to res >= 1 + * + * This is necessary to prevent signal delivery starvation, when the result of + * the division would be rounded down to 0. + */ +static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div) +{ + cputime_t res = cputime_div(time, div); + + return max_t(cputime_t, res, 1); +} + /* * Update expiry time from increment, and increase overrun count, * given the current clock sample. @@ -483,8 +496,8 @@ static void process_timer_rebalance(struct task_struct *p, BUG(); break; case CPUCLOCK_PROF: - left = cputime_div(cputime_sub(expires.cpu, val.cpu), - nthreads); + left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), + nthreads); do { if (likely(!(t->flags & PF_EXITING))) { ticks = cputime_add(prof_ticks(t), left); @@ -498,8 +511,8 @@ static void process_timer_rebalance(struct task_struct *p, } while (t != p); break; case CPUCLOCK_VIRT: - left = cputime_div(cputime_sub(expires.cpu, val.cpu), - nthreads); + left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), + nthreads); do { if (likely(!(t->flags & PF_EXITING))) { ticks = cputime_add(virt_ticks(t), left); @@ -515,6 +528,7 @@ static void process_timer_rebalance(struct task_struct *p, case CPUCLOCK_SCHED: nsleft = expires.sched - val.sched; do_div(nsleft, nthreads); + nsleft = max_t(unsigned long long, nsleft, 1); do { if (likely(!(t->flags & PF_EXITING))) { ns = t->sched_time + nsleft; @@ -1159,12 +1173,13 @@ static void check_process_timers(struct task_struct *tsk, prof_left = cputime_sub(prof_expires, utime); prof_left = cputime_sub(prof_left, stime); - prof_left = cputime_div(prof_left, nthreads); + prof_left = cputime_div_non_zero(prof_left, nthreads); virt_left = cputime_sub(virt_expires, utime); - virt_left = cputime_div(virt_left, nthreads); + virt_left = cputime_div_non_zero(virt_left, nthreads); if (sched_expires) { sched_left = sched_expires - sched_time; do_div(sched_left, nthreads); + sched_left = max_t(unsigned long long, sched_left, 1); } else { sched_left = 0; } -- cgit v1.2.3 From c60099bfe3a5e6fa22a930627689b3769c52153f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 17 Oct 2006 00:09:59 -0700 Subject: [PATCH] swsusp: fix memory leaks My fancy new swsusp IO code had a big memory leak. It's somewhat invisible because the whole mem_map[] gets overwritten after resume, but it can cause us to get low on memory during the actual suspend process. Cc: "Rafael J. Wysocki" Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/swap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 9b2ee5344dee..1a3b0dd2c3fc 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -425,7 +425,8 @@ static int submit(int rw, pgoff_t page_off, struct page *page, bio_set_pages_dirty(bio); bio_put(bio); } else { - get_page(page); + if (rw == READ) + get_page(page); /* These pages are freed later */ bio->bi_private = *bio_chain; *bio_chain = bio; submit_bio(rw | (1 << BIO_RW_SYNC), bio); -- cgit v1.2.3 From a460e745e8f9c75a0525ff94154a0629f9d3e05d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 17 Oct 2006 00:10:03 -0700 Subject: [PATCH] genirq: clean up irq-flow-type naming Introduce desc->name and eliminate the handle_irq_name() hack. Add set_irq_chip_and_handler_name() to set the flow type and name at once. Signed-off-by: Ingo Molnar Acked-by: Thomas Gleixner Cc: "Eric W. Biederman" Cc: Matthew Wilcox Cc: Kyle McMartin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/chip.c | 33 +++++++++------------------------ 1 file changed, 9 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 11c99697acfe..2d0dc3efe813 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -499,7 +499,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) #endif /* CONFIG_SMP */ void -__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained) +__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, + const char *name) { struct irq_desc *desc; unsigned long flags; @@ -540,6 +541,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained) desc->depth = 1; } desc->handle_irq = handle; + desc->name = name; if (handle != handle_bad_irq && is_chained) { desc->status &= ~IRQ_DISABLED; @@ -555,30 +557,13 @@ set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, irq_flow_handler_t handle) { set_irq_chip(irq, chip); - __set_irq_handler(irq, handle, 0); + __set_irq_handler(irq, handle, 0, NULL); } -/* - * Get a descriptive string for the highlevel handler, for - * /proc/interrupts output: - */ -const char * -handle_irq_name(irq_flow_handler_t handle) +void +set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, + irq_flow_handler_t handle, const char *name) { - if (handle == handle_level_irq) - return "level "; - if (handle == handle_fasteoi_irq) - return "fasteoi"; - if (handle == handle_edge_irq) - return "edge "; - if (handle == handle_simple_irq) - return "simple "; -#ifdef CONFIG_SMP - if (handle == handle_percpu_irq) - return "percpu "; -#endif - if (handle == handle_bad_irq) - return "bad "; - - return NULL; + set_irq_chip(irq, chip); + __set_irq_handler(irq, handle, 0, name); } -- cgit v1.2.3 From bea493a031fe3337f4fe5479e8e865513828ea76 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Oct 2006 00:10:33 -0700 Subject: [PATCH] rt-mutex: fixup rt-mutex debug code BUG: warning at kernel/rtmutex-debug.c:125/rt_mutex_debug_task_free() (Not tainted) [] show_trace_log_lvl+0x58/0x16a [] show_trace+0xd/0x10 [] dump_stack+0x19/0x1b [] rt_mutex_debug_task_free+0x35/0x6a [] free_task+0x15/0x24 [] copy_process+0x12bd/0x1324 [] do_fork+0x42/0x113 [] sys_fork+0x19/0x1b [] syscall_call+0x7/0xb In copy_process(), dup_task_struct() also duplicates the ->pi_lock, ->pi_waiters and ->pi_blocked_on members. rt_mutex_debug_task_free() called from free_task() validates these members. However free_task() can be invoked before these members are reset for the new task. Move the initialization code before the first bail that can hit free_task(). Signed-off-by: Peter Zijlstra Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 7dc6140baac6..29ebb30850ed 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -984,6 +984,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (!p) goto fork_out; + rt_mutex_init_task(p); + #ifdef CONFIG_TRACE_IRQFLAGS DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); @@ -1088,8 +1090,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->lockdep_recursion = 0; #endif - rt_mutex_init_task(p); - #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif -- cgit v1.2.3 From bd5349cfd2b9bbb10a3dbcd3fe5cbaabe0b2ab9e Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Tue, 17 Oct 2006 00:10:35 -0700 Subject: [PATCH] Convert cpu hotplug notifiers to use raw_notifier instead of blocking_notifier The use of blocking notifier by _cpu_up and _cpu_down in cpu.c has two problem. 1/ An interaction with the workqueue notifier causes lockdep to spit a warning. 2/ A notifier could conceivable be added or removed while _cpu_up or _cpu_down are in process. As each notifier is called twice (prepare then commit/abort) this could be unhealthy. To fix to we simply take cpu_add_remove_lock while adding or removing notifiers to/from the list. This makes the 'blocking' usage unnecessary as all accesses to cpu_chain are now protected by cpu_add_remove_lock. So change "blocking" to "raw" in all relevant places. This fixes 1. Credit: Andrew Morton Cc: Rusty Russell Cc: Michal Piotrowski (reporter) Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 32c96628463e..27dd3ee47099 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -19,7 +19,7 @@ static DEFINE_MUTEX(cpu_add_remove_lock); static DEFINE_MUTEX(cpu_bitmask_lock); -static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain); +static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); /* If set, cpu_up and cpu_down will return -EBUSY and do nothing. * Should always be manipulated under cpu_add_remove_lock @@ -68,7 +68,11 @@ EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); /* Need to know about CPUs going up/down? */ int __cpuinit register_cpu_notifier(struct notifier_block *nb) { - return blocking_notifier_chain_register(&cpu_chain, nb); + int ret; + mutex_lock(&cpu_add_remove_lock); + ret = raw_notifier_chain_register(&cpu_chain, nb); + mutex_unlock(&cpu_add_remove_lock); + return ret; } #ifdef CONFIG_HOTPLUG_CPU @@ -77,7 +81,9 @@ EXPORT_SYMBOL(register_cpu_notifier); void unregister_cpu_notifier(struct notifier_block *nb) { - blocking_notifier_chain_unregister(&cpu_chain, nb); + mutex_lock(&cpu_add_remove_lock); + raw_notifier_chain_unregister(&cpu_chain, nb); + mutex_unlock(&cpu_add_remove_lock); } EXPORT_SYMBOL(unregister_cpu_notifier); @@ -126,7 +132,7 @@ static int _cpu_down(unsigned int cpu) if (!cpu_online(cpu)) return -EINVAL; - err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, + err = raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, (void *)(long)cpu); if (err == NOTIFY_BAD) { printk("%s: attempt to take down CPU %u failed\n", @@ -146,7 +152,7 @@ static int _cpu_down(unsigned int cpu) if (IS_ERR(p)) { /* CPU didn't die: tell everyone. Can't complain. */ - if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, + if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, (void *)(long)cpu) == NOTIFY_BAD) BUG(); @@ -169,7 +175,7 @@ static int _cpu_down(unsigned int cpu) put_cpu(); /* CPU is completely dead: tell everyone. Too late to complain. */ - if (blocking_notifier_call_chain(&cpu_chain, CPU_DEAD, + if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD, (void *)(long)cpu) == NOTIFY_BAD) BUG(); @@ -206,7 +212,7 @@ static int __devinit _cpu_up(unsigned int cpu) if (cpu_online(cpu) || !cpu_present(cpu)) return -EINVAL; - ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); + ret = raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); if (ret == NOTIFY_BAD) { printk("%s: attempt to bring up CPU %u failed\n", __FUNCTION__, cpu); @@ -223,11 +229,11 @@ static int __devinit _cpu_up(unsigned int cpu) BUG_ON(!cpu_online(cpu)); /* Now call notifier in preparation. */ - blocking_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); + raw_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); out_notify: if (ret != 0) - blocking_notifier_call_chain(&cpu_chain, + raw_notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu); return ret; -- cgit v1.2.3 From 91fcdd4e0314145d7d4fa52dba2f9c2da25346fd Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 19 Oct 2006 23:28:29 -0700 Subject: [PATCH] readjust comments of task_timeslice for kernel doc Signed-off-by: Borislav Petkov Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 094b5687eef6..3399701c680e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -160,15 +160,6 @@ #define TASK_PREEMPTS_CURR(p, rq) \ ((p)->prio < (rq)->curr->prio) -/* - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] - * to time slice values: [800ms ... 100ms ... 5ms] - * - * The higher a thread's priority, the bigger timeslices - * it gets during one round of execution. But even the lowest - * priority thread gets MIN_TIMESLICE worth of execution time. - */ - #define SCALE_PRIO(x, prio) \ max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) @@ -180,6 +171,15 @@ static unsigned int static_prio_timeslice(int static_prio) return SCALE_PRIO(DEF_TIMESLICE, static_prio); } +/* + * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] + * to time slice values: [800ms ... 100ms ... 5ms] + * + * The higher a thread's priority, the bigger timeslices + * it gets during one round of execution. But even the lowest + * priority thread gets MIN_TIMESLICE worth of execution time. + */ + static inline unsigned int task_timeslice(struct task_struct *p) { return static_prio_timeslice(p->static_prio); -- cgit v1.2.3 From d6f8ff7381501887233666b508b9eac70143303d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 19 Oct 2006 23:28:34 -0700 Subject: [PATCH] cad_pid sysctl with PROC_FS=n If CONFIG_PROC_FS=n: kernel/sysctl.c:148: warning: 'proc_do_cad_pid' used but never defined kernel/built-in.o:(.data+0x1228): undefined reference to `proc_do_cad_pid' make: *** [.tmp_vmlinux1] Error 1 Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8020fb273c4f..8bff2c18fb5a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -136,8 +136,10 @@ static int parse_table(int __user *, int, void __user *, size_t __user *, static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); +#ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); +#endif static ctl_table root_table[]; static struct ctl_table_header root_table_header = @@ -542,6 +544,7 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_PROC_SYSCTL { .ctl_name = KERN_CADPID, .procname = "cad_pid", @@ -550,6 +553,7 @@ static ctl_table kern_table[] = { .mode = 0600, .proc_handler = &proc_do_cad_pid, }, +#endif { .ctl_name = KERN_MAX_THREADS, .procname = "threads-max", -- cgit v1.2.3 From e05d722e4555cd54677b4c8431d9e81fd047ef7a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 19 Oct 2006 23:29:12 -0700 Subject: [PATCH] kernel/nsproxy.c: use kmemdup() Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/nsproxy.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 6ebdb82a0ce4..674aceb7335a 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -44,11 +44,9 @@ static inline struct nsproxy *clone_namespaces(struct nsproxy *orig) { struct nsproxy *ns; - ns = kmalloc(sizeof(struct nsproxy), GFP_KERNEL); - if (ns) { - memcpy(ns, orig, sizeof(struct nsproxy)); + ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL); + if (ns) atomic_set(&ns->count, 1); - } return ns; } -- cgit v1.2.3 From 690a973f48b6ba2954465992c08e65059c8374fe Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Sat, 21 Oct 2006 18:37:01 +0200 Subject: [PATCH] x86-64: Speed up dwarf2 unwinder This changes the dwarf2 unwinder to do a binary search for CIEs instead of a linear work. The linker is unfortunately not able to build a proper lookup table at link time, instead it creates one at runtime as soon as the bootmem allocator is usable (so you'll continue using the linear lookup for the first [hopefully] few calls). The code should be ready to utilize a build-time created table once a fixed linker becomes available. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- kernel/unwind.c | 318 +++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 279 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/unwind.c b/kernel/unwind.c index 2e2368607aab..f7e50d16dbf6 100644 --- a/kernel/unwind.c +++ b/kernel/unwind.c @@ -11,13 +11,15 @@ #include #include -#include +#include +#include #include #include #include #include extern char __start_unwind[], __end_unwind[]; +extern const u8 __start_unwind_hdr[], __end_unwind_hdr[]; #define MAX_STACK_DEPTH 8 @@ -100,6 +102,8 @@ static struct unwind_table { } core, init; const void *address; unsigned long size; + const unsigned char *header; + unsigned long hdrsz; struct unwind_table *link; const char *name; } root_table; @@ -145,6 +149,10 @@ static struct unwind_table *find_table(unsigned long pc) return table; } +static unsigned long read_pointer(const u8 **pLoc, + const void *end, + signed ptrType); + static void init_unwind_table(struct unwind_table *table, const char *name, const void *core_start, @@ -152,14 +160,30 @@ static void init_unwind_table(struct unwind_table *table, const void *init_start, unsigned long init_size, const void *table_start, - unsigned long table_size) + unsigned long table_size, + const u8 *header_start, + unsigned long header_size) { + const u8 *ptr = header_start + 4; + const u8 *end = header_start + header_size; + table->core.pc = (unsigned long)core_start; table->core.range = core_size; table->init.pc = (unsigned long)init_start; table->init.range = init_size; table->address = table_start; table->size = table_size; + /* See if the linker provided table looks valid. */ + if (header_size <= 4 + || header_start[0] != 1 + || (void *)read_pointer(&ptr, end, header_start[1]) != table_start + || header_start[2] == DW_EH_PE_omit + || read_pointer(&ptr, end, header_start[2]) <= 0 + || header_start[3] == DW_EH_PE_omit) + header_start = NULL; + table->hdrsz = header_size; + smp_wmb(); + table->header = header_start; table->link = NULL; table->name = name; } @@ -169,7 +193,143 @@ void __init unwind_init(void) init_unwind_table(&root_table, "kernel", _text, _end - _text, NULL, 0, - __start_unwind, __end_unwind - __start_unwind); + __start_unwind, __end_unwind - __start_unwind, + __start_unwind_hdr, __end_unwind_hdr - __start_unwind_hdr); +} + +static const u32 bad_cie, not_fde; +static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *); +static signed fde_pointer_type(const u32 *cie); + +struct eh_frame_hdr_table_entry { + unsigned long start, fde; +}; + +static int cmp_eh_frame_hdr_table_entries(const void *p1, const void *p2) +{ + const struct eh_frame_hdr_table_entry *e1 = p1; + const struct eh_frame_hdr_table_entry *e2 = p2; + + return (e1->start > e2->start) - (e1->start < e2->start); +} + +static void swap_eh_frame_hdr_table_entries(void *p1, void *p2, int size) +{ + struct eh_frame_hdr_table_entry *e1 = p1; + struct eh_frame_hdr_table_entry *e2 = p2; + unsigned long v; + + v = e1->start; + e1->start = e2->start; + e2->start = v; + v = e1->fde; + e1->fde = e2->fde; + e2->fde = v; +} + +static void __init setup_unwind_table(struct unwind_table *table, + void *(*alloc)(unsigned long)) +{ + const u8 *ptr; + unsigned long tableSize = table->size, hdrSize; + unsigned n; + const u32 *fde; + struct { + u8 version; + u8 eh_frame_ptr_enc; + u8 fde_count_enc; + u8 table_enc; + unsigned long eh_frame_ptr; + unsigned int fde_count; + struct eh_frame_hdr_table_entry table[]; + } __attribute__((__packed__)) *header; + + if (table->header) + return; + + if (table->hdrsz) + printk(KERN_WARNING ".eh_frame_hdr for '%s' present but unusable\n", + table->name); + + if (tableSize & (sizeof(*fde) - 1)) + return; + + for (fde = table->address, n = 0; + tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde; + tableSize -= sizeof(*fde) + *fde, fde += 1 + *fde / sizeof(*fde)) { + const u32 *cie = cie_for_fde(fde, table); + signed ptrType; + + if (cie == ¬_fde) + continue; + if (cie == NULL + || cie == &bad_cie + || (ptrType = fde_pointer_type(cie)) < 0) + return; + ptr = (const u8 *)(fde + 2); + if (!read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType)) + return; + ++n; + } + + if (tableSize || !n) + return; + + hdrSize = 4 + sizeof(unsigned long) + sizeof(unsigned int) + + 2 * n * sizeof(unsigned long); + header = alloc(hdrSize); + if (!header) + return; + header->version = 1; + header->eh_frame_ptr_enc = DW_EH_PE_abs|DW_EH_PE_native; + header->fde_count_enc = DW_EH_PE_abs|DW_EH_PE_data4; + header->table_enc = DW_EH_PE_abs|DW_EH_PE_native; + put_unaligned((unsigned long)table->address, &header->eh_frame_ptr); + BUILD_BUG_ON(offsetof(typeof(*header), fde_count) + % __alignof(typeof(header->fde_count))); + header->fde_count = n; + + BUILD_BUG_ON(offsetof(typeof(*header), table) + % __alignof(typeof(*header->table))); + for (fde = table->address, tableSize = table->size, n = 0; + tableSize; + tableSize -= sizeof(*fde) + *fde, fde += 1 + *fde / sizeof(*fde)) { + const u32 *cie = fde + 1 - fde[1] / sizeof(*fde); + + if (!fde[1]) + continue; /* this is a CIE */ + ptr = (const u8 *)(fde + 2); + header->table[n].start = read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + fde_pointer_type(cie)); + header->table[n].fde = (unsigned long)fde; + ++n; + } + WARN_ON(n != header->fde_count); + + sort(header->table, + n, + sizeof(*header->table), + cmp_eh_frame_hdr_table_entries, + swap_eh_frame_hdr_table_entries); + + table->hdrsz = hdrSize; + smp_wmb(); + table->header = (const void *)header; +} + +static void *__init balloc(unsigned long sz) +{ + return __alloc_bootmem_nopanic(sz, + sizeof(unsigned int), + __pa(MAX_DMA_ADDRESS)); +} + +void __init unwind_setup(void) +{ + setup_unwind_table(&root_table, balloc); } #ifdef CONFIG_MODULES @@ -193,7 +353,8 @@ void *unwind_add_table(struct module *module, init_unwind_table(table, module->name, module->module_core, module->core_size, module->module_init, module->init_size, - table_start, table_size); + table_start, table_size, + NULL, 0); if (last_table) last_table->link = table; @@ -303,6 +464,26 @@ static sleb128_t get_sleb128(const u8 **pcur, const u8 *end) return value; } +static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *table) +{ + const u32 *cie; + + if (!*fde || (*fde & (sizeof(*fde) - 1))) + return &bad_cie; + if (!fde[1]) + return ¬_fde; /* this is a CIE */ + if ((fde[1] & (sizeof(*fde) - 1)) + || fde[1] > (unsigned long)(fde + 1) - (unsigned long)table->address) + return NULL; /* this is not a valid FDE */ + cie = fde + 1 - fde[1] / sizeof(*fde); + if (*cie <= sizeof(*cie) + 4 + || *cie >= fde[1] - sizeof(*fde) + || (*cie & (sizeof(*cie) - 1)) + || cie[1]) + return NULL; /* this is not a (valid) CIE */ + return cie; +} + static unsigned long read_pointer(const u8 **pLoc, const void *end, signed ptrType) @@ -610,49 +791,108 @@ int unwind(struct unwind_frame_info *frame) unsigned i; signed ptrType = -1; uleb128_t retAddrReg = 0; - struct unwind_table *table; + const struct unwind_table *table; struct unwind_state state; if (UNW_PC(frame) == 0) return -EINVAL; if ((table = find_table(pc)) != NULL && !(table->size & (sizeof(*fde) - 1))) { - unsigned long tableSize = table->size; - - for (fde = table->address; - tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde; - tableSize -= sizeof(*fde) + *fde, - fde += 1 + *fde / sizeof(*fde)) { - if (!*fde || (*fde & (sizeof(*fde) - 1))) - break; - if (!fde[1]) - continue; /* this is a CIE */ - if ((fde[1] & (sizeof(*fde) - 1)) - || fde[1] > (unsigned long)(fde + 1) - - (unsigned long)table->address) - continue; /* this is not a valid FDE */ - cie = fde + 1 - fde[1] / sizeof(*fde); - if (*cie <= sizeof(*cie) + 4 - || *cie >= fde[1] - sizeof(*fde) - || (*cie & (sizeof(*cie) - 1)) - || cie[1] - || (ptrType = fde_pointer_type(cie)) < 0) { - cie = NULL; /* this is not a (valid) CIE */ - continue; + const u8 *hdr = table->header; + unsigned long tableSize; + + smp_rmb(); + if (hdr && hdr[0] == 1) { + switch(hdr[3] & DW_EH_PE_FORM) { + case DW_EH_PE_native: tableSize = sizeof(unsigned long); break; + case DW_EH_PE_data2: tableSize = 2; break; + case DW_EH_PE_data4: tableSize = 4; break; + case DW_EH_PE_data8: tableSize = 8; break; + default: tableSize = 0; break; + } + ptr = hdr + 4; + end = hdr + table->hdrsz; + if (tableSize + && read_pointer(&ptr, end, hdr[1]) + == (unsigned long)table->address + && (i = read_pointer(&ptr, end, hdr[2])) > 0 + && i == (end - ptr) / (2 * tableSize) + && !((end - ptr) % (2 * tableSize))) { + do { + const u8 *cur = ptr + (i / 2) * (2 * tableSize); + + startLoc = read_pointer(&cur, + cur + tableSize, + hdr[3]); + if (pc < startLoc) + i /= 2; + else { + ptr = cur - tableSize; + i = (i + 1) / 2; + } + } while (startLoc && i > 1); + if (i == 1 + && (startLoc = read_pointer(&ptr, + ptr + tableSize, + hdr[3])) != 0 + && pc >= startLoc) + fde = (void *)read_pointer(&ptr, + ptr + tableSize, + hdr[3]); } + } + + if (fde != NULL) { + cie = cie_for_fde(fde, table); ptr = (const u8 *)(fde + 2); - startLoc = read_pointer(&ptr, - (const u8 *)(fde + 1) + *fde, - ptrType); - endLoc = startLoc - + read_pointer(&ptr, - (const u8 *)(fde + 1) + *fde, - ptrType & DW_EH_PE_indirect - ? ptrType - : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed)); - if (pc >= startLoc && pc < endLoc) - break; - cie = NULL; + if(cie != NULL + && cie != &bad_cie + && cie != ¬_fde + && (ptrType = fde_pointer_type(cie)) >= 0 + && read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType) == startLoc) { + if (!(ptrType & DW_EH_PE_indirect)) + ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed; + endLoc = startLoc + + read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType); + if(pc >= endLoc) + fde = NULL; + } else + fde = NULL; + } + if (fde == NULL) { + for (fde = table->address, tableSize = table->size; + cie = NULL, tableSize > sizeof(*fde) + && tableSize - sizeof(*fde) >= *fde; + tableSize -= sizeof(*fde) + *fde, + fde += 1 + *fde / sizeof(*fde)) { + cie = cie_for_fde(fde, table); + if (cie == &bad_cie) { + cie = NULL; + break; + } + if (cie == NULL + || cie == ¬_fde + || (ptrType = fde_pointer_type(cie)) < 0) + continue; + ptr = (const u8 *)(fde + 2); + startLoc = read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType); + if (!startLoc) + continue; + if (!(ptrType & DW_EH_PE_indirect)) + ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed; + endLoc = startLoc + + read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType); + if (pc >= startLoc && pc < endLoc) + break; + } } } if (cie != NULL) { -- cgit v1.2.3 From 1d4d262769cd1894a0306b9c57e72f005cd9e75a Mon Sep 17 00:00:00 2001 From: Jan Dittmer Date: Sat, 28 Oct 2006 10:38:38 -0700 Subject: [PATCH] Add missing space in module.c for taintskernel Obvious fix. Signed-off-by: Jan Dittmer Acked-by: Florin Malita Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 67009bd56c52..5072a943fe35 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1342,7 +1342,7 @@ static void set_license(struct module *mod, const char *license) if (!license_is_gpl_compatible(license)) { if (!(tainted & TAINT_PROPRIETARY_MODULE)) - printk(KERN_WARNING "%s: module license '%s' taints" + printk(KERN_WARNING "%s: module license '%s' taints " "kernel.\n", mod->name, license); add_taint_module(mod, TAINT_PROPRIETARY_MODULE); } -- cgit v1.2.3 From 5fa3839a64203b2ab727dcb37da9b2d7079fca28 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Sat, 28 Oct 2006 10:38:46 -0700 Subject: [PATCH] Constify compat_get_bitmap argument This means we can call it when the bitmap we want to fetch is declared const. Signed-off-by: Stephen Rothwell Cc: Christoph Lameter Cc: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 75573e5d27b0..d4898aad6cfa 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -678,7 +678,7 @@ int get_compat_sigevent(struct sigevent *event, ? -EFAULT : 0; } -long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask, +long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, unsigned long bitmap_size) { int i, j; -- cgit v1.2.3 From fca178c0c6e8d52a1875be36b070f30884ebfae9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 28 Oct 2006 10:38:49 -0700 Subject: [PATCH] fill_tgid: fix task_struct leak and possible oops 1. fill_tgid() forgets to do put_task_struct(first). 2. release_task(first) can happen after fill_tgid() drops tasklist_lock, it is unsafe to dereference first->signal. This is a temporary fix, imho the locking should be reworked. Signed-off-by: Oleg Nesterov Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 5d6a8c54ee85..9aeee511a463 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -237,14 +237,17 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, } else get_task_struct(first); - /* Start with stats from dead tasks */ - spin_lock_irqsave(&first->signal->stats_lock, flags); - if (first->signal->stats) - memcpy(stats, first->signal->stats, sizeof(*stats)); - spin_unlock_irqrestore(&first->signal->stats_lock, flags); tsk = first; read_lock(&tasklist_lock); + /* Start with stats from dead tasks */ + if (first->signal) { + spin_lock_irqsave(&first->signal->stats_lock, flags); + if (first->signal->stats) + memcpy(stats, first->signal->stats, sizeof(*stats)); + spin_unlock_irqrestore(&first->signal->stats_lock, flags); + } + do { if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk)) continue; @@ -264,7 +267,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, * Accounting subsytems can also add calls here to modify * fields of taskstats. */ - + put_task_struct(first); return 0; } -- cgit v1.2.3 From 05d5bcd60e8202e5c7b28cf61186043a4d612623 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 28 Oct 2006 10:38:50 -0700 Subject: [PATCH] bacct_add_tsk: fix unsafe and wrong parent/group_leader dereference 1. ts = timespec_sub(uptime, current->group_leader->start_time); It is possible that current != tsk. Probably it was supposed to be 'tsk->group_leader->start_time. But why we are reading group_leader's start_time ? This accounting is per thread, not per procees, I changed this to 'tsk->start_time. Please corect me. 2. stats->ac_ppid = (tsk->parent) ? tsk->parent->pid : 0; tsk->parent never == NULL, and it is unsafe to dereference it. Both the task and it's parent may exit after the caller unlocks tasklist_lock, the memory could be unmapped (DEBUG_SLAB). (And we should use ->real_parent->tgid in fact). Q: I don't understand the 'if (thread_group_leader(tsk))' check. Why it is needed ? Signed-off-by: Oleg Nesterov Cc: Shailabh Nagar Cc: Balbir Singh Acked-by: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/tsacct.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/tsacct.c b/kernel/tsacct.c index db443221ba5b..65a5036a3d95 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -36,7 +36,7 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) /* calculate task elapsed time in timespec */ do_posix_clock_monotonic_gettime(&uptime); - ts = timespec_sub(uptime, current->group_leader->start_time); + ts = timespec_sub(uptime, tsk->start_time); /* rebase elapsed time to usec */ ac_etime = timespec_to_ns(&ts); do_div(ac_etime, NSEC_PER_USEC); @@ -58,7 +58,10 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) stats->ac_uid = tsk->uid; stats->ac_gid = tsk->gid; stats->ac_pid = tsk->pid; - stats->ac_ppid = (tsk->parent) ? tsk->parent->pid : 0; + rcu_read_lock(); + stats->ac_ppid = pid_alive(tsk) ? + rcu_dereference(tsk->real_parent)->tgid : 0; + rcu_read_unlock(); stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC; stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; stats->ac_minflt = tsk->min_flt; -- cgit v1.2.3 From 093a8e8aecd77b2799934996a55a6838e1e2b8f3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 28 Oct 2006 10:38:51 -0700 Subject: [PATCH] taskstats_tgid_free: fix usage taskstats_tgid_free() is called on copy_process's error path. This is wrong. IF (clone_flags & CLONE_THREAD) We should not clear ->signal->taskstats, current uses it, it probably has a valid accumulated info. ELSE taskstats_tgid_init() set ->signal->taskstats = NULL, there is nothing to free. Move the callsite to __exit_signal(). We don't need any locking, entire thread group is exiting, nobody should have a reference to soon to be released ->signal. Signed-off-by: Oleg Nesterov Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 1 + kernel/fork.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f250a5e3e281..06de6c4e8ca3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -128,6 +128,7 @@ static void __exit_signal(struct task_struct *tsk) flush_sigqueue(&tsk->pending); if (sig) { flush_sigqueue(&sig->shared_pending); + taskstats_tgid_free(sig); __cleanup_signal(sig); } } diff --git a/kernel/fork.c b/kernel/fork.c index 29ebb30850ed..213326609bac 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -897,7 +897,6 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts void __cleanup_signal(struct signal_struct *sig) { exit_thread_group_keys(sig); - taskstats_tgid_free(sig); kmem_cache_free(signal_cachep, sig); } -- cgit v1.2.3 From b8534d7bd89df0cd41cd47bcd6733a05ea9a691a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 28 Oct 2006 10:38:53 -0700 Subject: [PATCH] taskstats: kill ->taskstats_lock in favor of ->siglock signal_struct is (mostly) protected by ->sighand->siglock, I think we don't need ->taskstats_lock to protect ->stats. This also allows us to simplify the locking in fill_tgid(). Signed-off-by: Oleg Nesterov Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- kernel/taskstats.c | 16 ++++++---------- 2 files changed, 7 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 213326609bac..3da978eec791 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -830,7 +830,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts if (clone_flags & CLONE_THREAD) { atomic_inc(¤t->signal->count); atomic_inc(¤t->signal->live); - taskstats_tgid_alloc(current->signal); + taskstats_tgid_alloc(current); return 0; } sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 9aeee511a463..b2efda94615a 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -241,11 +241,11 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, tsk = first; read_lock(&tasklist_lock); /* Start with stats from dead tasks */ - if (first->signal) { - spin_lock_irqsave(&first->signal->stats_lock, flags); + if (first->sighand) { + spin_lock_irqsave(&first->sighand->siglock, flags); if (first->signal->stats) memcpy(stats, first->signal->stats, sizeof(*stats)); - spin_unlock_irqrestore(&first->signal->stats_lock, flags); + spin_unlock_irqrestore(&first->sighand->siglock, flags); } do { @@ -276,7 +276,7 @@ static void fill_tgid_exit(struct task_struct *tsk) { unsigned long flags; - spin_lock_irqsave(&tsk->signal->stats_lock, flags); + spin_lock_irqsave(&tsk->sighand->siglock, flags); if (!tsk->signal->stats) goto ret; @@ -288,7 +288,7 @@ static void fill_tgid_exit(struct task_struct *tsk) */ delayacct_add_tsk(tsk->signal->stats, tsk); ret: - spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); + spin_unlock_irqrestore(&tsk->sighand->siglock, flags); return; } @@ -464,15 +464,10 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, size_t size; int is_thread_group; struct nlattr *na; - unsigned long flags; if (!family_registered || !tidstats) return; - spin_lock_irqsave(&tsk->signal->stats_lock, flags); - is_thread_group = tsk->signal->stats ? 1 : 0; - spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); - rc = 0; /* * Size includes space for nested attributes @@ -480,6 +475,7 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, size = nla_total_size(sizeof(u32)) + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + is_thread_group = (tsk->signal->stats != NULL); if (is_thread_group) size = 2 * size; /* PID + STATS + TGID + STATS */ -- cgit v1.2.3 From a98b6094261c0112e9c455c96995972181bff049 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 28 Oct 2006 10:38:54 -0700 Subject: [PATCH] taskstats: don't use tasklist_lock Remove tasklist_lock from taskstats.c. find_task_by_pid() is rcu-safe. ->siglock allows us to traverse subthread without tasklist. Q: delay accounting looks wrong to me. If sub-thread has already called taskstats_exit_send() but didn't call release_task(self) yet it will be accounted twice. The window is big. No? Signed-off-by: Oleg Nesterov Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 59 ++++++++++++++++++++++-------------------------------- 1 file changed, 24 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index b2efda94615a..b724aeea5443 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -174,21 +174,19 @@ static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) up_write(&listeners->sem); } -static int fill_pid(pid_t pid, struct task_struct *pidtsk, +static int fill_pid(pid_t pid, struct task_struct *tsk, struct taskstats *stats) { int rc = 0; - struct task_struct *tsk = pidtsk; - if (!pidtsk) { - read_lock(&tasklist_lock); + if (!tsk) { + rcu_read_lock(); tsk = find_task_by_pid(pid); - if (!tsk) { - read_unlock(&tasklist_lock); + if (tsk) + get_task_struct(tsk); + rcu_read_unlock(); + if (!tsk) return -ESRCH; - } - get_task_struct(tsk); - read_unlock(&tasklist_lock); } else get_task_struct(tsk); @@ -214,40 +212,28 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk, } -static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, +static int fill_tgid(pid_t tgid, struct task_struct *first, struct taskstats *stats) { - struct task_struct *tsk, *first; + struct task_struct *tsk; unsigned long flags; + int rc = -ESRCH; /* * Add additional stats from live tasks except zombie thread group * leaders who are already counted with the dead tasks */ - first = tgidtsk; - if (!first) { - read_lock(&tasklist_lock); + rcu_read_lock(); + if (!first) first = find_task_by_pid(tgid); - if (!first) { - read_unlock(&tasklist_lock); - return -ESRCH; - } - get_task_struct(first); - read_unlock(&tasklist_lock); - } else - get_task_struct(first); + if (!first || !lock_task_sighand(first, &flags)) + goto out; - tsk = first; - read_lock(&tasklist_lock); - /* Start with stats from dead tasks */ - if (first->sighand) { - spin_lock_irqsave(&first->sighand->siglock, flags); - if (first->signal->stats) - memcpy(stats, first->signal->stats, sizeof(*stats)); - spin_unlock_irqrestore(&first->sighand->siglock, flags); - } + if (first->signal->stats) + memcpy(stats, first->signal->stats, sizeof(*stats)); + tsk = first; do { if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk)) continue; @@ -260,15 +246,18 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, delayacct_add_tsk(stats, tsk); } while_each_thread(first, tsk); - read_unlock(&tasklist_lock); - stats->version = TASKSTATS_VERSION; + unlock_task_sighand(first, &flags); + rc = 0; +out: + rcu_read_unlock(); + + stats->version = TASKSTATS_VERSION; /* * Accounting subsytems can also add calls here to modify * fields of taskstats. */ - put_task_struct(first); - return 0; + return rc; } -- cgit v1.2.3 From d7c3f5f231c60d7e6ada5770b536df2b3ec1bd08 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 28 Oct 2006 10:38:54 -0700 Subject: [PATCH] fill_tgid: cleanup delays accounting fill_tgid() should skip not only an already exited group leader. If the task has ->exit_state != 0 it already did exit_notify(), so it also did fill_tgid_exit()->delayacct_add_tsk(->signal->stats) and we should skip it to avoid a double accounting. This patch doesn't close the race completely, but it cleanups the code. Signed-off-by: Oleg Nesterov Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index b724aeea5443..8adfb8069c6d 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -235,7 +235,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first, tsk = first; do { - if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk)) + if (tsk->exit_state) continue; /* * Accounting subsystem can call its functions here to -- cgit v1.2.3 From bb1d860551c4307b1a7ee9a21b120319075e987e Mon Sep 17 00:00:00 2001 From: Jim Houston Date: Sat, 28 Oct 2006 10:38:56 -0700 Subject: [PATCH] time_adjust cleared before use I notice that the code which implements adjtime clears the time_adjust value before using it. The attached patch makes the obvious fix. Acked-by: Roman Zippel Signed-off-by: Jim Houston Cc: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/ntp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 47195fa0ec4f..3afeaa3a73f9 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -161,9 +161,9 @@ void second_overflow(void) time_adjust += MAX_TICKADJ; tick_length -= MAX_TICKADJ_SCALED; } else { - time_adjust = 0; tick_length += (s64)(time_adjust * NSEC_PER_USEC / HZ) << TICK_LENGTH_SHIFT; + time_adjust = 0; } } } -- cgit v1.2.3 From 8fa1d7d3b2c51594c0f3aa151983dd51f605e07d Mon Sep 17 00:00:00 2001 From: Satoru Takeuchi Date: Sat, 28 Oct 2006 10:38:57 -0700 Subject: [PATCH] cpu-hotplug: release `workqueue_mutex' properly on CPU hot-remove _cpu_down() acquires `workqueue_mutex' on its process, but doen't release it if __cpu_disable() fails. Signed-off-by: Satoru Takeuchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 27dd3ee47099..663c920b2234 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -150,18 +150,18 @@ static int _cpu_down(unsigned int cpu) p = __stop_machine_run(take_cpu_down, NULL, cpu); mutex_unlock(&cpu_bitmask_lock); - if (IS_ERR(p)) { + if (IS_ERR(p) || cpu_online(cpu)) { /* CPU didn't die: tell everyone. Can't complain. */ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, (void *)(long)cpu) == NOTIFY_BAD) BUG(); - err = PTR_ERR(p); - goto out_allowed; - } - - if (cpu_online(cpu)) + if (IS_ERR(p)) { + err = PTR_ERR(p); + goto out_allowed; + } goto out_thread; + } /* Wait for it to sleep (leaving idle task). */ while (!idle_cpu(cpu)) -- cgit v1.2.3 From 057647fc47b3a5fbcfa997041db3f483d506603c Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Sat, 28 Oct 2006 10:38:58 -0700 Subject: [PATCH] workqueue: update kerneldoc This patch (as812) changes the kerneldoc comments explaining the return values from queue_work(), queue_delayed_work(), and queue_delayed_work_on(). The updated comments explain more accurately the meaning of the return code and avoid suggesting that a 0 value means the routine was unsuccessful. Signed-off-by: Alan Stern Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3df9bfc7ff78..17c2f03d2c27 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -99,7 +99,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, * @wq: workqueue to use * @work: work to queue * - * Returns non-zero if it was successfully added. + * Returns 0 if @work was already on a queue, non-zero otherwise. * * We queue the work to the CPU it was submitted, but there is no * guarantee that it will be processed by that CPU. @@ -138,7 +138,7 @@ static void delayed_work_timer_fn(unsigned long __data) * @work: work to queue * @delay: number of jiffies to wait before queueing * - * Returns non-zero if it was successfully added. + * Returns 0 if @work was already on a queue, non-zero otherwise. */ int fastcall queue_delayed_work(struct workqueue_struct *wq, struct work_struct *work, unsigned long delay) @@ -169,7 +169,7 @@ EXPORT_SYMBOL_GPL(queue_delayed_work); * @work: work to queue * @delay: number of jiffies to wait before queueing * - * Returns non-zero if it was successfully added. + * Returns 0 if @work was already on a queue, non-zero otherwise. */ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work, unsigned long delay) -- cgit v1.2.3 From d46a3d0d07ba539aea5b0e1ad30e568f0cb03576 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 29 Oct 2006 16:45:58 +0300 Subject: [PATCH] taskstats: fix sk_buff leak 'return genlmsg_cancel()' in taskstats_user_cmd/taskstats_exit_send potentially leaks a skb. Unless we pass 'rep_skb' to the netlink layer we own sk_buff. This means we should always do kfree_skb() on failure. [ Thomas acked and pointed out missing return value in original version ] Signed-off-by: Oleg Nesterov Acked-by: Thomas Graf Cc: Andrew Morton Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 8adfb8069c6d..f3c3e9d43d2c 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -411,7 +411,7 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) return send_reply(rep_skb, info->snd_pid); nla_put_failure: - return genlmsg_cancel(rep_skb, reply); + rc = genlmsg_cancel(rep_skb, reply); err: nlmsg_free(rep_skb); return rc; @@ -507,7 +507,6 @@ send: nla_put_failure: genlmsg_cancel(rep_skb, reply); - goto ret; err_skb: nlmsg_free(rep_skb); ret: -- cgit v1.2.3 From 3d8334def5cf831d2ed438aae021696a2faa4ddd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 29 Oct 2006 18:57:16 +0300 Subject: [PATCH] taskstats: fix sk_buff size calculation prepare_reply() adds GENL_HDRLEN to the payload (genlmsg_total_size()), but then it does genlmsg_put()->nlmsg_put(). This means we forget to reserve a room for 'struct nlmsghdr'. Signed-off-by: Oleg Nesterov Cc: Thomas Graf Cc: Andrew Morton Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index f3c3e9d43d2c..2039585ec5e1 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -77,7 +77,8 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, /* * If new attributes are added, please revisit this allocation */ - skb = nlmsg_new(genlmsg_total_size(size), GFP_KERNEL); + size = nlmsg_total_size(genlmsg_total_size(size)); + skb = nlmsg_new(size, GFP_KERNEL); if (!skb) return -ENOMEM; -- cgit v1.2.3 From 0e2d57fc6e7dabdbfdd4f26c861e7e6c75d5bdcf Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 29 Oct 2006 22:46:34 -0800 Subject: [PATCH] ndiswrapper: don't set the module->taints flags For ndiswrapper, don't set the module->taints flags, just set the kernel global tainted flag. This should allow ndiswrapper to continue to use GPL symbols. Signed-off-by: Randy Dunlap Cc: Florin Malita Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 5072a943fe35..f0166563c602 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1718,7 +1718,7 @@ static struct module *load_module(void __user *umod, set_license(mod, get_modinfo(sechdrs, infoindex, "license")); if (strcmp(mod->name, "ndiswrapper") == 0) - add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + add_taint(TAINT_PROPRIETARY_MODULE); if (strcmp(mod->name, "driverloader") == 0) add_taint_module(mod, TAINT_PROPRIETARY_MODULE); -- cgit v1.2.3 From f0ec1aaf54caddd21c259aea8b2ecfbde4ee4fb9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 29 Oct 2006 22:46:43 -0800 Subject: [PATCH] xacct_add_tsk: fix pure theoretical ->mm use-after-free Paranoid fix. The task can free its ->mm after the 'if (p->mm)' check. Signed-off-by: Oleg Nesterov Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/tsacct.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 65a5036a3d95..96f77013d3f0 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -80,13 +80,17 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) */ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) { + struct mm_struct *mm; + /* convert pages-jiffies to Mbyte-usec */ stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB; stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB; - if (p->mm) { + mm = get_task_mm(p); + if (mm) { /* adjust to KB unit */ - stats->hiwater_rss = p->mm->hiwater_rss * PAGE_SIZE / KB; - stats->hiwater_vm = p->mm->hiwater_vm * PAGE_SIZE / KB; + stats->hiwater_rss = mm->hiwater_rss * PAGE_SIZE / KB; + stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB; + mmput(mm); } stats->read_char = p->rchar; stats->write_char = p->wchar; -- cgit v1.2.3 From 4a279ff1ea1cf325775ada983035123fcdc8e986 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 30 Oct 2006 22:07:15 -0800 Subject: [PATCH] taskstats: fix sub-threads accounting If there are no listeners, taskstats_exit_send() just returns because taskstats_exit_alloc() didn't allocate *tidstats. This is wrong, each sub-thread should do fill_tgid_exit() on exit, otherwise its ->delays is not recorded in ->signal->stats and lost. Q: We don't send TASKSTATS_TYPE_AGGR_TGID when single-threaded process exits. Is it good? How can the listener figure out that it was actually a process exit, not sub-thread? Signed-off-by: Oleg Nesterov Cc: Balbir Singh Acked-by: Shailabh Nagar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 2039585ec5e1..f45c5e70773c 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -455,10 +455,9 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, int is_thread_group; struct nlattr *na; - if (!family_registered || !tidstats) + if (!family_registered) return; - rc = 0; /* * Size includes space for nested attributes */ @@ -466,8 +465,15 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); is_thread_group = (tsk->signal->stats != NULL); - if (is_thread_group) - size = 2 * size; /* PID + STATS + TGID + STATS */ + if (is_thread_group) { + /* PID + STATS + TGID + STATS */ + size = 2 * size; + /* fill the tsk->signal->stats structure */ + fill_tgid_exit(tsk); + } + + if (!tidstats) + return; rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); if (rc < 0) @@ -487,11 +493,8 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, goto send; /* - * tsk has/had a thread group so fill the tsk->signal->stats structure * Doesn't matter if tsk is the leader or the last group member leaving */ - - fill_tgid_exit(tsk); if (!group_dead) goto send; -- cgit v1.2.3 From f46c483357c2d87606bbefb511321e3efd4baae0 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 2 Nov 2006 22:07:16 -0800 Subject: [PATCH] Add printk_timed_ratelimit() printk_ratelimit() has global state which makes it not useful for callers which wish to perform ratelimiting at a particular frequency. Add a printk_timed_ratelimit() which utilises caller-provided state storage to permit more flexibility. This function can in fact be used for things other than printk ratelimiting and is perhaps poorly named. Cc: Ulrich Drepper Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index f7d427ef5038..66426552fbfe 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -1101,3 +1102,23 @@ int printk_ratelimit(void) printk_ratelimit_burst); } EXPORT_SYMBOL(printk_ratelimit); + +/** + * printk_timed_ratelimit - caller-controlled printk ratelimiting + * @caller_jiffies: pointer to caller's state + * @interval_msecs: minimum interval between prints + * + * printk_timed_ratelimit() returns true if more than @interval_msecs + * milliseconds have elapsed since the last time printk_timed_ratelimit() + * returned true. + */ +bool printk_timed_ratelimit(unsigned long *caller_jiffies, + unsigned int interval_msecs) +{ + if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) { + *caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs); + return true; + } + return false; +} +EXPORT_SYMBOL(printk_timed_ratelimit); -- cgit v1.2.3 From 19c6b6ed3f597a583f58e3fc99256cc01ae8c394 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 2 Nov 2006 22:07:17 -0800 Subject: [PATCH] schedule removal of FUTEX_FD Apparently FUTEX_FD is unfixably racy and nothing uses it (or if it does, it shouldn't). Add a warning printk, give any remaining users six months to migrate off it. Cc: Ulrich Drepper Cc: Ingo Molnar Acked-by: Thomas Gleixner Cc: Rusty Russell Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index b364e0026191..93ef30ba209f 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1507,6 +1507,13 @@ static int futex_fd(u32 __user *uaddr, int signal) struct futex_q *q; struct file *filp; int ret, err; + static unsigned long printk_interval; + + if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) { + printk(KERN_WARNING "Process `%s' used FUTEX_FD, which " + "will be removed from the kernel in June 2007\n", + current->comm); + } ret = -EINVAL; if (!valid_signal(signal)) -- cgit v1.2.3 From b918f6e62cd46774f9fc0a3fbba6bd10ad85ee14 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 2 Nov 2006 22:07:19 -0800 Subject: [PATCH] swsusp: debugging Add a swsusp debugging mode. This does everything that's needed for a suspend except for actually suspending. So we can look in the log messages and work out a) what code is being slow and b) which drivers are misbehaving. (1) # echo testproc > /sys/power/disk # echo disk > /sys/power/state This should turn off the non-boot CPU, freeze all processes, wait for 5 seconds and then thaw the processes and the CPU. (2) # echo test > /sys/power/disk # echo disk > /sys/power/state This should turn off the non-boot CPU, freeze all processes, shrink memory, suspend all devices, wait for 5 seconds, resume the devices etc. Cc: Pavel Machek Cc: Stefan Seyfried Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index d3a158a60312..b1fb7866b0b3 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -71,7 +71,7 @@ static inline void platform_finish(void) static int prepare_processes(void) { - int error; + int error = 0; pm_prepare_console(); @@ -84,6 +84,12 @@ static int prepare_processes(void) goto thaw; } + if (pm_disk_mode == PM_DISK_TESTPROC) { + printk("swsusp debug: Waiting for 5 seconds.\n"); + mdelay(5000); + goto thaw; + } + /* Free memory before shutting down devices. */ if (!(error = swsusp_shrink_memory())) return 0; @@ -120,13 +126,21 @@ int pm_suspend_disk(void) if (error) return error; + if (pm_disk_mode == PM_DISK_TESTPROC) + goto Thaw; + suspend_console(); error = device_suspend(PMSG_FREEZE); if (error) { resume_console(); printk("Some devices failed to suspend\n"); - unprepare_processes(); - return error; + goto Thaw; + } + + if (pm_disk_mode == PM_DISK_TEST) { + printk("swsusp debug: Waiting for 5 seconds.\n"); + mdelay(5000); + goto Done; } pr_debug("PM: snapshotting memory.\n"); @@ -143,16 +157,17 @@ int pm_suspend_disk(void) power_down(pm_disk_mode); else { swsusp_free(); - unprepare_processes(); - return error; + goto Thaw; } - } else + } else { pr_debug("PM: Image restored successfully.\n"); + } swsusp_free(); Done: device_resume(); resume_console(); + Thaw: unprepare_processes(); return error; } @@ -249,6 +264,8 @@ static const char * const pm_disk_modes[] = { [PM_DISK_PLATFORM] = "platform", [PM_DISK_SHUTDOWN] = "shutdown", [PM_DISK_REBOOT] = "reboot", + [PM_DISK_TEST] = "test", + [PM_DISK_TESTPROC] = "testproc", }; /** @@ -303,17 +320,19 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n) } } if (mode) { - if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT) + if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT || + mode == PM_DISK_TEST || mode == PM_DISK_TESTPROC) { pm_disk_mode = mode; - else { + } else { if (pm_ops && pm_ops->enter && (mode == pm_ops->pm_disk_mode)) pm_disk_mode = mode; else error = -EINVAL; } - } else + } else { error = -EINVAL; + } pr_debug("PM: suspend-to-disk mode set to '%s'\n", pm_disk_modes[mode]); -- cgit v1.2.3 From 3fd593979802f81ff6452596ac61e3840f917589 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 2 Nov 2006 22:07:24 -0800 Subject: [PATCH] Create compat_sys_migrate_pages This is needed on bigendian 64bit architectures. Signed-off-by: Stephen Rothwell Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/compat.c | 33 +++++++++++++++++++++++++++++++++ kernel/sys_ni.c | 1 + 2 files changed, 34 insertions(+) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index d4898aad6cfa..6952dd057300 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -982,4 +982,37 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, } return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); } + +asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, + compat_ulong_t maxnode, + const compat_ulong_t __user *old_nodes, + const compat_ulong_t __user *new_nodes) +{ + unsigned long __user *old = NULL; + unsigned long __user *new = NULL; + nodemask_t tmp_mask; + unsigned long nr_bits; + unsigned long size; + + nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES); + size = ALIGN(nr_bits, BITS_PER_LONG) / 8; + if (old_nodes) { + if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits)) + return -EFAULT; + old = compat_alloc_user_space(new_nodes ? size * 2 : size); + if (new_nodes) + new = old + size / sizeof(unsigned long); + if (copy_to_user(old, nodes_addr(tmp_mask), size)) + return -EFAULT; + } + if (new_nodes) { + if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits)) + return -EFAULT; + if (new == NULL) + new = compat_alloc_user_space(size); + if (copy_to_user(new, nodes_addr(tmp_mask), size)) + return -EFAULT; + } + return sys_migrate_pages(pid, nr_bits + 1, old, new); +} #endif diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 0e53314b14de..d7306d0f3dfc 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -135,6 +135,7 @@ cond_syscall(sys_madvise); cond_syscall(sys_mremap); cond_syscall(sys_remap_file_pages); cond_syscall(compat_sys_move_pages); +cond_syscall(compat_sys_migrate_pages); /* block-layer dependent */ cond_syscall(sys_bdflush); -- cgit v1.2.3 From 45c18b0bb579b5c1b89f8c99f1b6ffa4c586ba08 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 4 Nov 2006 10:06:02 -0800 Subject: Fix unlikely (but possible) race condition on task->user access There's a possible race condition when doing a "switch_uid()" from one user to another, which could race with another thread doing a signal allocation and looking at the old thread ->user pointer as it is freed. This explains an oops reported by Lukasz Trabinski: http://permalink.gmane.org/gmane.linux.kernel/462241 We fix this by delaying the (reference-counted) freeing of the user structure until the thread signal handler lock has been released, so that we know that the signal allocation has either seen the new value or has properly incremented the reference count of the old one. Race identified by Oleg Nesterov. Cc: Lukasz Trabinski Cc: Oleg Nesterov Cc: Andrew Morton Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/user.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 6408c0424291..220e586127a0 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -187,6 +187,17 @@ void switch_uid(struct user_struct *new_user) atomic_dec(&old_user->processes); switch_uid_keyring(new_user); current->user = new_user; + + /* + * We need to synchronize with __sigqueue_alloc() + * doing a get_uid(p->user).. If that saw the old + * user value, we need to wait until it has exited + * its critical region before we can free the old + * structure. + */ + smp_mb(); + spin_unlock_wait(¤t->sighand->siglock); + free_uid(old_user); suid_keys(current); } -- cgit v1.2.3 From 10b1fbdb0a0ca91847a534ad26d0bc250c25b74f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 4 Nov 2006 13:03:00 -0800 Subject: Make sure "user->sigpending" count is in sync The previous commit (45c18b0bb579b5c1b89f8c99f1b6ffa4c586ba08, aka "Fix unlikely (but possible) race condition on task->user access") fixed a potential oops due to __sigqueue_alloc() getting its "user" pointer out of sync with switch_user(), and accessing a user pointer that had been de-allocated on another CPU. It still left another (much less serious) problem, where a concurrent __sigqueue_alloc and swich_user could cause sigqueue_alloc to do signal pending reference counting for a _different_ user than the one it then actually ended up using. No oops, but we'd end up with the wrong signal accounting. Another case of Oleg's eagle-eyes picking up the problem. This is trivially fixed by just making sure we load whichever "user" structure we decide to use (it doesn't matter _which_ one we pick, we just need to pick one) just once. Acked-by: Oleg Nesterov Cc: Andrew Morton Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/signal.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 7ed8d5304bec..df18c167a2a7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -267,18 +267,25 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, int override_rlimit) { struct sigqueue *q = NULL; + struct user_struct *user; - atomic_inc(&t->user->sigpending); + /* + * In order to avoid problems with "switch_user()", we want to make + * sure that the compiler doesn't re-load "t->user" + */ + user = t->user; + barrier(); + atomic_inc(&user->sigpending); if (override_rlimit || - atomic_read(&t->user->sigpending) <= + atomic_read(&user->sigpending) <= t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) q = kmem_cache_alloc(sigqueue_cachep, flags); if (unlikely(q == NULL)) { - atomic_dec(&t->user->sigpending); + atomic_dec(&user->sigpending); } else { INIT_LIST_HEAD(&q->list); q->flags = 0; - q->user = get_uid(t->user); + q->user = get_uid(user); } return(q); } -- cgit v1.2.3