From 4dfbb9d8c6cbfc32faa5c71145bd2a43e1f8237c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Oct 2006 01:45:14 -0400 Subject: Lockdep: add lockdep_set_class_and_subclass() and lockdep_set_subclass() This annotation makes it possible to assign a subclass on lock init. This annotation is meant to reduce the _nested() annotations by assigning a default subclass. One could do without this annotation and rely on lockdep_set_class() exclusively, but that would require a manual stack of struct lock_class_key objects. Signed-off-by: Peter Zijlstra Signed-off-by: Dmitry Torokhov --- kernel/lockdep.c | 10 ++++++---- kernel/mutex-debug.c | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 4c0553461000..ba7156ac70c1 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -1177,7 +1177,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) * itself, so actual lookup of the hash should be once per lock object. */ static inline struct lock_class * -register_lock_class(struct lockdep_map *lock, unsigned int subclass) +register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) { struct lockdep_subclass_key *key; struct list_head *hash_head; @@ -1249,7 +1249,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass) out_unlock_set: __raw_spin_unlock(&hash_lock); - if (!subclass) + if (!subclass || force) lock->class_cache = class; DEBUG_LOCKS_WARN_ON(class->subclass != subclass); @@ -1937,7 +1937,7 @@ void trace_softirqs_off(unsigned long ip) * Initialize a lock instance's lock-class mapping info: */ void lockdep_init_map(struct lockdep_map *lock, const char *name, - struct lock_class_key *key) + struct lock_class_key *key, int subclass) { if (unlikely(!debug_locks)) return; @@ -1957,6 +1957,8 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, lock->name = name; lock->key = key; lock->class_cache = NULL; + if (subclass) + register_lock_class(lock, subclass, 1); } EXPORT_SYMBOL_GPL(lockdep_init_map); @@ -1995,7 +1997,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, * Not cached yet or subclass? */ if (unlikely(!class)) { - class = register_lock_class(lock, subclass); + class = register_lock_class(lock, subclass, 0); if (!class) return 0; } diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index e3203c654dda..18651641a7b5 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -91,7 +91,7 @@ void debug_mutex_init(struct mutex *lock, const char *name, * Make sure we are not reinitializing a held lock: */ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key); + lockdep_init_map(&lock->dep_map, name, key, 0); #endif lock->owner = NULL; lock->magic = lock; -- cgit v1.2.3 From 39af114377bf80d2a88e47be33d578d1fa9b0775 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 16 Oct 2006 09:01:46 -0700 Subject: [PATCH] fix epoll_pwait when EPOLL=n Fixes http://bugzilla.kernel.org/show_bug.cgi?id=7371 sys_epoll_pwait needs to be listed as a conditional (weak) entry point for CONFIG_EPOLL=n. Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 7a3b2e75f040..0e53314b14de 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -49,6 +49,7 @@ cond_syscall(compat_sys_get_robust_list); cond_syscall(sys_epoll_create); cond_syscall(sys_epoll_ctl); cond_syscall(sys_epoll_wait); +cond_syscall(sys_epoll_pwait); cond_syscall(sys_semget); cond_syscall(sys_semop); cond_syscall(sys_semtimedop); -- cgit v1.2.3 From ca268c691de95612981b93e58899c1d73fdb6b47 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 17 Oct 2006 00:09:28 -0700 Subject: [PATCH] lockdep: increase max allowed recursion depth In general, lockdep warnings are intended to be non-fatal, so I have put in various practical limits on internal data structure failure modes. We haven't had a /single/ lockdep-internal crash ever since lockdep went upstream [the unwinder crashes are outside of lockdep], and that's largely due to the good internal checks it does. Recursion within the dependency graph is currently limited to 20, that's probably not enough on some many-CPU boxes - this patch doubles it to 40. I have written the lockdep functions to have as small stackframes as possible, so 40 should be OK too. (The practical recursion limit should be somewhere between 100 and 200 entries. If we hit that then I'll change the algorithm to be iteration-based. Graph walking logic is so easy to program via recursion, so i'd like to keep recursion as long as possible.) Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/lockdep.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 805a322a5655..d1a3b2cfe4a9 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -575,6 +575,8 @@ static noinline int print_circular_bug_tail(void) return 0; } +#define RECURSION_LIMIT 40 + static int noinline print_infinite_recursion_bug(void) { __raw_spin_unlock(&hash_lock); @@ -595,7 +597,7 @@ check_noncircular(struct lock_class *source, unsigned int depth) debug_atomic_inc(&nr_cyclic_check_recursions); if (depth > max_recursion_depth) max_recursion_depth = depth; - if (depth >= 20) + if (depth >= RECURSION_LIMIT) return print_infinite_recursion_bug(); /* * Check this lock's dependency list: @@ -645,7 +647,7 @@ find_usage_forwards(struct lock_class *source, unsigned int depth) if (depth > max_recursion_depth) max_recursion_depth = depth; - if (depth >= 20) + if (depth >= RECURSION_LIMIT) return print_infinite_recursion_bug(); debug_atomic_inc(&nr_find_usage_forwards_checks); @@ -684,7 +686,7 @@ find_usage_backwards(struct lock_class *source, unsigned int depth) if (depth > max_recursion_depth) max_recursion_depth = depth; - if (depth >= 20) + if (depth >= RECURSION_LIMIT) return print_infinite_recursion_bug(); debug_atomic_inc(&nr_find_usage_backwards_checks); -- cgit v1.2.3 From 3f4a0b917ce72ef47e438d354c433eb645218e87 Mon Sep 17 00:00:00 2001 From: john stultz Date: Tue, 17 Oct 2006 00:09:32 -0700 Subject: [PATCH] i386 Time: Avoid PIT SMP lockups Avoid possible PIT livelock issues seen on SMP systems (and reported by Andi), by not allowing it as a clocksource on SMP boxes. However, since the PIT may no longer be present, we have to properly handle the cases where SMP systems have TSC skew and fall back from the TSC. Since the PIT isn't there, it would "fall back" to the TSC again. So this changes the jiffies rating to 1, and the TSC-bad rating value to 0. Thus you will get the following behavior priority on i386 systems: tsc [if present & stable] hpet [if present] cyclone [if present] acpi_pm [if present] pit [if UP] jiffies Rather then the current more complicated: tsc [if present & stable] hpet [if present] cyclone [if present] acpi_pm [if present] pit [if cpus < 4] tsc [if present & unstable] jiffies Signed-off-by: John Stultz Cc: Andi Kleen Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/jiffies.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 126bb30c4afe..a99b2a6e6a07 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -57,7 +57,7 @@ static cycle_t jiffies_read(void) struct clocksource clocksource_jiffies = { .name = "jiffies", - .rating = 0, /* lowest rating*/ + .rating = 1, /* lowest valid rating*/ .read = jiffies_read, .mask = 0xffffffff, /*32bits*/ .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ -- cgit v1.2.3 From ac08c26492a0ad4d94a25bd47d5630cd38337069 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Oct 2006 00:09:39 -0700 Subject: [PATCH] posix-cpu-timers: prevent signal delivery starvation The integer divisions in the timer accounting code can round the result down to 0. Adding 0 is without effect and the signal delivery stops. Clamp the division result to minimum 1 to avoid this. Problem was reported by Seongbae Park , who provided also an inital patch. Roland sayeth: I have had some more time to think about the problem, and to reproduce it using Toyo's test case. For the record, if my understanding of the problem is correct, this happens only in one very particular case. First, the expiry time has to be so soon that in cputime_t units (usually 1s/HZ ticks) it's < nthreads so the division yields zero. Second, it only affects each thread that is so new that its CPU time accumulation is zero so now+0 is still zero and ->it_*_expires winds up staying zero. For the VIRT and PROF clocks when cputime_t is tick granularity (or the SCHED clock on configurations where sched_clock's value only advances on clock ticks), this is not hard to arrange with new threads starting up and blocking before they accumulate a whole tick of CPU time. That's what happens in Toyo's test case. Note that in general it is fine for that division to round down to zero, and set each thread's expiry time to its "now" time. The problem only arises with thread's whose "now" value is still zero, so that now+0 winds up 0 and is interpreted as "not set" instead of ">= now". So it would be a sufficient and more precise fix to just use max(ticks, 1) inside the loop when setting each it_*_expires value. But, it does no harm to round the division up to one and always advance every thread's expiry time. If the thread didn't already fire timers for the expiry time of "now", there is no expectation that it will do so before the next tick anyway. So I followed Thomas's patch in lifting the max out of the loops. This patch also covers the reload cases, which are harder to write a test for (and I didn't try). I've tested it with Toyo's case and it fixes that. [toyoa@mvista.com: fix: min_t -> max_t] Signed-off-by: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Roland McGrath Cc: Daniel Walker Cc: Toyo Abe Cc: john stultz Cc: Roman Zippel Cc: Seongbae Park Cc: Peter Mattis Cc: Rohit Seth Cc: Martin Bligh Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/posix-cpu-timers.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 479b16b44f79..7c3e1e6dfb5b 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -87,6 +87,19 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, return a; } +/* + * Divide and limit the result to res >= 1 + * + * This is necessary to prevent signal delivery starvation, when the result of + * the division would be rounded down to 0. + */ +static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div) +{ + cputime_t res = cputime_div(time, div); + + return max_t(cputime_t, res, 1); +} + /* * Update expiry time from increment, and increase overrun count, * given the current clock sample. @@ -483,8 +496,8 @@ static void process_timer_rebalance(struct task_struct *p, BUG(); break; case CPUCLOCK_PROF: - left = cputime_div(cputime_sub(expires.cpu, val.cpu), - nthreads); + left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), + nthreads); do { if (likely(!(t->flags & PF_EXITING))) { ticks = cputime_add(prof_ticks(t), left); @@ -498,8 +511,8 @@ static void process_timer_rebalance(struct task_struct *p, } while (t != p); break; case CPUCLOCK_VIRT: - left = cputime_div(cputime_sub(expires.cpu, val.cpu), - nthreads); + left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), + nthreads); do { if (likely(!(t->flags & PF_EXITING))) { ticks = cputime_add(virt_ticks(t), left); @@ -515,6 +528,7 @@ static void process_timer_rebalance(struct task_struct *p, case CPUCLOCK_SCHED: nsleft = expires.sched - val.sched; do_div(nsleft, nthreads); + nsleft = max_t(unsigned long long, nsleft, 1); do { if (likely(!(t->flags & PF_EXITING))) { ns = t->sched_time + nsleft; @@ -1159,12 +1173,13 @@ static void check_process_timers(struct task_struct *tsk, prof_left = cputime_sub(prof_expires, utime); prof_left = cputime_sub(prof_left, stime); - prof_left = cputime_div(prof_left, nthreads); + prof_left = cputime_div_non_zero(prof_left, nthreads); virt_left = cputime_sub(virt_expires, utime); - virt_left = cputime_div(virt_left, nthreads); + virt_left = cputime_div_non_zero(virt_left, nthreads); if (sched_expires) { sched_left = sched_expires - sched_time; do_div(sched_left, nthreads); + sched_left = max_t(unsigned long long, sched_left, 1); } else { sched_left = 0; } -- cgit v1.2.3 From c60099bfe3a5e6fa22a930627689b3769c52153f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 17 Oct 2006 00:09:59 -0700 Subject: [PATCH] swsusp: fix memory leaks My fancy new swsusp IO code had a big memory leak. It's somewhat invisible because the whole mem_map[] gets overwritten after resume, but it can cause us to get low on memory during the actual suspend process. Cc: "Rafael J. Wysocki" Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/swap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 9b2ee5344dee..1a3b0dd2c3fc 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -425,7 +425,8 @@ static int submit(int rw, pgoff_t page_off, struct page *page, bio_set_pages_dirty(bio); bio_put(bio); } else { - get_page(page); + if (rw == READ) + get_page(page); /* These pages are freed later */ bio->bi_private = *bio_chain; *bio_chain = bio; submit_bio(rw | (1 << BIO_RW_SYNC), bio); -- cgit v1.2.3 From a460e745e8f9c75a0525ff94154a0629f9d3e05d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 17 Oct 2006 00:10:03 -0700 Subject: [PATCH] genirq: clean up irq-flow-type naming Introduce desc->name and eliminate the handle_irq_name() hack. Add set_irq_chip_and_handler_name() to set the flow type and name at once. Signed-off-by: Ingo Molnar Acked-by: Thomas Gleixner Cc: "Eric W. Biederman" Cc: Matthew Wilcox Cc: Kyle McMartin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/chip.c | 33 +++++++++------------------------ 1 file changed, 9 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 11c99697acfe..2d0dc3efe813 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -499,7 +499,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) #endif /* CONFIG_SMP */ void -__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained) +__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, + const char *name) { struct irq_desc *desc; unsigned long flags; @@ -540,6 +541,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained) desc->depth = 1; } desc->handle_irq = handle; + desc->name = name; if (handle != handle_bad_irq && is_chained) { desc->status &= ~IRQ_DISABLED; @@ -555,30 +557,13 @@ set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, irq_flow_handler_t handle) { set_irq_chip(irq, chip); - __set_irq_handler(irq, handle, 0); + __set_irq_handler(irq, handle, 0, NULL); } -/* - * Get a descriptive string for the highlevel handler, for - * /proc/interrupts output: - */ -const char * -handle_irq_name(irq_flow_handler_t handle) +void +set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, + irq_flow_handler_t handle, const char *name) { - if (handle == handle_level_irq) - return "level "; - if (handle == handle_fasteoi_irq) - return "fasteoi"; - if (handle == handle_edge_irq) - return "edge "; - if (handle == handle_simple_irq) - return "simple "; -#ifdef CONFIG_SMP - if (handle == handle_percpu_irq) - return "percpu "; -#endif - if (handle == handle_bad_irq) - return "bad "; - - return NULL; + set_irq_chip(irq, chip); + __set_irq_handler(irq, handle, 0, name); } -- cgit v1.2.3 From bea493a031fe3337f4fe5479e8e865513828ea76 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Oct 2006 00:10:33 -0700 Subject: [PATCH] rt-mutex: fixup rt-mutex debug code BUG: warning at kernel/rtmutex-debug.c:125/rt_mutex_debug_task_free() (Not tainted) [] show_trace_log_lvl+0x58/0x16a [] show_trace+0xd/0x10 [] dump_stack+0x19/0x1b [] rt_mutex_debug_task_free+0x35/0x6a [] free_task+0x15/0x24 [] copy_process+0x12bd/0x1324 [] do_fork+0x42/0x113 [] sys_fork+0x19/0x1b [] syscall_call+0x7/0xb In copy_process(), dup_task_struct() also duplicates the ->pi_lock, ->pi_waiters and ->pi_blocked_on members. rt_mutex_debug_task_free() called from free_task() validates these members. However free_task() can be invoked before these members are reset for the new task. Move the initialization code before the first bail that can hit free_task(). Signed-off-by: Peter Zijlstra Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 7dc6140baac6..29ebb30850ed 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -984,6 +984,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (!p) goto fork_out; + rt_mutex_init_task(p); + #ifdef CONFIG_TRACE_IRQFLAGS DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); @@ -1088,8 +1090,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->lockdep_recursion = 0; #endif - rt_mutex_init_task(p); - #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif -- cgit v1.2.3 From bd5349cfd2b9bbb10a3dbcd3fe5cbaabe0b2ab9e Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Tue, 17 Oct 2006 00:10:35 -0700 Subject: [PATCH] Convert cpu hotplug notifiers to use raw_notifier instead of blocking_notifier The use of blocking notifier by _cpu_up and _cpu_down in cpu.c has two problem. 1/ An interaction with the workqueue notifier causes lockdep to spit a warning. 2/ A notifier could conceivable be added or removed while _cpu_up or _cpu_down are in process. As each notifier is called twice (prepare then commit/abort) this could be unhealthy. To fix to we simply take cpu_add_remove_lock while adding or removing notifiers to/from the list. This makes the 'blocking' usage unnecessary as all accesses to cpu_chain are now protected by cpu_add_remove_lock. So change "blocking" to "raw" in all relevant places. This fixes 1. Credit: Andrew Morton Cc: Rusty Russell Cc: Michal Piotrowski (reporter) Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 32c96628463e..27dd3ee47099 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -19,7 +19,7 @@ static DEFINE_MUTEX(cpu_add_remove_lock); static DEFINE_MUTEX(cpu_bitmask_lock); -static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain); +static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); /* If set, cpu_up and cpu_down will return -EBUSY and do nothing. * Should always be manipulated under cpu_add_remove_lock @@ -68,7 +68,11 @@ EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); /* Need to know about CPUs going up/down? */ int __cpuinit register_cpu_notifier(struct notifier_block *nb) { - return blocking_notifier_chain_register(&cpu_chain, nb); + int ret; + mutex_lock(&cpu_add_remove_lock); + ret = raw_notifier_chain_register(&cpu_chain, nb); + mutex_unlock(&cpu_add_remove_lock); + return ret; } #ifdef CONFIG_HOTPLUG_CPU @@ -77,7 +81,9 @@ EXPORT_SYMBOL(register_cpu_notifier); void unregister_cpu_notifier(struct notifier_block *nb) { - blocking_notifier_chain_unregister(&cpu_chain, nb); + mutex_lock(&cpu_add_remove_lock); + raw_notifier_chain_unregister(&cpu_chain, nb); + mutex_unlock(&cpu_add_remove_lock); } EXPORT_SYMBOL(unregister_cpu_notifier); @@ -126,7 +132,7 @@ static int _cpu_down(unsigned int cpu) if (!cpu_online(cpu)) return -EINVAL; - err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, + err = raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, (void *)(long)cpu); if (err == NOTIFY_BAD) { printk("%s: attempt to take down CPU %u failed\n", @@ -146,7 +152,7 @@ static int _cpu_down(unsigned int cpu) if (IS_ERR(p)) { /* CPU didn't die: tell everyone. Can't complain. */ - if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, + if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, (void *)(long)cpu) == NOTIFY_BAD) BUG(); @@ -169,7 +175,7 @@ static int _cpu_down(unsigned int cpu) put_cpu(); /* CPU is completely dead: tell everyone. Too late to complain. */ - if (blocking_notifier_call_chain(&cpu_chain, CPU_DEAD, + if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD, (void *)(long)cpu) == NOTIFY_BAD) BUG(); @@ -206,7 +212,7 @@ static int __devinit _cpu_up(unsigned int cpu) if (cpu_online(cpu) || !cpu_present(cpu)) return -EINVAL; - ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); + ret = raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); if (ret == NOTIFY_BAD) { printk("%s: attempt to bring up CPU %u failed\n", __FUNCTION__, cpu); @@ -223,11 +229,11 @@ static int __devinit _cpu_up(unsigned int cpu) BUG_ON(!cpu_online(cpu)); /* Now call notifier in preparation. */ - blocking_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); + raw_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); out_notify: if (ret != 0) - blocking_notifier_call_chain(&cpu_chain, + raw_notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu); return ret; -- cgit v1.2.3 From 91fcdd4e0314145d7d4fa52dba2f9c2da25346fd Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 19 Oct 2006 23:28:29 -0700 Subject: [PATCH] readjust comments of task_timeslice for kernel doc Signed-off-by: Borislav Petkov Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 094b5687eef6..3399701c680e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -160,15 +160,6 @@ #define TASK_PREEMPTS_CURR(p, rq) \ ((p)->prio < (rq)->curr->prio) -/* - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] - * to time slice values: [800ms ... 100ms ... 5ms] - * - * The higher a thread's priority, the bigger timeslices - * it gets during one round of execution. But even the lowest - * priority thread gets MIN_TIMESLICE worth of execution time. - */ - #define SCALE_PRIO(x, prio) \ max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) @@ -180,6 +171,15 @@ static unsigned int static_prio_timeslice(int static_prio) return SCALE_PRIO(DEF_TIMESLICE, static_prio); } +/* + * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] + * to time slice values: [800ms ... 100ms ... 5ms] + * + * The higher a thread's priority, the bigger timeslices + * it gets during one round of execution. But even the lowest + * priority thread gets MIN_TIMESLICE worth of execution time. + */ + static inline unsigned int task_timeslice(struct task_struct *p) { return static_prio_timeslice(p->static_prio); -- cgit v1.2.3 From d6f8ff7381501887233666b508b9eac70143303d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 19 Oct 2006 23:28:34 -0700 Subject: [PATCH] cad_pid sysctl with PROC_FS=n If CONFIG_PROC_FS=n: kernel/sysctl.c:148: warning: 'proc_do_cad_pid' used but never defined kernel/built-in.o:(.data+0x1228): undefined reference to `proc_do_cad_pid' make: *** [.tmp_vmlinux1] Error 1 Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8020fb273c4f..8bff2c18fb5a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -136,8 +136,10 @@ static int parse_table(int __user *, int, void __user *, size_t __user *, static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); +#ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); +#endif static ctl_table root_table[]; static struct ctl_table_header root_table_header = @@ -542,6 +544,7 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_PROC_SYSCTL { .ctl_name = KERN_CADPID, .procname = "cad_pid", @@ -550,6 +553,7 @@ static ctl_table kern_table[] = { .mode = 0600, .proc_handler = &proc_do_cad_pid, }, +#endif { .ctl_name = KERN_MAX_THREADS, .procname = "threads-max", -- cgit v1.2.3 From e05d722e4555cd54677b4c8431d9e81fd047ef7a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 19 Oct 2006 23:29:12 -0700 Subject: [PATCH] kernel/nsproxy.c: use kmemdup() Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/nsproxy.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 6ebdb82a0ce4..674aceb7335a 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -44,11 +44,9 @@ static inline struct nsproxy *clone_namespaces(struct nsproxy *orig) { struct nsproxy *ns; - ns = kmalloc(sizeof(struct nsproxy), GFP_KERNEL); - if (ns) { - memcpy(ns, orig, sizeof(struct nsproxy)); + ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL); + if (ns) atomic_set(&ns->count, 1); - } return ns; } -- cgit v1.2.3 From 690a973f48b6ba2954465992c08e65059c8374fe Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Sat, 21 Oct 2006 18:37:01 +0200 Subject: [PATCH] x86-64: Speed up dwarf2 unwinder This changes the dwarf2 unwinder to do a binary search for CIEs instead of a linear work. The linker is unfortunately not able to build a proper lookup table at link time, instead it creates one at runtime as soon as the bootmem allocator is usable (so you'll continue using the linear lookup for the first [hopefully] few calls). The code should be ready to utilize a build-time created table once a fixed linker becomes available. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- kernel/unwind.c | 318 +++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 279 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/unwind.c b/kernel/unwind.c index 2e2368607aab..f7e50d16dbf6 100644 --- a/kernel/unwind.c +++ b/kernel/unwind.c @@ -11,13 +11,15 @@ #include #include -#include +#include +#include #include #include #include #include extern char __start_unwind[], __end_unwind[]; +extern const u8 __start_unwind_hdr[], __end_unwind_hdr[]; #define MAX_STACK_DEPTH 8 @@ -100,6 +102,8 @@ static struct unwind_table { } core, init; const void *address; unsigned long size; + const unsigned char *header; + unsigned long hdrsz; struct unwind_table *link; const char *name; } root_table; @@ -145,6 +149,10 @@ static struct unwind_table *find_table(unsigned long pc) return table; } +static unsigned long read_pointer(const u8 **pLoc, + const void *end, + signed ptrType); + static void init_unwind_table(struct unwind_table *table, const char *name, const void *core_start, @@ -152,14 +160,30 @@ static void init_unwind_table(struct unwind_table *table, const void *init_start, unsigned long init_size, const void *table_start, - unsigned long table_size) + unsigned long table_size, + const u8 *header_start, + unsigned long header_size) { + const u8 *ptr = header_start + 4; + const u8 *end = header_start + header_size; + table->core.pc = (unsigned long)core_start; table->core.range = core_size; table->init.pc = (unsigned long)init_start; table->init.range = init_size; table->address = table_start; table->size = table_size; + /* See if the linker provided table looks valid. */ + if (header_size <= 4 + || header_start[0] != 1 + || (void *)read_pointer(&ptr, end, header_start[1]) != table_start + || header_start[2] == DW_EH_PE_omit + || read_pointer(&ptr, end, header_start[2]) <= 0 + || header_start[3] == DW_EH_PE_omit) + header_start = NULL; + table->hdrsz = header_size; + smp_wmb(); + table->header = header_start; table->link = NULL; table->name = name; } @@ -169,7 +193,143 @@ void __init unwind_init(void) init_unwind_table(&root_table, "kernel", _text, _end - _text, NULL, 0, - __start_unwind, __end_unwind - __start_unwind); + __start_unwind, __end_unwind - __start_unwind, + __start_unwind_hdr, __end_unwind_hdr - __start_unwind_hdr); +} + +static const u32 bad_cie, not_fde; +static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *); +static signed fde_pointer_type(const u32 *cie); + +struct eh_frame_hdr_table_entry { + unsigned long start, fde; +}; + +static int cmp_eh_frame_hdr_table_entries(const void *p1, const void *p2) +{ + const struct eh_frame_hdr_table_entry *e1 = p1; + const struct eh_frame_hdr_table_entry *e2 = p2; + + return (e1->start > e2->start) - (e1->start < e2->start); +} + +static void swap_eh_frame_hdr_table_entries(void *p1, void *p2, int size) +{ + struct eh_frame_hdr_table_entry *e1 = p1; + struct eh_frame_hdr_table_entry *e2 = p2; + unsigned long v; + + v = e1->start; + e1->start = e2->start; + e2->start = v; + v = e1->fde; + e1->fde = e2->fde; + e2->fde = v; +} + +static void __init setup_unwind_table(struct unwind_table *table, + void *(*alloc)(unsigned long)) +{ + const u8 *ptr; + unsigned long tableSize = table->size, hdrSize; + unsigned n; + const u32 *fde; + struct { + u8 version; + u8 eh_frame_ptr_enc; + u8 fde_count_enc; + u8 table_enc; + unsigned long eh_frame_ptr; + unsigned int fde_count; + struct eh_frame_hdr_table_entry table[]; + } __attribute__((__packed__)) *header; + + if (table->header) + return; + + if (table->hdrsz) + printk(KERN_WARNING ".eh_frame_hdr for '%s' present but unusable\n", + table->name); + + if (tableSize & (sizeof(*fde) - 1)) + return; + + for (fde = table->address, n = 0; + tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde; + tableSize -= sizeof(*fde) + *fde, fde += 1 + *fde / sizeof(*fde)) { + const u32 *cie = cie_for_fde(fde, table); + signed ptrType; + + if (cie == ¬_fde) + continue; + if (cie == NULL + || cie == &bad_cie + || (ptrType = fde_pointer_type(cie)) < 0) + return; + ptr = (const u8 *)(fde + 2); + if (!read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType)) + return; + ++n; + } + + if (tableSize || !n) + return; + + hdrSize = 4 + sizeof(unsigned long) + sizeof(unsigned int) + + 2 * n * sizeof(unsigned long); + header = alloc(hdrSize); + if (!header) + return; + header->version = 1; + header->eh_frame_ptr_enc = DW_EH_PE_abs|DW_EH_PE_native; + header->fde_count_enc = DW_EH_PE_abs|DW_EH_PE_data4; + header->table_enc = DW_EH_PE_abs|DW_EH_PE_native; + put_unaligned((unsigned long)table->address, &header->eh_frame_ptr); + BUILD_BUG_ON(offsetof(typeof(*header), fde_count) + % __alignof(typeof(header->fde_count))); + header->fde_count = n; + + BUILD_BUG_ON(offsetof(typeof(*header), table) + % __alignof(typeof(*header->table))); + for (fde = table->address, tableSize = table->size, n = 0; + tableSize; + tableSize -= sizeof(*fde) + *fde, fde += 1 + *fde / sizeof(*fde)) { + const u32 *cie = fde + 1 - fde[1] / sizeof(*fde); + + if (!fde[1]) + continue; /* this is a CIE */ + ptr = (const u8 *)(fde + 2); + header->table[n].start = read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + fde_pointer_type(cie)); + header->table[n].fde = (unsigned long)fde; + ++n; + } + WARN_ON(n != header->fde_count); + + sort(header->table, + n, + sizeof(*header->table), + cmp_eh_frame_hdr_table_entries, + swap_eh_frame_hdr_table_entries); + + table->hdrsz = hdrSize; + smp_wmb(); + table->header = (const void *)header; +} + +static void *__init balloc(unsigned long sz) +{ + return __alloc_bootmem_nopanic(sz, + sizeof(unsigned int), + __pa(MAX_DMA_ADDRESS)); +} + +void __init unwind_setup(void) +{ + setup_unwind_table(&root_table, balloc); } #ifdef CONFIG_MODULES @@ -193,7 +353,8 @@ void *unwind_add_table(struct module *module, init_unwind_table(table, module->name, module->module_core, module->core_size, module->module_init, module->init_size, - table_start, table_size); + table_start, table_size, + NULL, 0); if (last_table) last_table->link = table; @@ -303,6 +464,26 @@ static sleb128_t get_sleb128(const u8 **pcur, const u8 *end) return value; } +static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *table) +{ + const u32 *cie; + + if (!*fde || (*fde & (sizeof(*fde) - 1))) + return &bad_cie; + if (!fde[1]) + return ¬_fde; /* this is a CIE */ + if ((fde[1] & (sizeof(*fde) - 1)) + || fde[1] > (unsigned long)(fde + 1) - (unsigned long)table->address) + return NULL; /* this is not a valid FDE */ + cie = fde + 1 - fde[1] / sizeof(*fde); + if (*cie <= sizeof(*cie) + 4 + || *cie >= fde[1] - sizeof(*fde) + || (*cie & (sizeof(*cie) - 1)) + || cie[1]) + return NULL; /* this is not a (valid) CIE */ + return cie; +} + static unsigned long read_pointer(const u8 **pLoc, const void *end, signed ptrType) @@ -610,49 +791,108 @@ int unwind(struct unwind_frame_info *frame) unsigned i; signed ptrType = -1; uleb128_t retAddrReg = 0; - struct unwind_table *table; + const struct unwind_table *table; struct unwind_state state; if (UNW_PC(frame) == 0) return -EINVAL; if ((table = find_table(pc)) != NULL && !(table->size & (sizeof(*fde) - 1))) { - unsigned long tableSize = table->size; - - for (fde = table->address; - tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde; - tableSize -= sizeof(*fde) + *fde, - fde += 1 + *fde / sizeof(*fde)) { - if (!*fde || (*fde & (sizeof(*fde) - 1))) - break; - if (!fde[1]) - continue; /* this is a CIE */ - if ((fde[1] & (sizeof(*fde) - 1)) - || fde[1] > (unsigned long)(fde + 1) - - (unsigned long)table->address) - continue; /* this is not a valid FDE */ - cie = fde + 1 - fde[1] / sizeof(*fde); - if (*cie <= sizeof(*cie) + 4 - || *cie >= fde[1] - sizeof(*fde) - || (*cie & (sizeof(*cie) - 1)) - || cie[1] - || (ptrType = fde_pointer_type(cie)) < 0) { - cie = NULL; /* this is not a (valid) CIE */ - continue; + const u8 *hdr = table->header; + unsigned long tableSize; + + smp_rmb(); + if (hdr && hdr[0] == 1) { + switch(hdr[3] & DW_EH_PE_FORM) { + case DW_EH_PE_native: tableSize = sizeof(unsigned long); break; + case DW_EH_PE_data2: tableSize = 2; break; + case DW_EH_PE_data4: tableSize = 4; break; + case DW_EH_PE_data8: tableSize = 8; break; + default: tableSize = 0; break; + } + ptr = hdr + 4; + end = hdr + table->hdrsz; + if (tableSize + && read_pointer(&ptr, end, hdr[1]) + == (unsigned long)table->address + && (i = read_pointer(&ptr, end, hdr[2])) > 0 + && i == (end - ptr) / (2 * tableSize) + && !((end - ptr) % (2 * tableSize))) { + do { + const u8 *cur = ptr + (i / 2) * (2 * tableSize); + + startLoc = read_pointer(&cur, + cur + tableSize, + hdr[3]); + if (pc < startLoc) + i /= 2; + else { + ptr = cur - tableSize; + i = (i + 1) / 2; + } + } while (startLoc && i > 1); + if (i == 1 + && (startLoc = read_pointer(&ptr, + ptr + tableSize, + hdr[3])) != 0 + && pc >= startLoc) + fde = (void *)read_pointer(&ptr, + ptr + tableSize, + hdr[3]); } + } + + if (fde != NULL) { + cie = cie_for_fde(fde, table); ptr = (const u8 *)(fde + 2); - startLoc = read_pointer(&ptr, - (const u8 *)(fde + 1) + *fde, - ptrType); - endLoc = startLoc - + read_pointer(&ptr, - (const u8 *)(fde + 1) + *fde, - ptrType & DW_EH_PE_indirect - ? ptrType - : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed)); - if (pc >= startLoc && pc < endLoc) - break; - cie = NULL; + if(cie != NULL + && cie != &bad_cie + && cie != ¬_fde + && (ptrType = fde_pointer_type(cie)) >= 0 + && read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType) == startLoc) { + if (!(ptrType & DW_EH_PE_indirect)) + ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed; + endLoc = startLoc + + read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType); + if(pc >= endLoc) + fde = NULL; + } else + fde = NULL; + } + if (fde == NULL) { + for (fde = table->address, tableSize = table->size; + cie = NULL, tableSize > sizeof(*fde) + && tableSize - sizeof(*fde) >= *fde; + tableSize -= sizeof(*fde) + *fde, + fde += 1 + *fde / sizeof(*fde)) { + cie = cie_for_fde(fde, table); + if (cie == &bad_cie) { + cie = NULL; + break; + } + if (cie == NULL + || cie == ¬_fde + || (ptrType = fde_pointer_type(cie)) < 0) + continue; + ptr = (const u8 *)(fde + 2); + startLoc = read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType); + if (!startLoc) + continue; + if (!(ptrType & DW_EH_PE_indirect)) + ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed; + endLoc = startLoc + + read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType); + if (pc >= startLoc && pc < endLoc) + break; + } } } if (cie != NULL) { -- cgit v1.2.3 From 1d4d262769cd1894a0306b9c57e72f005cd9e75a Mon Sep 17 00:00:00 2001 From: Jan Dittmer Date: Sat, 28 Oct 2006 10:38:38 -0700 Subject: [PATCH] Add missing space in module.c for taintskernel Obvious fix. Signed-off-by: Jan Dittmer Acked-by: Florin Malita Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 67009bd56c52..5072a943fe35 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1342,7 +1342,7 @@ static void set_license(struct module *mod, const char *license) if (!license_is_gpl_compatible(license)) { if (!(tainted & TAINT_PROPRIETARY_MODULE)) - printk(KERN_WARNING "%s: module license '%s' taints" + printk(KERN_WARNING "%s: module license '%s' taints " "kernel.\n", mod->name, license); add_taint_module(mod, TAINT_PROPRIETARY_MODULE); } -- cgit v1.2.3 From 5fa3839a64203b2ab727dcb37da9b2d7079fca28 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Sat, 28 Oct 2006 10:38:46 -0700 Subject: [PATCH] Constify compat_get_bitmap argument This means we can call it when the bitmap we want to fetch is declared const. Signed-off-by: Stephen Rothwell Cc: Christoph Lameter Cc: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 75573e5d27b0..d4898aad6cfa 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -678,7 +678,7 @@ int get_compat_sigevent(struct sigevent *event, ? -EFAULT : 0; } -long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask, +long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, unsigned long bitmap_size) { int i, j; -- cgit v1.2.3 From fca178c0c6e8d52a1875be36b070f30884ebfae9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 28 Oct 2006 10:38:49 -0700 Subject: [PATCH] fill_tgid: fix task_struct leak and possible oops 1. fill_tgid() forgets to do put_task_struct(first). 2. release_task(first) can happen after fill_tgid() drops tasklist_lock, it is unsafe to dereference first->signal. This is a temporary fix, imho the locking should be reworked. Signed-off-by: Oleg Nesterov Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 5d6a8c54ee85..9aeee511a463 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -237,14 +237,17 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, } else get_task_struct(first); - /* Start with stats from dead tasks */ - spin_lock_irqsave(&first->signal->stats_lock, flags); - if (first->signal->stats) - memcpy(stats, first->signal->stats, sizeof(*stats)); - spin_unlock_irqrestore(&first->signal->stats_lock, flags); tsk = first; read_lock(&tasklist_lock); + /* Start with stats from dead tasks */ + if (first->signal) { + spin_lock_irqsave(&first->signal->stats_lock, flags); + if (first->signal->stats) + memcpy(stats, first->signal->stats, sizeof(*stats)); + spin_unlock_irqrestore(&first->signal->stats_lock, flags); + } + do { if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk)) continue; @@ -264,7 +267,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, * Accounting subsytems can also add calls here to modify * fields of taskstats. */ - + put_task_struct(first); return 0; } -- cgit v1.2.3 From 05d5bcd60e8202e5c7b28cf61186043a4d612623 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 28 Oct 2006 10:38:50 -0700 Subject: [PATCH] bacct_add_tsk: fix unsafe and wrong parent/group_leader dereference 1. ts = timespec_sub(uptime, current->group_leader->start_time); It is possible that current != tsk. Probably it was supposed to be 'tsk->group_leader->start_time. But why we are reading group_leader's start_time ? This accounting is per thread, not per procees, I changed this to 'tsk->start_time. Please corect me. 2. stats->ac_ppid = (tsk->parent) ? tsk->parent->pid : 0; tsk->parent never == NULL, and it is unsafe to dereference it. Both the task and it's parent may exit after the caller unlocks tasklist_lock, the memory could be unmapped (DEBUG_SLAB). (And we should use ->real_parent->tgid in fact). Q: I don't understand the 'if (thread_group_leader(tsk))' check. Why it is needed ? Signed-off-by: Oleg Nesterov Cc: Shailabh Nagar Cc: Balbir Singh Acked-by: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/tsacct.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/tsacct.c b/kernel/tsacct.c index db443221ba5b..65a5036a3d95 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -36,7 +36,7 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) /* calculate task elapsed time in timespec */ do_posix_clock_monotonic_gettime(&uptime); - ts = timespec_sub(uptime, current->group_leader->start_time); + ts = timespec_sub(uptime, tsk->start_time); /* rebase elapsed time to usec */ ac_etime = timespec_to_ns(&ts); do_div(ac_etime, NSEC_PER_USEC); @@ -58,7 +58,10 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) stats->ac_uid = tsk->uid; stats->ac_gid = tsk->gid; stats->ac_pid = tsk->pid; - stats->ac_ppid = (tsk->parent) ? tsk->parent->pid : 0; + rcu_read_lock(); + stats->ac_ppid = pid_alive(tsk) ? + rcu_dereference(tsk->real_parent)->tgid : 0; + rcu_read_unlock(); stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC; stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; stats->ac_minflt = tsk->min_flt; -- cgit v1.2.3 From 093a8e8aecd77b2799934996a55a6838e1e2b8f3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 28 Oct 2006 10:38:51 -0700 Subject: [PATCH] taskstats_tgid_free: fix usage taskstats_tgid_free() is called on copy_process's error path. This is wrong. IF (clone_flags & CLONE_THREAD) We should not clear ->signal->taskstats, current uses it, it probably has a valid accumulated info. ELSE taskstats_tgid_init() set ->signal->taskstats = NULL, there is nothing to free. Move the callsite to __exit_signal(). We don't need any locking, entire thread group is exiting, nobody should have a reference to soon to be released ->signal. Signed-off-by: Oleg Nesterov Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 1 + kernel/fork.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f250a5e3e281..06de6c4e8ca3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -128,6 +128,7 @@ static void __exit_signal(struct task_struct *tsk) flush_sigqueue(&tsk->pending); if (sig) { flush_sigqueue(&sig->shared_pending); + taskstats_tgid_free(sig); __cleanup_signal(sig); } } diff --git a/kernel/fork.c b/kernel/fork.c index 29ebb30850ed..213326609bac 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -897,7 +897,6 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts void __cleanup_signal(struct signal_struct *sig) { exit_thread_group_keys(sig); - taskstats_tgid_free(sig); kmem_cache_free(signal_cachep, sig); } -- cgit v1.2.3 From b8534d7bd89df0cd41cd47bcd6733a05ea9a691a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 28 Oct 2006 10:38:53 -0700 Subject: [PATCH] taskstats: kill ->taskstats_lock in favor of ->siglock signal_struct is (mostly) protected by ->sighand->siglock, I think we don't need ->taskstats_lock to protect ->stats. This also allows us to simplify the locking in fill_tgid(). Signed-off-by: Oleg Nesterov Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- kernel/taskstats.c | 16 ++++++---------- 2 files changed, 7 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 213326609bac..3da978eec791 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -830,7 +830,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts if (clone_flags & CLONE_THREAD) { atomic_inc(¤t->signal->count); atomic_inc(¤t->signal->live); - taskstats_tgid_alloc(current->signal); + taskstats_tgid_alloc(current); return 0; } sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 9aeee511a463..b2efda94615a 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -241,11 +241,11 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, tsk = first; read_lock(&tasklist_lock); /* Start with stats from dead tasks */ - if (first->signal) { - spin_lock_irqsave(&first->signal->stats_lock, flags); + if (first->sighand) { + spin_lock_irqsave(&first->sighand->siglock, flags); if (first->signal->stats) memcpy(stats, first->signal->stats, sizeof(*stats)); - spin_unlock_irqrestore(&first->signal->stats_lock, flags); + spin_unlock_irqrestore(&first->sighand->siglock, flags); } do { @@ -276,7 +276,7 @@ static void fill_tgid_exit(struct task_struct *tsk) { unsigned long flags; - spin_lock_irqsave(&tsk->signal->stats_lock, flags); + spin_lock_irqsave(&tsk->sighand->siglock, flags); if (!tsk->signal->stats) goto ret; @@ -288,7 +288,7 @@ static void fill_tgid_exit(struct task_struct *tsk) */ delayacct_add_tsk(tsk->signal->stats, tsk); ret: - spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); + spin_unlock_irqrestore(&tsk->sighand->siglock, flags); return; } @@ -464,15 +464,10 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, size_t size; int is_thread_group; struct nlattr *na; - unsigned long flags; if (!family_registered || !tidstats) return; - spin_lock_irqsave(&tsk->signal->stats_lock, flags); - is_thread_group = tsk->signal->stats ? 1 : 0; - spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); - rc = 0; /* * Size includes space for nested attributes @@ -480,6 +475,7 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, size = nla_total_size(sizeof(u32)) + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + is_thread_group = (tsk->signal->stats != NULL); if (is_thread_group) size = 2 * size; /* PID + STATS + TGID + STATS */ -- cgit v1.2.3 From a98b6094261c0112e9c455c96995972181bff049 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 28 Oct 2006 10:38:54 -0700 Subject: [PATCH] taskstats: don't use tasklist_lock Remove tasklist_lock from taskstats.c. find_task_by_pid() is rcu-safe. ->siglock allows us to traverse subthread without tasklist. Q: delay accounting looks wrong to me. If sub-thread has already called taskstats_exit_send() but didn't call release_task(self) yet it will be accounted twice. The window is big. No? Signed-off-by: Oleg Nesterov Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 59 ++++++++++++++++++++++-------------------------------- 1 file changed, 24 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index b2efda94615a..b724aeea5443 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -174,21 +174,19 @@ static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) up_write(&listeners->sem); } -static int fill_pid(pid_t pid, struct task_struct *pidtsk, +static int fill_pid(pid_t pid, struct task_struct *tsk, struct taskstats *stats) { int rc = 0; - struct task_struct *tsk = pidtsk; - if (!pidtsk) { - read_lock(&tasklist_lock); + if (!tsk) { + rcu_read_lock(); tsk = find_task_by_pid(pid); - if (!tsk) { - read_unlock(&tasklist_lock); + if (tsk) + get_task_struct(tsk); + rcu_read_unlock(); + if (!tsk) return -ESRCH; - } - get_task_struct(tsk); - read_unlock(&tasklist_lock); } else get_task_struct(tsk); @@ -214,40 +212,28 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk, } -static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, +static int fill_tgid(pid_t tgid, struct task_struct *first, struct taskstats *stats) { - struct task_struct *tsk, *first; + struct task_struct *tsk; unsigned long flags; + int rc = -ESRCH; /* * Add additional stats from live tasks except zombie thread group * leaders who are already counted with the dead tasks */ - first = tgidtsk; - if (!first) { - read_lock(&tasklist_lock); + rcu_read_lock(); + if (!first) first = find_task_by_pid(tgid); - if (!first) { - read_unlock(&tasklist_lock); - return -ESRCH; - } - get_task_struct(first); - read_unlock(&tasklist_lock); - } else - get_task_struct(first); + if (!first || !lock_task_sighand(first, &flags)) + goto out; - tsk = first; - read_lock(&tasklist_lock); - /* Start with stats from dead tasks */ - if (first->sighand) { - spin_lock_irqsave(&first->sighand->siglock, flags); - if (first->signal->stats) - memcpy(stats, first->signal->stats, sizeof(*stats)); - spin_unlock_irqrestore(&first->sighand->siglock, flags); - } + if (first->signal->stats) + memcpy(stats, first->signal->stats, sizeof(*stats)); + tsk = first; do { if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk)) continue; @@ -260,15 +246,18 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, delayacct_add_tsk(stats, tsk); } while_each_thread(first, tsk); - read_unlock(&tasklist_lock); - stats->version = TASKSTATS_VERSION; + unlock_task_sighand(first, &flags); + rc = 0; +out: + rcu_read_unlock(); + + stats->version = TASKSTATS_VERSION; /* * Accounting subsytems can also add calls here to modify * fields of taskstats. */ - put_task_struct(first); - return 0; + return rc; } -- cgit v1.2.3 From d7c3f5f231c60d7e6ada5770b536df2b3ec1bd08 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 28 Oct 2006 10:38:54 -0700 Subject: [PATCH] fill_tgid: cleanup delays accounting fill_tgid() should skip not only an already exited group leader. If the task has ->exit_state != 0 it already did exit_notify(), so it also did fill_tgid_exit()->delayacct_add_tsk(->signal->stats) and we should skip it to avoid a double accounting. This patch doesn't close the race completely, but it cleanups the code. Signed-off-by: Oleg Nesterov Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index b724aeea5443..8adfb8069c6d 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -235,7 +235,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first, tsk = first; do { - if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk)) + if (tsk->exit_state) continue; /* * Accounting subsystem can call its functions here to -- cgit v1.2.3 From bb1d860551c4307b1a7ee9a21b120319075e987e Mon Sep 17 00:00:00 2001 From: Jim Houston Date: Sat, 28 Oct 2006 10:38:56 -0700 Subject: [PATCH] time_adjust cleared before use I notice that the code which implements adjtime clears the time_adjust value before using it. The attached patch makes the obvious fix. Acked-by: Roman Zippel Signed-off-by: Jim Houston Cc: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/ntp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 47195fa0ec4f..3afeaa3a73f9 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -161,9 +161,9 @@ void second_overflow(void) time_adjust += MAX_TICKADJ; tick_length -= MAX_TICKADJ_SCALED; } else { - time_adjust = 0; tick_length += (s64)(time_adjust * NSEC_PER_USEC / HZ) << TICK_LENGTH_SHIFT; + time_adjust = 0; } } } -- cgit v1.2.3 From 8fa1d7d3b2c51594c0f3aa151983dd51f605e07d Mon Sep 17 00:00:00 2001 From: Satoru Takeuchi Date: Sat, 28 Oct 2006 10:38:57 -0700 Subject: [PATCH] cpu-hotplug: release `workqueue_mutex' properly on CPU hot-remove _cpu_down() acquires `workqueue_mutex' on its process, but doen't release it if __cpu_disable() fails. Signed-off-by: Satoru Takeuchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 27dd3ee47099..663c920b2234 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -150,18 +150,18 @@ static int _cpu_down(unsigned int cpu) p = __stop_machine_run(take_cpu_down, NULL, cpu); mutex_unlock(&cpu_bitmask_lock); - if (IS_ERR(p)) { + if (IS_ERR(p) || cpu_online(cpu)) { /* CPU didn't die: tell everyone. Can't complain. */ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, (void *)(long)cpu) == NOTIFY_BAD) BUG(); - err = PTR_ERR(p); - goto out_allowed; - } - - if (cpu_online(cpu)) + if (IS_ERR(p)) { + err = PTR_ERR(p); + goto out_allowed; + } goto out_thread; + } /* Wait for it to sleep (leaving idle task). */ while (!idle_cpu(cpu)) -- cgit v1.2.3 From 057647fc47b3a5fbcfa997041db3f483d506603c Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Sat, 28 Oct 2006 10:38:58 -0700 Subject: [PATCH] workqueue: update kerneldoc This patch (as812) changes the kerneldoc comments explaining the return values from queue_work(), queue_delayed_work(), and queue_delayed_work_on(). The updated comments explain more accurately the meaning of the return code and avoid suggesting that a 0 value means the routine was unsuccessful. Signed-off-by: Alan Stern Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3df9bfc7ff78..17c2f03d2c27 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -99,7 +99,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, * @wq: workqueue to use * @work: work to queue * - * Returns non-zero if it was successfully added. + * Returns 0 if @work was already on a queue, non-zero otherwise. * * We queue the work to the CPU it was submitted, but there is no * guarantee that it will be processed by that CPU. @@ -138,7 +138,7 @@ static void delayed_work_timer_fn(unsigned long __data) * @work: work to queue * @delay: number of jiffies to wait before queueing * - * Returns non-zero if it was successfully added. + * Returns 0 if @work was already on a queue, non-zero otherwise. */ int fastcall queue_delayed_work(struct workqueue_struct *wq, struct work_struct *work, unsigned long delay) @@ -169,7 +169,7 @@ EXPORT_SYMBOL_GPL(queue_delayed_work); * @work: work to queue * @delay: number of jiffies to wait before queueing * - * Returns non-zero if it was successfully added. + * Returns 0 if @work was already on a queue, non-zero otherwise. */ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work, unsigned long delay) -- cgit v1.2.3 From d46a3d0d07ba539aea5b0e1ad30e568f0cb03576 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 29 Oct 2006 16:45:58 +0300 Subject: [PATCH] taskstats: fix sk_buff leak 'return genlmsg_cancel()' in taskstats_user_cmd/taskstats_exit_send potentially leaks a skb. Unless we pass 'rep_skb' to the netlink layer we own sk_buff. This means we should always do kfree_skb() on failure. [ Thomas acked and pointed out missing return value in original version ] Signed-off-by: Oleg Nesterov Acked-by: Thomas Graf Cc: Andrew Morton Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 8adfb8069c6d..f3c3e9d43d2c 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -411,7 +411,7 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) return send_reply(rep_skb, info->snd_pid); nla_put_failure: - return genlmsg_cancel(rep_skb, reply); + rc = genlmsg_cancel(rep_skb, reply); err: nlmsg_free(rep_skb); return rc; @@ -507,7 +507,6 @@ send: nla_put_failure: genlmsg_cancel(rep_skb, reply); - goto ret; err_skb: nlmsg_free(rep_skb); ret: -- cgit v1.2.3 From 3d8334def5cf831d2ed438aae021696a2faa4ddd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 29 Oct 2006 18:57:16 +0300 Subject: [PATCH] taskstats: fix sk_buff size calculation prepare_reply() adds GENL_HDRLEN to the payload (genlmsg_total_size()), but then it does genlmsg_put()->nlmsg_put(). This means we forget to reserve a room for 'struct nlmsghdr'. Signed-off-by: Oleg Nesterov Cc: Thomas Graf Cc: Andrew Morton Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index f3c3e9d43d2c..2039585ec5e1 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -77,7 +77,8 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, /* * If new attributes are added, please revisit this allocation */ - skb = nlmsg_new(genlmsg_total_size(size), GFP_KERNEL); + size = nlmsg_total_size(genlmsg_total_size(size)); + skb = nlmsg_new(size, GFP_KERNEL); if (!skb) return -ENOMEM; -- cgit v1.2.3 From 0e2d57fc6e7dabdbfdd4f26c861e7e6c75d5bdcf Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 29 Oct 2006 22:46:34 -0800 Subject: [PATCH] ndiswrapper: don't set the module->taints flags For ndiswrapper, don't set the module->taints flags, just set the kernel global tainted flag. This should allow ndiswrapper to continue to use GPL symbols. Signed-off-by: Randy Dunlap Cc: Florin Malita Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 5072a943fe35..f0166563c602 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1718,7 +1718,7 @@ static struct module *load_module(void __user *umod, set_license(mod, get_modinfo(sechdrs, infoindex, "license")); if (strcmp(mod->name, "ndiswrapper") == 0) - add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + add_taint(TAINT_PROPRIETARY_MODULE); if (strcmp(mod->name, "driverloader") == 0) add_taint_module(mod, TAINT_PROPRIETARY_MODULE); -- cgit v1.2.3 From f0ec1aaf54caddd21c259aea8b2ecfbde4ee4fb9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 29 Oct 2006 22:46:43 -0800 Subject: [PATCH] xacct_add_tsk: fix pure theoretical ->mm use-after-free Paranoid fix. The task can free its ->mm after the 'if (p->mm)' check. Signed-off-by: Oleg Nesterov Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/tsacct.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 65a5036a3d95..96f77013d3f0 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -80,13 +80,17 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) */ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) { + struct mm_struct *mm; + /* convert pages-jiffies to Mbyte-usec */ stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB; stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB; - if (p->mm) { + mm = get_task_mm(p); + if (mm) { /* adjust to KB unit */ - stats->hiwater_rss = p->mm->hiwater_rss * PAGE_SIZE / KB; - stats->hiwater_vm = p->mm->hiwater_vm * PAGE_SIZE / KB; + stats->hiwater_rss = mm->hiwater_rss * PAGE_SIZE / KB; + stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB; + mmput(mm); } stats->read_char = p->rchar; stats->write_char = p->wchar; -- cgit v1.2.3 From 4a279ff1ea1cf325775ada983035123fcdc8e986 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 30 Oct 2006 22:07:15 -0800 Subject: [PATCH] taskstats: fix sub-threads accounting If there are no listeners, taskstats_exit_send() just returns because taskstats_exit_alloc() didn't allocate *tidstats. This is wrong, each sub-thread should do fill_tgid_exit() on exit, otherwise its ->delays is not recorded in ->signal->stats and lost. Q: We don't send TASKSTATS_TYPE_AGGR_TGID when single-threaded process exits. Is it good? How can the listener figure out that it was actually a process exit, not sub-thread? Signed-off-by: Oleg Nesterov Cc: Balbir Singh Acked-by: Shailabh Nagar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 2039585ec5e1..f45c5e70773c 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -455,10 +455,9 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, int is_thread_group; struct nlattr *na; - if (!family_registered || !tidstats) + if (!family_registered) return; - rc = 0; /* * Size includes space for nested attributes */ @@ -466,8 +465,15 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); is_thread_group = (tsk->signal->stats != NULL); - if (is_thread_group) - size = 2 * size; /* PID + STATS + TGID + STATS */ + if (is_thread_group) { + /* PID + STATS + TGID + STATS */ + size = 2 * size; + /* fill the tsk->signal->stats structure */ + fill_tgid_exit(tsk); + } + + if (!tidstats) + return; rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); if (rc < 0) @@ -487,11 +493,8 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, goto send; /* - * tsk has/had a thread group so fill the tsk->signal->stats structure * Doesn't matter if tsk is the leader or the last group member leaving */ - - fill_tgid_exit(tsk); if (!group_dead) goto send; -- cgit v1.2.3 From f46c483357c2d87606bbefb511321e3efd4baae0 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 2 Nov 2006 22:07:16 -0800 Subject: [PATCH] Add printk_timed_ratelimit() printk_ratelimit() has global state which makes it not useful for callers which wish to perform ratelimiting at a particular frequency. Add a printk_timed_ratelimit() which utilises caller-provided state storage to permit more flexibility. This function can in fact be used for things other than printk ratelimiting and is perhaps poorly named. Cc: Ulrich Drepper Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index f7d427ef5038..66426552fbfe 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -1101,3 +1102,23 @@ int printk_ratelimit(void) printk_ratelimit_burst); } EXPORT_SYMBOL(printk_ratelimit); + +/** + * printk_timed_ratelimit - caller-controlled printk ratelimiting + * @caller_jiffies: pointer to caller's state + * @interval_msecs: minimum interval between prints + * + * printk_timed_ratelimit() returns true if more than @interval_msecs + * milliseconds have elapsed since the last time printk_timed_ratelimit() + * returned true. + */ +bool printk_timed_ratelimit(unsigned long *caller_jiffies, + unsigned int interval_msecs) +{ + if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) { + *caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs); + return true; + } + return false; +} +EXPORT_SYMBOL(printk_timed_ratelimit); -- cgit v1.2.3 From 19c6b6ed3f597a583f58e3fc99256cc01ae8c394 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 2 Nov 2006 22:07:17 -0800 Subject: [PATCH] schedule removal of FUTEX_FD Apparently FUTEX_FD is unfixably racy and nothing uses it (or if it does, it shouldn't). Add a warning printk, give any remaining users six months to migrate off it. Cc: Ulrich Drepper Cc: Ingo Molnar Acked-by: Thomas Gleixner Cc: Rusty Russell Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index b364e0026191..93ef30ba209f 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1507,6 +1507,13 @@ static int futex_fd(u32 __user *uaddr, int signal) struct futex_q *q; struct file *filp; int ret, err; + static unsigned long printk_interval; + + if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) { + printk(KERN_WARNING "Process `%s' used FUTEX_FD, which " + "will be removed from the kernel in June 2007\n", + current->comm); + } ret = -EINVAL; if (!valid_signal(signal)) -- cgit v1.2.3 From b918f6e62cd46774f9fc0a3fbba6bd10ad85ee14 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 2 Nov 2006 22:07:19 -0800 Subject: [PATCH] swsusp: debugging Add a swsusp debugging mode. This does everything that's needed for a suspend except for actually suspending. So we can look in the log messages and work out a) what code is being slow and b) which drivers are misbehaving. (1) # echo testproc > /sys/power/disk # echo disk > /sys/power/state This should turn off the non-boot CPU, freeze all processes, wait for 5 seconds and then thaw the processes and the CPU. (2) # echo test > /sys/power/disk # echo disk > /sys/power/state This should turn off the non-boot CPU, freeze all processes, shrink memory, suspend all devices, wait for 5 seconds, resume the devices etc. Cc: Pavel Machek Cc: Stefan Seyfried Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index d3a158a60312..b1fb7866b0b3 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -71,7 +71,7 @@ static inline void platform_finish(void) static int prepare_processes(void) { - int error; + int error = 0; pm_prepare_console(); @@ -84,6 +84,12 @@ static int prepare_processes(void) goto thaw; } + if (pm_disk_mode == PM_DISK_TESTPROC) { + printk("swsusp debug: Waiting for 5 seconds.\n"); + mdelay(5000); + goto thaw; + } + /* Free memory before shutting down devices. */ if (!(error = swsusp_shrink_memory())) return 0; @@ -120,13 +126,21 @@ int pm_suspend_disk(void) if (error) return error; + if (pm_disk_mode == PM_DISK_TESTPROC) + goto Thaw; + suspend_console(); error = device_suspend(PMSG_FREEZE); if (error) { resume_console(); printk("Some devices failed to suspend\n"); - unprepare_processes(); - return error; + goto Thaw; + } + + if (pm_disk_mode == PM_DISK_TEST) { + printk("swsusp debug: Waiting for 5 seconds.\n"); + mdelay(5000); + goto Done; } pr_debug("PM: snapshotting memory.\n"); @@ -143,16 +157,17 @@ int pm_suspend_disk(void) power_down(pm_disk_mode); else { swsusp_free(); - unprepare_processes(); - return error; + goto Thaw; } - } else + } else { pr_debug("PM: Image restored successfully.\n"); + } swsusp_free(); Done: device_resume(); resume_console(); + Thaw: unprepare_processes(); return error; } @@ -249,6 +264,8 @@ static const char * const pm_disk_modes[] = { [PM_DISK_PLATFORM] = "platform", [PM_DISK_SHUTDOWN] = "shutdown", [PM_DISK_REBOOT] = "reboot", + [PM_DISK_TEST] = "test", + [PM_DISK_TESTPROC] = "testproc", }; /** @@ -303,17 +320,19 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n) } } if (mode) { - if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT) + if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT || + mode == PM_DISK_TEST || mode == PM_DISK_TESTPROC) { pm_disk_mode = mode; - else { + } else { if (pm_ops && pm_ops->enter && (mode == pm_ops->pm_disk_mode)) pm_disk_mode = mode; else error = -EINVAL; } - } else + } else { error = -EINVAL; + } pr_debug("PM: suspend-to-disk mode set to '%s'\n", pm_disk_modes[mode]); -- cgit v1.2.3 From 3fd593979802f81ff6452596ac61e3840f917589 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 2 Nov 2006 22:07:24 -0800 Subject: [PATCH] Create compat_sys_migrate_pages This is needed on bigendian 64bit architectures. Signed-off-by: Stephen Rothwell Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/compat.c | 33 +++++++++++++++++++++++++++++++++ kernel/sys_ni.c | 1 + 2 files changed, 34 insertions(+) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index d4898aad6cfa..6952dd057300 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -982,4 +982,37 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, } return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); } + +asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, + compat_ulong_t maxnode, + const compat_ulong_t __user *old_nodes, + const compat_ulong_t __user *new_nodes) +{ + unsigned long __user *old = NULL; + unsigned long __user *new = NULL; + nodemask_t tmp_mask; + unsigned long nr_bits; + unsigned long size; + + nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES); + size = ALIGN(nr_bits, BITS_PER_LONG) / 8; + if (old_nodes) { + if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits)) + return -EFAULT; + old = compat_alloc_user_space(new_nodes ? size * 2 : size); + if (new_nodes) + new = old + size / sizeof(unsigned long); + if (copy_to_user(old, nodes_addr(tmp_mask), size)) + return -EFAULT; + } + if (new_nodes) { + if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits)) + return -EFAULT; + if (new == NULL) + new = compat_alloc_user_space(size); + if (copy_to_user(new, nodes_addr(tmp_mask), size)) + return -EFAULT; + } + return sys_migrate_pages(pid, nr_bits + 1, old, new); +} #endif diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 0e53314b14de..d7306d0f3dfc 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -135,6 +135,7 @@ cond_syscall(sys_madvise); cond_syscall(sys_mremap); cond_syscall(sys_remap_file_pages); cond_syscall(compat_sys_move_pages); +cond_syscall(compat_sys_migrate_pages); /* block-layer dependent */ cond_syscall(sys_bdflush); -- cgit v1.2.3 From 45c18b0bb579b5c1b89f8c99f1b6ffa4c586ba08 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 4 Nov 2006 10:06:02 -0800 Subject: Fix unlikely (but possible) race condition on task->user access There's a possible race condition when doing a "switch_uid()" from one user to another, which could race with another thread doing a signal allocation and looking at the old thread ->user pointer as it is freed. This explains an oops reported by Lukasz Trabinski: http://permalink.gmane.org/gmane.linux.kernel/462241 We fix this by delaying the (reference-counted) freeing of the user structure until the thread signal handler lock has been released, so that we know that the signal allocation has either seen the new value or has properly incremented the reference count of the old one. Race identified by Oleg Nesterov. Cc: Lukasz Trabinski Cc: Oleg Nesterov Cc: Andrew Morton Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/user.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 6408c0424291..220e586127a0 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -187,6 +187,17 @@ void switch_uid(struct user_struct *new_user) atomic_dec(&old_user->processes); switch_uid_keyring(new_user); current->user = new_user; + + /* + * We need to synchronize with __sigqueue_alloc() + * doing a get_uid(p->user).. If that saw the old + * user value, we need to wait until it has exited + * its critical region before we can free the old + * structure. + */ + smp_mb(); + spin_unlock_wait(¤t->sighand->siglock); + free_uid(old_user); suid_keys(current); } -- cgit v1.2.3 From 10b1fbdb0a0ca91847a534ad26d0bc250c25b74f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 4 Nov 2006 13:03:00 -0800 Subject: Make sure "user->sigpending" count is in sync The previous commit (45c18b0bb579b5c1b89f8c99f1b6ffa4c586ba08, aka "Fix unlikely (but possible) race condition on task->user access") fixed a potential oops due to __sigqueue_alloc() getting its "user" pointer out of sync with switch_user(), and accessing a user pointer that had been de-allocated on another CPU. It still left another (much less serious) problem, where a concurrent __sigqueue_alloc and swich_user could cause sigqueue_alloc to do signal pending reference counting for a _different_ user than the one it then actually ended up using. No oops, but we'd end up with the wrong signal accounting. Another case of Oleg's eagle-eyes picking up the problem. This is trivially fixed by just making sure we load whichever "user" structure we decide to use (it doesn't matter _which_ one we pick, we just need to pick one) just once. Acked-by: Oleg Nesterov Cc: Andrew Morton Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/signal.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 7ed8d5304bec..df18c167a2a7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -267,18 +267,25 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, int override_rlimit) { struct sigqueue *q = NULL; + struct user_struct *user; - atomic_inc(&t->user->sigpending); + /* + * In order to avoid problems with "switch_user()", we want to make + * sure that the compiler doesn't re-load "t->user" + */ + user = t->user; + barrier(); + atomic_inc(&user->sigpending); if (override_rlimit || - atomic_read(&t->user->sigpending) <= + atomic_read(&user->sigpending) <= t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) q = kmem_cache_alloc(sigqueue_cachep, flags); if (unlikely(q == NULL)) { - atomic_dec(&t->user->sigpending); + atomic_dec(&user->sigpending); } else { INIT_LIST_HEAD(&q->list); q->flags = 0; - q->user = get_uid(t->user); + q->user = get_uid(user); } return(q); } -- cgit v1.2.3 From 4b96b1a10cb00c867103b21f0f2a6c91b705db11 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Sun, 5 Nov 2006 23:52:04 -0800 Subject: [PATCH] Fix the spurious unlock_cpu_hotplug false warnings Cpu-hotplug locking has a minor race case caused because of setting the variable "recursive" to NULL *after* releasing the cpu_bitmask_lock in the function unlock_cpu_hotplug,instead of doing so before releasing the cpu_bitmask_lock. This was the cause of most of the recent false spurious lock_cpu_unlock warnings. This should fix the problem reported by Martin Lorenz reported in http://lkml.org/lkml/2006/10/29/127. Thanks to Srinivasa DS for pointing it out. Signed-off-by: Gautham R Shenoy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 663c920b2234..272254f20d97 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -58,8 +58,8 @@ void unlock_cpu_hotplug(void) recursive_depth--; return; } - mutex_unlock(&cpu_bitmask_lock); recursive = NULL; + mutex_unlock(&cpu_bitmask_lock); } EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); -- cgit v1.2.3 From 64efade11cddc4237c1b95ea4ca18af122a7e19e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 5 Nov 2006 23:52:10 -0800 Subject: [PATCH] lockdep: fix delayacct locking bug Make the delayacct lock irqsave; this avoids the possible deadlock where an interrupt is taken while holding the delayacct lock which needs to take the delayacct lock. Signed-off-by: Peter Zijlstra Acked-by: Oleg Nesterov Cc: Balbir Singh Cc: Shailabh Nagar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/delayacct.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 36752f124c6a..66a0ea48751d 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -66,6 +66,7 @@ static void delayacct_end(struct timespec *start, struct timespec *end, { struct timespec ts; s64 ns; + unsigned long flags; do_posix_clock_monotonic_gettime(end); ts = timespec_sub(*end, *start); @@ -73,10 +74,10 @@ static void delayacct_end(struct timespec *start, struct timespec *end, if (ns < 0) return; - spin_lock(¤t->delays->lock); + spin_lock_irqsave(¤t->delays->lock, flags); *total += ns; (*count)++; - spin_unlock(¤t->delays->lock); + spin_unlock_irqrestore(¤t->delays->lock, flags); } void __delayacct_blkio_start(void) @@ -104,6 +105,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) s64 tmp; struct timespec ts; unsigned long t1,t2,t3; + unsigned long flags; /* Though tsk->delays accessed later, early exit avoids * unnecessary returning of other data @@ -136,14 +138,14 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ - spin_lock(&tsk->delays->lock); + spin_lock_irqsave(&tsk->delays->lock, flags); tmp = d->blkio_delay_total + tsk->delays->blkio_delay; d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; tmp = d->swapin_delay_total + tsk->delays->swapin_delay; d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; - spin_unlock(&tsk->delays->lock); + spin_unlock_irqrestore(&tsk->delays->lock, flags); done: return 0; @@ -152,11 +154,12 @@ done: __u64 __delayacct_blkio_ticks(struct task_struct *tsk) { __u64 ret; + unsigned long flags; - spin_lock(&tsk->delays->lock); + spin_lock_irqsave(&tsk->delays->lock, flags); ret = nsec_to_clock_t(tsk->delays->blkio_delay + tsk->delays->swapin_delay); - spin_unlock(&tsk->delays->lock); + spin_unlock_irqrestore(&tsk->delays->lock, flags); return ret; } -- cgit v1.2.3 From 0e009be8a0c2309f3696df70f72ef0075aa34c9c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 5 Nov 2006 23:52:11 -0800 Subject: [PATCH] Improve the removed sysctl warnings Don't warn about libpthread's access to kernel.version. When it receives -ENOSYS it will read /proc/sys/kernel/version. If anything else shows up print the sysctl number string. Signed-off-by: Eric W. Biederman Cc: Cal Peake Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8bff2c18fb5a..0c8e805bbd6f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2680,13 +2680,33 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, asmlinkage long sys_sysctl(struct __sysctl_args __user *args) { static int msg_count; + struct __sysctl_args tmp; + int name[CTL_MAXNAME]; + int i; + + /* Read in the sysctl name for better debug message logging */ + if (copy_from_user(&tmp, args, sizeof(tmp))) + return -EFAULT; + if (tmp.nlen <= 0 || tmp.nlen >= CTL_MAXNAME) + return -ENOTDIR; + for (i = 0; i < tmp.nlen; i++) + if (get_user(name[i], tmp.name + i)) + return -EFAULT; + + /* Ignore accesses to kernel.version */ + if ((tmp.nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION)) + goto out; if (msg_count < 5) { msg_count++; printk(KERN_INFO "warning: process `%s' used the removed sysctl " - "system call\n", current->comm); + "system call with ", current->comm); + for (i = 0; i < tmp.nlen; i++) + printk("%d.", name[i]); + printk("\n"); } +out: return -ENOSYS; } -- cgit v1.2.3 From d99f160ac53e51090f015a8f0617cea25f81a191 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 5 Nov 2006 23:52:12 -0800 Subject: [PATCH] sysctl: allow a zero ctl_name in the middle of a sysctl table Since it is becoming clear that there are just enough users of the binary sysctl interface that completely removing the binary interface from the kernel will not be an option for foreseeable future, we need to find a way to address the sysctl maintenance issues. The basic problem is that sysctl requires one central authority to allocate sysctl numbers, or else conflicts and ABI breakage occur. The proc interface to sysctl does not have that problem, as names are not densely allocated. By not terminating a sysctl table until I have neither a ctl_name nor a procname, it becomes simple to add sysctl entries that don't show up in the binary sysctl interface. Which allows people to avoid allocating a binary sysctl value when not needed. I have audited the kernel code and in my reading I have not found a single sysctl table that wasn't terminated by a completely zero filled entry. So this change in behavior should not affect anything. I think this mechanism eases the pain enough that combined with a little disciple we can solve the reoccurring sysctl ABI breakage. Signed-off-by: Eric W. Biederman Acked-by: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0c8e805bbd6f..09e569f4792b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1315,7 +1315,9 @@ repeat: return -ENOTDIR; if (get_user(n, name)) return -EFAULT; - for ( ; table->ctl_name; table++) { + for ( ; table->ctl_name || table->procname; table++) { + if (!table->ctl_name) + continue; if (n == table->ctl_name || table->ctl_name == CTL_ANY) { int error; if (table->child) { @@ -1532,7 +1534,7 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, int len; mode_t mode; - for (; table->ctl_name; table++) { + for (; table->ctl_name || table->procname; table++) { /* Can't do anything without a proc name. */ if (!table->procname) continue; @@ -1579,7 +1581,7 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root) { struct proc_dir_entry *de; - for (; table->ctl_name; table++) { + for (; table->ctl_name || table->procname; table++) { if (!(de = table->de)) continue; if (de->mode & S_IFDIR) { -- cgit v1.2.3 From 0130b0b32ee53dc7add773fcea984f6a26ef1da3 Mon Sep 17 00:00:00 2001 From: Sharyathi Nagesh Date: Fri, 10 Nov 2006 12:27:54 -0800 Subject: [PATCH] fix Data Acess error in dup_fd On running the Stress Test on machine for more than 72 hours following error message was observed. 0:mon> e cpu 0x0: Vector: 300 (Data Access) at [c00000007ce2f7f0] pc: c000000000060d90: .dup_fd+0x240/0x39c lr: c000000000060d6c: .dup_fd+0x21c/0x39c sp: c00000007ce2fa70 msr: 800000000000b032 dar: ffffffff00000028 dsisr: 40000000 current = 0xc000000074950980 paca = 0xc000000000454500 pid = 27330, comm = bash 0:mon> t [c00000007ce2fa70] c000000000060d28 .dup_fd+0x1d8/0x39c (unreliable) [c00000007ce2fb30] c000000000060f48 .copy_files+0x5c/0x88 [c00000007ce2fbd0] c000000000061f5c .copy_process+0x574/0x1520 [c00000007ce2fcd0] c000000000062f88 .do_fork+0x80/0x1c4 [c00000007ce2fdc0] c000000000011790 .sys_clone+0x5c/0x74 [c00000007ce2fe30] c000000000008950 .ppc_clone+0x8/0xc The problem is because of race window. When if(expand) block is executed in dup_fd unlocking of oldf->file_lock give a window for fdtable in oldf to be modified. So actual open_files in oldf may not match with open_files variable. Cc: Vadim Lobanov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 3da978eec791..4b4eab2a3161 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -687,6 +687,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) * the latest pointer. */ spin_lock(&oldf->file_lock); + open_files = count_open_files(old_fdt); old_fdt = files_fdtable(oldf); } -- cgit v1.2.3 From f72fa707604c015a6625e80f269506032d5430dc Mon Sep 17 00:00:00 2001 From: Pavel Emelianov Date: Fri, 10 Nov 2006 12:27:56 -0800 Subject: [PATCH] Fix misrouted interrupts deadlocks While testing kernel on machine with "irqpoll" option I've caught such a lockup: __do_IRQ() spin_lock(&desc->lock); desc->chip->ack(); /* IRQ is ACKed */ note_interrupt() misrouted_irq() handle_IRQ_event() if (...) local_irq_enable_in_hardirq(); /* interrupts are enabled from now */ ... __do_IRQ() /* same IRQ we've started from */ spin_lock(&desc->lock); /* LOCKUP */ Looking at misrouted_irq() code I've found that a potential deadlock like this can also take place: 1CPU: __do_IRQ() spin_lock(&desc->lock); /* irq = A */ misrouted_irq() for (i = 1; i < NR_IRQS; i++) { spin_lock(&desc->lock); /* irq = B */ if (desc->status & IRQ_INPROGRESS) { 2CPU: __do_IRQ() spin_lock(&desc->lock); /* irq = B */ misrouted_irq() for (i = 1; i < NR_IRQS; i++) { spin_lock(&desc->lock); /* irq = A */ if (desc->status & IRQ_INPROGRESS) { As the second lock on both CPUs is taken before checking that this irq is being handled in another processor this may cause a deadlock. This issue is only theoretical. I propose the attached patch to fix booth problems: when trying to handle misrouted IRQ active desc->lock may be unlocked. Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/spurious.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 543ea2e5ad93..9c7e2e4c1fe7 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -147,7 +147,11 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, if (unlikely(irqfixup)) { /* Don't punish working computers */ if ((irqfixup == 2 && irq == 0) || action_ret == IRQ_NONE) { - int ok = misrouted_irq(irq); + int ok; + + spin_unlock(&desc->lock); + ok = misrouted_irq(irq); + spin_lock(&desc->lock); if (action_ret == IRQ_NONE) desc->irqs_unhandled -= ok; } -- cgit v1.2.3 From 8b126b77536186eef69d408eb7959ce7f558f251 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 14 Nov 2006 02:03:23 -0800 Subject: [PATCH] setup_irq(): better mismatch debugging When we get a mismatch between handlers on the same IRQ, all we get is "IRQ handler type mismatch for IRQ n". Let's print the name of the presently-registered handler with which we got the mismatch. Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 6879202afe9a..b385878c6e80 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -216,6 +216,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) { struct irq_desc *desc = irq_desc + irq; struct irqaction *old, **p; + const char *old_name = NULL; unsigned long flags; int shared = 0; @@ -255,8 +256,10 @@ int setup_irq(unsigned int irq, struct irqaction *new) * set the trigger type must match. */ if (!((old->flags & new->flags) & IRQF_SHARED) || - ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) + ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) { + old_name = old->name; goto mismatch; + } #if defined(CONFIG_IRQ_PER_CPU) /* All handlers must agree on per-cpuness */ @@ -322,11 +325,13 @@ int setup_irq(unsigned int irq, struct irqaction *new) return 0; mismatch: - spin_unlock_irqrestore(&desc->lock, flags); if (!(new->flags & IRQF_PROBE_SHARED)) { printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); + if (old_name) + printk(KERN_ERR "current handler: %s\n", old_name); dump_stack(); } + spin_unlock_irqrestore(&desc->lock, flags); return -EBUSY; } -- cgit v1.2.3 From 9a3a04ac386f44175b6a4142eaeab3d4170a57f3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 14 Nov 2006 15:20:51 -0800 Subject: Revert "[PATCH] fix Data Acess error in dup_fd" This reverts commit 0130b0b32ee53dc7add773fcea984f6a26ef1da3. Sergey Vlasov points out (and Vadim Lobanov concurs) that the bug it was supposed to fix must be some unrelated memory corruption, and the "fix" actually causes more problems: "However, the new code does not look safe in all cases. If some other task has opened more files while dup_fd() released oldf->file_lock, the new code will update open_files to the new larger value. But newf was allocated with the old smaller value of open_files, therefore subsequent accesses to newf may try to write into unallocated memory." so revert it. Cc: Sharyathi Nagesh Cc: Sergey Vlasov Cc: Vadim Lobanov Cc: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 4b4eab2a3161..3da978eec791 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -687,7 +687,6 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) * the latest pointer. */ spin_lock(&oldf->file_lock); - open_files = count_open_files(old_fdt); old_fdt = files_fdtable(oldf); } -- cgit v1.2.3 From b86432b42eba5671969a9e6483ee219674b7ee25 Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Thu, 16 Nov 2006 01:19:10 -0800 Subject: [PATCH] some irq_chip variables point to NULL I got an oops when booting 2.6.19-rc5-mm1 on my ia64 machine. Below is the log. Oops 11012296146944 [1] Modules linked in: binfmt_misc dm_mirror dm_multipath dm_mod thermal processor f an container button sg eepro100 e100 mii Pid: 0, CPU 0, comm: swapper psr : 0000121008022038 ifs : 800000000000040b ip : [] Not tainted ip is at __do_IRQ+0x371/0x3e0 unat: 0000000000000000 pfs : 000000000000040b rsc : 0000000000000003 rnat: 656960155aa56aa5 bsps: a00000010058b890 pr : 656960155aa55a65 ldrs: 0000000000000000 ccv : 0000000000000000 fpsr: 0009804c0270033f csd : 0000000000000000 ssd : 0000000000000000 b0 : a0000001000e1390 b6 : a0000001005beac0 b7 : e00000007f01aa00 f6 : 000000000000000000000 f7 : 0ffe69090000000000000 f8 : 1000a9090000000000000 f9 : 0ffff8000000000000000 f10 : 1000a908ffffff6f70000 f11 : 1003e0000000000000909 r1 : a000000100fbbff0 r2 : 0000000000010002 r3 : 0000000000010001 r8 : fffffffffffbffff r9 : a000000100bd8060 r10 : a000000100dd83b8 r11 : fffffffffffeffff r12 : a000000100bcbbb0 r13 : a000000100bc4000 r14 : 0000000000010000 r15 : 0000000000010000 r16 : a000000100c01aa8 r17 : a000000100d2c350 r18 : 0000000000000000 r19 : a000000100d2c300 r20 : a000000100c01a88 r21 : 0000000080010100 r22 : a000000100c01ac0 r23 : a0000001000108e0 r24 : e000000477980004 r25 : 0000000000000000 r26 : 0000000000000000 r27 : e00000000913400c r28 : e0000004799ee51c r29 : e0000004778b87f0 r30 : a000000100d2c300 r31 : a00000010005c7e0 Call Trace: [] show_stack+0x40/0xa0 sp=a000000100bcb760 bsp=a000000100bc4f40 [] show_regs+0x840/0x880 sp=a000000100bcb930 bsp=a000000100bc4ee8 [] die+0x250/0x320 sp=a000000100bcb930 bsp=a000000100bc4ea0 [] ia64_do_page_fault+0x8d0/0xa20 sp=a000000100bcb950 bsp=a000000100bc4e50 [] ia64_leave_kernel+0x0/0x290 sp=a000000100bcb9e0 bsp=a000000100bc4e50 [] __do_IRQ+0x370/0x3e0 sp=a000000100bcbbb0 bsp=a000000100bc4df0 [] ia64_handle_irq+0x170/0x220 sp=a000000100bcbbb0 bsp=a000000100bc4dc0 [] ia64_leave_kernel+0x0/0x290 sp=a000000100bcbbb0 bsp=a000000100bc4dc0 [] ia64_pal_call_static+0x90/0xc0 sp=a000000100bcbd80 bsp=a000000100bc4d78 [] default_idle+0x90/0x160 sp=a000000100bcbd80 bsp=a000000100bc4d58 [] cpu_idle+0x1f0/0x440 sp=a000000100bcbe20 bsp=a000000100bc4d18 [] rest_init+0xc0/0xe0 sp=a000000100bcbe20 bsp=a000000100bc4d00 [] start_kernel+0x6a0/0x6c0 sp=a000000100bcbe20 bsp=a000000100bc4ca0 [] __end_ivt_text+0x6d0/0x6f0 sp=a000000100bcbe30 bsp=a000000100bc4c00 <0>Kernel panic - not syncing: Aiee, killing interrupt handler! The root cause is that some irq_chip variables, especially ia64_msi_chip, initiate their memeber end to point to NULL. __do_IRQ doesn't check if irq_chip->end is null and just calls it after processing the interrupt. As irq_chip->end is called at many places, so I fix it by reinitiating irq_chip->end to dummy_irq_chip.end, e.g., a noop function. Signed-off-by: Zhang Yanmin Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "Luck, Tony" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/chip.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2d0dc3efe813..ebfd24a41858 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -233,6 +233,8 @@ void irq_chip_set_defaults(struct irq_chip *chip) chip->shutdown = chip->disable; if (!chip->name) chip->name = chip->typename; + if (!chip->end) + chip->end = dummy_irq_chip.end; } static inline void mask_ack_irq(struct irq_desc *desc, int irq) -- cgit v1.2.3 From 1ff5683043196b9ad628a5de6bf8eeca52ee8bfd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 17 Nov 2006 19:57:22 +0100 Subject: [PATCH] lockdep: fix static keys in module-allocated percpu areas lockdep got confused by certain locks in modules: INFO: trying to register non-static key. the code is fine but needs lockdep annotation. turning off the locking correctness validator. Call Trace: [] dump_trace+0xaa/0x3f2 [] show_trace+0x3a/0x60 [] dump_stack+0x15/0x17 [] __lock_acquire+0x724/0x9bb [] lock_acquire+0x4d/0x67 [] rt_spin_lock+0x3d/0x41 [] :ip_conntrack:__ip_ct_refresh_acct+0x131/0x174 [] :ip_conntrack:udp_packet+0xbf/0xcf [] :ip_conntrack:ip_conntrack_in+0x394/0x4a7 [] nf_iterate+0x41/0x7f [] nf_hook_slow+0x64/0xd5 [] ip_rcv+0x24e/0x506 [...] Steven Rostedt found the bug: static_obj() check did not take PERCPU_ENOUGH_ROOM into account, so in-module DEFINE_PER_CPU-area locks were triggering this message. Signed-off-by: Ingo Molnar Signed-off-by: Steven Rostedt Signed-off-by: Linus Torvalds --- kernel/lockdep.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index b739be2a6dc9..c9fefdb1a7db 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -1081,7 +1081,8 @@ static int static_obj(void *obj) */ for_each_possible_cpu(i) { start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); - end = (unsigned long) &__per_cpu_end + per_cpu_offset(i); + end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM + + per_cpu_offset(i); if ((addr >= start) && (addr < end)) return 1; -- cgit v1.2.3 From b42172fc7b569a0ef2b0fa38d71382969074c0e2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 22 Nov 2006 09:32:06 -0800 Subject: Don't call "note_interrupt()" with irq descriptor lock held This reverts commit f72fa707604c015a6625e80f269506032d5430dc, and solves the problem that it tried to fix by simply making "__do_IRQ()" call the note_interrupt() function without the lock held, the way everybody else does. It should be noted that all interrupt handling code must never allow the descriptor actors to be entered "recursively" (that's why we do all the magic IRQ_PENDING stuff in the first place), so there actually is exclusion at that much higher level, even in the absense of locking. Acked-by: Vivek Goyal Acked-by:Pavel Emelianov Cc: Andrew Morton Cc: Ingo Molnar Cc: Adrian Bunk Signed-off-by: Linus Torvalds --- kernel/irq/handle.c | 4 ++-- kernel/irq/spurious.c | 6 +----- 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 42aa6f1a3f0f..a681912bc89a 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -231,10 +231,10 @@ fastcall unsigned int __do_IRQ(unsigned int irq) spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); - - spin_lock(&desc->lock); if (!noirqdebug) note_interrupt(irq, desc, action_ret); + + spin_lock(&desc->lock); if (likely(!(desc->status & IRQ_PENDING))) break; desc->status &= ~IRQ_PENDING; diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 9c7e2e4c1fe7..543ea2e5ad93 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -147,11 +147,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, if (unlikely(irqfixup)) { /* Don't punish working computers */ if ((irqfixup == 2 && irq == 0) || action_ret == IRQ_NONE) { - int ok; - - spin_unlock(&desc->lock); - ok = misrouted_irq(irq); - spin_lock(&desc->lock); + int ok = misrouted_irq(irq); if (action_ret == IRQ_NONE) desc->irqs_unhandled -= ok; } -- cgit v1.2.3 From 753ca4f312a4b26940e4731b4fa5dbbbbcc77e97 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sat, 25 Nov 2006 11:09:34 -0800 Subject: [PATCH] fix copy_process() error check The return value of copy_process() should be checked by IS_ERR(). Signed-off-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 3da978eec791..8cdd3e72ba55 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1315,9 +1315,8 @@ struct task_struct * __devinit fork_idle(int cpu) struct pt_regs regs; task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL, 0); - if (!task) - return ERR_PTR(-ENOMEM); - init_idle(task, cpu); + if (!IS_ERR(task)) + init_idle(task, cpu); return task; } -- cgit v1.2.3 From cfd3ef2346f924d6c0e82236c20fdb3a8840136a Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 25 Nov 2006 11:09:37 -0800 Subject: [PATCH] lockdep: spin_lock_irqsave_nested() Introduce spin_lock_irqsave_nested(); implementation from: http://lkml.org/lkml/2006/6/1/122 Patch from: http://lkml.org/lkml/2006/9/13/258 [akpm@osdl.org: two compile fixes] Signed-off-by: Arjan van de Ven Signed-off-by: Jiri Kosina Signed-off-by: Peter Zijlstra Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/spinlock.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 476c3741511b..2c6c2bf85514 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -293,6 +293,27 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) } EXPORT_SYMBOL(_spin_lock_nested); +unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) +{ + unsigned long flags; + + local_irq_save(flags); + preempt_disable(); + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + /* + * On lockdep we dont want the hand-coded irq-enable of + * _raw_spin_lock_flags() code, because lockdep assumes + * that interrupts are not re-enabled during lock-acquire: + */ +#ifdef CONFIG_PROVE_SPIN_LOCKING + _raw_spin_lock(lock); +#else + _raw_spin_lock_flags(lock, &flags); +#endif + return flags; +} + +EXPORT_SYMBOL(_spin_lock_irqsave_nested); #endif -- cgit v1.2.3 From ff0a538d8b08700df2b46f9aafc9fb2765071f0a Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 28 Nov 2006 20:12:59 +0100 Subject: [PATCH] x86-64: work around gcc4 issue with -Os in Dwarf2 stack unwind This fixes a problem with gcc4 mis-compiling the stack unwind code under -Os, which resulted in 'stuck' messages whenever an assembly routine was encountered. (The second hunk is trivial cleanup.) Signed-off-by: Jan Beulich --- kernel/unwind.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/unwind.c b/kernel/unwind.c index f7e50d16dbf6..ed0a21d4a902 100644 --- a/kernel/unwind.c +++ b/kernel/unwind.c @@ -938,8 +938,11 @@ int unwind(struct unwind_frame_info *frame) else { retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end); /* skip augmentation */ - if (((const char *)(cie + 2))[1] == 'z') - ptr += get_uleb128(&ptr, end); + if (((const char *)(cie + 2))[1] == 'z') { + uleb128_t augSize = get_uleb128(&ptr, end); + + ptr += augSize; + } if (ptr > end || retAddrReg >= ARRAY_SIZE(reg_info) || REG_INVALID(retAddrReg) @@ -963,9 +966,7 @@ int unwind(struct unwind_frame_info *frame) if (cie == NULL || fde == NULL) { #ifdef CONFIG_FRAME_POINTER unsigned long top, bottom; -#endif -#ifdef CONFIG_FRAME_POINTER top = STACK_TOP(frame->task); bottom = STACK_BOTTOM(frame->task); # if FRAME_RETADDR_OFFSET < 0 -- cgit v1.2.3 From 3cce4856ff3dfa663b1a168dab48120d70820da6 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 28 Nov 2006 12:29:43 -0800 Subject: [PATCH] fix create_write_pipe() error check The return value of create_write_pipe()/create_read_pipe() should be checked by IS_ERR(). Signed-off-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index bb4e29d924e4..2b76dee28496 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -307,14 +307,14 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp, return 0; f = create_write_pipe(); - if (!f) - return -ENOMEM; + if (IS_ERR(f)) + return PTR_ERR(f); *filp = f; f = create_read_pipe(f); - if (!f) { + if (IS_ERR(f)) { free_write_pipe(*filp); - return -ENOMEM; + return PTR_ERR(f); } sub_info.stdin = f; -- cgit v1.2.3 From e17e0f51aeea4e59c7e450a1c0f26605b91c1260 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 24 Nov 2006 12:15:25 +0100 Subject: Driver core: show drivers in /sys/module/ Show the drivers, which belong to the module: $ ls -l /sys/module/usbcore/drivers/ hub -> ../../../bus/usb/drivers/hub usb -> ../../../bus/usb/drivers/usb usbfs -> ../../../bus/usb/drivers/usbfs Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/module.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index f0166563c602..45e01cb60101 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1086,22 +1086,35 @@ static int mod_sysfs_setup(struct module *mod, goto out; kobj_set_kset_s(&mod->mkobj, module_subsys); mod->mkobj.mod = mod; - err = kobject_register(&mod->mkobj.kobj); + + /* delay uevent until full sysfs population */ + kobject_init(&mod->mkobj.kobj); + err = kobject_add(&mod->mkobj.kobj); if (err) goto out; + mod->drivers_dir = kobject_add_dir(&mod->mkobj.kobj, "drivers"); + if (!mod->drivers_dir) + goto out_unreg; + err = module_param_sysfs_setup(mod, kparam, num_params); if (err) - goto out_unreg; + goto out_unreg_drivers; err = module_add_modinfo_attrs(mod); if (err) - goto out_unreg; + goto out_unreg_param; + kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); return 0; +out_unreg_drivers: + kobject_unregister(mod->drivers_dir); +out_unreg_param: + module_param_sysfs_remove(mod); out_unreg: - kobject_unregister(&mod->mkobj.kobj); + kobject_del(&mod->mkobj.kobj); + kobject_put(&mod->mkobj.kobj); out: return err; } @@ -1110,6 +1123,7 @@ static void mod_kobject_remove(struct module *mod) { module_remove_modinfo_attrs(mod); module_param_sysfs_remove(mod); + kobject_unregister(mod->drivers_dir); kobject_unregister(&mod->mkobj.kobj); } @@ -2275,11 +2289,14 @@ void print_modules(void) void module_add_driver(struct module *mod, struct device_driver *drv) { + int no_warn; + if (!mod || !drv) return; - /* Don't check return code; this call is idempotent */ - sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module"); + /* Don't check return codes; these calls are idempotent */ + no_warn = sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module"); + no_warn = sysfs_create_link(mod->drivers_dir, &drv->kobj, drv->name); } EXPORT_SYMBOL(module_add_driver); @@ -2288,6 +2305,8 @@ void module_remove_driver(struct device_driver *drv) if (!drv) return; sysfs_remove_link(&drv->kobj, "module"); + if (drv->owner && drv->owner->drivers_dir) + sysfs_remove_link(drv->owner->drivers_dir, drv->name); } EXPORT_SYMBOL(module_remove_driver); -- cgit v1.2.3 From 339bf98ffc6a8d8eb16fc532ac57ffbced2f8a68 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Fri, 10 Nov 2006 14:10:15 -0800 Subject: [NETLINK]: Do precise netlink message allocations where possible Account for the netlink message header size directly in nlmsg_new() instead of relying on the caller calculate it correctly. Replaces error handling of message construction functions when constructing notifications with bug traps since a failure implies a bug in calculating the size of the skb. Signed-off-by: Thomas Graf Acked-by: Paul Moore Signed-off-by: David S. Miller --- kernel/taskstats.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index f45c5e70773c..4f3f0e48c845 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -77,8 +77,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, /* * If new attributes are added, please revisit this allocation */ - size = nlmsg_total_size(genlmsg_total_size(size)); - skb = nlmsg_new(size, GFP_KERNEL); + skb = nlmsg_new(genlmsg_total_size(size), GFP_KERNEL); if (!skb) return -ENOMEM; -- cgit v1.2.3 From 3dabc7157859e706770c825aa229f8943db4e0e1 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 14 Nov 2006 19:44:52 -0800 Subject: [GENL]: Add genlmsg_new() to allocate generic netlink messages Signed-off-by: Thomas Graf Acked-by: Paul Moore Signed-off-by: David S. Miller --- kernel/taskstats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 4f3f0e48c845..faa5239813ce 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -77,7 +77,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, /* * If new attributes are added, please revisit this allocation */ - skb = nlmsg_new(genlmsg_total_size(size), GFP_KERNEL); + skb = genlmsg_new(size, GFP_KERNEL); if (!skb) return -ENOMEM; -- cgit v1.2.3 From 17c157c889f4b07258af6bfec9e4e9dcf3c00178 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 14 Nov 2006 19:46:02 -0800 Subject: [GENL]: Add genlmsg_put_reply() to simplify building reply headers By modyfing genlmsg_put() to take a genl_family and by adding genlmsg_put_reply() the process of constructing the netlink and generic netlink headers is simplified. Signed-off-by: Thomas Graf Acked-by: Paul Moore Signed-off-by: David S. Miller --- kernel/taskstats.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index faa5239813ce..d3d28919d4b4 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -85,13 +85,9 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, int seq = get_cpu_var(taskstats_seqnum)++; put_cpu_var(taskstats_seqnum); - reply = genlmsg_put(skb, 0, seq, - family.id, 0, 0, - cmd, family.version); + reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); } else - reply = genlmsg_put(skb, info->snd_pid, info->snd_seq, - family.id, 0, 0, - cmd, family.version); + reply = genlmsg_put_reply(skb, info, &family, 0, cmd); if (reply == NULL) { nlmsg_free(skb); return -EINVAL; -- cgit v1.2.3