From 0a565f7919cfb3d3df2c97d45751cbb83d858f97 Mon Sep 17 00:00:00 2001 From: Peter Williams Date: Mon, 10 Jul 2006 04:43:51 -0700 Subject: [PATCH] sched: fix bug in __migrate_task() Problem: In the function __migrate_task(), deactivate_task() followed by activate_task() is used to move the task from one run queue to another. This has two undesirable effects: 1. The task's priority is recalculated. (Nowhere else in the scheduler code is the priority recalculated for a change of CPU.) 2. The task's time stamp is set to the current time. At the very least, this makes the adjustment of the time stamp before the call to deactivate_task() redundant but I believe the problem is more serious as the time stamp now holds the time of the queue change instead of the time at which the task was woken. In addition, unless dest_rq is the same queue as "current" is on the time stamp could be inaccurate due to inter CPU drift. Solution: Replace the call to activate_task() with one to __activate_task(). Signed-off-by: Peter Williams Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 4ee400f9d56b..b47819b676fa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4877,7 +4877,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) p->timestamp = p->timestamp - rq_src->timestamp_last_tick + rq_dest->timestamp_last_tick; deactivate_task(p, rq_src); - activate_task(p, rq_dest, 0); + __activate_task(p, rq_dest); if (TASK_PREEMPTS_CURR(p, rq_dest)) resched_task(rq_dest->curr); } -- cgit v1.2.3 From 2ed6e34f88a0d896a6f889b00693cae0fadacfd0 Mon Sep 17 00:00:00 2001 From: Andreas Mohr Date: Mon, 10 Jul 2006 04:43:52 -0700 Subject: [PATCH] small kernel/sched.c cleanup - constify and optimize stat_nam (thanks to Michael Tokarev!) - spelling and comment fixes Signed-off-by: Andreas Mohr Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b47819b676fa..d714611f1691 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3384,7 +3384,7 @@ EXPORT_SYMBOL(schedule); #ifdef CONFIG_PREEMPT /* - * this is is the entry point to schedule() from in-kernel preemption + * this is the entry point to schedule() from in-kernel preemption * off of preempt_enable. Kernel preemptions off return from interrupt * occur there and call schedule directly. */ @@ -3427,7 +3427,7 @@ need_resched: EXPORT_SYMBOL(preempt_schedule); /* - * this is is the entry point to schedule() from kernel preemption + * this is the entry point to schedule() from kernel preemption * off of irq context. * Note, that this is called and return with irqs disabled. This will * protect us against recursive calling from irq. @@ -3439,7 +3439,7 @@ asmlinkage void __sched preempt_schedule_irq(void) struct task_struct *task = current; int saved_lock_depth; #endif - /* Catch callers which need to be fixed*/ + /* Catch callers which need to be fixed */ BUG_ON(ti->preempt_count || !irqs_disabled()); need_resched: @@ -4650,7 +4650,7 @@ static inline struct task_struct *younger_sibling(struct task_struct *p) return list_entry(p->sibling.next,struct task_struct,sibling); } -static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" }; +static const char stat_nam[] = "RSDTtZX"; static void show_task(struct task_struct *p) { @@ -4658,12 +4658,9 @@ static void show_task(struct task_struct *p) unsigned long free = 0; unsigned state; - printk("%-13.13s ", p->comm); state = p->state ? __ffs(p->state) + 1 : 0; - if (state < ARRAY_SIZE(stat_nam)) - printk(stat_nam[state]); - else - printk("?"); + printk("%-13.13s %c", p->comm, + state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); #if (BITS_PER_LONG == 32) if (state == TASK_RUNNING) printk(" running "); @@ -5776,7 +5773,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2) cache = vmalloc(max_size); if (!cache) { printk("could not vmalloc %d bytes for cache!\n", 2*max_size); - return 1000000; // return 1 msec on very small boxen + return 1000000; /* return 1 msec on very small boxen */ } while (size <= max_size) { -- cgit v1.2.3 From f9829cceb686f3719215fe43c8593e5f3efe1710 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 10 Jul 2006 04:44:01 -0700 Subject: [PATCH] Minor cleanup to lockdep.c - Use printk formatting for indentation - Don't leave NTFS in the default event filter Signed-off-by: Andi Kleen Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/lockdep.c | 44 ++++++++++++-------------------------------- 1 file changed, 12 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index f32ca78c198d..42af5cc31a49 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -169,22 +169,17 @@ EXPORT_SYMBOL(lockdep_internal); */ static int class_filter(struct lock_class *class) { +#if 0 + /* Example */ if (class->name_version == 1 && - !strcmp(class->name, "&rl->lock")) + !strcmp(class->name, "lockname")) return 1; if (class->name_version == 1 && - !strcmp(class->name, "&ni->mrec_lock")) + !strcmp(class->name, "&struct->lockfield")) return 1; - if (class->name_version == 1 && - !strcmp(class->name, "mft_ni_runlist_lock")) - return 1; - if (class->name_version == 1 && - !strcmp(class->name, "mft_ni_mrec_lock")) - return 1; - if (class->name_version == 1 && - !strcmp(class->name, "&vol->lcnbmp_lock")) - return 1; - return 0; +#endif + /* Allow everything else. 0 would be filter everything else */ + return 1; } #endif @@ -408,23 +403,12 @@ static void lockdep_print_held_locks(struct task_struct *curr) print_lock(curr->held_locks + i); } } -/* - * Helper to print a nice hierarchy of lock dependencies: - */ -static void print_spaces(int nr) -{ - int i; - - for (i = 0; i < nr; i++) - printk(" "); -} static void print_lock_class_header(struct lock_class *class, int depth) { int bit; - print_spaces(depth); - printk("->"); + printk("%*s->", depth, ""); print_lock_name(class); printk(" ops: %lu", class->ops); printk(" {\n"); @@ -433,17 +417,14 @@ static void print_lock_class_header(struct lock_class *class, int depth) if (class->usage_mask & (1 << bit)) { int len = depth; - print_spaces(depth); - len += printk(" %s", usage_str[bit]); + len += printk("%*s %s", depth, "", usage_str[bit]); len += printk(" at:\n"); print_stack_trace(class->usage_traces + bit, len); } } - print_spaces(depth); - printk(" }\n"); + printk("%*s }\n", depth, ""); - print_spaces(depth); - printk(" ... key at: "); + printk("%*s ... key at: ",depth,""); print_ip_sym((unsigned long)class->key); } @@ -463,8 +444,7 @@ static void print_lock_dependencies(struct lock_class *class, int depth) DEBUG_LOCKS_WARN_ON(!entry->class); print_lock_dependencies(entry->class, depth + 1); - print_spaces(depth); - printk(" ... acquired at:\n"); + printk("%*s ... acquired at:\n",depth,""); print_stack_trace(&entry->trace, 2); printk("\n"); } -- cgit v1.2.3 From 55794a412fdf9af1744800e5020a4ec6b21e3cdc Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 10 Jul 2006 04:44:03 -0700 Subject: [PATCH] lockdep: improve debug output Make lockdep print which lock is held, in the "kfree() of a live lock" scenario. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/lockdep.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 42af5cc31a49..c1f34addd003 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2551,7 +2551,7 @@ static inline int in_range(const void *start, const void *addr, const void *end) static void print_freed_lock_bug(struct task_struct *curr, const void *mem_from, - const void *mem_to) + const void *mem_to, struct held_lock *hlock) { if (!debug_locks_off()) return; @@ -2563,6 +2563,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, printk( "-------------------------\n"); printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", curr->comm, curr->pid, mem_from, mem_to-1); + print_lock(hlock); lockdep_print_held_locks(curr); printk("\nstack backtrace:\n"); @@ -2596,7 +2597,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) !in_range(mem_from, lock_to, mem_to)) continue; - print_freed_lock_bug(curr, mem_from, mem_to); + print_freed_lock_bug(curr, mem_from, mem_to, hlock); break; } local_irq_restore(flags); -- cgit v1.2.3 From d6d897cec29252b8d0785198cfa6ca16d30c739d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 10 Jul 2006 04:44:04 -0700 Subject: [PATCH] lockdep: core, reduce per-lock class-cache size lockdep_map is embedded into every lock, which blows up data structure sizes all around the kernel. Reduce the class-cache to be for the default class only - that is used in 99.9% of the cases and even if we dont have a class cached, the lookup in the class-hash is lockless. This change reduces the per-lock dep_map overhead by 56 bytes on 64-bit platforms and by 28 bytes on 32-bit platforms. Signed-off-by: Ingo Molnar Cc: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/lockdep.c | 87 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 54 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index c1f34addd003..9bad17884513 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -1104,7 +1104,7 @@ extern void __error_too_big_MAX_LOCKDEP_SUBCLASSES(void); * itself, so actual lookup of the hash should be once per lock object. */ static inline struct lock_class * -register_lock_class(struct lockdep_map *lock, unsigned int subclass) +look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) { struct lockdep_subclass_key *key; struct list_head *hash_head; @@ -1148,7 +1148,26 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass) */ list_for_each_entry(class, hash_head, hash_entry) if (class->key == key) - goto out_set; + return class; + + return NULL; +} + +/* + * Register a lock's class in the hash-table, if the class is not present + * yet. Otherwise we look it up. We cache the result in the lock object + * itself, so actual lookup of the hash should be once per lock object. + */ +static inline struct lock_class * +register_lock_class(struct lockdep_map *lock, unsigned int subclass) +{ + struct lockdep_subclass_key *key; + struct list_head *hash_head; + struct lock_class *class; + + class = look_up_lock_class(lock, subclass); + if (likely(class)) + return class; /* * Debug-check: all keys must be persistent! @@ -1163,6 +1182,9 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass) return NULL; } + key = lock->key->subkeys + subclass; + hash_head = classhashentry(key); + __raw_spin_lock(&hash_lock); /* * We have to do the hash-walk again, to avoid races @@ -1209,8 +1231,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass) out_unlock_set: __raw_spin_unlock(&hash_lock); -out_set: - lock->class[subclass] = class; + if (!subclass) + lock->class_cache = class; DEBUG_LOCKS_WARN_ON(class->subclass != subclass); @@ -1914,7 +1936,7 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, } lock->name = name; lock->key = key; - memset(lock->class, 0, sizeof(lock->class[0])*MAX_LOCKDEP_SUBCLASSES); + lock->class_cache = NULL; } EXPORT_SYMBOL_GPL(lockdep_init_map); @@ -1928,8 +1950,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, unsigned long ip) { struct task_struct *curr = current; + struct lock_class *class = NULL; struct held_lock *hlock; - struct lock_class *class; unsigned int depth, id; int chain_head = 0; u64 chain_key; @@ -1947,8 +1969,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, return 0; } - class = lock->class[subclass]; - /* not cached yet? */ + if (!subclass) + class = lock->class_cache; + /* + * Not cached yet or subclass? + */ if (unlikely(!class)) { class = register_lock_class(lock, subclass); if (!class) @@ -2449,48 +2474,44 @@ void lockdep_free_key_range(void *start, unsigned long size) void lockdep_reset_lock(struct lockdep_map *lock) { - struct lock_class *class, *next, *entry; + struct lock_class *class, *next; struct list_head *head; unsigned long flags; int i, j; raw_local_irq_save(flags); - __raw_spin_lock(&hash_lock); /* - * Remove all classes this lock has: + * Remove all classes this lock might have: + */ + for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) { + /* + * If the class exists we look it up and zap it: + */ + class = look_up_lock_class(lock, j); + if (class) + zap_class(class); + } + /* + * Debug check: in the end all mapped classes should + * be gone. */ + __raw_spin_lock(&hash_lock); for (i = 0; i < CLASSHASH_SIZE; i++) { head = classhash_table + i; if (list_empty(head)) continue; list_for_each_entry_safe(class, next, head, hash_entry) { - for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) { - entry = lock->class[j]; - if (class == entry) { - zap_class(class); - lock->class[j] = NULL; - break; - } + if (unlikely(class == lock->class_cache)) { + __raw_spin_unlock(&hash_lock); + DEBUG_LOCKS_WARN_ON(1); + goto out_restore; } } } - - /* - * Debug check: in the end all mapped classes should - * be gone. - */ - for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) { - entry = lock->class[j]; - if (!entry) - continue; - __raw_spin_unlock(&hash_lock); - DEBUG_LOCKS_WARN_ON(1); - raw_local_irq_restore(flags); - return; - } - __raw_spin_unlock(&hash_lock); + +out_restore: raw_local_irq_restore(flags); } -- cgit v1.2.3 From c0fc84d2e5bb4a9e3ae470812a00cccba85a48b8 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Mon, 10 Jul 2006 04:44:21 -0700 Subject: [PATCH] kernel/printk.c: EXPORT_SYMBOL_UNUSED This patch marks unused exports as EXPORT_SYMBOL_UNUSED. Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index bdba5d80496c..65ca0688f86f 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -52,7 +52,7 @@ int console_printk[4] = { DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ }; -EXPORT_SYMBOL(console_printk); +EXPORT_UNUSED_SYMBOL(console_printk); /* June 2006 */ /* * Low lever drivers may need that to know if they can schedule in @@ -773,7 +773,7 @@ int is_console_locked(void) { return console_locked; } -EXPORT_SYMBOL(is_console_locked); +EXPORT_UNUSED_SYMBOL(is_console_locked); /* June 2006 */ /** * release_console_sem - unlock the console system -- cgit v1.2.3 From 80d6679a62fe45f440d042099d997a42e4e8c59d Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Mon, 10 Jul 2006 04:44:24 -0700 Subject: [PATCH] kernel/softirq.c: EXPORT_UNUSED_SYMBOL This patch marks an unused export as EXPORT_UNUSED_SYMBOL. Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/softirq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 215541e26c1a..fd12f2556f0d 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -311,7 +311,7 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data) softirq_vec[nr].action = action; } -EXPORT_SYMBOL(open_softirq); +EXPORT_UNUSED_SYMBOL(open_softirq); /* June 2006 */ /* Tasklets */ struct tasklet_head -- cgit v1.2.3 From 06a9ec291b3aec9c7e36af0a10ad2b556bd7e84f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 10 Jul 2006 04:44:30 -0700 Subject: [PATCH] pi-futex: Validate futex type instead of oopsing Calling futex_lock_pi is called with a reference to a non PI futex and waiters exist already, lookup_pi_state() oopses due to pi_state == NULL. Check this condition and return -EINVAL to userspace. Signed-off-by: Thomas Gleixner Cc: Ingo Molnar Cc: Jakub Jelinek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 1dc98e4dd287..cf0c8e21d1ab 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -476,6 +476,12 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) * the refcount and return its pi_state: */ pi_state = this->pi_state; + /* + * Userspace might have messed up non PI and PI futexes + */ + if (unlikely(!pi_state)) + return -EINVAL; + atomic_inc(&pi_state->refcount); me->pi_state = pi_state; -- cgit v1.2.3 From e154ff3d2c5ad313ef0c66e6217502361cad2799 Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Mon, 10 Jul 2006 04:44:32 -0700 Subject: [PATCH] adjust clock for lost ticks A large number of lost ticks can cause an overadjustment of the clock. To compensate for this we look at the current error and the larger the error already is the more careful we are at adjusting the error. As small extra fix reset the error when the clock is set. Signed-off-by: Roman Zippel Acked-by: john stultz Cc: Uwe Bugla Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/timer.c | 85 ++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 47 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 396a3c024c2c..2a87430a58d4 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -891,6 +891,7 @@ int do_settimeofday(struct timespec *tv) set_normalized_timespec(&xtime, sec, nsec); set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); + clock->error = 0; ntp_clear(); write_sequnlock_irqrestore(&xtime_lock, flags); @@ -1008,52 +1009,52 @@ static int __init timekeeping_init_device(void) device_initcall(timekeeping_init_device); /* - * If the error is already larger, we look ahead another tick, + * If the error is already larger, we look ahead even further * to compensate for late or lost adjustments. */ -static __always_inline int clocksource_bigadjust(int sign, s64 error, s64 *interval, s64 *offset) +static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset) { - int adj; + s64 tick_error, i; + u32 look_ahead, adj; + s32 error2, mult; /* - * As soon as the machine is synchronized to the external time - * source this should be the common case. + * Use the current error value to determine how much to look ahead. + * The larger the error the slower we adjust for it to avoid problems + * with losing too many ticks, otherwise we would overadjust and + * produce an even larger error. The smaller the adjustment the + * faster we try to adjust for it, as lost ticks can do less harm + * here. This is tuned so that an error of about 1 msec is adusted + * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). */ - error >>= 2; - if (likely(sign > 0 ? error <= *interval : error >= *interval)) - return sign; + error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ); + error2 = abs(error2); + for (look_ahead = 0; error2 > 0; look_ahead++) + error2 >>= 2; /* - * An extra look ahead dampens the effect of the current error, - * which can grow quite large with continously late updates, as - * it would dominate the adjustment value and can lead to - * oscillation. + * Now calculate the error in (1 << look_ahead) ticks, but first + * remove the single look ahead already included in the error. */ - error += current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1); - error -= clock->xtime_interval >> 1; - - adj = 0; - while (1) { - error >>= 1; - if (sign > 0 ? error <= *interval : error >= *interval) - break; - adj++; + tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1); + tick_error -= clock->xtime_interval >> 1; + error = ((error - tick_error) >> look_ahead) + tick_error; + + /* Finally calculate the adjustment shift value. */ + i = *interval; + mult = 1; + if (error < 0) { + error = -error; + *interval = -*interval; + *offset = -*offset; + mult = -1; } - - /* - * Add the current adjustments to the error and take the offset - * into account, the latter can cause the error to be hardly - * reduced at the next tick. Check the error again if there's - * room for another adjustment, thus further reducing the error - * which otherwise had to be corrected at the next update. - */ - error = (error << 1) - *interval + *offset; - if (sign > 0 ? error > *interval : error < *interval) - adj++; + for (adj = 0; error > i; adj++) + error >>= 1; *interval <<= adj; *offset <<= adj; - return sign << adj; + return mult << adj; } /* @@ -1068,11 +1069,19 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset) error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1); if (error > interval) { - adj = clocksource_bigadjust(1, error, &interval, &offset); + error >>= 2; + if (likely(error <= interval)) + adj = 1; + else + adj = clocksource_bigadjust(error, &interval, &offset); } else if (error < -interval) { - interval = -interval; - offset = -offset; - adj = clocksource_bigadjust(-1, error, &interval, &offset); + error >>= 2; + if (likely(error >= -interval)) { + adj = -1; + interval = -interval; + offset = -offset; + } else + adj = clocksource_bigadjust(error, &interval, &offset); } else return; @@ -1129,7 +1138,7 @@ static void update_wall_time(void) clocksource_adjust(clock, offset); /* store full nanoseconds into xtime */ - xtime.tv_nsec = clock->xtime_nsec >> clock->shift; + xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; /* check to see if there is a new clocksource to use */ -- cgit v1.2.3 From 95018f7c94cbe4e78fc014b6ce52004714c06e2a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 10 Jul 2006 04:45:00 -0700 Subject: [PATCH] swsusp: do not use memcpy for snapshotting memory swsusp should not use memcpy for snapshotting memory, because on some architectures memcpy may increase preempt_count (i386 does this when CONFIG_X86_USE_3DNOW is set). Then, as a result, wrong value of preempt_count is stored in the image. Replace memcpy in copy_data_pages with an open-coded loop. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 24c96f354231..75d4886e648e 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -227,11 +227,17 @@ static void copy_data_pages(struct pbe *pblist) for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { if (saveable(zone, &zone_pfn)) { struct page *page; + long *src, *dst; + int n; + page = pfn_to_page(zone_pfn + zone->zone_start_pfn); BUG_ON(!pbe); pbe->orig_address = (unsigned long)page_address(page); - /* copy_page is not usable for copying task structs. */ - memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE); + /* copy_page and memcpy are not usable for copying task structs. */ + dst = (long *)pbe->address; + src = (long *)pbe->orig_address; + for (n = PAGE_SIZE / sizeof(long); n; n--) + *dst++ = *src++; pbe = pbe->next; } } -- cgit v1.2.3 From 712f403af6682c942d8ff8bfbd54eed03643a796 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 10 Jul 2006 04:45:00 -0700 Subject: [PATCH] swsusp warning fix kernel/power/swap.c: In function 'swsusp_write': kernel/power/swap.c:275: warning: 'start' may be used uninitialized in this function gcc isn't smart enough, so help it. Cc: Pavel Machek Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/swap.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 044b8e0c1025..a57c661c7e8a 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -263,7 +263,6 @@ int swsusp_write(void) struct swap_map_handle handle; struct snapshot_handle snapshot; struct swsusp_info *header; - unsigned long start; int error; if ((error = swsusp_swap_check())) { @@ -281,16 +280,17 @@ int swsusp_write(void) } error = get_swap_writer(&handle); if (!error) { - start = handle.cur_swap; + unsigned long start = handle.cur_swap; error = swap_write_page(&handle, header); - } - if (!error) - error = save_image(&handle, &snapshot, header->pages - 1); - if (!error) { - flush_swap_writer(&handle); - printk("S"); - error = mark_swapfiles(swp_entry(root_swap, start)); - printk("|\n"); + if (!error) + error = save_image(&handle, &snapshot, + header->pages - 1); + if (!error) { + flush_swap_writer(&handle); + printk("S"); + error = mark_swapfiles(swp_entry(root_swap, start)); + printk("|\n"); + } } if (error) free_all_swap_pages(root_swap, handle.bitmap); -- cgit v1.2.3 From aeceb15738958fe59cd9fe537f40317b1a3bc731 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 10 Jul 2006 04:45:01 -0700 Subject: [PATCH] swsusp: fix panic when signature can't be read Do not panic a machine when swsusp signature can't be read. Signed-off-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/swap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index a57c661c7e8a..f1dd146bd64d 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -311,8 +311,10 @@ static atomic_t io_done = ATOMIC_INIT(0); static int end_io(struct bio *bio, unsigned int num, int err) { - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - panic("I/O error reading memory image"); + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + printk(KERN_ERR "I/O error reading swsusp image.\n"); + return -EIO; + } atomic_set(&io_done, 0); return 0; } -- cgit v1.2.3 From 21d71f513b6221f482ed6ad45e05f073ae67f319 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 10 Jul 2006 04:45:32 -0700 Subject: [PATCH] uninline init_waitqueue_head() allyesconfig vmlinux size delta: text data bss dec filename 20736884 6073834 3075176 29885894 vmlinux.before 20721009 6073966 3075176 29870151 vmlinux.after ~18 bytes per callsite, 15K of text size (~0.1%) saved. (as an added bonus this also removes a lockdep annotation.) Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/wait.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/wait.c b/kernel/wait.c index a1d57aeb7f75..59a82f63275d 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -10,9 +10,13 @@ #include #include -struct lock_class_key waitqueue_lock_key; +void init_waitqueue_head(wait_queue_head_t *q) +{ + spin_lock_init(&q->lock); + INIT_LIST_HEAD(&q->task_list); +} -EXPORT_SYMBOL(waitqueue_lock_key); +EXPORT_SYMBOL(init_waitqueue_head); void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) { -- cgit v1.2.3 From c59923a15c12d2b3597af913bf234a0ef264a38b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Jul 2006 04:45:40 -0700 Subject: [PATCH] remove the tasklist_lock export As announced half a year ago this patch will remove the tasklist_lock export. The previous two patches got rid of the remaining modular users. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 56e4e07e45f7..926e5a68ea9e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -61,9 +61,7 @@ int max_threads; /* tunable limit on nr_threads */ DEFINE_PER_CPU(unsigned long, process_counts) = 0; - __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ - -EXPORT_SYMBOL(tasklist_lock); +__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ int nr_processes(void) { -- cgit v1.2.3 From 2c16e9c888985761511bd1905b00fb271169c3c0 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 10 Jul 2006 04:45:42 -0700 Subject: [PATCH] lockdep: disable lock debugging when kernel state becomes untrusted Disable lockdep debugging in two situations where the integrity of the kernel no longer is guaranteed: when oopsing and when hitting a tainting-condition. The goal is to not get weird lockdep traces that don't make sense or are otherwise undebuggable, to not waste time. Lockdep assumes that the previous state it knows about is valid to operate, which is why lockdep turns itself off after the first violation it reports, after that point it can no longer make that assumption. A kernel oops means that the integrity of the kernel compromised; in addition anything lockdep would report is of lesser importance than the oops. All the tainting conditions are of similar integrity-violating nature and also make debugging/diagnosing more difficult. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index ab13f0f668b5..d8a0bca21233 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -172,6 +172,7 @@ const char *print_tainted(void) void add_taint(unsigned flag) { + debug_locks_off(); /* can't trust the integrity of the kernel anymore */ tainted |= flag; } EXPORT_SYMBOL(add_taint); @@ -256,6 +257,7 @@ int oops_may_print(void) */ void oops_enter(void) { + debug_locks_off(); /* can't trust the integrity of the kernel anymore */ do_oops_enter_exit(); } -- cgit v1.2.3 From abf75a5033d4da7b8a7e92321d74021d1fcfb502 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Wed, 12 Jul 2006 13:12:00 +0200 Subject: [PATCH] Fix prctl privilege escalation and suid_dumpable (CVE-2006-2451) Based on a patch from Ernie Petrides During security research, Red Hat discovered a behavioral flaw in core dump handling. A local user could create a program that would cause a core file to be dumped into a directory they would not normally have permissions to write to. This could lead to a denial of service (disk consumption), or allow the local user to gain root privileges. The prctl() system call should never allow to set "dumpable" to the value 2. Especially not for non-privileged users. This can be split into three cases: 1) running as root -- then core dumps will already be done as root, and so prctl(PR_SET_DUMPABLE, 2) is not useful 2) running as non-root w/setuid-to-root -- this is the debatable case 3) running as non-root w/setuid-to-non-root -- then you definitely do NOT want "dumpable" to get set to 2 because you have the privilege escalation vulnerability With case #2, the only potential usefulness is for a program that has designed to run with higher privilege (than the user invoking it) that wants to be able to create root-owned root-validated core dumps. This might be useful as a debugging aid, but would only be safe if the program had done a chdir() to a safe directory. There is no benefit to a production setuid-to-root utility, because it shouldn't be dumping core in the first place. If this is true, then the same debugging aid could also be accomplished with the "suid_dumpable" sysctl. Signed-off-by: Marcel Holtmann Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index dbb3b9c7ea64..e236f98f7ec5 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1983,7 +1983,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, error = current->mm->dumpable; break; case PR_SET_DUMPABLE: - if (arg2 < 0 || arg2 > 2) { + if (arg2 < 0 || arg2 > 1) { error = -EINVAL; break; } -- cgit v1.2.3 From 26865e9c26d2d336f385b821b531ce2b31008e20 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Fri, 30 Jun 2006 02:15:43 -0700 Subject: [PATCH] remove kernel/power/pm.c:pm_unregister_all() Remove the deprecated and no longer used pm_unregister_all(). Signed-off-by: Adrian Bunk Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- kernel/power/pm.c | 37 ------------------------------------- 1 file changed, 37 deletions(-) (limited to 'kernel') diff --git a/kernel/power/pm.c b/kernel/power/pm.c index 84063ac8fcfc..c50d15266c10 100644 --- a/kernel/power/pm.c +++ b/kernel/power/pm.c @@ -75,42 +75,6 @@ struct pm_dev *pm_register(pm_dev_t type, return dev; } -static void __pm_unregister(struct pm_dev *dev) -{ - if (dev) { - list_del(&dev->entry); - kfree(dev); - } -} - -/** - * pm_unregister_all - unregister all devices with matching callback - * @callback: callback function pointer - * - * Unregister every device that would call the callback passed. This - * is primarily meant as a helper function for loadable modules. It - * enables a module to give up all its managed devices without keeping - * its own private list. - */ - -void pm_unregister_all(pm_callback callback) -{ - struct list_head *entry; - - if (!callback) - return; - - mutex_lock(&pm_devs_lock); - entry = pm_devs.next; - while (entry != &pm_devs) { - struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); - entry = entry->next; - if (dev->callback == callback) - __pm_unregister(dev); - } - mutex_unlock(&pm_devs_lock); -} - /** * pm_send - send request to a single device * @dev: device to send to @@ -239,7 +203,6 @@ int pm_send_all(pm_request_t rqst, void *data) } EXPORT_SYMBOL(pm_register); -EXPORT_SYMBOL(pm_unregister_all); EXPORT_SYMBOL(pm_send_all); EXPORT_SYMBOL(pm_active); -- cgit v1.2.3 From cd6ef2ada54aa4788d5a3dee3cffaad41383a52a Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Fri, 30 Jun 2006 02:15:42 -0700 Subject: [PATCH] The scheduled unexport of insert_resource Implement the scheduled unexport of insert_resource. Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- kernel/resource.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 129cf046e561..0dd3a857579e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -404,8 +404,6 @@ int insert_resource(struct resource *parent, struct resource *new) return result; } -EXPORT_SYMBOL(insert_resource); - /* * Given an existing resource, change its start and size to match the * arguments. Returns -EBUSY if it can't fit. Existing children of -- cgit v1.2.3 From 098c5eea03de4707019a205140296893252b4130 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 14 Jul 2006 00:24:04 -0700 Subject: [PATCH] null-terminate over-long /proc/kallsyms symbols Got a customer bug report (https://bugzilla.novell.com/190296) about kernel symbols longer than 127 characters which end up in a string buffer that is not NULL terminated, leading to garbage in /proc/kallsyms. Using strlcpy prevents this from happening, even though such symbols still won't come out right. A better fix would be to not use a fixed-size buffer, but it's probably not worth the trouble. (Modversion'ed symbols even have a length limit of 60.) [bunk@stusta.de: build fix] Signed-off-by: Andreas Gruenbacher Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 4 ++-- kernel/module.c | 11 ++++------- 2 files changed, 6 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 39277dd6bf90..ab16a5a4cfe9 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -275,8 +275,8 @@ static void upcase_if_global(struct kallsym_iter *iter) static int get_ksymbol_mod(struct kallsym_iter *iter) { iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, - &iter->value, - &iter->type, iter->name); + &iter->value, &iter->type, + iter->name, sizeof(iter->name)); if (iter->owner == NULL) return 0; diff --git a/kernel/module.c b/kernel/module.c index 35e1b1f859d7..2a19cd47c046 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2019,10 +2019,8 @@ const char *module_address_lookup(unsigned long addr, return NULL; } -struct module *module_get_kallsym(unsigned int symnum, - unsigned long *value, - char *type, - char namebuf[128]) +struct module *module_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *name, size_t namelen) { struct module *mod; @@ -2031,9 +2029,8 @@ struct module *module_get_kallsym(unsigned int symnum, if (symnum < mod->num_symtab) { *value = mod->symtab[symnum].st_value; *type = mod->symtab[symnum].st_info; - strncpy(namebuf, - mod->strtab + mod->symtab[symnum].st_name, - 127); + strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, + namelen); mutex_unlock(&module_mutex); return mod; } -- cgit v1.2.3 From 52e92e5788139921352213fa6faf6e30ff1f2f5a Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Fri, 14 Jul 2006 00:24:05 -0700 Subject: [PATCH] remove kernel/kthread.c:kthread_stop_sem() Remove the now-unneeded kthread_stop_sem(). Signed-off-by: Adrian Bunk Acked-by: Alan Stern Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 24be714b04c7..4f9c60ef95e8 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -215,23 +215,6 @@ EXPORT_SYMBOL(kthread_bind); * was never called. */ int kthread_stop(struct task_struct *k) -{ - return kthread_stop_sem(k, NULL); -} -EXPORT_SYMBOL(kthread_stop); - -/** - * kthread_stop_sem - stop a thread created by kthread_create(). - * @k: thread created by kthread_create(). - * @s: semaphore that @k waits on while idle. - * - * Does essentially the same thing as kthread_stop() above, but wakes - * @k by calling up(@s). - * - * Returns the result of threadfn(), or %-EINTR if wake_up_process() - * was never called. - */ -int kthread_stop_sem(struct task_struct *k, struct semaphore *s) { int ret; @@ -246,10 +229,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s) /* Now set kthread_should_stop() to true, and wake it up. */ kthread_stop_info.k = k; - if (s) - up(s); - else - wake_up_process(k); + wake_up_process(k); put_task_struct(k); /* Once it dies, reset stop ptr, gather result and we're done. */ @@ -260,7 +240,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s) return ret; } -EXPORT_SYMBOL(kthread_stop_sem); +EXPORT_SYMBOL(kthread_stop); static __init int helper_init(void) { -- cgit v1.2.3 From a0009652af385a42f0e0604136f772ead406c78d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 14 Jul 2006 00:24:06 -0700 Subject: [PATCH] del_timer_sync(): add cpu_relax() Relax the CPU in the del_timer_sync() busywait loop. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/timer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 2a87430a58d4..acfa557e685b 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -374,6 +374,7 @@ int del_timer_sync(struct timer_list *timer) int ret = try_to_del_timer_sync(timer); if (ret >= 0) return ret; + cpu_relax(); } } -- cgit v1.2.3 From 60198f9992db1e36d5b4cc1526ff29550f7d002c Mon Sep 17 00:00:00 2001 From: Luca Tettamanti Date: Fri, 14 Jul 2006 00:24:13 -0700 Subject: [PATCH] Add try_to_freeze() to rt-test kthreads When CONFIG_RT_MUTEX_TESTER is enabled kernel refuses to suspend the machine because it's unable to freeze the rt-test-* threads. Add try_to_freeze() after schedule() so that the threads will be freezed correctly; I've tested the patch and it lets the notebook suspends and resumes nicely. Signed-off-by: Luca Tettamanti Cc: Ingo Molnar Acked-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rtmutex-tester.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 494dac872a13..948bd8f643e2 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -275,6 +275,7 @@ static int test_func(void *data) /* Wait for the next command to be executed */ schedule(); + try_to_freeze(); if (signal_pending(current)) flush_signals(current); -- cgit v1.2.3 From 6bc02d8412b422388f86b09ae40d762c0bc05290 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Fri, 14 Jul 2006 00:24:15 -0700 Subject: [PATCH] unexport open_softirq Christoph Hellwig: open_softirq just enables a softirq. The softirq array is statically allocated so to add a new one you would have to patch the kernel. So there's no point to keep this export at all as any user would have to patch the enum in include/linux/interrupt.h anyway. Signed-off-by: Adrian Bunk Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/softirq.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index fd12f2556f0d..0f08a84ae307 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -311,8 +311,6 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data) softirq_vec[nr].action = action; } -EXPORT_UNUSED_SYMBOL(open_softirq); /* June 2006 */ - /* Tasklets */ struct tasklet_head { -- cgit v1.2.3 From 3e143475c22036847f898d7e76ba337c1d7dbf6f Mon Sep 17 00:00:00 2001 From: john stultz Date: Fri, 14 Jul 2006 00:24:17 -0700 Subject: [PATCH] improve timekeeping resume robustness Resolve problems seen w/ APM suspend. Due to resume initialization ordering, its possible we could get a timer interrupt before the timekeeping resume() function is called. This patch ensures we don't do any timekeeping accounting before we're fully resumed. (akpm: fixes the machine-freezes-on-APM-resume bug) Signed-off-by: John Stultz Cc: Roman Zippel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/timer.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index acfa557e685b..05809c2e2fd6 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -969,6 +969,7 @@ void __init timekeeping_init(void) } +static int timekeeping_suspended; /* * timekeeping_resume - Resumes the generic timekeeping subsystem. * @dev: unused @@ -984,6 +985,18 @@ static int timekeeping_resume(struct sys_device *dev) write_seqlock_irqsave(&xtime_lock, flags); /* restart the last cycle value */ clock->cycle_last = clocksource_read(clock); + clock->error = 0; + timekeeping_suspended = 0; + write_sequnlock_irqrestore(&xtime_lock, flags); + return 0; +} + +static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) +{ + unsigned long flags; + + write_seqlock_irqsave(&xtime_lock, flags); + timekeeping_suspended = 1; write_sequnlock_irqrestore(&xtime_lock, flags); return 0; } @@ -991,6 +1004,7 @@ static int timekeeping_resume(struct sys_device *dev) /* sysfs resume/suspend bits for timekeeping */ static struct sysdev_class timekeeping_sysclass = { .resume = timekeeping_resume, + .suspend = timekeeping_suspend, set_kset_name("timekeeping"), }; @@ -1101,13 +1115,16 @@ static void update_wall_time(void) { cycle_t offset; - clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; + /* Make sure we're fully resumed: */ + if (unlikely(timekeeping_suspended)) + return; #ifdef CONFIG_GENERIC_TIME offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; #else offset = clock->cycle_interval; #endif + clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; /* normally this loop will run just once, however in the * case of lost or late ticks, it will accumulate correctly. -- cgit v1.2.3 From a4afee02a5dd4f20c08fca26e9b610e72d0bcbf0 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Fri, 14 Jul 2006 00:24:18 -0700 Subject: [PATCH] Fix sighand->siglock usage in kernel/acct.c IRQs must be disabled before taking ->siglock. Noticed by lockdep. Signed-off-by: OGAWA Hirofumi Cc: Arjan van de Ven Cc: Ingo Molnar Cc: "Eric W. Biederman" Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index f18e0b8df3e1..2a7c933651c7 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -488,7 +488,7 @@ static void do_acct_process(struct file *file) old_encode_dev(tty_devnum(current->signal->tty)) : 0; read_unlock(&tasklist_lock); - spin_lock(¤t->sighand->siglock); + spin_lock_irq(¤t->sighand->siglock); ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); ac.ac_flag = pacct->ac_flag; @@ -496,7 +496,7 @@ static void do_acct_process(struct file *file) ac.ac_minflt = encode_comp_t(pacct->ac_minflt); ac.ac_majflt = encode_comp_t(pacct->ac_majflt); ac.ac_exitcode = pacct->ac_exitcode; - spin_unlock(¤t->sighand->siglock); + spin_unlock_irq(¤t->sighand->siglock); ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ ac.ac_rw = encode_comp_t(ac.ac_io / 1024); ac.ac_swaps = encode_comp_t(0); -- cgit v1.2.3 From 3a5f5e488ceee9e08df3dff3f01b12fafc9e7e68 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 14 Jul 2006 00:24:27 -0700 Subject: [PATCH] lockdep: core, fix rq-lock handling on __ARCH_WANT_UNLOCKED_CTXSW On platforms that have __ARCH_WANT_UNLOCKED_CTXSW set and want to implement lock validator support there's a bug in rq->lock handling: in this case we dont 'carry over' the runqueue lock into another task - but still we did a spinlock_release() of it. Fix this by making the spinlock_release() in context_switch() dependent on !__ARCH_WANT_UNLOCKED_CTXSW. (Reported by Ralf Baechle on MIPS, which has __ARCH_WANT_UNLOCKED_CTXSW. This fixes a lockdep-internal BUG message on such platforms.) Signed-off-by: Ingo Molnar Cc: Ralf Baechle Cc: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index d714611f1691..e9a0b61f12ab 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1788,7 +1788,15 @@ context_switch(struct rq *rq, struct task_struct *prev, WARN_ON(rq->prev_mm); rq->prev_mm = oldmm; } + /* + * Since the runqueue lock will be released by the next + * task (which is an invalid locking op but in the case + * of the scheduler it's an obvious special-case), so we + * do an early lockdep release here: + */ +#ifndef __ARCH_WANT_UNLOCKED_CTXSW spin_release(&rq->lock.dep_map, 1, _THIS_IP_); +#endif /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); -- cgit v1.2.3 From ca74e92b4698276b6696f15a801759f50944f387 Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Fri, 14 Jul 2006 00:24:36 -0700 Subject: [PATCH] per-task-delay-accounting: setup Initialization code related to collection of per-task "delay" statistics which measure how long it had to wait for cpu, sync block io, swapping etc. The collection of statistics and the interface are in other patches. This patch sets up the data structures and allows the statistics collection to be disabled through a kernel boot parameter. Signed-off-by: Shailabh Nagar Signed-off-by: Balbir Singh Cc: Jes Sorensen Cc: Peter Chubb Cc: Erich Focht Cc: Levent Serinol Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 1 + kernel/delayacct.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/exit.c | 2 ++ kernel/fork.c | 2 ++ 4 files changed, 92 insertions(+) create mode 100644 kernel/delayacct.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 47dbcd570cd8..87bb34cc8938 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RELAY) += relay.o +obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/delayacct.c b/kernel/delayacct.c new file mode 100644 index 000000000000..fbf7f2284952 --- /dev/null +++ b/kernel/delayacct.c @@ -0,0 +1,87 @@ +/* delayacct.c - per-task delay accounting + * + * Copyright (C) Shailabh Nagar, IBM Corp. 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include + +int delayacct_on __read_mostly; /* Delay accounting turned on/off */ +kmem_cache_t *delayacct_cache; + +static int __init delayacct_setup_enable(char *str) +{ + delayacct_on = 1; + return 1; +} +__setup("delayacct", delayacct_setup_enable); + +void delayacct_init(void) +{ + delayacct_cache = kmem_cache_create("delayacct_cache", + sizeof(struct task_delay_info), + 0, + SLAB_PANIC, + NULL, NULL); + delayacct_tsk_init(&init_task); +} + +void __delayacct_tsk_init(struct task_struct *tsk) +{ + tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL); + if (tsk->delays) + spin_lock_init(&tsk->delays->lock); +} + +void __delayacct_tsk_exit(struct task_struct *tsk) +{ + kmem_cache_free(delayacct_cache, tsk->delays); + tsk->delays = NULL; +} + +/* + * Start accounting for a delay statistic using + * its starting timestamp (@start) + */ + +static inline void delayacct_start(struct timespec *start) +{ + do_posix_clock_monotonic_gettime(start); +} + +/* + * Finish delay accounting for a statistic using + * its timestamps (@start, @end), accumalator (@total) and @count + */ + +static void delayacct_end(struct timespec *start, struct timespec *end, + u64 *total, u32 *count) +{ + struct timespec ts; + s64 ns; + + do_posix_clock_monotonic_gettime(end); + ts = timespec_sub(*end, *start); + ns = timespec_to_ns(&ts); + if (ns < 0) + return; + + spin_lock(¤t->delays->lock); + *total += ns; + (*count)++; + spin_unlock(¤t->delays->lock); +} + diff --git a/kernel/exit.c b/kernel/exit.c index 6664c084783d..3c2cf91defa7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -900,6 +901,7 @@ fastcall NORET_TYPE void do_exit(long code) #endif if (unlikely(tsk->audit_context)) audit_free(tsk); + delayacct_tsk_exit(tsk); exit_mm(tsk); if (group_dead) diff --git a/kernel/fork.c b/kernel/fork.c index 926e5a68ea9e..451cfd35bf22 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -1000,6 +1001,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_put_domain; p->did_exec = 0; + delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); p->pid = pid; retval = -EFAULT; -- cgit v1.2.3 From 0ff922452df86f3e9a2c6f705c4588ec62d096a7 Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Fri, 14 Jul 2006 00:24:37 -0700 Subject: [PATCH] per-task-delay-accounting: sync block I/O and swapin delay collection Unlike earlier iterations of the delay accounting patches, now delays are only collected for the actual I/O waits rather than try and cover the delays seen in I/O submission paths. Account separately for block I/O delays incurred as a result of swapin page faults whose frequency can be affected by the task/process' rss limit. Hence swapin delays can act as feedback for rss limit changes independent of I/O priority changes. Signed-off-by: Shailabh Nagar Signed-off-by: Balbir Singh Cc: Jes Sorensen Cc: Peter Chubb Cc: Erich Focht Cc: Levent Serinol Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/delayacct.c | 19 +++++++++++++++++++ kernel/sched.c | 5 +++++ 2 files changed, 24 insertions(+) (limited to 'kernel') diff --git a/kernel/delayacct.c b/kernel/delayacct.c index fbf7f2284952..3546b0800f9f 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -85,3 +85,22 @@ static void delayacct_end(struct timespec *start, struct timespec *end, spin_unlock(¤t->delays->lock); } +void __delayacct_blkio_start(void) +{ + delayacct_start(¤t->delays->blkio_start); +} + +void __delayacct_blkio_end(void) +{ + if (current->delays->flags & DELAYACCT_PF_SWAPIN) + /* Swapin block I/O */ + delayacct_end(¤t->delays->blkio_start, + ¤t->delays->blkio_end, + ¤t->delays->swapin_delay, + ¤t->delays->swapin_count); + else /* Other block I/O */ + delayacct_end(¤t->delays->blkio_start, + ¤t->delays->blkio_end, + ¤t->delays->blkio_delay, + ¤t->delays->blkio_count); +} diff --git a/kernel/sched.c b/kernel/sched.c index e9a0b61f12ab..9d42cbfc4f8b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -4534,9 +4535,11 @@ void __sched io_schedule(void) { struct rq *rq = &__raw_get_cpu_var(runqueues); + delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); schedule(); atomic_dec(&rq->nr_iowait); + delayacct_blkio_end(); } EXPORT_SYMBOL(io_schedule); @@ -4545,9 +4548,11 @@ long __sched io_schedule_timeout(long timeout) struct rq *rq = &__raw_get_cpu_var(runqueues); long ret; + delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); ret = schedule_timeout(timeout); atomic_dec(&rq->nr_iowait); + delayacct_blkio_end(); return ret; } -- cgit v1.2.3 From 52f17b6c2bd443e7806a161e9d10a983650db01d Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Fri, 14 Jul 2006 00:24:38 -0700 Subject: [PATCH] per-task-delay-accounting: cpu delay collection via schedstats Make the task-related schedstats functions callable by delay accounting even if schedstats collection isn't turned on. This removes the dependency of delay accounting on schedstats. Signed-off-by: Chandra Seetharaman Signed-off-by: Shailabh Nagar Signed-off-by: Balbir Singh Cc: Jes Sorensen Cc: Peter Chubb Cc: Erich Focht Cc: Levent Serinol Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 71 ++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 49 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9d42cbfc4f8b..b44b9a43b0fc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -502,9 +502,36 @@ struct file_operations proc_schedstat_operations = { .release = single_release, }; +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) +{ + if (rq) { + rq->rq_sched_info.run_delay += delta_jiffies; + rq->rq_sched_info.pcnt++; + } +} + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) +{ + if (rq) + rq->rq_sched_info.cpu_time += delta_jiffies; +} # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) #else /* !CONFIG_SCHEDSTATS */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) +{} +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) +{} # define schedstat_inc(rq, field) do { } while (0) # define schedstat_add(rq, field, amt) do { } while (0) #endif @@ -524,7 +551,7 @@ static inline struct rq *this_rq_lock(void) return rq; } -#ifdef CONFIG_SCHEDSTATS +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) /* * Called when a process is dequeued from the active array and given * the cpu. We should note that with the exception of interactive @@ -552,21 +579,16 @@ static inline void sched_info_dequeued(struct task_struct *t) */ static void sched_info_arrive(struct task_struct *t) { - unsigned long now = jiffies, diff = 0; - struct rq *rq = task_rq(t); + unsigned long now = jiffies, delta_jiffies = 0; if (t->sched_info.last_queued) - diff = now - t->sched_info.last_queued; + delta_jiffies = now - t->sched_info.last_queued; sched_info_dequeued(t); - t->sched_info.run_delay += diff; + t->sched_info.run_delay += delta_jiffies; t->sched_info.last_arrival = now; t->sched_info.pcnt++; - if (!rq) - return; - - rq->rq_sched_info.run_delay += diff; - rq->rq_sched_info.pcnt++; + rq_sched_info_arrive(task_rq(t), delta_jiffies); } /* @@ -586,8 +608,9 @@ static void sched_info_arrive(struct task_struct *t) */ static inline void sched_info_queued(struct task_struct *t) { - if (!t->sched_info.last_queued) - t->sched_info.last_queued = jiffies; + if (unlikely(sched_info_on())) + if (!t->sched_info.last_queued) + t->sched_info.last_queued = jiffies; } /* @@ -596,13 +619,10 @@ static inline void sched_info_queued(struct task_struct *t) */ static inline void sched_info_depart(struct task_struct *t) { - struct rq *rq = task_rq(t); - unsigned long diff = jiffies - t->sched_info.last_arrival; - - t->sched_info.cpu_time += diff; + unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival; - if (rq) - rq->rq_sched_info.cpu_time += diff; + t->sched_info.cpu_time += delta_jiffies; + rq_sched_info_depart(task_rq(t), delta_jiffies); } /* @@ -611,7 +631,7 @@ static inline void sched_info_depart(struct task_struct *t) * the idle task.) We are only called when prev != next. */ static inline void -sched_info_switch(struct task_struct *prev, struct task_struct *next) +__sched_info_switch(struct task_struct *prev, struct task_struct *next) { struct rq *rq = task_rq(prev); @@ -626,10 +646,16 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) if (next != rq->idle) sched_info_arrive(next); } +static inline void +sched_info_switch(struct task_struct *prev, struct task_struct *next) +{ + if (unlikely(sched_info_on())) + __sched_info_switch(prev, next); +} #else #define sched_info_queued(t) do { } while (0) #define sched_info_switch(t, next) do { } while (0) -#endif /* CONFIG_SCHEDSTATS */ +#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ /* * Adding/removing a task to/from a priority array: @@ -1531,8 +1557,9 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags) INIT_LIST_HEAD(&p->run_list); p->array = NULL; -#ifdef CONFIG_SCHEDSTATS - memset(&p->sched_info, 0, sizeof(p->sched_info)); +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) + if (unlikely(sched_info_on())) + memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) p->oncpu = 0; -- cgit v1.2.3 From c757249af152c59fd74b85e52e8c090acb33d9c0 Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Fri, 14 Jul 2006 00:24:40 -0700 Subject: [PATCH] per-task-delay-accounting: taskstats interface Create a "taskstats" interface based on generic netlink (NETLINK_GENERIC family), for getting statistics of tasks and thread groups during their lifetime and when they exit. The interface is intended for use by multiple accounting packages though it is being created in the context of delay accounting. This patch creates the interface without populating the fields of the data that is sent to the user in response to a command or upon the exit of a task. Each accounting package interested in using taskstats has to provide an additional patch to add its stats to the common structure. [akpm@osdl.org: cleanups, Kconfig fix] Signed-off-by: Shailabh Nagar Signed-off-by: Balbir Singh Cc: Jes Sorensen Cc: Peter Chubb Cc: Erich Focht Cc: Levent Serinol Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 1 + kernel/exit.c | 7 ++ kernel/taskstats.c | 336 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 344 insertions(+) create mode 100644 kernel/taskstats.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 87bb34cc8938..d62ec66c1af2 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -49,6 +49,7 @@ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o +obj-$(CONFIG_TASKSTATS) += taskstats.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/exit.c b/kernel/exit.c index 3c2cf91defa7..9852ed8c2988 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -844,6 +845,7 @@ static void exit_notify(struct task_struct *tsk) fastcall NORET_TYPE void do_exit(long code) { struct task_struct *tsk = current; + struct taskstats *tidstats, *tgidstats; int group_dead; profile_task_exit(tsk); @@ -882,6 +884,8 @@ fastcall NORET_TYPE void do_exit(long code) current->comm, current->pid, preempt_count()); + taskstats_exit_alloc(&tidstats, &tgidstats); + acct_update_integrals(tsk); if (tsk->mm) { update_hiwater_rss(tsk->mm); @@ -901,7 +905,10 @@ fastcall NORET_TYPE void do_exit(long code) #endif if (unlikely(tsk->audit_context)) audit_free(tsk); + taskstats_exit_send(tsk, tidstats, tgidstats); + taskstats_exit_free(tidstats, tgidstats); delayacct_tsk_exit(tsk); + exit_mm(tsk); if (group_dead) diff --git a/kernel/taskstats.c b/kernel/taskstats.c new file mode 100644 index 000000000000..82ec9137d908 --- /dev/null +++ b/kernel/taskstats.c @@ -0,0 +1,336 @@ +/* + * taskstats.c - Export per-task statistics to userland + * + * Copyright (C) Shailabh Nagar, IBM Corp. 2006 + * (C) Balbir Singh, IBM Corp. 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include + +static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; +static int family_registered; +kmem_cache_t *taskstats_cache; +static DEFINE_MUTEX(taskstats_exit_mutex); + +static struct genl_family family = { + .id = GENL_ID_GENERATE, + .name = TASKSTATS_GENL_NAME, + .version = TASKSTATS_GENL_VERSION, + .maxattr = TASKSTATS_CMD_ATTR_MAX, +}; + +static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] +__read_mostly = { + [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, + [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, +}; + + +static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, + void **replyp, size_t size) +{ + struct sk_buff *skb; + void *reply; + + /* + * If new attributes are added, please revisit this allocation + */ + skb = nlmsg_new(size); + if (!skb) + return -ENOMEM; + + if (!info) { + int seq = get_cpu_var(taskstats_seqnum)++; + put_cpu_var(taskstats_seqnum); + + reply = genlmsg_put(skb, 0, seq, + family.id, 0, 0, + cmd, family.version); + } else + reply = genlmsg_put(skb, info->snd_pid, info->snd_seq, + family.id, 0, 0, + cmd, family.version); + if (reply == NULL) { + nlmsg_free(skb); + return -EINVAL; + } + + *skbp = skb; + *replyp = reply; + return 0; +} + +static int send_reply(struct sk_buff *skb, pid_t pid, int event) +{ + struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); + void *reply; + int rc; + + reply = genlmsg_data(genlhdr); + + rc = genlmsg_end(skb, reply); + if (rc < 0) { + nlmsg_free(skb); + return rc; + } + + if (event == TASKSTATS_MSG_MULTICAST) + return genlmsg_multicast(skb, pid, TASKSTATS_LISTEN_GROUP); + return genlmsg_unicast(skb, pid); +} + +static int fill_pid(pid_t pid, struct task_struct *pidtsk, + struct taskstats *stats) +{ + int rc; + struct task_struct *tsk = pidtsk; + + if (!pidtsk) { + read_lock(&tasklist_lock); + tsk = find_task_by_pid(pid); + if (!tsk) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + get_task_struct(tsk); + read_unlock(&tasklist_lock); + } else + get_task_struct(tsk); + + /* + * Each accounting subsystem adds calls to its functions to + * fill in relevant parts of struct taskstsats as follows + * + * rc = per-task-foo(stats, tsk); + * if (rc) + * goto err; + */ + +err: + put_task_struct(tsk); + return rc; + +} + +static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, + struct taskstats *stats) +{ + int rc; + struct task_struct *tsk, *first; + + first = tgidtsk; + read_lock(&tasklist_lock); + if (!first) { + first = find_task_by_pid(tgid); + if (!first) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + } + tsk = first; + do { + /* + * Each accounting subsystem adds calls its functions to + * fill in relevant parts of struct taskstsats as follows + * + * rc = per-task-foo(stats, tsk); + * if (rc) + * break; + */ + + } while_each_thread(first, tsk); + read_unlock(&tasklist_lock); + + /* + * Accounting subsytems can also add calls here if they don't + * wish to aggregate statistics for per-tgid stats + */ + + return rc; +} + +static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info) +{ + int rc = 0; + struct sk_buff *rep_skb; + struct taskstats stats; + void *reply; + size_t size; + struct nlattr *na; + + /* + * Size includes space for nested attributes + */ + size = nla_total_size(sizeof(u32)) + + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + + memset(&stats, 0, sizeof(stats)); + rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); + if (rc < 0) + return rc; + + if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { + u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); + rc = fill_pid(pid, NULL, &stats); + if (rc < 0) + goto err; + + na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); + NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid); + NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, + stats); + } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { + u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); + rc = fill_tgid(tgid, NULL, &stats); + if (rc < 0) + goto err; + + na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); + NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid); + NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, + stats); + } else { + rc = -EINVAL; + goto err; + } + + nla_nest_end(rep_skb, na); + + return send_reply(rep_skb, info->snd_pid, TASKSTATS_MSG_UNICAST); + +nla_put_failure: + return genlmsg_cancel(rep_skb, reply); +err: + nlmsg_free(rep_skb); + return rc; +} + +/* Send pid data out on exit */ +void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, + struct taskstats *tgidstats) +{ + int rc; + struct sk_buff *rep_skb; + void *reply; + size_t size; + int is_thread_group; + struct nlattr *na; + + if (!family_registered || !tidstats) + return; + + mutex_lock(&taskstats_exit_mutex); + + is_thread_group = !thread_group_empty(tsk); + rc = 0; + + /* + * Size includes space for nested attributes + */ + size = nla_total_size(sizeof(u32)) + + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + + if (is_thread_group) + size = 2 * size; /* PID + STATS + TGID + STATS */ + + rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); + if (rc < 0) + goto ret; + + rc = fill_pid(tsk->pid, tsk, tidstats); + if (rc < 0) + goto err_skb; + + na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); + NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid); + NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, + *tidstats); + nla_nest_end(rep_skb, na); + + if (!is_thread_group || !tgidstats) { + send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); + goto ret; + } + + rc = fill_tgid(tsk->pid, tsk, tgidstats); + /* + * If fill_tgid() failed then one probable reason could be that the + * thread group leader has exited. fill_tgid() will fail, send out + * the pid statistics collected earlier. + */ + if (rc < 0) { + send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); + goto ret; + } + + na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); + NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid); + NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, + *tgidstats); + nla_nest_end(rep_skb, na); + + send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); + goto ret; + +nla_put_failure: + genlmsg_cancel(rep_skb, reply); + goto ret; +err_skb: + nlmsg_free(rep_skb); +ret: + mutex_unlock(&taskstats_exit_mutex); + return; +} + +static struct genl_ops taskstats_ops = { + .cmd = TASKSTATS_CMD_GET, + .doit = taskstats_send_stats, + .policy = taskstats_cmd_get_policy, +}; + +/* Needed early in initialization */ +void __init taskstats_init_early(void) +{ + taskstats_cache = kmem_cache_create("taskstats_cache", + sizeof(struct taskstats), + 0, SLAB_PANIC, NULL, NULL); +} + +static int __init taskstats_init(void) +{ + int rc; + + rc = genl_register_family(&family); + if (rc) + return rc; + + rc = genl_register_ops(&family, &taskstats_ops); + if (rc < 0) + goto err; + + family_registered = 1; + return 0; +err: + genl_unregister_family(&family); + return rc; +} + +/* + * late initcall ensures initialization of statistics collection + * mechanisms precedes initialization of the taskstats interface + */ +late_initcall(taskstats_init); -- cgit v1.2.3 From 6f44993fe1d7b2b097f6ac60cd5835c6f5ca0874 Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Fri, 14 Jul 2006 00:24:41 -0700 Subject: [PATCH] per-task-delay-accounting: delay accounting usage of taskstats interface Usage of taskstats interface by delay accounting. Signed-off-by: Shailabh Nagar Signed-off-by: Balbir Singh Cc: Jes Sorensen Cc: Peter Chubb Cc: Erich Focht Cc: Levent Serinol Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/delayacct.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++- kernel/taskstats.c | 16 +++++++++----- 2 files changed, 72 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 3546b0800f9f..1be274a462ca 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -41,6 +41,10 @@ void delayacct_init(void) void __delayacct_tsk_init(struct task_struct *tsk) { + spin_lock_init(&tsk->delays_lock); + /* No need to acquire tsk->delays_lock for allocation here unless + __delayacct_tsk_init called after tsk is attached to tasklist + */ tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL); if (tsk->delays) spin_lock_init(&tsk->delays->lock); @@ -48,8 +52,11 @@ void __delayacct_tsk_init(struct task_struct *tsk) void __delayacct_tsk_exit(struct task_struct *tsk) { - kmem_cache_free(delayacct_cache, tsk->delays); + struct task_delay_info *delays = tsk->delays; + spin_lock(&tsk->delays_lock); tsk->delays = NULL; + spin_unlock(&tsk->delays_lock); + kmem_cache_free(delayacct_cache, delays); } /* @@ -104,3 +111,56 @@ void __delayacct_blkio_end(void) ¤t->delays->blkio_delay, ¤t->delays->blkio_count); } + +int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) +{ + s64 tmp; + struct timespec ts; + unsigned long t1,t2,t3; + + spin_lock(&tsk->delays_lock); + + /* Though tsk->delays accessed later, early exit avoids + * unnecessary returning of other data + */ + if (!tsk->delays) + goto done; + + tmp = (s64)d->cpu_run_real_total; + cputime_to_timespec(tsk->utime + tsk->stime, &ts); + tmp += timespec_to_ns(&ts); + d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; + + /* + * No locking available for sched_info (and too expensive to add one) + * Mitigate by taking snapshot of values + */ + t1 = tsk->sched_info.pcnt; + t2 = tsk->sched_info.run_delay; + t3 = tsk->sched_info.cpu_time; + + d->cpu_count += t1; + + jiffies_to_timespec(t2, &ts); + tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts); + d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; + + tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000; + d->cpu_run_virtual_total = + (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp; + + /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ + + spin_lock(&tsk->delays->lock); + tmp = d->blkio_delay_total + tsk->delays->blkio_delay; + d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; + tmp = d->swapin_delay_total + tsk->delays->swapin_delay; + d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; + d->blkio_count += tsk->delays->blkio_count; + d->swapin_count += tsk->delays->swapin_count; + spin_unlock(&tsk->delays->lock); + +done: + spin_unlock(&tsk->delays_lock); + return 0; +} diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 82ec9137d908..ea9506de3b85 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -18,13 +18,13 @@ #include #include +#include #include #include static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; static int family_registered; kmem_cache_t *taskstats_cache; -static DEFINE_MUTEX(taskstats_exit_mutex); static struct genl_family family = { .id = GENL_ID_GENERATE, @@ -120,7 +120,10 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk, * goto err; */ -err: + rc = delayacct_add_tsk(stats, tsk); + stats->version = TASKSTATS_VERSION; + + /* Define err: label here if needed */ put_task_struct(tsk); return rc; @@ -152,8 +155,14 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, * break; */ + rc = delayacct_add_tsk(stats, tsk); + if (rc) + break; + } while_each_thread(first, tsk); read_unlock(&tasklist_lock); + stats->version = TASKSTATS_VERSION; + /* * Accounting subsytems can also add calls here if they don't @@ -233,8 +242,6 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, if (!family_registered || !tidstats) return; - mutex_lock(&taskstats_exit_mutex); - is_thread_group = !thread_group_empty(tsk); rc = 0; @@ -292,7 +299,6 @@ nla_put_failure: err_skb: nlmsg_free(rep_skb); ret: - mutex_unlock(&taskstats_exit_mutex); return; } -- cgit v1.2.3 From 25890454667b3295f67b3372352be90705f8667c Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Fri, 14 Jul 2006 00:24:43 -0700 Subject: [PATCH] per-task-delay-accounting: /proc export of aggregated block I/O delays Export I/O delays seen by a task through /proc//stats for use in top etc. Note that delays for I/O done for swapping in pages (swapin I/O) is clubbed together with all other I/O here (this is not the case in the netlink interface where the swapin I/O is kept distinct) [akpm@osdl.org: printk warning fix] Signed-off-by: Shailabh Nagar Signed-off-by: Balbir Singh Cc: Jes Sorensen Cc: Peter Chubb Cc: Erich Focht Cc: Levent Serinol Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/delayacct.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 1be274a462ca..f05392d64267 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -164,3 +164,15 @@ done: spin_unlock(&tsk->delays_lock); return 0; } + +__u64 __delayacct_blkio_ticks(struct task_struct *tsk) +{ + __u64 ret; + + spin_lock(&tsk->delays->lock); + ret = nsec_to_clock_t(tsk->delays->blkio_delay + + tsk->delays->swapin_delay); + spin_unlock(&tsk->delays->lock); + return ret; +} + -- cgit v1.2.3 From ad4ecbcba72855a2b5319b96e2a3a65ed1ca3bfd Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Fri, 14 Jul 2006 00:24:44 -0700 Subject: [PATCH] delay accounting taskstats interface send tgid once Send per-tgid data only once during exit of a thread group instead of once with each member thread exit. Currently, when a thread exits, besides its per-tid data, the per-tgid data of its thread group is also sent out, if its thread group is non-empty. The per-tgid data sent consists of the sum of per-tid stats for all *remaining* threads of the thread group. This patch modifies this sending in two ways: - the per-tgid data is sent only when the last thread of a thread group exits. This cuts down heavily on the overhead of sending/receiving per-tgid data, especially when other exploiters of the taskstats interface aren't interested in per-tgid stats - the semantics of the per-tgid data sent are changed. Instead of being the sum of per-tid data for remaining threads, the value now sent is the true total accumalated statistics for all threads that are/were part of the thread group. The patch also addresses a minor issue where failure of one accounting subsystem to fill in the taskstats structure was causing the send of taskstats to not be sent at all. The patch has been tested for stability and run cerberus for over 4 hours on an SMP. [akpm@osdl.org: bugfixes] Signed-off-by: Shailabh Nagar Signed-off-by: Balbir Singh Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 8 ++--- kernel/fork.c | 4 +++ kernel/taskstats.c | 98 ++++++++++++++++++++++++++++++++++++------------------ 3 files changed, 74 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 9852ed8c2988..67c1e9a4f812 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -845,7 +845,7 @@ static void exit_notify(struct task_struct *tsk) fastcall NORET_TYPE void do_exit(long code) { struct task_struct *tsk = current; - struct taskstats *tidstats, *tgidstats; + struct taskstats *tidstats; int group_dead; profile_task_exit(tsk); @@ -884,7 +884,7 @@ fastcall NORET_TYPE void do_exit(long code) current->comm, current->pid, preempt_count()); - taskstats_exit_alloc(&tidstats, &tgidstats); + taskstats_exit_alloc(&tidstats); acct_update_integrals(tsk); if (tsk->mm) { @@ -905,8 +905,8 @@ fastcall NORET_TYPE void do_exit(long code) #endif if (unlikely(tsk->audit_context)) audit_free(tsk); - taskstats_exit_send(tsk, tidstats, tgidstats); - taskstats_exit_free(tidstats, tgidstats); + taskstats_exit_send(tsk, tidstats, group_dead); + taskstats_exit_free(tidstats); delayacct_tsk_exit(tsk); exit_mm(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 451cfd35bf22..1b0f7b1e0881 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -819,6 +820,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts if (clone_flags & CLONE_THREAD) { atomic_inc(¤t->signal->count); atomic_inc(¤t->signal->live); + taskstats_tgid_alloc(current->signal); return 0; } sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); @@ -863,6 +865,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts INIT_LIST_HEAD(&sig->cpu_timers[0]); INIT_LIST_HEAD(&sig->cpu_timers[1]); INIT_LIST_HEAD(&sig->cpu_timers[2]); + taskstats_tgid_init(sig); task_lock(current->group_leader); memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); @@ -884,6 +887,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts void __cleanup_signal(struct signal_struct *sig) { exit_thread_group_keys(sig); + taskstats_tgid_free(sig); kmem_cache_free(signal_cachep, sig); } diff --git a/kernel/taskstats.c b/kernel/taskstats.c index ea9506de3b85..4a0a5022b299 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -132,46 +132,79 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk, static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, struct taskstats *stats) { - int rc; struct task_struct *tsk, *first; + unsigned long flags; + /* + * Add additional stats from live tasks except zombie thread group + * leaders who are already counted with the dead tasks + */ first = tgidtsk; - read_lock(&tasklist_lock); if (!first) { + read_lock(&tasklist_lock); first = find_task_by_pid(tgid); if (!first) { read_unlock(&tasklist_lock); return -ESRCH; } - } + get_task_struct(first); + read_unlock(&tasklist_lock); + } else + get_task_struct(first); + + /* Start with stats from dead tasks */ + spin_lock_irqsave(&first->signal->stats_lock, flags); + if (first->signal->stats) + memcpy(stats, first->signal->stats, sizeof(*stats)); + spin_unlock_irqrestore(&first->signal->stats_lock, flags); + tsk = first; + read_lock(&tasklist_lock); do { + if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk)) + continue; /* - * Each accounting subsystem adds calls its functions to + * Accounting subsystem can call its functions here to * fill in relevant parts of struct taskstsats as follows * - * rc = per-task-foo(stats, tsk); - * if (rc) - * break; + * per-task-foo(stats, tsk); */ - - rc = delayacct_add_tsk(stats, tsk); - if (rc) - break; + delayacct_add_tsk(stats, tsk); } while_each_thread(first, tsk); read_unlock(&tasklist_lock); stats->version = TASKSTATS_VERSION; - /* - * Accounting subsytems can also add calls here if they don't - * wish to aggregate statistics for per-tgid stats + * Accounting subsytems can also add calls here to modify + * fields of taskstats. */ - return rc; + return 0; +} + + +static void fill_tgid_exit(struct task_struct *tsk) +{ + unsigned long flags; + + spin_lock_irqsave(&tsk->signal->stats_lock, flags); + if (!tsk->signal->stats) + goto ret; + + /* + * Each accounting subsystem calls its functions here to + * accumalate its per-task stats for tsk, into the per-tgid structure + * + * per-task-foo(tsk->signal->stats, tsk); + */ + delayacct_add_tsk(tsk->signal->stats, tsk); +ret: + spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); + return; } + static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info) { int rc = 0; @@ -230,7 +263,7 @@ err: /* Send pid data out on exit */ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, - struct taskstats *tgidstats) + int group_dead) { int rc; struct sk_buff *rep_skb; @@ -238,13 +271,16 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, size_t size; int is_thread_group; struct nlattr *na; + unsigned long flags; if (!family_registered || !tidstats) return; - is_thread_group = !thread_group_empty(tsk); - rc = 0; + spin_lock_irqsave(&tsk->signal->stats_lock, flags); + is_thread_group = tsk->signal->stats ? 1 : 0; + spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); + rc = 0; /* * Size includes space for nested attributes */ @@ -268,30 +304,28 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, *tidstats); nla_nest_end(rep_skb, na); - if (!is_thread_group || !tgidstats) { - send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); - goto ret; - } + if (!is_thread_group) + goto send; - rc = fill_tgid(tsk->pid, tsk, tgidstats); /* - * If fill_tgid() failed then one probable reason could be that the - * thread group leader has exited. fill_tgid() will fail, send out - * the pid statistics collected earlier. + * tsk has/had a thread group so fill the tsk->signal->stats structure + * Doesn't matter if tsk is the leader or the last group member leaving */ - if (rc < 0) { - send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); - goto ret; - } + + fill_tgid_exit(tsk); + if (!group_dead) + goto send; na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid); + /* No locking needed for tsk->signal->stats since group is dead */ NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, - *tgidstats); + *tsk->signal->stats); nla_nest_end(rep_skb, na); +send: send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); - goto ret; + return; nla_put_failure: genlmsg_cancel(rep_skb, reply); -- cgit v1.2.3 From f9fd8914c1acca0d98b69d831b128d5b52f03c51 Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Fri, 14 Jul 2006 00:24:47 -0700 Subject: [PATCH] per-task delay accounting taskstats interface: control exit data through cpumasks On systems with a large number of cpus, with even a modest rate of tasks exiting per cpu, the volume of taskstats data sent on thread exit can overflow a userspace listener's buffers. One approach to avoiding overflow is to allow listeners to get data for a limited and specific set of cpus. By scaling the number of listeners and/or the cpus they monitor, userspace can handle the statistical data overload more gracefully. In this patch, each listener registers to listen to a specific set of cpus by specifying a cpumask. The interest is recorded per-cpu. When a task exits on a cpu, its taskstats data is unicast to each listener interested in that cpu. Thanks to Andrew Morton for pointing out the various scalability and general concerns of previous attempts and for suggesting this design. [akpm@osdl.org: build fix] Signed-off-by: Shailabh Nagar Signed-off-by: Balbir Singh Signed-off-by: Chandra Seetharaman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 5 +- kernel/taskstats.c | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 192 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 67c1e9a4f812..dba194a8d416 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -847,6 +847,7 @@ fastcall NORET_TYPE void do_exit(long code) struct task_struct *tsk = current; struct taskstats *tidstats; int group_dead; + unsigned int mycpu; profile_task_exit(tsk); @@ -884,7 +885,7 @@ fastcall NORET_TYPE void do_exit(long code) current->comm, current->pid, preempt_count()); - taskstats_exit_alloc(&tidstats); + taskstats_exit_alloc(&tidstats, &mycpu); acct_update_integrals(tsk); if (tsk->mm) { @@ -905,7 +906,7 @@ fastcall NORET_TYPE void do_exit(long code) #endif if (unlikely(tsk->audit_context)) audit_free(tsk); - taskstats_exit_send(tsk, tidstats, group_dead); + taskstats_exit_send(tsk, tidstats, group_dead, mycpu); taskstats_exit_free(tidstats); delayacct_tsk_exit(tsk); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 4a0a5022b299..abb59e323544 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -19,9 +19,17 @@ #include #include #include +#include +#include #include #include +/* + * Maximum length of a cpumask that can be specified in + * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute + */ +#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) + static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; static int family_registered; kmem_cache_t *taskstats_cache; @@ -37,8 +45,25 @@ static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] __read_mostly = { [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, + [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, + [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; + +struct listener { + struct list_head list; + pid_t pid; }; +struct listener_list { + struct rw_semaphore sem; + struct list_head list; +}; +static DEFINE_PER_CPU(struct listener_list, listener_array); + +enum actions { + REGISTER, + DEREGISTER, + CPU_DONT_CARE +}; static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, void **replyp, size_t size) @@ -74,25 +99,68 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, return 0; } -static int send_reply(struct sk_buff *skb, pid_t pid, int event) +/* + * Send taskstats data in @skb to listener with nl_pid @pid + */ +static int send_reply(struct sk_buff *skb, pid_t pid) { struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); - void *reply; + void *reply = genlmsg_data(genlhdr); int rc; - reply = genlmsg_data(genlhdr); - rc = genlmsg_end(skb, reply); if (rc < 0) { nlmsg_free(skb); return rc; } - if (event == TASKSTATS_MSG_MULTICAST) - return genlmsg_multicast(skb, pid, TASKSTATS_LISTEN_GROUP); return genlmsg_unicast(skb, pid); } +/* + * Send taskstats data in @skb to listeners registered for @cpu's exit data + */ +static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) +{ + struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); + struct listener_list *listeners; + struct listener *s, *tmp; + struct sk_buff *skb_next, *skb_cur = skb; + void *reply = genlmsg_data(genlhdr); + int rc, ret; + + rc = genlmsg_end(skb, reply); + if (rc < 0) { + nlmsg_free(skb); + return rc; + } + + rc = 0; + listeners = &per_cpu(listener_array, cpu); + down_write(&listeners->sem); + list_for_each_entry_safe(s, tmp, &listeners->list, list) { + skb_next = NULL; + if (!list_is_last(&s->list, &listeners->list)) { + skb_next = skb_clone(skb_cur, GFP_KERNEL); + if (!skb_next) { + nlmsg_free(skb_cur); + rc = -ENOMEM; + break; + } + } + ret = genlmsg_unicast(skb_cur, s->pid); + if (ret == -ECONNREFUSED) { + list_del(&s->list); + kfree(s); + rc = ret; + } + skb_cur = skb_next; + } + up_write(&listeners->sem); + + return rc; +} + static int fill_pid(pid_t pid, struct task_struct *pidtsk, struct taskstats *stats) { @@ -204,8 +272,73 @@ ret: return; } +static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) +{ + struct listener_list *listeners; + struct listener *s, *tmp; + unsigned int cpu; + cpumask_t mask = *maskp; -static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info) + if (!cpus_subset(mask, cpu_possible_map)) + return -EINVAL; + + if (isadd == REGISTER) { + for_each_cpu_mask(cpu, mask) { + s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, + cpu_to_node(cpu)); + if (!s) + goto cleanup; + s->pid = pid; + INIT_LIST_HEAD(&s->list); + + listeners = &per_cpu(listener_array, cpu); + down_write(&listeners->sem); + list_add(&s->list, &listeners->list); + up_write(&listeners->sem); + } + return 0; + } + + /* Deregister or cleanup */ +cleanup: + for_each_cpu_mask(cpu, mask) { + listeners = &per_cpu(listener_array, cpu); + down_write(&listeners->sem); + list_for_each_entry_safe(s, tmp, &listeners->list, list) { + if (s->pid == pid) { + list_del(&s->list); + kfree(s); + break; + } + } + up_write(&listeners->sem); + } + return 0; +} + +static int parse(struct nlattr *na, cpumask_t *mask) +{ + char *data; + int len; + int ret; + + if (na == NULL) + return 1; + len = nla_len(na); + if (len > TASKSTATS_CPUMASK_MAXLEN) + return -E2BIG; + if (len < 1) + return -EINVAL; + data = kmalloc(len, GFP_KERNEL); + if (!data) + return -ENOMEM; + nla_strlcpy(data, na, len); + ret = cpulist_parse(data, *mask); + kfree(data); + return ret; +} + +static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) { int rc = 0; struct sk_buff *rep_skb; @@ -213,6 +346,19 @@ static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info) void *reply; size_t size; struct nlattr *na; + cpumask_t mask; + + rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); + if (rc < 0) + return rc; + if (rc == 0) + return add_del_listener(info->snd_pid, &mask, REGISTER); + + rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask); + if (rc < 0) + return rc; + if (rc == 0) + return add_del_listener(info->snd_pid, &mask, DEREGISTER); /* * Size includes space for nested attributes @@ -252,7 +398,7 @@ static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info) nla_nest_end(rep_skb, na); - return send_reply(rep_skb, info->snd_pid, TASKSTATS_MSG_UNICAST); + return send_reply(rep_skb, info->snd_pid); nla_put_failure: return genlmsg_cancel(rep_skb, reply); @@ -261,9 +407,35 @@ err: return rc; } +void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) +{ + struct listener_list *listeners; + struct taskstats *tmp; + /* + * This is the cpu on which the task is exiting currently and will + * be the one for which the exit event is sent, even if the cpu + * on which this function is running changes later. + */ + *mycpu = raw_smp_processor_id(); + + *ptidstats = NULL; + tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); + if (!tmp) + return; + + listeners = &per_cpu(listener_array, *mycpu); + down_read(&listeners->sem); + if (!list_empty(&listeners->list)) { + *ptidstats = tmp; + tmp = NULL; + } + up_read(&listeners->sem); + kfree(tmp); +} + /* Send pid data out on exit */ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, - int group_dead) + int group_dead, unsigned int mycpu) { int rc; struct sk_buff *rep_skb; @@ -324,7 +496,7 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, nla_nest_end(rep_skb, na); send: - send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); + send_cpu_listeners(rep_skb, mycpu); return; nla_put_failure: @@ -338,16 +510,22 @@ ret: static struct genl_ops taskstats_ops = { .cmd = TASKSTATS_CMD_GET, - .doit = taskstats_send_stats, + .doit = taskstats_user_cmd, .policy = taskstats_cmd_get_policy, }; /* Needed early in initialization */ void __init taskstats_init_early(void) { + unsigned int i; + taskstats_cache = kmem_cache_create("taskstats_cache", sizeof(struct taskstats), 0, SLAB_PANIC, NULL, NULL); + for_each_possible_cpu(i) { + INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); + init_rwsem(&(per_cpu(listener_array, i).sem)); + } } static int __init taskstats_init(void) -- cgit v1.2.3 From bb129994c3bff9c5e8df91f05d7e9b6402fbd83f Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Fri, 14 Jul 2006 00:24:47 -0700 Subject: [PATCH] Remove down_write() from taskstats code invoked on the exit() path In send_cpu_listeners(), which is called on the exit path, a down_write() was protecting operations like skb_clone() and genlmsg_unicast() that do GFP_KERNEL allocations. If the oom-killer decides to kill tasks to satisfy the allocations,the exit of those tasks could block on the same semphore. The down_write() was only needed to allow removal of invalid listeners from the listener list. The patch converts the down_write to a down_read and defers the removal to a separate critical region. This ensures that even if the oom-killer is called, no other task's exit is blocked as it can still acquire another down_read. Thanks to Andrew Morton & Herbert Xu for pointing out the oom related pitfalls, and to Chandra Seetharaman for suggesting this fix instead of using something more complex like RCU. Signed-off-by: Chandra Seetharaman Signed-off-by: Shailabh Nagar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index abb59e323544..f45179ce028e 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -51,6 +51,7 @@ __read_mostly = { struct listener { struct list_head list; pid_t pid; + char valid; }; struct listener_list { @@ -127,7 +128,7 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) struct listener *s, *tmp; struct sk_buff *skb_next, *skb_cur = skb; void *reply = genlmsg_data(genlhdr); - int rc, ret; + int rc, ret, delcount = 0; rc = genlmsg_end(skb, reply); if (rc < 0) { @@ -137,7 +138,7 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) rc = 0; listeners = &per_cpu(listener_array, cpu); - down_write(&listeners->sem); + down_read(&listeners->sem); list_for_each_entry_safe(s, tmp, &listeners->list, list) { skb_next = NULL; if (!list_is_last(&s->list, &listeners->list)) { @@ -150,14 +151,26 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) } ret = genlmsg_unicast(skb_cur, s->pid); if (ret == -ECONNREFUSED) { - list_del(&s->list); - kfree(s); + s->valid = 0; + delcount++; rc = ret; } skb_cur = skb_next; } - up_write(&listeners->sem); + up_read(&listeners->sem); + + if (!delcount) + return rc; + /* Delete invalidated entries */ + down_write(&listeners->sem); + list_for_each_entry_safe(s, tmp, &listeners->list, list) { + if (!s->valid) { + list_del(&s->list); + kfree(s); + } + } + up_write(&listeners->sem); return rc; } @@ -290,6 +303,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) goto cleanup; s->pid = pid; INIT_LIST_HEAD(&s->list); + s->valid = 1; listeners = &per_cpu(listener_array, cpu); down_write(&listeners->sem); -- cgit v1.2.3 From aa95387774039096c11803c04011f1aa42d85758 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 23 Jul 2006 12:12:16 -0700 Subject: cpu hotplug: simplify and hopefully fix locking The CPU hotplug locking was quite messy, with a recursive lock to handle the fact that both the actual up/down sequence wanted to protect itself from being re-entered, but the callbacks that it called also tended to want to protect themselves from CPU events. This splits the lock into two (one to serialize the whole hotplug sequence, the other to protect against the CPU present bitmaps changing). The latter still allows recursive usage because some subsystems (ondemand policy for cpufreq at least) had already gotten too used to the lax locking, but the locking mistakes are hopefully now less fundamental, and we now warn about recursive lock usage when we see it, in the hope that it can be fixed. Signed-off-by: Linus Torvalds --- kernel/cpu.c | 75 +++++++++++++++++++++++++++--------------------------------- 1 file changed, 34 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 70fbf2e83766..f230f9ae01c2 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -16,56 +16,48 @@ #include /* This protects CPUs going up and down... */ -static DEFINE_MUTEX(cpucontrol); +static DEFINE_MUTEX(cpu_add_remove_lock); +static DEFINE_MUTEX(cpu_bitmask_lock); static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain); #ifdef CONFIG_HOTPLUG_CPU -static struct task_struct *lock_cpu_hotplug_owner; -static int lock_cpu_hotplug_depth; -static int __lock_cpu_hotplug(int interruptible) -{ - int ret = 0; - - if (lock_cpu_hotplug_owner != current) { - if (interruptible) - ret = mutex_lock_interruptible(&cpucontrol); - else - mutex_lock(&cpucontrol); - } - - /* - * Set only if we succeed in locking - */ - if (!ret) { - lock_cpu_hotplug_depth++; - lock_cpu_hotplug_owner = current; - } - - return ret; -} +/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */ +static struct task_struct *recursive; +static int recursive_depth; void lock_cpu_hotplug(void) { - __lock_cpu_hotplug(0); + struct task_struct *tsk = current; + + if (tsk == recursive) { + static int warnings = 10; + if (warnings) { + printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n"); + WARN_ON(1); + warnings--; + } + recursive_depth++; + return; + } + mutex_lock(&cpu_bitmask_lock); + recursive = tsk; } EXPORT_SYMBOL_GPL(lock_cpu_hotplug); void unlock_cpu_hotplug(void) { - if (--lock_cpu_hotplug_depth == 0) { - lock_cpu_hotplug_owner = NULL; - mutex_unlock(&cpucontrol); + WARN_ON(recursive != current); + if (recursive_depth) { + recursive_depth--; + return; } + mutex_unlock(&cpu_bitmask_lock); + recursive = NULL; } EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); -int lock_cpu_hotplug_interruptible(void) -{ - return __lock_cpu_hotplug(1); -} -EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible); #endif /* CONFIG_HOTPLUG_CPU */ /* Need to know about CPUs going up/down? */ @@ -122,9 +114,7 @@ int cpu_down(unsigned int cpu) struct task_struct *p; cpumask_t old_allowed, tmp; - if ((err = lock_cpu_hotplug_interruptible()) != 0) - return err; - + mutex_lock(&cpu_add_remove_lock); if (num_online_cpus() == 1) { err = -EBUSY; goto out; @@ -150,7 +140,10 @@ int cpu_down(unsigned int cpu) cpu_clear(cpu, tmp); set_cpus_allowed(current, tmp); + mutex_lock(&cpu_bitmask_lock); p = __stop_machine_run(take_cpu_down, NULL, cpu); + mutex_unlock(&cpu_bitmask_lock); + if (IS_ERR(p)) { /* CPU didn't die: tell everyone. Can't complain. */ if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, @@ -187,7 +180,7 @@ out_thread: out_allowed: set_cpus_allowed(current, old_allowed); out: - unlock_cpu_hotplug(); + mutex_unlock(&cpu_add_remove_lock); return err; } #endif /*CONFIG_HOTPLUG_CPU*/ @@ -197,9 +190,7 @@ int __devinit cpu_up(unsigned int cpu) int ret; void *hcpu = (void *)(long)cpu; - if ((ret = lock_cpu_hotplug_interruptible()) != 0) - return ret; - + mutex_lock(&cpu_add_remove_lock); if (cpu_online(cpu) || !cpu_present(cpu)) { ret = -EINVAL; goto out; @@ -214,7 +205,9 @@ int __devinit cpu_up(unsigned int cpu) } /* Arch-specific enabling code. */ + mutex_lock(&cpu_bitmask_lock); ret = __cpu_up(cpu); + mutex_unlock(&cpu_bitmask_lock); if (ret != 0) goto out_notify; BUG_ON(!cpu_online(cpu)); @@ -227,6 +220,6 @@ out_notify: blocking_notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu); out: - unlock_cpu_hotplug(); + mutex_unlock(&cpu_add_remove_lock); return ret; } -- cgit v1.2.3 From abb5a5cc6bba1516403146c5b79036fe843beb70 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Sun, 23 Jul 2006 11:36:08 -0700 Subject: [PATCH] Cpuset: fix ABBA deadlock with cpu hotplug lock Fix ABBA deadlock between lock_cpu_hotplug() and the cpuset callback_mutex lock. It only happens on cpu_exclusive cpusets, due to the dynamic sched domain code trying to take the cpu hotplug lock inside the cpuset callback_mutex lock. This bug has apparently been here for several months, but didn't get hit until the right customer load on a large system. This fix appears right from inspection, but it will take a few more days running it on that customers workload to be confident we nailed it. We don't have any other reproducible test case. The cpu_hotplug_lock() tends to cover large runs of code. The other places that hold both that lock and the cpuset callback mutex lock always nest the cpuset lock inside the hotplug lock. This place tries to do the reverse, risking an ABBA deadlock. This is in the cpuset_rmdir() code, where we: * take the callback_mutex lock * mark the cpuset CS_REMOVED * call update_cpu_domains for cpu_exclusive cpusets * in that call, take the cpu_hotplug lock if the cpuset is marked for removal. Thanks to Jack Steiner for identifying this deadlock. The fix is to tear down the dynamic sched domain before we grab the cpuset callback_mutex lock. This way, the two locks are serialized, with the hotplug lock taken and released before trying for the cpuset lock. I suspect that this bug was introduced when I changed the cpuset locking from one lock to two. The dynamic sched domain dependency on cpu_exclusive cpusets and its hotplug hooks were added to this code earlier, when cpusets had only a single lock. It may well have been fine then. Signed-off-by: Paul Jackson Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c232dc077438..1a649f2bb9bb 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -762,6 +762,8 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) * * Call with manage_mutex held. May nest a call to the * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. + * Must not be called holding callback_mutex, because we must + * not call lock_cpu_hotplug() while holding callback_mutex. */ static void update_cpu_domains(struct cpuset *cur) @@ -781,7 +783,7 @@ static void update_cpu_domains(struct cpuset *cur) if (is_cpu_exclusive(c)) cpus_andnot(pspan, pspan, c->cpus_allowed); } - if (is_removed(cur) || !is_cpu_exclusive(cur)) { + if (!is_cpu_exclusive(cur)) { cpus_or(pspan, pspan, cur->cpus_allowed); if (cpus_equal(pspan, cur->cpus_allowed)) return; @@ -1917,6 +1919,17 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode) return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); } +/* + * Locking note on the strange update_flag() call below: + * + * If the cpuset being removed is marked cpu_exclusive, then simulate + * turning cpu_exclusive off, which will call update_cpu_domains(). + * The lock_cpu_hotplug() call in update_cpu_domains() must not be + * made while holding callback_mutex. Elsewhere the kernel nests + * callback_mutex inside lock_cpu_hotplug() calls. So the reverse + * nesting would risk an ABBA deadlock. + */ + static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) { struct cpuset *cs = dentry->d_fsdata; @@ -1936,11 +1949,16 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) mutex_unlock(&manage_mutex); return -EBUSY; } + if (is_cpu_exclusive(cs)) { + int retval = update_flag(CS_CPU_EXCLUSIVE, cs, "0"); + if (retval < 0) { + mutex_unlock(&manage_mutex); + return retval; + } + } parent = cs->parent; mutex_lock(&callback_mutex); set_bit(CS_REMOVED, &cs->flags); - if (is_cpu_exclusive(cs)) - update_cpu_domains(cs); list_del(&cs->sibling); /* delete my sibling from parent->children */ spin_lock(&cs->dentry->d_lock); d = dget(cs->dentry); -- cgit v1.2.3