From 0a565f7919cfb3d3df2c97d45751cbb83d858f97 Mon Sep 17 00:00:00 2001
From: Peter Williams <pwil3058@bigpond.net.au>
Date: Mon, 10 Jul 2006 04:43:51 -0700
Subject: [PATCH] sched: fix bug in __migrate_task()

Problem:

In the function __migrate_task(), deactivate_task() followed by
activate_task() is used to move the task from one run queue to
another.  This has two undesirable effects:

1. The task's priority is recalculated. (Nowhere else in the
scheduler code is the priority recalculated for a change of CPU.)

2. The task's time stamp is set to the current time.  At the very least,
this makes the adjustment of the time stamp before the call to
deactivate_task() redundant but I believe the problem is more serious
as the time stamp now holds the time of the queue change instead of
the time at which the task was woken.  In addition, unless dest_rq is
the same queue as "current" is on the time stamp could be inaccurate
due to inter CPU drift.

Solution:

Replace the call to activate_task() with one to __activate_task().

Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 4ee400f9d56b..b47819b676fa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4877,7 +4877,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 		p->timestamp = p->timestamp - rq_src->timestamp_last_tick
 				+ rq_dest->timestamp_last_tick;
 		deactivate_task(p, rq_src);
-		activate_task(p, rq_dest, 0);
+		__activate_task(p, rq_dest);
 		if (TASK_PREEMPTS_CURR(p, rq_dest))
 			resched_task(rq_dest->curr);
 	}
-- 
cgit v1.2.3


From 2ed6e34f88a0d896a6f889b00693cae0fadacfd0 Mon Sep 17 00:00:00 2001
From: Andreas Mohr <andi@rhlx01.fht-esslingen.de>
Date: Mon, 10 Jul 2006 04:43:52 -0700
Subject: [PATCH] small kernel/sched.c cleanup

- constify and optimize stat_nam (thanks to Michael Tokarev!)
- spelling and comment fixes

Signed-off-by: Andreas Mohr <andi@lisas.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b47819b676fa..d714611f1691 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3384,7 +3384,7 @@ EXPORT_SYMBOL(schedule);
 
 #ifdef CONFIG_PREEMPT
 /*
- * this is is the entry point to schedule() from in-kernel preemption
+ * this is the entry point to schedule() from in-kernel preemption
  * off of preempt_enable.  Kernel preemptions off return from interrupt
  * occur there and call schedule directly.
  */
@@ -3427,7 +3427,7 @@ need_resched:
 EXPORT_SYMBOL(preempt_schedule);
 
 /*
- * this is is the entry point to schedule() from kernel preemption
+ * this is the entry point to schedule() from kernel preemption
  * off of irq context.
  * Note, that this is called and return with irqs disabled. This will
  * protect us against recursive calling from irq.
@@ -3439,7 +3439,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
 	struct task_struct *task = current;
 	int saved_lock_depth;
 #endif
-	/* Catch callers which need to be fixed*/
+	/* Catch callers which need to be fixed */
 	BUG_ON(ti->preempt_count || !irqs_disabled());
 
 need_resched:
@@ -4650,7 +4650,7 @@ static inline struct task_struct *younger_sibling(struct task_struct *p)
 	return list_entry(p->sibling.next,struct task_struct,sibling);
 }
 
-static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };
+static const char stat_nam[] = "RSDTtZX";
 
 static void show_task(struct task_struct *p)
 {
@@ -4658,12 +4658,9 @@ static void show_task(struct task_struct *p)
 	unsigned long free = 0;
 	unsigned state;
 
-	printk("%-13.13s ", p->comm);
 	state = p->state ? __ffs(p->state) + 1 : 0;
-	if (state < ARRAY_SIZE(stat_nam))
-		printk(stat_nam[state]);
-	else
-		printk("?");
+	printk("%-13.13s %c", p->comm,
+		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
 #if (BITS_PER_LONG == 32)
 	if (state == TASK_RUNNING)
 		printk(" running ");
@@ -5776,7 +5773,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
 	cache = vmalloc(max_size);
 	if (!cache) {
 		printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
-		return 1000000; // return 1 msec on very small boxen
+		return 1000000; /* return 1 msec on very small boxen */
 	}
 
 	while (size <= max_size) {
-- 
cgit v1.2.3


From f9829cceb686f3719215fe43c8593e5f3efe1710 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 10 Jul 2006 04:44:01 -0700
Subject: [PATCH] Minor cleanup to lockdep.c

- Use printk formatting for indentation
- Don't leave NTFS in the default event filter

Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/lockdep.c | 44 ++++++++++++--------------------------------
 1 file changed, 12 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f32ca78c198d..42af5cc31a49 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -169,22 +169,17 @@ EXPORT_SYMBOL(lockdep_internal);
  */
 static int class_filter(struct lock_class *class)
 {
+#if 0
+	/* Example */
 	if (class->name_version == 1 &&
-			!strcmp(class->name, "&rl->lock"))
+			!strcmp(class->name, "lockname"))
 		return 1;
 	if (class->name_version == 1 &&
-			!strcmp(class->name, "&ni->mrec_lock"))
+			!strcmp(class->name, "&struct->lockfield"))
 		return 1;
-	if (class->name_version == 1 &&
-			!strcmp(class->name, "mft_ni_runlist_lock"))
-		return 1;
-	if (class->name_version == 1 &&
-			!strcmp(class->name, "mft_ni_mrec_lock"))
-		return 1;
-	if (class->name_version == 1 &&
-			!strcmp(class->name, "&vol->lcnbmp_lock"))
-		return 1;
-	return 0;
+#endif
+	/* Allow everything else. 0 would be filter everything else */
+	return 1;
 }
 #endif
 
@@ -408,23 +403,12 @@ static void lockdep_print_held_locks(struct task_struct *curr)
 		print_lock(curr->held_locks + i);
 	}
 }
-/*
- * Helper to print a nice hierarchy of lock dependencies:
- */
-static void print_spaces(int nr)
-{
-	int i;
-
-	for (i = 0; i < nr; i++)
-		printk("  ");
-}
 
 static void print_lock_class_header(struct lock_class *class, int depth)
 {
 	int bit;
 
-	print_spaces(depth);
-	printk("->");
+	printk("%*s->", depth, "");
 	print_lock_name(class);
 	printk(" ops: %lu", class->ops);
 	printk(" {\n");
@@ -433,17 +417,14 @@ static void print_lock_class_header(struct lock_class *class, int depth)
 		if (class->usage_mask & (1 << bit)) {
 			int len = depth;
 
-			print_spaces(depth);
-			len += printk("   %s", usage_str[bit]);
+			len += printk("%*s   %s", depth, "", usage_str[bit]);
 			len += printk(" at:\n");
 			print_stack_trace(class->usage_traces + bit, len);
 		}
 	}
-	print_spaces(depth);
-	printk(" }\n");
+	printk("%*s }\n", depth, "");
 
-	print_spaces(depth);
-	printk(" ... key      at: ");
+	printk("%*s ... key      at: ",depth,"");
 	print_ip_sym((unsigned long)class->key);
 }
 
@@ -463,8 +444,7 @@ static void print_lock_dependencies(struct lock_class *class, int depth)
 		DEBUG_LOCKS_WARN_ON(!entry->class);
 		print_lock_dependencies(entry->class, depth + 1);
 
-		print_spaces(depth);
-		printk(" ... acquired at:\n");
+		printk("%*s ... acquired at:\n",depth,"");
 		print_stack_trace(&entry->trace, 2);
 		printk("\n");
 	}
-- 
cgit v1.2.3


From 55794a412fdf9af1744800e5020a4ec6b21e3cdc Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@Linux.intel.com>
Date: Mon, 10 Jul 2006 04:44:03 -0700
Subject: [PATCH] lockdep: improve debug output

Make lockdep print which lock is held, in the "kfree() of a live lock"
scenario.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/lockdep.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 42af5cc31a49..c1f34addd003 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2551,7 +2551,7 @@ static inline int in_range(const void *start, const void *addr, const void *end)
 
 static void
 print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
-		     const void *mem_to)
+		     const void *mem_to, struct held_lock *hlock)
 {
 	if (!debug_locks_off())
 		return;
@@ -2563,6 +2563,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
 	printk(  "-------------------------\n");
 	printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
 		curr->comm, curr->pid, mem_from, mem_to-1);
+	print_lock(hlock);
 	lockdep_print_held_locks(curr);
 
 	printk("\nstack backtrace:\n");
@@ -2596,7 +2597,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
 					!in_range(mem_from, lock_to, mem_to))
 			continue;
 
-		print_freed_lock_bug(curr, mem_from, mem_to);
+		print_freed_lock_bug(curr, mem_from, mem_to, hlock);
 		break;
 	}
 	local_irq_restore(flags);
-- 
cgit v1.2.3


From d6d897cec29252b8d0785198cfa6ca16d30c739d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 10 Jul 2006 04:44:04 -0700
Subject: [PATCH] lockdep: core, reduce per-lock class-cache size

lockdep_map is embedded into every lock, which blows up data structure
sizes all around the kernel.  Reduce the class-cache to be for the default
class only - that is used in 99.9% of the cases and even if we dont have a
class cached, the lookup in the class-hash is lockless.

This change reduces the per-lock dep_map overhead by 56 bytes on 64-bit
platforms and by 28 bytes on 32-bit platforms.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/lockdep.c | 87 +++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 54 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index c1f34addd003..9bad17884513 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -1104,7 +1104,7 @@ extern void __error_too_big_MAX_LOCKDEP_SUBCLASSES(void);
  * itself, so actual lookup of the hash should be once per lock object.
  */
 static inline struct lock_class *
-register_lock_class(struct lockdep_map *lock, unsigned int subclass)
+look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 {
 	struct lockdep_subclass_key *key;
 	struct list_head *hash_head;
@@ -1148,7 +1148,26 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass)
 	 */
 	list_for_each_entry(class, hash_head, hash_entry)
 		if (class->key == key)
-			goto out_set;
+			return class;
+
+	return NULL;
+}
+
+/*
+ * Register a lock's class in the hash-table, if the class is not present
+ * yet. Otherwise we look it up. We cache the result in the lock object
+ * itself, so actual lookup of the hash should be once per lock object.
+ */
+static inline struct lock_class *
+register_lock_class(struct lockdep_map *lock, unsigned int subclass)
+{
+	struct lockdep_subclass_key *key;
+	struct list_head *hash_head;
+	struct lock_class *class;
+
+	class = look_up_lock_class(lock, subclass);
+	if (likely(class))
+		return class;
 
 	/*
 	 * Debug-check: all keys must be persistent!
@@ -1163,6 +1182,9 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass)
 		return NULL;
 	}
 
+	key = lock->key->subkeys + subclass;
+	hash_head = classhashentry(key);
+
 	__raw_spin_lock(&hash_lock);
 	/*
 	 * We have to do the hash-walk again, to avoid races
@@ -1209,8 +1231,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass)
 out_unlock_set:
 	__raw_spin_unlock(&hash_lock);
 
-out_set:
-	lock->class[subclass] = class;
+	if (!subclass)
+		lock->class_cache = class;
 
 	DEBUG_LOCKS_WARN_ON(class->subclass != subclass);
 
@@ -1914,7 +1936,7 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
 	}
 	lock->name = name;
 	lock->key = key;
-	memset(lock->class, 0, sizeof(lock->class[0])*MAX_LOCKDEP_SUBCLASSES);
+	lock->class_cache = NULL;
 }
 
 EXPORT_SYMBOL_GPL(lockdep_init_map);
@@ -1928,8 +1950,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 			  unsigned long ip)
 {
 	struct task_struct *curr = current;
+	struct lock_class *class = NULL;
 	struct held_lock *hlock;
-	struct lock_class *class;
 	unsigned int depth, id;
 	int chain_head = 0;
 	u64 chain_key;
@@ -1947,8 +1969,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 		return 0;
 	}
 
-	class = lock->class[subclass];
-	/* not cached yet? */
+	if (!subclass)
+		class = lock->class_cache;
+	/*
+	 * Not cached yet or subclass?
+	 */
 	if (unlikely(!class)) {
 		class = register_lock_class(lock, subclass);
 		if (!class)
@@ -2449,48 +2474,44 @@ void lockdep_free_key_range(void *start, unsigned long size)
 
 void lockdep_reset_lock(struct lockdep_map *lock)
 {
-	struct lock_class *class, *next, *entry;
+	struct lock_class *class, *next;
 	struct list_head *head;
 	unsigned long flags;
 	int i, j;
 
 	raw_local_irq_save(flags);
-	__raw_spin_lock(&hash_lock);
 
 	/*
-	 * Remove all classes this lock has:
+	 * Remove all classes this lock might have:
+	 */
+	for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) {
+		/*
+		 * If the class exists we look it up and zap it:
+		 */
+		class = look_up_lock_class(lock, j);
+		if (class)
+			zap_class(class);
+	}
+	/*
+	 * Debug check: in the end all mapped classes should
+	 * be gone.
 	 */
+	__raw_spin_lock(&hash_lock);
 	for (i = 0; i < CLASSHASH_SIZE; i++) {
 		head = classhash_table + i;
 		if (list_empty(head))
 			continue;
 		list_for_each_entry_safe(class, next, head, hash_entry) {
-			for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) {
-				entry = lock->class[j];
-				if (class == entry) {
-					zap_class(class);
-					lock->class[j] = NULL;
-					break;
-				}
+			if (unlikely(class == lock->class_cache)) {
+				__raw_spin_unlock(&hash_lock);
+				DEBUG_LOCKS_WARN_ON(1);
+				goto out_restore;
 			}
 		}
 	}
-
-	/*
-	 * Debug check: in the end all mapped classes should
-	 * be gone.
-	 */
-	for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) {
-		entry = lock->class[j];
-		if (!entry)
-			continue;
-		__raw_spin_unlock(&hash_lock);
-		DEBUG_LOCKS_WARN_ON(1);
-		raw_local_irq_restore(flags);
-		return;
-	}
-
 	__raw_spin_unlock(&hash_lock);
+
+out_restore:
 	raw_local_irq_restore(flags);
 }
 
-- 
cgit v1.2.3


From c0fc84d2e5bb4a9e3ae470812a00cccba85a48b8 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 10 Jul 2006 04:44:21 -0700
Subject: [PATCH] kernel/printk.c: EXPORT_SYMBOL_UNUSED

This patch marks unused exports as EXPORT_SYMBOL_UNUSED.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/printk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index bdba5d80496c..65ca0688f86f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -52,7 +52,7 @@ int console_printk[4] = {
 	DEFAULT_CONSOLE_LOGLEVEL,	/* default_console_loglevel */
 };
 
-EXPORT_SYMBOL(console_printk);
+EXPORT_UNUSED_SYMBOL(console_printk);  /*  June 2006  */
 
 /*
  * Low lever drivers may need that to know if they can schedule in
@@ -773,7 +773,7 @@ int is_console_locked(void)
 {
 	return console_locked;
 }
-EXPORT_SYMBOL(is_console_locked);
+EXPORT_UNUSED_SYMBOL(is_console_locked);  /*  June 2006  */
 
 /**
  * release_console_sem - unlock the console system
-- 
cgit v1.2.3


From 80d6679a62fe45f440d042099d997a42e4e8c59d Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 10 Jul 2006 04:44:24 -0700
Subject: [PATCH] kernel/softirq.c: EXPORT_UNUSED_SYMBOL

This patch marks an unused export as EXPORT_UNUSED_SYMBOL.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/softirq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 215541e26c1a..fd12f2556f0d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -311,7 +311,7 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
 	softirq_vec[nr].action = action;
 }
 
-EXPORT_SYMBOL(open_softirq);
+EXPORT_UNUSED_SYMBOL(open_softirq);  /*  June 2006  */
 
 /* Tasklets */
 struct tasklet_head
-- 
cgit v1.2.3


From 06a9ec291b3aec9c7e36af0a10ad2b556bd7e84f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 10 Jul 2006 04:44:30 -0700
Subject: [PATCH] pi-futex: Validate futex type instead of oopsing

Calling futex_lock_pi is called with a reference to a non PI futex and
waiters exist already, lookup_pi_state() oopses due to pi_state == NULL.
Check this condition and return -EINVAL to userspace.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jakub Jelinek <jakub@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 1dc98e4dd287..cf0c8e21d1ab 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -476,6 +476,12 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 			 * the refcount and return its pi_state:
 			 */
 			pi_state = this->pi_state;
+			/*
+			 * Userspace might have messed up non PI and PI futexes
+			 */
+			if (unlikely(!pi_state))
+				return -EINVAL;
+
 			atomic_inc(&pi_state->refcount);
 			me->pi_state = pi_state;
 
-- 
cgit v1.2.3


From e154ff3d2c5ad313ef0c66e6217502361cad2799 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Mon, 10 Jul 2006 04:44:32 -0700
Subject: [PATCH] adjust clock for lost ticks

A large number of lost ticks can cause an overadjustment of the clock.  To
compensate for this we look at the current error and the larger the error
already is the more careful we are at adjusting the error.  As small extra
fix reset the error when the clock is set.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Acked-by: john stultz <johnstul@us.ibm.com>
Cc: Uwe Bugla <uwe.bugla@gmx.de>
Cc: James Bottomley <James.Bottomley@SteelEye.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 85 ++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 47 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 396a3c024c2c..2a87430a58d4 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -891,6 +891,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
+	clock->error = 0;
 	ntp_clear();
 
 	write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -1008,52 +1009,52 @@ static int __init timekeeping_init_device(void)
 device_initcall(timekeeping_init_device);
 
 /*
- * If the error is already larger, we look ahead another tick,
+ * If the error is already larger, we look ahead even further
  * to compensate for late or lost adjustments.
  */
-static __always_inline int clocksource_bigadjust(int sign, s64 error, s64 *interval, s64 *offset)
+static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset)
 {
-	int adj;
+	s64 tick_error, i;
+	u32 look_ahead, adj;
+	s32 error2, mult;
 
 	/*
-	 * As soon as the machine is synchronized to the external time
-	 * source this should be the common case.
+	 * Use the current error value to determine how much to look ahead.
+	 * The larger the error the slower we adjust for it to avoid problems
+	 * with losing too many ticks, otherwise we would overadjust and
+	 * produce an even larger error.  The smaller the adjustment the
+	 * faster we try to adjust for it, as lost ticks can do less harm
+	 * here.  This is tuned so that an error of about 1 msec is adusted
+	 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
 	 */
-	error >>= 2;
-	if (likely(sign > 0 ? error <= *interval : error >= *interval))
-		return sign;
+	error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ);
+	error2 = abs(error2);
+	for (look_ahead = 0; error2 > 0; look_ahead++)
+		error2 >>= 2;
 
 	/*
-	 * An extra look ahead dampens the effect of the current error,
-	 * which can grow quite large with continously late updates, as
-	 * it would dominate the adjustment value and can lead to
-	 * oscillation.
+	 * Now calculate the error in (1 << look_ahead) ticks, but first
+	 * remove the single look ahead already included in the error.
 	 */
-	error += current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1);
-	error -= clock->xtime_interval >> 1;
-
-	adj = 0;
-	while (1) {
-		error >>= 1;
-		if (sign > 0 ? error <= *interval : error >= *interval)
-			break;
-		adj++;
+	tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1);
+	tick_error -= clock->xtime_interval >> 1;
+	error = ((error - tick_error) >> look_ahead) + tick_error;
+
+	/* Finally calculate the adjustment shift value.  */
+	i = *interval;
+	mult = 1;
+	if (error < 0) {
+		error = -error;
+		*interval = -*interval;
+		*offset = -*offset;
+		mult = -1;
 	}
-
-	/*
-	 * Add the current adjustments to the error and take the offset
-	 * into account, the latter can cause the error to be hardly
-	 * reduced at the next tick. Check the error again if there's
-	 * room for another adjustment, thus further reducing the error
-	 * which otherwise had to be corrected at the next update.
-	 */
-	error = (error << 1) - *interval + *offset;
-	if (sign > 0 ? error > *interval : error < *interval)
-		adj++;
+	for (adj = 0; error > i; adj++)
+		error >>= 1;
 
 	*interval <<= adj;
 	*offset <<= adj;
-	return sign << adj;
+	return mult << adj;
 }
 
 /*
@@ -1068,11 +1069,19 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset)
 
 	error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
 	if (error > interval) {
-		adj = clocksource_bigadjust(1, error, &interval, &offset);
+		error >>= 2;
+		if (likely(error <= interval))
+			adj = 1;
+		else
+			adj = clocksource_bigadjust(error, &interval, &offset);
 	} else if (error < -interval) {
-		interval = -interval;
-		offset = -offset;
-		adj = clocksource_bigadjust(-1, error, &interval, &offset);
+		error >>= 2;
+		if (likely(error >= -interval)) {
+			adj = -1;
+			interval = -interval;
+			offset = -offset;
+		} else
+			adj = clocksource_bigadjust(error, &interval, &offset);
 	} else
 		return;
 
@@ -1129,7 +1138,7 @@ static void update_wall_time(void)
 	clocksource_adjust(clock, offset);
 
 	/* store full nanoseconds into xtime */
-	xtime.tv_nsec = clock->xtime_nsec >> clock->shift;
+	xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
 	clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
 
 	/* check to see if there is a new clocksource to use */
-- 
cgit v1.2.3


From 95018f7c94cbe4e78fc014b6ce52004714c06e2a Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 10 Jul 2006 04:45:00 -0700
Subject: [PATCH] swsusp: do not use memcpy for snapshotting memory

swsusp should not use memcpy for snapshotting memory, because on some
architectures memcpy may increase preempt_count (i386 does this when
CONFIG_X86_USE_3DNOW is set).  Then, as a result, wrong value of preempt_count
is stored in the image.

Replace memcpy in copy_data_pages with an open-coded loop.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 24c96f354231..75d4886e648e 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -227,11 +227,17 @@ static void copy_data_pages(struct pbe *pblist)
 		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
 			if (saveable(zone, &zone_pfn)) {
 				struct page *page;
+				long *src, *dst;
+				int n;
+
 				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
 				BUG_ON(!pbe);
 				pbe->orig_address = (unsigned long)page_address(page);
-				/* copy_page is not usable for copying task structs. */
-				memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
+				/* copy_page and memcpy are not usable for copying task structs. */
+				dst = (long *)pbe->address;
+				src = (long *)pbe->orig_address;
+				for (n = PAGE_SIZE / sizeof(long); n; n--)
+					*dst++ = *src++;
 				pbe = pbe->next;
 			}
 		}
-- 
cgit v1.2.3


From 712f403af6682c942d8ff8bfbd54eed03643a796 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 10 Jul 2006 04:45:00 -0700
Subject: [PATCH] swsusp warning fix

kernel/power/swap.c: In function 'swsusp_write':
kernel/power/swap.c:275: warning: 'start' may be used uninitialized in this function

gcc isn't smart enough, so help it.

Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swap.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 044b8e0c1025..a57c661c7e8a 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -263,7 +263,6 @@ int swsusp_write(void)
 	struct swap_map_handle handle;
 	struct snapshot_handle snapshot;
 	struct swsusp_info *header;
-	unsigned long start;
 	int error;
 
 	if ((error = swsusp_swap_check())) {
@@ -281,16 +280,17 @@ int swsusp_write(void)
 	}
 	error = get_swap_writer(&handle);
 	if (!error) {
-		start = handle.cur_swap;
+		unsigned long start = handle.cur_swap;
 		error = swap_write_page(&handle, header);
-	}
-	if (!error)
-		error = save_image(&handle, &snapshot, header->pages - 1);
-	if (!error) {
-		flush_swap_writer(&handle);
-		printk("S");
-		error = mark_swapfiles(swp_entry(root_swap, start));
-		printk("|\n");
+		if (!error)
+			error = save_image(&handle, &snapshot,
+					header->pages - 1);
+		if (!error) {
+			flush_swap_writer(&handle);
+			printk("S");
+			error = mark_swapfiles(swp_entry(root_swap, start));
+			printk("|\n");
+		}
 	}
 	if (error)
 		free_all_swap_pages(root_swap, handle.bitmap);
-- 
cgit v1.2.3


From aeceb15738958fe59cd9fe537f40317b1a3bc731 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@osdl.org>
Date: Mon, 10 Jul 2006 04:45:01 -0700
Subject: [PATCH] swsusp: fix panic when signature can't be read

Do not panic a machine when swsusp signature can't be read.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swap.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index a57c661c7e8a..f1dd146bd64d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -311,8 +311,10 @@ static atomic_t io_done = ATOMIC_INIT(0);
 
 static int end_io(struct bio *bio, unsigned int num, int err)
 {
-	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-		panic("I/O error reading memory image");
+	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+		printk(KERN_ERR "I/O error reading swsusp image.\n");
+		return -EIO;
+	}
 	atomic_set(&io_done, 0);
 	return 0;
 }
-- 
cgit v1.2.3


From 21d71f513b6221f482ed6ad45e05f073ae67f319 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 10 Jul 2006 04:45:32 -0700
Subject: [PATCH] uninline init_waitqueue_head()

allyesconfig vmlinux size delta:

  text            data    bss     dec          filename
  20736884        6073834 3075176 29885894     vmlinux.before
  20721009        6073966 3075176 29870151     vmlinux.after

~18 bytes per callsite, 15K of text size (~0.1%) saved.

(as an added bonus this also removes a lockdep annotation.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/wait.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/wait.c b/kernel/wait.c
index a1d57aeb7f75..59a82f63275d 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -10,9 +10,13 @@
 #include <linux/wait.h>
 #include <linux/hash.h>
 
-struct lock_class_key waitqueue_lock_key;
+void init_waitqueue_head(wait_queue_head_t *q)
+{
+	spin_lock_init(&q->lock);
+	INIT_LIST_HEAD(&q->task_list);
+}
 
-EXPORT_SYMBOL(waitqueue_lock_key);
+EXPORT_SYMBOL(init_waitqueue_head);
 
 void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
 {
-- 
cgit v1.2.3


From c59923a15c12d2b3597af913bf234a0ef264a38b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 10 Jul 2006 04:45:40 -0700
Subject: [PATCH] remove the tasklist_lock export

As announced half a year ago this patch will remove the tasklist_lock
export.  The previous two patches got rid of the remaining modular users.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 56e4e07e45f7..926e5a68ea9e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -61,9 +61,7 @@ int max_threads;		/* tunable limit on nr_threads */
 
 DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 
- __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
-
-EXPORT_SYMBOL(tasklist_lock);
+__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
 
 int nr_processes(void)
 {
-- 
cgit v1.2.3


From 2c16e9c888985761511bd1905b00fb271169c3c0 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 10 Jul 2006 04:45:42 -0700
Subject: [PATCH] lockdep: disable lock debugging when kernel state becomes
 untrusted

Disable lockdep debugging in two situations where the integrity of the
kernel no longer is guaranteed: when oopsing and when hitting a
tainting-condition.  The goal is to not get weird lockdep traces that don't
make sense or are otherwise undebuggable, to not waste time.

Lockdep assumes that the previous state it knows about is valid to operate,
which is why lockdep turns itself off after the first violation it reports,
after that point it can no longer make that assumption.

A kernel oops means that the integrity of the kernel compromised; in
addition anything lockdep would report is of lesser importance than the
oops.

All the tainting conditions are of similar integrity-violating nature and
also make debugging/diagnosing more difficult.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/panic.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index ab13f0f668b5..d8a0bca21233 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -172,6 +172,7 @@ const char *print_tainted(void)
 
 void add_taint(unsigned flag)
 {
+	debug_locks_off(); /* can't trust the integrity of the kernel anymore */
 	tainted |= flag;
 }
 EXPORT_SYMBOL(add_taint);
@@ -256,6 +257,7 @@ int oops_may_print(void)
  */
 void oops_enter(void)
 {
+	debug_locks_off(); /* can't trust the integrity of the kernel anymore */
 	do_oops_enter_exit();
 }
 
-- 
cgit v1.2.3


From abf75a5033d4da7b8a7e92321d74021d1fcfb502 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Wed, 12 Jul 2006 13:12:00 +0200
Subject: [PATCH] Fix prctl privilege escalation and suid_dumpable
 (CVE-2006-2451)

Based on a patch from Ernie Petrides

During security research, Red Hat discovered a behavioral flaw in core
dump handling. A local user could create a program that would cause a
core file to be dumped into a directory they would not normally have
permissions to write to. This could lead to a denial of service (disk
consumption), or allow the local user to gain root privileges.

The prctl() system call should never allow to set "dumpable" to the
value 2. Especially not for non-privileged users.

This can be split into three cases:

  1) running as root -- then core dumps will already be done as root,
     and so prctl(PR_SET_DUMPABLE, 2) is not useful

  2) running as non-root w/setuid-to-root -- this is the debatable case

  3) running as non-root w/setuid-to-non-root -- then you definitely
     do NOT want "dumpable" to get set to 2 because you have the
     privilege escalation vulnerability

With case #2, the only potential usefulness is for a program that has
designed to run with higher privilege (than the user invoking it) that
wants to be able to create root-owned root-validated core dumps. This
might be useful as a debugging aid, but would only be safe if the program
had done a chdir() to a safe directory.

There is no benefit to a production setuid-to-root utility, because it
shouldn't be dumping core in the first place. If this is true, then the
same debugging aid could also be accomplished with the "suid_dumpable"
sysctl.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index dbb3b9c7ea64..e236f98f7ec5 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1983,7 +1983,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 			error = current->mm->dumpable;
 			break;
 		case PR_SET_DUMPABLE:
-			if (arg2 < 0 || arg2 > 2) {
+			if (arg2 < 0 || arg2 > 1) {
 				error = -EINVAL;
 				break;
 			}
-- 
cgit v1.2.3


From 26865e9c26d2d336f385b821b531ce2b31008e20 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Fri, 30 Jun 2006 02:15:43 -0700
Subject: [PATCH] remove kernel/power/pm.c:pm_unregister_all()

Remove the deprecated and no longer used pm_unregister_all().

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/power/pm.c | 37 -------------------------------------
 1 file changed, 37 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 84063ac8fcfc..c50d15266c10 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -75,42 +75,6 @@ struct pm_dev *pm_register(pm_dev_t type,
 	return dev;
 }
 
-static void __pm_unregister(struct pm_dev *dev)
-{
-	if (dev) {
-		list_del(&dev->entry);
-		kfree(dev);
-	}
-}
-
-/**
- *	pm_unregister_all - unregister all devices with matching callback
- *	@callback: callback function pointer
- *
- *	Unregister every device that would call the callback passed. This
- *	is primarily meant as a helper function for loadable modules. It
- *	enables a module to give up all its managed devices without keeping
- *	its own private list.
- */
- 
-void pm_unregister_all(pm_callback callback)
-{
-	struct list_head *entry;
-
-	if (!callback)
-		return;
-
-	mutex_lock(&pm_devs_lock);
-	entry = pm_devs.next;
-	while (entry != &pm_devs) {
-		struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
-		entry = entry->next;
-		if (dev->callback == callback)
-			__pm_unregister(dev);
-	}
-	mutex_unlock(&pm_devs_lock);
-}
-
 /**
  *	pm_send - send request to a single device
  *	@dev: device to send to
@@ -239,7 +203,6 @@ int pm_send_all(pm_request_t rqst, void *data)
 }
 
 EXPORT_SYMBOL(pm_register);
-EXPORT_SYMBOL(pm_unregister_all);
 EXPORT_SYMBOL(pm_send_all);
 EXPORT_SYMBOL(pm_active);
 
-- 
cgit v1.2.3


From cd6ef2ada54aa4788d5a3dee3cffaad41383a52a Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Fri, 30 Jun 2006 02:15:42 -0700
Subject: [PATCH] The scheduled unexport of insert_resource

Implement the scheduled unexport of insert_resource.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/resource.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index 129cf046e561..0dd3a857579e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -404,8 +404,6 @@ int insert_resource(struct resource *parent, struct resource *new)
 	return result;
 }
 
-EXPORT_SYMBOL(insert_resource);
-
 /*
  * Given an existing resource, change its start and size to match the
  * arguments.  Returns -EBUSY if it can't fit.  Existing children of
-- 
cgit v1.2.3


From 098c5eea03de4707019a205140296893252b4130 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Fri, 14 Jul 2006 00:24:04 -0700
Subject: [PATCH] null-terminate over-long /proc/kallsyms symbols

Got a customer bug report (https://bugzilla.novell.com/190296) about kernel
symbols longer than 127 characters which end up in a string buffer that is
not NULL terminated, leading to garbage in /proc/kallsyms.  Using strlcpy
prevents this from happening, even though such symbols still won't come out
right.

A better fix would be to not use a fixed-size buffer, but it's probably not
worth the trouble.  (Modversion'ed symbols even have a length limit of 60.)

[bunk@stusta.de: build fix]
Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kallsyms.c |  4 ++--
 kernel/module.c   | 11 ++++-------
 2 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 39277dd6bf90..ab16a5a4cfe9 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -275,8 +275,8 @@ static void upcase_if_global(struct kallsym_iter *iter)
 static int get_ksymbol_mod(struct kallsym_iter *iter)
 {
 	iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
-					 &iter->value,
-					 &iter->type, iter->name);
+					 &iter->value, &iter->type,
+					 iter->name, sizeof(iter->name));
 	if (iter->owner == NULL)
 		return 0;
 
diff --git a/kernel/module.c b/kernel/module.c
index 35e1b1f859d7..2a19cd47c046 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2019,10 +2019,8 @@ const char *module_address_lookup(unsigned long addr,
 	return NULL;
 }
 
-struct module *module_get_kallsym(unsigned int symnum,
-				  unsigned long *value,
-				  char *type,
-				  char namebuf[128])
+struct module *module_get_kallsym(unsigned int symnum, unsigned long *value,
+				char *type, char *name, size_t namelen)
 {
 	struct module *mod;
 
@@ -2031,9 +2029,8 @@ struct module *module_get_kallsym(unsigned int symnum,
 		if (symnum < mod->num_symtab) {
 			*value = mod->symtab[symnum].st_value;
 			*type = mod->symtab[symnum].st_info;
-			strncpy(namebuf,
-				mod->strtab + mod->symtab[symnum].st_name,
-				127);
+			strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
+				namelen);
 			mutex_unlock(&module_mutex);
 			return mod;
 		}
-- 
cgit v1.2.3


From 52e92e5788139921352213fa6faf6e30ff1f2f5a Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Fri, 14 Jul 2006 00:24:05 -0700
Subject: [PATCH] remove kernel/kthread.c:kthread_stop_sem()

Remove the now-unneeded kthread_stop_sem().

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kthread.c | 24 ++----------------------
 1 file changed, 2 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 24be714b04c7..4f9c60ef95e8 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -215,23 +215,6 @@ EXPORT_SYMBOL(kthread_bind);
  * was never called.
  */
 int kthread_stop(struct task_struct *k)
-{
-	return kthread_stop_sem(k, NULL);
-}
-EXPORT_SYMBOL(kthread_stop);
-
-/**
- * kthread_stop_sem - stop a thread created by kthread_create().
- * @k: thread created by kthread_create().
- * @s: semaphore that @k waits on while idle.
- *
- * Does essentially the same thing as kthread_stop() above, but wakes
- * @k by calling up(@s).
- *
- * Returns the result of threadfn(), or %-EINTR if wake_up_process()
- * was never called.
- */
-int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
 {
 	int ret;
 
@@ -246,10 +229,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
 
 	/* Now set kthread_should_stop() to true, and wake it up. */
 	kthread_stop_info.k = k;
-	if (s)
-		up(s);
-	else
-		wake_up_process(k);
+	wake_up_process(k);
 	put_task_struct(k);
 
 	/* Once it dies, reset stop ptr, gather result and we're done. */
@@ -260,7 +240,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
 
 	return ret;
 }
-EXPORT_SYMBOL(kthread_stop_sem);
+EXPORT_SYMBOL(kthread_stop);
 
 static __init int helper_init(void)
 {
-- 
cgit v1.2.3


From a0009652af385a42f0e0604136f772ead406c78d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 14 Jul 2006 00:24:06 -0700
Subject: [PATCH] del_timer_sync(): add cpu_relax()

Relax the CPU in the del_timer_sync() busywait loop.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 2a87430a58d4..acfa557e685b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -374,6 +374,7 @@ int del_timer_sync(struct timer_list *timer)
 		int ret = try_to_del_timer_sync(timer);
 		if (ret >= 0)
 			return ret;
+		cpu_relax();
 	}
 }
 
-- 
cgit v1.2.3


From 60198f9992db1e36d5b4cc1526ff29550f7d002c Mon Sep 17 00:00:00 2001
From: Luca Tettamanti <kronos.it@gmail.com>
Date: Fri, 14 Jul 2006 00:24:13 -0700
Subject: [PATCH] Add try_to_freeze() to rt-test kthreads

When CONFIG_RT_MUTEX_TESTER is enabled kernel refuses to suspend the
machine because it's unable to freeze the rt-test-* threads.

Add try_to_freeze() after schedule() so that the threads will be freezed
correctly; I've tested the patch and it lets the notebook suspends and
resumes nicely.

Signed-off-by: Luca Tettamanti <kronos.it@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rtmutex-tester.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 494dac872a13..948bd8f643e2 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -275,6 +275,7 @@ static int test_func(void *data)
 
 		/* Wait for the next command to be executed */
 		schedule();
+		try_to_freeze();
 
 		if (signal_pending(current))
 			flush_signals(current);
-- 
cgit v1.2.3


From 6bc02d8412b422388f86b09ae40d762c0bc05290 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Fri, 14 Jul 2006 00:24:15 -0700
Subject: [PATCH] unexport open_softirq

Christoph Hellwig:
open_softirq just enables a softirq.  The softirq array is statically
allocated so to add a new one you would have to patch the kernel.  So
there's no point to keep this export at all as any user would have to
patch the enum in include/linux/interrupt.h anyway.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/softirq.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index fd12f2556f0d..0f08a84ae307 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -311,8 +311,6 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
 	softirq_vec[nr].action = action;
 }
 
-EXPORT_UNUSED_SYMBOL(open_softirq);  /*  June 2006  */
-
 /* Tasklets */
 struct tasklet_head
 {
-- 
cgit v1.2.3


From 3e143475c22036847f898d7e76ba337c1d7dbf6f Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Fri, 14 Jul 2006 00:24:17 -0700
Subject: [PATCH] improve timekeeping resume robustness

Resolve problems seen w/ APM suspend.

Due to resume initialization ordering, its possible we could get a timer
interrupt before the timekeeping resume() function is called.  This patch
ensures we don't do any timekeeping accounting before we're fully resumed.

(akpm: fixes the machine-freezes-on-APM-resume bug)

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index acfa557e685b..05809c2e2fd6 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -969,6 +969,7 @@ void __init timekeeping_init(void)
 }
 
 
+static int timekeeping_suspended;
 /*
  * timekeeping_resume - Resumes the generic timekeeping subsystem.
  * @dev:	unused
@@ -984,6 +985,18 @@ static int timekeeping_resume(struct sys_device *dev)
 	write_seqlock_irqsave(&xtime_lock, flags);
 	/* restart the last cycle value */
 	clock->cycle_last = clocksource_read(clock);
+	clock->error = 0;
+	timekeeping_suspended = 0;
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+	return 0;
+}
+
+static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
+{
+	unsigned long flags;
+
+	write_seqlock_irqsave(&xtime_lock, flags);
+	timekeeping_suspended = 1;
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 	return 0;
 }
@@ -991,6 +1004,7 @@ static int timekeeping_resume(struct sys_device *dev)
 /* sysfs resume/suspend bits for timekeeping */
 static struct sysdev_class timekeeping_sysclass = {
 	.resume		= timekeeping_resume,
+	.suspend	= timekeeping_suspend,
 	set_kset_name("timekeeping"),
 };
 
@@ -1101,13 +1115,16 @@ static void update_wall_time(void)
 {
 	cycle_t offset;
 
-	clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
+	/* Make sure we're fully resumed: */
+	if (unlikely(timekeeping_suspended))
+		return;
 
 #ifdef CONFIG_GENERIC_TIME
 	offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
 #else
 	offset = clock->cycle_interval;
 #endif
+	clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
 
 	/* normally this loop will run just once, however in the
 	 * case of lost or late ticks, it will accumulate correctly.
-- 
cgit v1.2.3


From a4afee02a5dd4f20c08fca26e9b610e72d0bcbf0 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Fri, 14 Jul 2006 00:24:18 -0700
Subject: [PATCH] Fix sighand->siglock usage in kernel/acct.c

IRQs must be disabled before taking ->siglock.

Noticed by lockdep.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/acct.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index f18e0b8df3e1..2a7c933651c7 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -488,7 +488,7 @@ static void do_acct_process(struct file *file)
 		old_encode_dev(tty_devnum(current->signal->tty)) : 0;
 	read_unlock(&tasklist_lock);
 
-	spin_lock(&current->sighand->siglock);
+	spin_lock_irq(&current->sighand->siglock);
 	ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
 	ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
 	ac.ac_flag = pacct->ac_flag;
@@ -496,7 +496,7 @@ static void do_acct_process(struct file *file)
 	ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
 	ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
 	ac.ac_exitcode = pacct->ac_exitcode;
-	spin_unlock(&current->sighand->siglock);
+	spin_unlock_irq(&current->sighand->siglock);
 	ac.ac_io = encode_comp_t(0 /* current->io_usage */);	/* %% */
 	ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
 	ac.ac_swaps = encode_comp_t(0);
-- 
cgit v1.2.3


From 3a5f5e488ceee9e08df3dff3f01b12fafc9e7e68 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 14 Jul 2006 00:24:27 -0700
Subject: [PATCH] lockdep: core, fix rq-lock handling on
 __ARCH_WANT_UNLOCKED_CTXSW

On platforms that have __ARCH_WANT_UNLOCKED_CTXSW set and want to implement
lock validator support there's a bug in rq->lock handling: in this case we
dont 'carry over' the runqueue lock into another task - but still we did a
spinlock_release() of it.  Fix this by making the spinlock_release() in
context_switch() dependent on !__ARCH_WANT_UNLOCKED_CTXSW.

(Reported by Ralf Baechle on MIPS, which has __ARCH_WANT_UNLOCKED_CTXSW.
This fixes a lockdep-internal BUG message on such platforms.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index d714611f1691..e9a0b61f12ab 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1788,7 +1788,15 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		WARN_ON(rq->prev_mm);
 		rq->prev_mm = oldmm;
 	}
+	/*
+	 * Since the runqueue lock will be released by the next
+	 * task (which is an invalid locking op but in the case
+	 * of the scheduler it's an obvious special-case), so we
+	 * do an early lockdep release here:
+	 */
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+#endif
 
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
-- 
cgit v1.2.3


From ca74e92b4698276b6696f15a801759f50944f387 Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Fri, 14 Jul 2006 00:24:36 -0700
Subject: [PATCH] per-task-delay-accounting: setup

Initialization code related to collection of per-task "delay" statistics which
measure how long it had to wait for cpu, sync block io, swapping etc.  The
collection of statistics and the interface are in other patches.  This patch
sets up the data structures and allows the statistics collection to be
disabled through a kernel boot parameter.

Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Peter Chubb <peterc@gelato.unsw.edu.au>
Cc: Erich Focht <efocht@ess.nec.de>
Cc: Levent Serinol <lserinol@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/Makefile    |  1 +
 kernel/delayacct.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/exit.c      |  2 ++
 kernel/fork.c      |  2 ++
 4 files changed, 92 insertions(+)
 create mode 100644 kernel/delayacct.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 47dbcd570cd8..87bb34cc8938 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_RELAY) += relay.o
+obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
new file mode 100644
index 000000000000..fbf7f2284952
--- /dev/null
+++ b/kernel/delayacct.c
@@ -0,0 +1,87 @@
+/* delayacct.c - per-task delay accounting
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/sysctl.h>
+#include <linux/delayacct.h>
+
+int delayacct_on __read_mostly;	/* Delay accounting turned on/off */
+kmem_cache_t *delayacct_cache;
+
+static int __init delayacct_setup_enable(char *str)
+{
+	delayacct_on = 1;
+	return 1;
+}
+__setup("delayacct", delayacct_setup_enable);
+
+void delayacct_init(void)
+{
+	delayacct_cache = kmem_cache_create("delayacct_cache",
+					sizeof(struct task_delay_info),
+					0,
+					SLAB_PANIC,
+					NULL, NULL);
+	delayacct_tsk_init(&init_task);
+}
+
+void __delayacct_tsk_init(struct task_struct *tsk)
+{
+	tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL);
+	if (tsk->delays)
+		spin_lock_init(&tsk->delays->lock);
+}
+
+void __delayacct_tsk_exit(struct task_struct *tsk)
+{
+	kmem_cache_free(delayacct_cache, tsk->delays);
+	tsk->delays = NULL;
+}
+
+/*
+ * Start accounting for a delay statistic using
+ * its starting timestamp (@start)
+ */
+
+static inline void delayacct_start(struct timespec *start)
+{
+	do_posix_clock_monotonic_gettime(start);
+}
+
+/*
+ * Finish delay accounting for a statistic using
+ * its timestamps (@start, @end), accumalator (@total) and @count
+ */
+
+static void delayacct_end(struct timespec *start, struct timespec *end,
+				u64 *total, u32 *count)
+{
+	struct timespec ts;
+	s64 ns;
+
+	do_posix_clock_monotonic_gettime(end);
+	ts = timespec_sub(*end, *start);
+	ns = timespec_to_ns(&ts);
+	if (ns < 0)
+		return;
+
+	spin_lock(&current->delays->lock);
+	*total += ns;
+	(*count)++;
+	spin_unlock(&current->delays->lock);
+}
+
diff --git a/kernel/exit.c b/kernel/exit.c
index 6664c084783d..3c2cf91defa7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -25,6 +25,7 @@
 #include <linux/mount.h>
 #include <linux/proc_fs.h>
 #include <linux/mempolicy.h>
+#include <linux/delayacct.h>
 #include <linux/cpuset.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
@@ -900,6 +901,7 @@ fastcall NORET_TYPE void do_exit(long code)
 #endif
 	if (unlikely(tsk->audit_context))
 		audit_free(tsk);
+	delayacct_tsk_exit(tsk);
 	exit_mm(tsk);
 
 	if (group_dead)
diff --git a/kernel/fork.c b/kernel/fork.c
index 926e5a68ea9e..451cfd35bf22 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -43,6 +43,7 @@
 #include <linux/rmap.h>
 #include <linux/acct.h>
 #include <linux/cn_proc.h>
+#include <linux/delayacct.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -1000,6 +1001,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto bad_fork_cleanup_put_domain;
 
 	p->did_exec = 0;
+	delayacct_tsk_init(p);	/* Must remain after dup_task_struct() */
 	copy_flags(clone_flags, p);
 	p->pid = pid;
 	retval = -EFAULT;
-- 
cgit v1.2.3


From 0ff922452df86f3e9a2c6f705c4588ec62d096a7 Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Fri, 14 Jul 2006 00:24:37 -0700
Subject: [PATCH] per-task-delay-accounting: sync block I/O and swapin delay
 collection

Unlike earlier iterations of the delay accounting patches, now delays are only
collected for the actual I/O waits rather than try and cover the delays seen
in I/O submission paths.

Account separately for block I/O delays incurred as a result of swapin page
faults whose frequency can be affected by the task/process' rss limit.  Hence
swapin delays can act as feedback for rss limit changes independent of I/O
priority changes.

Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Peter Chubb <peterc@gelato.unsw.edu.au>
Cc: Erich Focht <efocht@ess.nec.de>
Cc: Levent Serinol <lserinol@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/delayacct.c | 19 +++++++++++++++++++
 kernel/sched.c     |  5 +++++
 2 files changed, 24 insertions(+)

(limited to 'kernel')

diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index fbf7f2284952..3546b0800f9f 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -85,3 +85,22 @@ static void delayacct_end(struct timespec *start, struct timespec *end,
 	spin_unlock(&current->delays->lock);
 }
 
+void __delayacct_blkio_start(void)
+{
+	delayacct_start(&current->delays->blkio_start);
+}
+
+void __delayacct_blkio_end(void)
+{
+	if (current->delays->flags & DELAYACCT_PF_SWAPIN)
+		/* Swapin block I/O */
+		delayacct_end(&current->delays->blkio_start,
+			&current->delays->blkio_end,
+			&current->delays->swapin_delay,
+			&current->delays->swapin_count);
+	else	/* Other block I/O */
+		delayacct_end(&current->delays->blkio_start,
+			&current->delays->blkio_end,
+			&current->delays->blkio_delay,
+			&current->delays->blkio_count);
+}
diff --git a/kernel/sched.c b/kernel/sched.c
index e9a0b61f12ab..9d42cbfc4f8b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -51,6 +51,7 @@
 #include <linux/times.h>
 #include <linux/acct.h>
 #include <linux/kprobes.h>
+#include <linux/delayacct.h>
 #include <asm/tlb.h>
 
 #include <asm/unistd.h>
@@ -4534,9 +4535,11 @@ void __sched io_schedule(void)
 {
 	struct rq *rq = &__raw_get_cpu_var(runqueues);
 
+	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
 	schedule();
 	atomic_dec(&rq->nr_iowait);
+	delayacct_blkio_end();
 }
 EXPORT_SYMBOL(io_schedule);
 
@@ -4545,9 +4548,11 @@ long __sched io_schedule_timeout(long timeout)
 	struct rq *rq = &__raw_get_cpu_var(runqueues);
 	long ret;
 
+	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
 	ret = schedule_timeout(timeout);
 	atomic_dec(&rq->nr_iowait);
+	delayacct_blkio_end();
 	return ret;
 }
 
-- 
cgit v1.2.3


From 52f17b6c2bd443e7806a161e9d10a983650db01d Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Fri, 14 Jul 2006 00:24:38 -0700
Subject: [PATCH] per-task-delay-accounting: cpu delay collection via
 schedstats

Make the task-related schedstats functions callable by delay accounting even
if schedstats collection isn't turned on.  This removes the dependency of
delay accounting on schedstats.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Peter Chubb <peterc@gelato.unsw.edu.au>
Cc: Erich Focht <efocht@ess.nec.de>
Cc: Levent Serinol <lserinol@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 71 ++++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 49 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 9d42cbfc4f8b..b44b9a43b0fc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -502,9 +502,36 @@ struct file_operations proc_schedstat_operations = {
 	.release = single_release,
 };
 
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
+{
+	if (rq) {
+		rq->rq_sched_info.run_delay += delta_jiffies;
+		rq->rq_sched_info.pcnt++;
+	}
+}
+
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
+{
+	if (rq)
+		rq->rq_sched_info.cpu_time += delta_jiffies;
+}
 # define schedstat_inc(rq, field)	do { (rq)->field++; } while (0)
 # define schedstat_add(rq, field, amt)	do { (rq)->field += (amt); } while (0)
 #else /* !CONFIG_SCHEDSTATS */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
+{}
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
+{}
 # define schedstat_inc(rq, field)	do { } while (0)
 # define schedstat_add(rq, field, amt)	do { } while (0)
 #endif
@@ -524,7 +551,7 @@ static inline struct rq *this_rq_lock(void)
 	return rq;
 }
 
-#ifdef CONFIG_SCHEDSTATS
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 /*
  * Called when a process is dequeued from the active array and given
  * the cpu.  We should note that with the exception of interactive
@@ -552,21 +579,16 @@ static inline void sched_info_dequeued(struct task_struct *t)
  */
 static void sched_info_arrive(struct task_struct *t)
 {
-	unsigned long now = jiffies, diff = 0;
-	struct rq *rq = task_rq(t);
+	unsigned long now = jiffies, delta_jiffies = 0;
 
 	if (t->sched_info.last_queued)
-		diff = now - t->sched_info.last_queued;
+		delta_jiffies = now - t->sched_info.last_queued;
 	sched_info_dequeued(t);
-	t->sched_info.run_delay += diff;
+	t->sched_info.run_delay += delta_jiffies;
 	t->sched_info.last_arrival = now;
 	t->sched_info.pcnt++;
 
-	if (!rq)
-		return;
-
-	rq->rq_sched_info.run_delay += diff;
-	rq->rq_sched_info.pcnt++;
+	rq_sched_info_arrive(task_rq(t), delta_jiffies);
 }
 
 /*
@@ -586,8 +608,9 @@ static void sched_info_arrive(struct task_struct *t)
  */
 static inline void sched_info_queued(struct task_struct *t)
 {
-	if (!t->sched_info.last_queued)
-		t->sched_info.last_queued = jiffies;
+	if (unlikely(sched_info_on()))
+		if (!t->sched_info.last_queued)
+			t->sched_info.last_queued = jiffies;
 }
 
 /*
@@ -596,13 +619,10 @@ static inline void sched_info_queued(struct task_struct *t)
  */
 static inline void sched_info_depart(struct task_struct *t)
 {
-	struct rq *rq = task_rq(t);
-	unsigned long diff = jiffies - t->sched_info.last_arrival;
-
-	t->sched_info.cpu_time += diff;
+	unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival;
 
-	if (rq)
-		rq->rq_sched_info.cpu_time += diff;
+	t->sched_info.cpu_time += delta_jiffies;
+	rq_sched_info_depart(task_rq(t), delta_jiffies);
 }
 
 /*
@@ -611,7 +631,7 @@ static inline void sched_info_depart(struct task_struct *t)
  * the idle task.)  We are only called when prev != next.
  */
 static inline void
-sched_info_switch(struct task_struct *prev, struct task_struct *next)
+__sched_info_switch(struct task_struct *prev, struct task_struct *next)
 {
 	struct rq *rq = task_rq(prev);
 
@@ -626,10 +646,16 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
 	if (next != rq->idle)
 		sched_info_arrive(next);
 }
+static inline void
+sched_info_switch(struct task_struct *prev, struct task_struct *next)
+{
+	if (unlikely(sched_info_on()))
+		__sched_info_switch(prev, next);
+}
 #else
 #define sched_info_queued(t)		do { } while (0)
 #define sched_info_switch(t, next)	do { } while (0)
-#endif /* CONFIG_SCHEDSTATS */
+#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
 
 /*
  * Adding/removing a task to/from a priority array:
@@ -1531,8 +1557,9 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
 
 	INIT_LIST_HEAD(&p->run_list);
 	p->array = NULL;
-#ifdef CONFIG_SCHEDSTATS
-	memset(&p->sched_info, 0, sizeof(p->sched_info));
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+	if (unlikely(sched_info_on()))
+		memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	p->oncpu = 0;
-- 
cgit v1.2.3


From c757249af152c59fd74b85e52e8c090acb33d9c0 Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Fri, 14 Jul 2006 00:24:40 -0700
Subject: [PATCH] per-task-delay-accounting: taskstats interface

Create a "taskstats" interface based on generic netlink (NETLINK_GENERIC
family), for getting statistics of tasks and thread groups during their
lifetime and when they exit.  The interface is intended for use by multiple
accounting packages though it is being created in the context of delay
accounting.

This patch creates the interface without populating the fields of the data
that is sent to the user in response to a command or upon the exit of a task.
Each accounting package interested in using taskstats has to provide an
additional patch to add its stats to the common structure.

[akpm@osdl.org: cleanups, Kconfig fix]
Signed-off-by: Shailabh Nagar <nagar@us.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Peter Chubb <peterc@gelato.unsw.edu.au>
Cc: Erich Focht <efocht@ess.nec.de>
Cc: Levent Serinol <lserinol@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/Makefile    |   1 +
 kernel/exit.c      |   7 ++
 kernel/taskstats.c | 336 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 344 insertions(+)
 create mode 100644 kernel/taskstats.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 87bb34cc8938..d62ec66c1af2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
+obj-$(CONFIG_TASKSTATS) += taskstats.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/exit.c b/kernel/exit.c
index 3c2cf91defa7..9852ed8c2988 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -25,6 +25,7 @@
 #include <linux/mount.h>
 #include <linux/proc_fs.h>
 #include <linux/mempolicy.h>
+#include <linux/taskstats_kern.h>
 #include <linux/delayacct.h>
 #include <linux/cpuset.h>
 #include <linux/syscalls.h>
@@ -844,6 +845,7 @@ static void exit_notify(struct task_struct *tsk)
 fastcall NORET_TYPE void do_exit(long code)
 {
 	struct task_struct *tsk = current;
+	struct taskstats *tidstats, *tgidstats;
 	int group_dead;
 
 	profile_task_exit(tsk);
@@ -882,6 +884,8 @@ fastcall NORET_TYPE void do_exit(long code)
 				current->comm, current->pid,
 				preempt_count());
 
+	taskstats_exit_alloc(&tidstats, &tgidstats);
+
 	acct_update_integrals(tsk);
 	if (tsk->mm) {
 		update_hiwater_rss(tsk->mm);
@@ -901,7 +905,10 @@ fastcall NORET_TYPE void do_exit(long code)
 #endif
 	if (unlikely(tsk->audit_context))
 		audit_free(tsk);
+	taskstats_exit_send(tsk, tidstats, tgidstats);
+	taskstats_exit_free(tidstats, tgidstats);
 	delayacct_tsk_exit(tsk);
+
 	exit_mm(tsk);
 
 	if (group_dead)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
new file mode 100644
index 000000000000..82ec9137d908
--- /dev/null
+++ b/kernel/taskstats.c
@@ -0,0 +1,336 @@
+/*
+ * taskstats.c - Export per-task statistics to userland
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ *           (C) Balbir Singh,   IBM Corp. 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/taskstats_kern.h>
+#include <net/genetlink.h>
+#include <asm/atomic.h>
+
+static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
+static int family_registered;
+kmem_cache_t *taskstats_cache;
+static DEFINE_MUTEX(taskstats_exit_mutex);
+
+static struct genl_family family = {
+	.id		= GENL_ID_GENERATE,
+	.name		= TASKSTATS_GENL_NAME,
+	.version	= TASKSTATS_GENL_VERSION,
+	.maxattr	= TASKSTATS_CMD_ATTR_MAX,
+};
+
+static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
+__read_mostly = {
+	[TASKSTATS_CMD_ATTR_PID]  = { .type = NLA_U32 },
+	[TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
+};
+
+
+static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
+			void **replyp, size_t size)
+{
+	struct sk_buff *skb;
+	void *reply;
+
+	/*
+	 * If new attributes are added, please revisit this allocation
+	 */
+	skb = nlmsg_new(size);
+	if (!skb)
+		return -ENOMEM;
+
+	if (!info) {
+		int seq = get_cpu_var(taskstats_seqnum)++;
+		put_cpu_var(taskstats_seqnum);
+
+		reply = genlmsg_put(skb, 0, seq,
+				family.id, 0, 0,
+				cmd, family.version);
+	} else
+		reply = genlmsg_put(skb, info->snd_pid, info->snd_seq,
+				family.id, 0, 0,
+				cmd, family.version);
+	if (reply == NULL) {
+		nlmsg_free(skb);
+		return -EINVAL;
+	}
+
+	*skbp = skb;
+	*replyp = reply;
+	return 0;
+}
+
+static int send_reply(struct sk_buff *skb, pid_t pid, int event)
+{
+	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
+	void *reply;
+	int rc;
+
+	reply = genlmsg_data(genlhdr);
+
+	rc = genlmsg_end(skb, reply);
+	if (rc < 0) {
+		nlmsg_free(skb);
+		return rc;
+	}
+
+	if (event == TASKSTATS_MSG_MULTICAST)
+		return genlmsg_multicast(skb, pid, TASKSTATS_LISTEN_GROUP);
+	return genlmsg_unicast(skb, pid);
+}
+
+static int fill_pid(pid_t pid, struct task_struct *pidtsk,
+		struct taskstats *stats)
+{
+	int rc;
+	struct task_struct *tsk = pidtsk;
+
+	if (!pidtsk) {
+		read_lock(&tasklist_lock);
+		tsk = find_task_by_pid(pid);
+		if (!tsk) {
+			read_unlock(&tasklist_lock);
+			return -ESRCH;
+		}
+		get_task_struct(tsk);
+		read_unlock(&tasklist_lock);
+	} else
+		get_task_struct(tsk);
+
+	/*
+	 * Each accounting subsystem adds calls to its functions to
+	 * fill in relevant parts of struct taskstsats as follows
+	 *
+	 *	rc = per-task-foo(stats, tsk);
+	 *	if (rc)
+	 *		goto err;
+	 */
+
+err:
+	put_task_struct(tsk);
+	return rc;
+
+}
+
+static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
+		struct taskstats *stats)
+{
+	int rc;
+	struct task_struct *tsk, *first;
+
+	first = tgidtsk;
+	read_lock(&tasklist_lock);
+	if (!first) {
+		first = find_task_by_pid(tgid);
+		if (!first) {
+			read_unlock(&tasklist_lock);
+			return -ESRCH;
+		}
+	}
+	tsk = first;
+	do {
+		/*
+		 * Each accounting subsystem adds calls its functions to
+		 * fill in relevant parts of struct taskstsats as follows
+		 *
+		 *	rc = per-task-foo(stats, tsk);
+		 *	if (rc)
+		 *		break;
+		 */
+
+	} while_each_thread(first, tsk);
+	read_unlock(&tasklist_lock);
+
+	/*
+	 * Accounting subsytems can also add calls here if they don't
+	 * wish to aggregate statistics for per-tgid stats
+	 */
+
+	return rc;
+}
+
+static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
+{
+	int rc = 0;
+	struct sk_buff *rep_skb;
+	struct taskstats stats;
+	void *reply;
+	size_t size;
+	struct nlattr *na;
+
+	/*
+	 * Size includes space for nested attributes
+	 */
+	size = nla_total_size(sizeof(u32)) +
+		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+
+	memset(&stats, 0, sizeof(stats));
+	rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
+	if (rc < 0)
+		return rc;
+
+	if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
+		u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
+		rc = fill_pid(pid, NULL, &stats);
+		if (rc < 0)
+			goto err;
+
+		na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
+		NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid);
+		NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+				stats);
+	} else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
+		u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
+		rc = fill_tgid(tgid, NULL, &stats);
+		if (rc < 0)
+			goto err;
+
+		na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
+		NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid);
+		NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+				stats);
+	} else {
+		rc = -EINVAL;
+		goto err;
+	}
+
+	nla_nest_end(rep_skb, na);
+
+	return send_reply(rep_skb, info->snd_pid, TASKSTATS_MSG_UNICAST);
+
+nla_put_failure:
+	return genlmsg_cancel(rep_skb, reply);
+err:
+	nlmsg_free(rep_skb);
+	return rc;
+}
+
+/* Send pid data out on exit */
+void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
+			struct taskstats *tgidstats)
+{
+	int rc;
+	struct sk_buff *rep_skb;
+	void *reply;
+	size_t size;
+	int is_thread_group;
+	struct nlattr *na;
+
+	if (!family_registered || !tidstats)
+		return;
+
+	mutex_lock(&taskstats_exit_mutex);
+
+	is_thread_group = !thread_group_empty(tsk);
+	rc = 0;
+
+	/*
+	 * Size includes space for nested attributes
+	 */
+	size = nla_total_size(sizeof(u32)) +
+		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+
+	if (is_thread_group)
+		size = 2 * size;	/* PID + STATS + TGID + STATS */
+
+	rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
+	if (rc < 0)
+		goto ret;
+
+	rc = fill_pid(tsk->pid, tsk, tidstats);
+	if (rc < 0)
+		goto err_skb;
+
+	na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
+	NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid);
+	NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+			*tidstats);
+	nla_nest_end(rep_skb, na);
+
+	if (!is_thread_group || !tgidstats) {
+		send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
+		goto ret;
+	}
+
+	rc = fill_tgid(tsk->pid, tsk, tgidstats);
+	/*
+	 * If fill_tgid() failed then one probable reason could be that the
+	 * thread group leader has exited. fill_tgid() will fail, send out
+	 * the pid statistics collected earlier.
+	 */
+	if (rc < 0) {
+		send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
+		goto ret;
+	}
+
+	na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
+	NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid);
+	NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+			*tgidstats);
+	nla_nest_end(rep_skb, na);
+
+	send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
+	goto ret;
+
+nla_put_failure:
+	genlmsg_cancel(rep_skb, reply);
+	goto ret;
+err_skb:
+	nlmsg_free(rep_skb);
+ret:
+	mutex_unlock(&taskstats_exit_mutex);
+	return;
+}
+
+static struct genl_ops taskstats_ops = {
+	.cmd		= TASKSTATS_CMD_GET,
+	.doit		= taskstats_send_stats,
+	.policy		= taskstats_cmd_get_policy,
+};
+
+/* Needed early in initialization */
+void __init taskstats_init_early(void)
+{
+	taskstats_cache = kmem_cache_create("taskstats_cache",
+						sizeof(struct taskstats),
+						0, SLAB_PANIC, NULL, NULL);
+}
+
+static int __init taskstats_init(void)
+{
+	int rc;
+
+	rc = genl_register_family(&family);
+	if (rc)
+		return rc;
+
+	rc = genl_register_ops(&family, &taskstats_ops);
+	if (rc < 0)
+		goto err;
+
+	family_registered = 1;
+	return 0;
+err:
+	genl_unregister_family(&family);
+	return rc;
+}
+
+/*
+ * late initcall ensures initialization of statistics collection
+ * mechanisms precedes initialization of the taskstats interface
+ */
+late_initcall(taskstats_init);
-- 
cgit v1.2.3


From 6f44993fe1d7b2b097f6ac60cd5835c6f5ca0874 Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Fri, 14 Jul 2006 00:24:41 -0700
Subject: [PATCH] per-task-delay-accounting: delay accounting usage of
 taskstats interface

Usage of taskstats interface by delay accounting.

Signed-off-by: Shailabh Nagar <nagar@us.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Peter Chubb <peterc@gelato.unsw.edu.au>
Cc: Erich Focht <efocht@ess.nec.de>
Cc: Levent Serinol <lserinol@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/delayacct.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/taskstats.c | 16 +++++++++-----
 2 files changed, 72 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 3546b0800f9f..1be274a462ca 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -41,6 +41,10 @@ void delayacct_init(void)
 
 void __delayacct_tsk_init(struct task_struct *tsk)
 {
+	spin_lock_init(&tsk->delays_lock);
+	/* No need to acquire tsk->delays_lock for allocation here unless
+	   __delayacct_tsk_init called after tsk is attached to tasklist
+	*/
 	tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL);
 	if (tsk->delays)
 		spin_lock_init(&tsk->delays->lock);
@@ -48,8 +52,11 @@ void __delayacct_tsk_init(struct task_struct *tsk)
 
 void __delayacct_tsk_exit(struct task_struct *tsk)
 {
-	kmem_cache_free(delayacct_cache, tsk->delays);
+	struct task_delay_info *delays = tsk->delays;
+	spin_lock(&tsk->delays_lock);
 	tsk->delays = NULL;
+	spin_unlock(&tsk->delays_lock);
+	kmem_cache_free(delayacct_cache, delays);
 }
 
 /*
@@ -104,3 +111,56 @@ void __delayacct_blkio_end(void)
 			&current->delays->blkio_delay,
 			&current->delays->blkio_count);
 }
+
+int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
+{
+	s64 tmp;
+	struct timespec ts;
+	unsigned long t1,t2,t3;
+
+	spin_lock(&tsk->delays_lock);
+
+	/* Though tsk->delays accessed later, early exit avoids
+	 * unnecessary returning of other data
+	 */
+	if (!tsk->delays)
+		goto done;
+
+	tmp = (s64)d->cpu_run_real_total;
+	cputime_to_timespec(tsk->utime + tsk->stime, &ts);
+	tmp += timespec_to_ns(&ts);
+	d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
+
+	/*
+	 * No locking available for sched_info (and too expensive to add one)
+	 * Mitigate by taking snapshot of values
+	 */
+	t1 = tsk->sched_info.pcnt;
+	t2 = tsk->sched_info.run_delay;
+	t3 = tsk->sched_info.cpu_time;
+
+	d->cpu_count += t1;
+
+	jiffies_to_timespec(t2, &ts);
+	tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts);
+	d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
+
+	tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000;
+	d->cpu_run_virtual_total =
+		(tmp < (s64)d->cpu_run_virtual_total) ?	0 : tmp;
+
+	/* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
+
+	spin_lock(&tsk->delays->lock);
+	tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
+	d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
+	tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
+	d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
+	d->blkio_count += tsk->delays->blkio_count;
+	d->swapin_count += tsk->delays->swapin_count;
+	spin_unlock(&tsk->delays->lock);
+
+done:
+	spin_unlock(&tsk->delays_lock);
+	return 0;
+}
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 82ec9137d908..ea9506de3b85 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -18,13 +18,13 @@
 
 #include <linux/kernel.h>
 #include <linux/taskstats_kern.h>
+#include <linux/delayacct.h>
 #include <net/genetlink.h>
 #include <asm/atomic.h>
 
 static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
 static int family_registered;
 kmem_cache_t *taskstats_cache;
-static DEFINE_MUTEX(taskstats_exit_mutex);
 
 static struct genl_family family = {
 	.id		= GENL_ID_GENERATE,
@@ -120,7 +120,10 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 	 *		goto err;
 	 */
 
-err:
+	rc = delayacct_add_tsk(stats, tsk);
+	stats->version = TASKSTATS_VERSION;
+
+	/* Define err: label here if needed */
 	put_task_struct(tsk);
 	return rc;
 
@@ -152,8 +155,14 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
 		 *		break;
 		 */
 
+		rc = delayacct_add_tsk(stats, tsk);
+		if (rc)
+			break;
+
 	} while_each_thread(first, tsk);
 	read_unlock(&tasklist_lock);
+	stats->version = TASKSTATS_VERSION;
+
 
 	/*
 	 * Accounting subsytems can also add calls here if they don't
@@ -233,8 +242,6 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 	if (!family_registered || !tidstats)
 		return;
 
-	mutex_lock(&taskstats_exit_mutex);
-
 	is_thread_group = !thread_group_empty(tsk);
 	rc = 0;
 
@@ -292,7 +299,6 @@ nla_put_failure:
 err_skb:
 	nlmsg_free(rep_skb);
 ret:
-	mutex_unlock(&taskstats_exit_mutex);
 	return;
 }
 
-- 
cgit v1.2.3


From 25890454667b3295f67b3372352be90705f8667c Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Fri, 14 Jul 2006 00:24:43 -0700
Subject: [PATCH] per-task-delay-accounting: /proc export of aggregated block
 I/O delays

Export I/O delays seen by a task through /proc/<tgid>/stats for use in top
etc.

Note that delays for I/O done for swapping in pages (swapin I/O) is clubbed
together with all other I/O here (this is not the case in the netlink
interface where the swapin I/O is kept distinct)

[akpm@osdl.org: printk warning fix]
Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Peter Chubb <peterc@gelato.unsw.edu.au>
Cc: Erich Focht <efocht@ess.nec.de>
Cc: Levent Serinol <lserinol@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/delayacct.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 1be274a462ca..f05392d64267 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -164,3 +164,15 @@ done:
 	spin_unlock(&tsk->delays_lock);
 	return 0;
 }
+
+__u64 __delayacct_blkio_ticks(struct task_struct *tsk)
+{
+	__u64 ret;
+
+	spin_lock(&tsk->delays->lock);
+	ret = nsec_to_clock_t(tsk->delays->blkio_delay +
+				tsk->delays->swapin_delay);
+	spin_unlock(&tsk->delays->lock);
+	return ret;
+}
+
-- 
cgit v1.2.3


From ad4ecbcba72855a2b5319b96e2a3a65ed1ca3bfd Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Fri, 14 Jul 2006 00:24:44 -0700
Subject: [PATCH] delay accounting taskstats interface send tgid once

Send per-tgid data only once during exit of a thread group instead of once
with each member thread exit.

Currently, when a thread exits, besides its per-tid data, the per-tgid data
of its thread group is also sent out, if its thread group is non-empty.
The per-tgid data sent consists of the sum of per-tid stats for all
*remaining* threads of the thread group.

This patch modifies this sending in two ways:

- the per-tgid data is sent only when the last thread of a thread group
  exits.  This cuts down heavily on the overhead of sending/receiving
  per-tgid data, especially when other exploiters of the taskstats
  interface aren't interested in per-tgid stats

- the semantics of the per-tgid data sent are changed.  Instead of being
  the sum of per-tid data for remaining threads, the value now sent is the
  true total accumalated statistics for all threads that are/were part of
  the thread group.

The patch also addresses a minor issue where failure of one accounting
subsystem to fill in the taskstats structure was causing the send of
taskstats to not be sent at all.

The patch has been tested for stability and run cerberus for over 4 hours
on an SMP.

[akpm@osdl.org: bugfixes]
Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c      |  8 ++---
 kernel/fork.c      |  4 +++
 kernel/taskstats.c | 98 ++++++++++++++++++++++++++++++++++++------------------
 3 files changed, 74 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 9852ed8c2988..67c1e9a4f812 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -845,7 +845,7 @@ static void exit_notify(struct task_struct *tsk)
 fastcall NORET_TYPE void do_exit(long code)
 {
 	struct task_struct *tsk = current;
-	struct taskstats *tidstats, *tgidstats;
+	struct taskstats *tidstats;
 	int group_dead;
 
 	profile_task_exit(tsk);
@@ -884,7 +884,7 @@ fastcall NORET_TYPE void do_exit(long code)
 				current->comm, current->pid,
 				preempt_count());
 
-	taskstats_exit_alloc(&tidstats, &tgidstats);
+	taskstats_exit_alloc(&tidstats);
 
 	acct_update_integrals(tsk);
 	if (tsk->mm) {
@@ -905,8 +905,8 @@ fastcall NORET_TYPE void do_exit(long code)
 #endif
 	if (unlikely(tsk->audit_context))
 		audit_free(tsk);
-	taskstats_exit_send(tsk, tidstats, tgidstats);
-	taskstats_exit_free(tidstats, tgidstats);
+	taskstats_exit_send(tsk, tidstats, group_dead);
+	taskstats_exit_free(tidstats);
 	delayacct_tsk_exit(tsk);
 
 	exit_mm(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 451cfd35bf22..1b0f7b1e0881 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -44,6 +44,7 @@
 #include <linux/acct.h>
 #include <linux/cn_proc.h>
 #include <linux/delayacct.h>
+#include <linux/taskstats_kern.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -819,6 +820,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	if (clone_flags & CLONE_THREAD) {
 		atomic_inc(&current->signal->count);
 		atomic_inc(&current->signal->live);
+		taskstats_tgid_alloc(current->signal);
 		return 0;
 	}
 	sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
@@ -863,6 +865,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	INIT_LIST_HEAD(&sig->cpu_timers[0]);
 	INIT_LIST_HEAD(&sig->cpu_timers[1]);
 	INIT_LIST_HEAD(&sig->cpu_timers[2]);
+	taskstats_tgid_init(sig);
 
 	task_lock(current->group_leader);
 	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
@@ -884,6 +887,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 void __cleanup_signal(struct signal_struct *sig)
 {
 	exit_thread_group_keys(sig);
+	taskstats_tgid_free(sig);
 	kmem_cache_free(signal_cachep, sig);
 }
 
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index ea9506de3b85..4a0a5022b299 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -132,46 +132,79 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
 		struct taskstats *stats)
 {
-	int rc;
 	struct task_struct *tsk, *first;
+	unsigned long flags;
 
+	/*
+	 * Add additional stats from live tasks except zombie thread group
+	 * leaders who are already counted with the dead tasks
+	 */
 	first = tgidtsk;
-	read_lock(&tasklist_lock);
 	if (!first) {
+		read_lock(&tasklist_lock);
 		first = find_task_by_pid(tgid);
 		if (!first) {
 			read_unlock(&tasklist_lock);
 			return -ESRCH;
 		}
-	}
+		get_task_struct(first);
+		read_unlock(&tasklist_lock);
+	} else
+		get_task_struct(first);
+
+	/* Start with stats from dead tasks */
+	spin_lock_irqsave(&first->signal->stats_lock, flags);
+	if (first->signal->stats)
+		memcpy(stats, first->signal->stats, sizeof(*stats));
+	spin_unlock_irqrestore(&first->signal->stats_lock, flags);
+
 	tsk = first;
+	read_lock(&tasklist_lock);
 	do {
+		if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk))
+			continue;
 		/*
-		 * Each accounting subsystem adds calls its functions to
+		 * Accounting subsystem can call its functions here to
 		 * fill in relevant parts of struct taskstsats as follows
 		 *
-		 *	rc = per-task-foo(stats, tsk);
-		 *	if (rc)
-		 *		break;
+		 *	per-task-foo(stats, tsk);
 		 */
-
-		rc = delayacct_add_tsk(stats, tsk);
-		if (rc)
-			break;
+		delayacct_add_tsk(stats, tsk);
 
 	} while_each_thread(first, tsk);
 	read_unlock(&tasklist_lock);
 	stats->version = TASKSTATS_VERSION;
 
-
 	/*
-	 * Accounting subsytems can also add calls here if they don't
-	 * wish to aggregate statistics for per-tgid stats
+	 * Accounting subsytems can also add calls here to modify
+	 * fields of taskstats.
 	 */
 
-	return rc;
+	return 0;
+}
+
+
+static void fill_tgid_exit(struct task_struct *tsk)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&tsk->signal->stats_lock, flags);
+	if (!tsk->signal->stats)
+		goto ret;
+
+	/*
+	 * Each accounting subsystem calls its functions here to
+	 * accumalate its per-task stats for tsk, into the per-tgid structure
+	 *
+	 *	per-task-foo(tsk->signal->stats, tsk);
+	 */
+	delayacct_add_tsk(tsk->signal->stats, tsk);
+ret:
+	spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
+	return;
 }
 
+
 static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
 {
 	int rc = 0;
@@ -230,7 +263,7 @@ err:
 
 /* Send pid data out on exit */
 void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
-			struct taskstats *tgidstats)
+			int group_dead)
 {
 	int rc;
 	struct sk_buff *rep_skb;
@@ -238,13 +271,16 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 	size_t size;
 	int is_thread_group;
 	struct nlattr *na;
+	unsigned long flags;
 
 	if (!family_registered || !tidstats)
 		return;
 
-	is_thread_group = !thread_group_empty(tsk);
-	rc = 0;
+	spin_lock_irqsave(&tsk->signal->stats_lock, flags);
+	is_thread_group = tsk->signal->stats ? 1 : 0;
+	spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
 
+	rc = 0;
 	/*
 	 * Size includes space for nested attributes
 	 */
@@ -268,30 +304,28 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 			*tidstats);
 	nla_nest_end(rep_skb, na);
 
-	if (!is_thread_group || !tgidstats) {
-		send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
-		goto ret;
-	}
+	if (!is_thread_group)
+		goto send;
 
-	rc = fill_tgid(tsk->pid, tsk, tgidstats);
 	/*
-	 * If fill_tgid() failed then one probable reason could be that the
-	 * thread group leader has exited. fill_tgid() will fail, send out
-	 * the pid statistics collected earlier.
+	 * tsk has/had a thread group so fill the tsk->signal->stats structure
+	 * Doesn't matter if tsk is the leader or the last group member leaving
 	 */
-	if (rc < 0) {
-		send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
-		goto ret;
-	}
+
+	fill_tgid_exit(tsk);
+	if (!group_dead)
+		goto send;
 
 	na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
 	NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid);
+	/* No locking needed for tsk->signal->stats since group is dead */
 	NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
-			*tgidstats);
+			*tsk->signal->stats);
 	nla_nest_end(rep_skb, na);
 
+send:
 	send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
-	goto ret;
+	return;
 
 nla_put_failure:
 	genlmsg_cancel(rep_skb, reply);
-- 
cgit v1.2.3


From f9fd8914c1acca0d98b69d831b128d5b52f03c51 Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Fri, 14 Jul 2006 00:24:47 -0700
Subject: [PATCH] per-task delay accounting taskstats interface: control exit
 data through cpumasks

On systems with a large number of cpus, with even a modest rate of tasks
exiting per cpu, the volume of taskstats data sent on thread exit can
overflow a userspace listener's buffers.

One approach to avoiding overflow is to allow listeners to get data for a
limited and specific set of cpus.  By scaling the number of listeners
and/or the cpus they monitor, userspace can handle the statistical data
overload more gracefully.

In this patch, each listener registers to listen to a specific set of cpus
by specifying a cpumask.  The interest is recorded per-cpu.  When a task
exits on a cpu, its taskstats data is unicast to each listener interested
in that cpu.

Thanks to Andrew Morton for pointing out the various scalability and
general concerns of previous attempts and for suggesting this design.

[akpm@osdl.org: build fix]
Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c      |   5 +-
 kernel/taskstats.c | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 192 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 67c1e9a4f812..dba194a8d416 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -847,6 +847,7 @@ fastcall NORET_TYPE void do_exit(long code)
 	struct task_struct *tsk = current;
 	struct taskstats *tidstats;
 	int group_dead;
+	unsigned int mycpu;
 
 	profile_task_exit(tsk);
 
@@ -884,7 +885,7 @@ fastcall NORET_TYPE void do_exit(long code)
 				current->comm, current->pid,
 				preempt_count());
 
-	taskstats_exit_alloc(&tidstats);
+	taskstats_exit_alloc(&tidstats, &mycpu);
 
 	acct_update_integrals(tsk);
 	if (tsk->mm) {
@@ -905,7 +906,7 @@ fastcall NORET_TYPE void do_exit(long code)
 #endif
 	if (unlikely(tsk->audit_context))
 		audit_free(tsk);
-	taskstats_exit_send(tsk, tidstats, group_dead);
+	taskstats_exit_send(tsk, tidstats, group_dead, mycpu);
 	taskstats_exit_free(tidstats);
 	delayacct_tsk_exit(tsk);
 
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 4a0a5022b299..abb59e323544 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -19,9 +19,17 @@
 #include <linux/kernel.h>
 #include <linux/taskstats_kern.h>
 #include <linux/delayacct.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
 #include <net/genetlink.h>
 #include <asm/atomic.h>
 
+/*
+ * Maximum length of a cpumask that can be specified in
+ * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute
+ */
+#define TASKSTATS_CPUMASK_MAXLEN	(100+6*NR_CPUS)
+
 static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
 static int family_registered;
 kmem_cache_t *taskstats_cache;
@@ -37,8 +45,25 @@ static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
 __read_mostly = {
 	[TASKSTATS_CMD_ATTR_PID]  = { .type = NLA_U32 },
 	[TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
+	[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
+	[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
+
+struct listener {
+	struct list_head list;
+	pid_t pid;
 };
 
+struct listener_list {
+	struct rw_semaphore sem;
+	struct list_head list;
+};
+static DEFINE_PER_CPU(struct listener_list, listener_array);
+
+enum actions {
+	REGISTER,
+	DEREGISTER,
+	CPU_DONT_CARE
+};
 
 static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
 			void **replyp, size_t size)
@@ -74,25 +99,68 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
 	return 0;
 }
 
-static int send_reply(struct sk_buff *skb, pid_t pid, int event)
+/*
+ * Send taskstats data in @skb to listener with nl_pid @pid
+ */
+static int send_reply(struct sk_buff *skb, pid_t pid)
 {
 	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
-	void *reply;
+	void *reply = genlmsg_data(genlhdr);
 	int rc;
 
-	reply = genlmsg_data(genlhdr);
-
 	rc = genlmsg_end(skb, reply);
 	if (rc < 0) {
 		nlmsg_free(skb);
 		return rc;
 	}
 
-	if (event == TASKSTATS_MSG_MULTICAST)
-		return genlmsg_multicast(skb, pid, TASKSTATS_LISTEN_GROUP);
 	return genlmsg_unicast(skb, pid);
 }
 
+/*
+ * Send taskstats data in @skb to listeners registered for @cpu's exit data
+ */
+static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
+{
+	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
+	struct listener_list *listeners;
+	struct listener *s, *tmp;
+	struct sk_buff *skb_next, *skb_cur = skb;
+	void *reply = genlmsg_data(genlhdr);
+	int rc, ret;
+
+	rc = genlmsg_end(skb, reply);
+	if (rc < 0) {
+		nlmsg_free(skb);
+		return rc;
+	}
+
+	rc = 0;
+	listeners = &per_cpu(listener_array, cpu);
+	down_write(&listeners->sem);
+	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+		skb_next = NULL;
+		if (!list_is_last(&s->list, &listeners->list)) {
+			skb_next = skb_clone(skb_cur, GFP_KERNEL);
+			if (!skb_next) {
+				nlmsg_free(skb_cur);
+				rc = -ENOMEM;
+				break;
+			}
+		}
+		ret = genlmsg_unicast(skb_cur, s->pid);
+		if (ret == -ECONNREFUSED) {
+			list_del(&s->list);
+			kfree(s);
+			rc = ret;
+		}
+		skb_cur = skb_next;
+	}
+	up_write(&listeners->sem);
+
+	return rc;
+}
+
 static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 		struct taskstats *stats)
 {
@@ -204,8 +272,73 @@ ret:
 	return;
 }
 
+static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
+{
+	struct listener_list *listeners;
+	struct listener *s, *tmp;
+	unsigned int cpu;
+	cpumask_t mask = *maskp;
 
-static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
+	if (!cpus_subset(mask, cpu_possible_map))
+		return -EINVAL;
+
+	if (isadd == REGISTER) {
+		for_each_cpu_mask(cpu, mask) {
+			s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
+					 cpu_to_node(cpu));
+			if (!s)
+				goto cleanup;
+			s->pid = pid;
+			INIT_LIST_HEAD(&s->list);
+
+			listeners = &per_cpu(listener_array, cpu);
+			down_write(&listeners->sem);
+			list_add(&s->list, &listeners->list);
+			up_write(&listeners->sem);
+		}
+		return 0;
+	}
+
+	/* Deregister or cleanup */
+cleanup:
+	for_each_cpu_mask(cpu, mask) {
+		listeners = &per_cpu(listener_array, cpu);
+		down_write(&listeners->sem);
+		list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+			if (s->pid == pid) {
+				list_del(&s->list);
+				kfree(s);
+				break;
+			}
+		}
+		up_write(&listeners->sem);
+	}
+	return 0;
+}
+
+static int parse(struct nlattr *na, cpumask_t *mask)
+{
+	char *data;
+	int len;
+	int ret;
+
+	if (na == NULL)
+		return 1;
+	len = nla_len(na);
+	if (len > TASKSTATS_CPUMASK_MAXLEN)
+		return -E2BIG;
+	if (len < 1)
+		return -EINVAL;
+	data = kmalloc(len, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+	nla_strlcpy(data, na, len);
+	ret = cpulist_parse(data, *mask);
+	kfree(data);
+	return ret;
+}
+
+static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 {
 	int rc = 0;
 	struct sk_buff *rep_skb;
@@ -213,6 +346,19 @@ static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
 	void *reply;
 	size_t size;
 	struct nlattr *na;
+	cpumask_t mask;
+
+	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
+	if (rc < 0)
+		return rc;
+	if (rc == 0)
+		return add_del_listener(info->snd_pid, &mask, REGISTER);
+
+	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask);
+	if (rc < 0)
+		return rc;
+	if (rc == 0)
+		return add_del_listener(info->snd_pid, &mask, DEREGISTER);
 
 	/*
 	 * Size includes space for nested attributes
@@ -252,7 +398,7 @@ static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
 
 	nla_nest_end(rep_skb, na);
 
-	return send_reply(rep_skb, info->snd_pid, TASKSTATS_MSG_UNICAST);
+	return send_reply(rep_skb, info->snd_pid);
 
 nla_put_failure:
 	return genlmsg_cancel(rep_skb, reply);
@@ -261,9 +407,35 @@ err:
 	return rc;
 }
 
+void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
+{
+	struct listener_list *listeners;
+	struct taskstats *tmp;
+	/*
+	 * This is the cpu on which the task is exiting currently and will
+	 * be the one for which the exit event is sent, even if the cpu
+	 * on which this function is running changes later.
+	 */
+	*mycpu = raw_smp_processor_id();
+
+	*ptidstats = NULL;
+	tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
+	if (!tmp)
+		return;
+
+	listeners = &per_cpu(listener_array, *mycpu);
+	down_read(&listeners->sem);
+	if (!list_empty(&listeners->list)) {
+		*ptidstats = tmp;
+		tmp = NULL;
+	}
+	up_read(&listeners->sem);
+	kfree(tmp);
+}
+
 /* Send pid data out on exit */
 void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
-			int group_dead)
+			int group_dead, unsigned int mycpu)
 {
 	int rc;
 	struct sk_buff *rep_skb;
@@ -324,7 +496,7 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 	nla_nest_end(rep_skb, na);
 
 send:
-	send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
+	send_cpu_listeners(rep_skb, mycpu);
 	return;
 
 nla_put_failure:
@@ -338,16 +510,22 @@ ret:
 
 static struct genl_ops taskstats_ops = {
 	.cmd		= TASKSTATS_CMD_GET,
-	.doit		= taskstats_send_stats,
+	.doit		= taskstats_user_cmd,
 	.policy		= taskstats_cmd_get_policy,
 };
 
 /* Needed early in initialization */
 void __init taskstats_init_early(void)
 {
+	unsigned int i;
+
 	taskstats_cache = kmem_cache_create("taskstats_cache",
 						sizeof(struct taskstats),
 						0, SLAB_PANIC, NULL, NULL);
+	for_each_possible_cpu(i) {
+		INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
+		init_rwsem(&(per_cpu(listener_array, i).sem));
+	}
 }
 
 static int __init taskstats_init(void)
-- 
cgit v1.2.3


From bb129994c3bff9c5e8df91f05d7e9b6402fbd83f Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Fri, 14 Jul 2006 00:24:47 -0700
Subject: [PATCH] Remove down_write() from taskstats code invoked on the exit()
 path

In send_cpu_listeners(), which is called on the exit path, a down_write()
was protecting operations like skb_clone() and genlmsg_unicast() that do
GFP_KERNEL allocations.  If the oom-killer decides to kill tasks to satisfy
the allocations,the exit of those tasks could block on the same semphore.

The down_write() was only needed to allow removal of invalid listeners from
the listener list.  The patch converts the down_write to a down_read and
defers the removal to a separate critical region.  This ensures that even
if the oom-killer is called, no other task's exit is blocked as it can
still acquire another down_read.

Thanks to Andrew Morton & Herbert Xu for pointing out the oom related
pitfalls, and to Chandra Seetharaman for suggesting this fix instead of
using something more complex like RCU.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index abb59e323544..f45179ce028e 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -51,6 +51,7 @@ __read_mostly = {
 struct listener {
 	struct list_head list;
 	pid_t pid;
+	char valid;
 };
 
 struct listener_list {
@@ -127,7 +128,7 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 	struct listener *s, *tmp;
 	struct sk_buff *skb_next, *skb_cur = skb;
 	void *reply = genlmsg_data(genlhdr);
-	int rc, ret;
+	int rc, ret, delcount = 0;
 
 	rc = genlmsg_end(skb, reply);
 	if (rc < 0) {
@@ -137,7 +138,7 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 
 	rc = 0;
 	listeners = &per_cpu(listener_array, cpu);
-	down_write(&listeners->sem);
+	down_read(&listeners->sem);
 	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
 		skb_next = NULL;
 		if (!list_is_last(&s->list, &listeners->list)) {
@@ -150,14 +151,26 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 		}
 		ret = genlmsg_unicast(skb_cur, s->pid);
 		if (ret == -ECONNREFUSED) {
-			list_del(&s->list);
-			kfree(s);
+			s->valid = 0;
+			delcount++;
 			rc = ret;
 		}
 		skb_cur = skb_next;
 	}
-	up_write(&listeners->sem);
+	up_read(&listeners->sem);
+
+	if (!delcount)
+		return rc;
 
+	/* Delete invalidated entries */
+	down_write(&listeners->sem);
+	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+		if (!s->valid) {
+			list_del(&s->list);
+			kfree(s);
+		}
+	}
+	up_write(&listeners->sem);
 	return rc;
 }
 
@@ -290,6 +303,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
 				goto cleanup;
 			s->pid = pid;
 			INIT_LIST_HEAD(&s->list);
+			s->valid = 1;
 
 			listeners = &per_cpu(listener_array, cpu);
 			down_write(&listeners->sem);
-- 
cgit v1.2.3


From aa95387774039096c11803c04011f1aa42d85758 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@macmini.osdl.org>
Date: Sun, 23 Jul 2006 12:12:16 -0700
Subject: cpu hotplug: simplify and hopefully fix locking

The CPU hotplug locking was quite messy, with a recursive lock to
handle the fact that both the actual up/down sequence wanted to
protect itself from being re-entered, but the callbacks that it
called also tended to want to protect themselves from CPU events.

This splits the lock into two (one to serialize the whole hotplug
sequence, the other to protect against the CPU present bitmaps
changing). The latter still allows recursive usage because some
subsystems (ondemand policy for cpufreq at least) had already gotten
too used to the lax locking, but the locking mistakes are hopefully
now less fundamental, and we now warn about recursive lock usage
when we see it, in the hope that it can be fixed.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpu.c | 75 +++++++++++++++++++++++++++---------------------------------
 1 file changed, 34 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 70fbf2e83766..f230f9ae01c2 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -16,56 +16,48 @@
 #include <linux/mutex.h>
 
 /* This protects CPUs going up and down... */
-static DEFINE_MUTEX(cpucontrol);
+static DEFINE_MUTEX(cpu_add_remove_lock);
+static DEFINE_MUTEX(cpu_bitmask_lock);
 
 static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain);
 
 #ifdef CONFIG_HOTPLUG_CPU
-static struct task_struct *lock_cpu_hotplug_owner;
-static int lock_cpu_hotplug_depth;
 
-static int __lock_cpu_hotplug(int interruptible)
-{
-	int ret = 0;
-
-	if (lock_cpu_hotplug_owner != current) {
-		if (interruptible)
-			ret = mutex_lock_interruptible(&cpucontrol);
-		else
-			mutex_lock(&cpucontrol);
-	}
-
-	/*
-	 * Set only if we succeed in locking
-	 */
-	if (!ret) {
-		lock_cpu_hotplug_depth++;
-		lock_cpu_hotplug_owner = current;
-	}
-
-	return ret;
-}
+/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */
+static struct task_struct *recursive;
+static int recursive_depth;
 
 void lock_cpu_hotplug(void)
 {
-	__lock_cpu_hotplug(0);
+	struct task_struct *tsk = current;
+
+	if (tsk == recursive) {
+		static int warnings = 10;
+		if (warnings) {
+			printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n");
+			WARN_ON(1);
+			warnings--;
+		}
+		recursive_depth++;
+		return;
+	}
+	mutex_lock(&cpu_bitmask_lock);
+	recursive = tsk;
 }
 EXPORT_SYMBOL_GPL(lock_cpu_hotplug);
 
 void unlock_cpu_hotplug(void)
 {
-	if (--lock_cpu_hotplug_depth == 0) {
-		lock_cpu_hotplug_owner = NULL;
-		mutex_unlock(&cpucontrol);
+	WARN_ON(recursive != current);
+	if (recursive_depth) {
+		recursive_depth--;
+		return;
 	}
+	mutex_unlock(&cpu_bitmask_lock);
+	recursive = NULL;
 }
 EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
 
-int lock_cpu_hotplug_interruptible(void)
-{
-	return __lock_cpu_hotplug(1);
-}
-EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
 #endif	/* CONFIG_HOTPLUG_CPU */
 
 /* Need to know about CPUs going up/down? */
@@ -122,9 +114,7 @@ int cpu_down(unsigned int cpu)
 	struct task_struct *p;
 	cpumask_t old_allowed, tmp;
 
-	if ((err = lock_cpu_hotplug_interruptible()) != 0)
-		return err;
-
+	mutex_lock(&cpu_add_remove_lock);
 	if (num_online_cpus() == 1) {
 		err = -EBUSY;
 		goto out;
@@ -150,7 +140,10 @@ int cpu_down(unsigned int cpu)
 	cpu_clear(cpu, tmp);
 	set_cpus_allowed(current, tmp);
 
+	mutex_lock(&cpu_bitmask_lock);
 	p = __stop_machine_run(take_cpu_down, NULL, cpu);
+	mutex_unlock(&cpu_bitmask_lock);
+
 	if (IS_ERR(p)) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
@@ -187,7 +180,7 @@ out_thread:
 out_allowed:
 	set_cpus_allowed(current, old_allowed);
 out:
-	unlock_cpu_hotplug();
+	mutex_unlock(&cpu_add_remove_lock);
 	return err;
 }
 #endif /*CONFIG_HOTPLUG_CPU*/
@@ -197,9 +190,7 @@ int __devinit cpu_up(unsigned int cpu)
 	int ret;
 	void *hcpu = (void *)(long)cpu;
 
-	if ((ret = lock_cpu_hotplug_interruptible()) != 0)
-		return ret;
-
+	mutex_lock(&cpu_add_remove_lock);
 	if (cpu_online(cpu) || !cpu_present(cpu)) {
 		ret = -EINVAL;
 		goto out;
@@ -214,7 +205,9 @@ int __devinit cpu_up(unsigned int cpu)
 	}
 
 	/* Arch-specific enabling code. */
+	mutex_lock(&cpu_bitmask_lock);
 	ret = __cpu_up(cpu);
+	mutex_unlock(&cpu_bitmask_lock);
 	if (ret != 0)
 		goto out_notify;
 	BUG_ON(!cpu_online(cpu));
@@ -227,6 +220,6 @@ out_notify:
 		blocking_notifier_call_chain(&cpu_chain,
 				CPU_UP_CANCELED, hcpu);
 out:
-	unlock_cpu_hotplug();
+	mutex_unlock(&cpu_add_remove_lock);
 	return ret;
 }
-- 
cgit v1.2.3


From abb5a5cc6bba1516403146c5b79036fe843beb70 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 23 Jul 2006 11:36:08 -0700
Subject: [PATCH] Cpuset: fix ABBA deadlock with cpu hotplug lock

Fix ABBA deadlock between lock_cpu_hotplug() and the cpuset
callback_mutex lock.

It only happens on cpu_exclusive cpusets, due to the dynamic
sched domain code trying to take the cpu hotplug lock inside
the cpuset callback_mutex lock.

This bug has apparently been here for several months, but didn't
get hit until the right customer load on a large system.

This fix appears right from inspection, but it will take a few
more days running it on that customers workload to be confident
we nailed it.  We don't have any other reproducible test case.

The cpu_hotplug_lock() tends to cover large runs of code.
The other places that hold both that lock and the cpuset callback
mutex lock always nest the cpuset lock inside the hotplug lock.
This place tries to do the reverse, risking an ABBA deadlock.

This is in the cpuset_rmdir() code, where we:
  * take the callback_mutex lock
  * mark the cpuset CS_REMOVED
  * call update_cpu_domains for cpu_exclusive cpusets
  * in that call, take the cpu_hotplug lock if the
    cpuset is marked for removal.

Thanks to Jack Steiner for identifying this deadlock.

The fix is to tear down the dynamic sched domain before we grab
the cpuset callback_mutex lock.  This way, the two locks are
serialized, with the hotplug lock taken and released before
trying for the cpuset lock.

I suspect that this bug was introduced when I changed the
cpuset locking from one lock to two.  The dynamic sched domain
dependency on cpu_exclusive cpusets and its hotplug hooks were
added to this code earlier, when cpusets had only a single lock.
It may well have been fine then.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c232dc077438..1a649f2bb9bb 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -762,6 +762,8 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
  *
  * Call with manage_mutex held.  May nest a call to the
  * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
+ * Must not be called holding callback_mutex, because we must
+ * not call lock_cpu_hotplug() while holding callback_mutex.
  */
 
 static void update_cpu_domains(struct cpuset *cur)
@@ -781,7 +783,7 @@ static void update_cpu_domains(struct cpuset *cur)
 		if (is_cpu_exclusive(c))
 			cpus_andnot(pspan, pspan, c->cpus_allowed);
 	}
-	if (is_removed(cur) || !is_cpu_exclusive(cur)) {
+	if (!is_cpu_exclusive(cur)) {
 		cpus_or(pspan, pspan, cur->cpus_allowed);
 		if (cpus_equal(pspan, cur->cpus_allowed))
 			return;
@@ -1917,6 +1919,17 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
 }
 
+/*
+ * Locking note on the strange update_flag() call below:
+ *
+ * If the cpuset being removed is marked cpu_exclusive, then simulate
+ * turning cpu_exclusive off, which will call update_cpu_domains().
+ * The lock_cpu_hotplug() call in update_cpu_domains() must not be
+ * made while holding callback_mutex.  Elsewhere the kernel nests
+ * callback_mutex inside lock_cpu_hotplug() calls.  So the reverse
+ * nesting would risk an ABBA deadlock.
+ */
+
 static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 {
 	struct cpuset *cs = dentry->d_fsdata;
@@ -1936,11 +1949,16 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 		mutex_unlock(&manage_mutex);
 		return -EBUSY;
 	}
+	if (is_cpu_exclusive(cs)) {
+		int retval = update_flag(CS_CPU_EXCLUSIVE, cs, "0");
+		if (retval < 0) {
+			mutex_unlock(&manage_mutex);
+			return retval;
+		}
+	}
 	parent = cs->parent;
 	mutex_lock(&callback_mutex);
 	set_bit(CS_REMOVED, &cs->flags);
-	if (is_cpu_exclusive(cs))
-		update_cpu_domains(cs);
 	list_del(&cs->sibling);	/* delete my sibling from parent->children */
 	spin_lock(&cs->dentry->d_lock);
 	d = dget(cs->dentry);
-- 
cgit v1.2.3