From b36e4758dc1b9ff1f6d97e951edba22366230d11 Mon Sep 17 00:00:00 2001
From: Russell King <rmk@dyn-67.arm.linux.org.uk>
Date: Sun, 27 Aug 2006 12:26:34 +0100
Subject: [ARM] Fix kernel/fork.c for lockdep on ARM

ARM has interrupts enabled over context switches (iow, has
__ARCH_WANT_INTERRUPTS_ON_CTXSW defined.)  The lockdep code in fork.c
 assumes that interrupts are always disabled.  Fix this wrong
assumption by making the initialisation of 'p->hardirqs_enabled'
depend on __ARCH_WANT_INTERRUPTS_ON_CTXSW.

Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 kernel/fork.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index f9b014e3e700..8f76adf1c6a6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1056,7 +1056,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #endif
 #ifdef CONFIG_TRACE_IRQFLAGS
 	p->irq_events = 0;
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	p->hardirqs_enabled = 1;
+#else
 	p->hardirqs_enabled = 0;
+#endif
 	p->hardirq_enable_ip = 0;
 	p->hardirq_enable_event = 0;
 	p->hardirq_disable_ip = _THIS_IP_;
-- 
cgit v1.2.3


From d33b6fba2c4350651f3f61ff2ab858a2f116e9a4 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Fri, 30 Jun 2006 02:31:24 -0700
Subject: Resources: insert identical resources above existing resources

If you have two resources which aree exactly the same size,
insert_resource() currently inserts the new one below the existing one.
This is wrong because there's no way to insert a resource of the same size
above an existing one.

I took this opportunity to rewrite the initial loop to be a for-loop
instead of a goto-loop and fix the documentation.

Signed-off-by: Matthew Wilcox <matthew@wil.cx>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/resource.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index 46286434af80..9db38a1a7520 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -344,12 +344,11 @@ EXPORT_SYMBOL(allocate_resource);
  *
  * Returns 0 on success, -EBUSY if the resource can't be inserted.
  *
- * This function is equivalent of request_resource when no conflict
+ * This function is equivalent to request_resource when no conflict
  * happens. If a conflict happens, and the conflicting resources
  * entirely fit within the range of the new resource, then the new
- * resource is inserted and the conflicting resources become childs of
- * the new resource.  Otherwise the new resource becomes the child of
- * the conflicting resource
+ * resource is inserted and the conflicting resources become children of
+ * the new resource.
  */
 int insert_resource(struct resource *parent, struct resource *new)
 {
@@ -357,20 +356,21 @@ int insert_resource(struct resource *parent, struct resource *new)
 	struct resource *first, *next;
 
 	write_lock(&resource_lock);
- begin:
- 	result = 0;
-	first = __request_resource(parent, new);
-	if (!first)
-		goto out;
 
-	result = -EBUSY;
-	if (first == parent)
-		goto out;
+	for (;; parent = first) {
+	 	result = 0;
+		first = __request_resource(parent, new);
+		if (!first)
+			goto out;
 
-	/* Resource fully contained by the clashing resource? Recurse into it */
-	if (first->start <= new->start && first->end >= new->end) {
-		parent = first;
-		goto begin;
+		result = -EBUSY;
+		if (first == parent)
+			goto out;
+
+		if ((first->start > new->start) || (first->end < new->end))
+			break;
+		if ((first->start == new->start) && (first->end == new->end))
+			break;
 	}
 
 	for (next = first; ; next = next->sibling) {
-- 
cgit v1.2.3


From 0ec76a110f432e98277e464b82ace8dd66571689 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 27 Sep 2006 01:50:15 -0700
Subject: [PATCH] NOMMU: Check that access_process_vm() has a valid target

Check that access_process_vm() is accessing a valid mapping in the target
process.

This limits ptrace() accesses and accesses through /proc/<pid>/maps to only
those regions actually mapped by a program.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/ptrace.c | 54 ------------------------------------------------------
 1 file changed, 54 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 9a111f70145c..8aad0331d82e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -241,60 +241,6 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	return 0;
 }
 
-/*
- * Access another process' address space.
- * Source/target buffer must be kernel space, 
- * Do not walk the page table directly, use get_user_pages
- */
-
-int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
-{
-	struct mm_struct *mm;
-	struct vm_area_struct *vma;
-	struct page *page;
-	void *old_buf = buf;
-
-	mm = get_task_mm(tsk);
-	if (!mm)
-		return 0;
-
-	down_read(&mm->mmap_sem);
-	/* ignore errors, just check how much was sucessfully transfered */
-	while (len) {
-		int bytes, ret, offset;
-		void *maddr;
-
-		ret = get_user_pages(tsk, mm, addr, 1,
-				write, 1, &page, &vma);
-		if (ret <= 0)
-			break;
-
-		bytes = len;
-		offset = addr & (PAGE_SIZE-1);
-		if (bytes > PAGE_SIZE-offset)
-			bytes = PAGE_SIZE-offset;
-
-		maddr = kmap(page);
-		if (write) {
-			copy_to_user_page(vma, page, addr,
-					  maddr + offset, buf, bytes);
-			set_page_dirty_lock(page);
-		} else {
-			copy_from_user_page(vma, page, addr,
-					    buf, maddr + offset, bytes);
-		}
-		kunmap(page);
-		page_cache_release(page);
-		len -= bytes;
-		buf += bytes;
-		addr += bytes;
-	}
-	up_read(&mm->mmap_sem);
-	mmput(mm);
-	
-	return buf - old_buf;
-}
-
 int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
 {
 	int copied = 0;
-- 
cgit v1.2.3


From f269fdd1829acc5e53bf57b145003e5733133f2b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 27 Sep 2006 01:50:23 -0700
Subject: [PATCH] NOMMU: move the fallback arch_vma_name() to a sensible place

Move the fallback arch_vma_name() to a sensible place (kernel/signal.c).

Currently it's in fs/proc/task_mmu.c, a file that is dependent on both
CONFIG_PROC_FS and CONFIG_MMU being enabled, but it's used from
kernel/signal.c from where it is called unconditionally.

[akpm@osdl.org: build fix]
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index bfdb5686fa3e..05853a7337e3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2577,6 +2577,11 @@ asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize)
 }
 #endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
 
+__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
+{
+	return NULL;
+}
+
 void __init signals_init(void)
 {
 	sigqueue_cachep =
-- 
cgit v1.2.3


From 8e18e2941c53416aa219708e7dcad21fb4bd6794 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 27 Sep 2006 01:50:46 -0700
Subject: [PATCH] inode_diet: Replace inode.u.generic_ip with inode.i_private

The following patches reduce the size of the VFS inode structure by 28 bytes
on a UP x86.  (It would be more on an x86_64 system).  This is a 10% reduction
in the inode size on a UP kernel that is configured in a production mode
(i.e., with no spinlock or other debugging functions enabled; if you want to
save memory taken up by in-core inodes, the first thing you should do is
disable the debugging options; they are responsible for a huge amount of bloat
in the VFS inode structure).

This patch:

The filesystem or device-specific pointer in the inode is inside a union,
which is pretty pointless given that all 30+ users of this field have been
using the void pointer.  Get rid of the union and rename it to i_private, with
a comment to explain who is allowed to use the void pointer.  This is just a
cleanup, but it allows us to reuse the union 'u' for something something where
the union will actually be used.

[judith@osdl.org: powerpc build fix]
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Judith Lebzelter <judith@osdl.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/relay.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 33345e73485c..85786ff2a4f9 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -669,7 +669,7 @@ EXPORT_SYMBOL_GPL(relay_flush);
  */
 static int relay_file_open(struct inode *inode, struct file *filp)
 {
-	struct rchan_buf *buf = inode->u.generic_ip;
+	struct rchan_buf *buf = inode->i_private;
 	kref_get(&buf->kref);
 	filp->private_data = buf;
 
-- 
cgit v1.2.3


From ba52de123d454b57369f291348266d86f4b35070 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 27 Sep 2006 01:50:49 -0700
Subject: [PATCH] inode-diet: Eliminate i_blksize from the inode structure

This eliminates the i_blksize field from struct inode.  Filesystems that want
to provide a per-inode st_blksize can do so by providing their own getattr
routine instead of using the generic_fillattr() function.

Note that some filesystems were providing pretty much random (and incorrect)
values for i_blksize.

[bunk@stusta.de: cleanup]
[akpm@osdl.org: generic_fillattr() fix]
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index cff41511269f..1b32c2c04c15 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -289,7 +289,6 @@ static struct inode *cpuset_new_inode(mode_t mode)
 		inode->i_mode = mode;
 		inode->i_uid = current->fsuid;
 		inode->i_gid = current->fsgid;
-		inode->i_blksize = PAGE_CACHE_SIZE;
 		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info;
-- 
cgit v1.2.3


From b89a81712f486e4f7a606987413e387605fdeaf4 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 27 Sep 2006 01:51:04 -0700
Subject: [PATCH] sysctl: Allow /proc/sys without sys_sysctl

Since sys_sysctl is deprecated start allow it to be compiled out.  This
should catch any remaining user space code that cares, and paves the way
for further sysctl cleanups.

[akpm@osdl.org: If sys_sysctl() is not compiled-in, emit a warning]
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 113 +++++++++++++++++++-------------------------------------
 1 file changed, 38 insertions(+), 75 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index bcb3a181dbb2..8bfa7d117c54 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -137,8 +137,11 @@ extern int no_unaligned_warning;
 extern int max_lock_depth;
 #endif
 
-static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t,
-		       ctl_table *, void **);
+#ifdef CONFIG_SYSCTL_SYSCALL
+static int parse_table(int __user *, int, void __user *, size_t __user *,
+		void __user *, size_t, ctl_table *, void **);
+#endif
+
 static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
 
@@ -165,7 +168,7 @@ int sysctl_legacy_va_layout;
 
 /* /proc declarations: */
 
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
 
 static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *);
 static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *);
@@ -1166,12 +1169,13 @@ static void start_unregistering(struct ctl_table_header *p)
 
 void __init sysctl_init(void)
 {
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
 	register_proc_table(root_table, proc_sys_root, &root_table_header);
 	init_irq_proc();
 #endif
 }
 
+#ifdef CONFIG_SYSCTL_SYSCALL
 int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
 	       void __user *newval, size_t newlen)
 {
@@ -1225,6 +1229,7 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
 	unlock_kernel();
 	return error;
 }
+#endif /* CONFIG_SYSCTL_SYSCALL */
 
 /*
  * ctl_perm does NOT grant the superuser all rights automatically, because
@@ -1251,6 +1256,7 @@ static inline int ctl_perm(ctl_table *table, int op)
 	return test_perm(table->mode, op);
 }
 
+#ifdef CONFIG_SYSCTL_SYSCALL
 static int parse_table(int __user *name, int nlen,
 		       void __user *oldval, size_t __user *oldlenp,
 		       void __user *newval, size_t newlen,
@@ -1340,6 +1346,7 @@ int do_sysctl_strategy (ctl_table *table,
 	}
 	return 0;
 }
+#endif /* CONFIG_SYSCTL_SYSCALL */
 
 /**
  * register_sysctl_table - register a sysctl hierarchy
@@ -1427,7 +1434,7 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table,
 	else
 		list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
 	spin_unlock(&sysctl_lock);
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
 	register_proc_table(table, proc_sys_root, tmp);
 #endif
 	return tmp;
@@ -1445,18 +1452,31 @@ void unregister_sysctl_table(struct ctl_table_header * header)
 	might_sleep();
 	spin_lock(&sysctl_lock);
 	start_unregistering(header);
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
 	unregister_proc_table(header->ctl_table, proc_sys_root);
 #endif
 	spin_unlock(&sysctl_lock);
 	kfree(header);
 }
 
+#else /* !CONFIG_SYSCTL */
+struct ctl_table_header * register_sysctl_table(ctl_table * table,
+						int insert_at_head)
+{
+	return NULL;
+}
+
+void unregister_sysctl_table(struct ctl_table_header * table)
+{
+}
+
+#endif /* CONFIG_SYSCTL */
+
 /*
  * /proc/sys support
  */
 
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
 
 /* Scan the sysctl entries in table and add them all into /proc */
 static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set)
@@ -2318,6 +2338,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
 #endif /* CONFIG_PROC_FS */
 
 
+#ifdef CONFIG_SYSCTL_SYSCALL
 /*
  * General sysctl support routines 
  */
@@ -2460,11 +2481,19 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
 	return 1;
 }
 
-#else /* CONFIG_SYSCTL */
+#else /* CONFIG_SYSCTL_SYSCALL */
 
 
 asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
 {
+	static int msg_count;
+
+	if (msg_count < 5) {
+		msg_count++;
+		printk(KERN_INFO
+			"warning: process `%s' used the removed sysctl "
+			"system call\n", current->comm);
+	}
 	return -ENOSYS;
 }
 
@@ -2496,73 +2525,7 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
 	return -ENOSYS;
 }
 
-int proc_dostring(ctl_table *table, int write, struct file *filp,
-		  void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	return -ENOSYS;
-}
-
-int proc_dointvec(ctl_table *table, int write, struct file *filp,
-		  void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	return -ENOSYS;
-}
-
-int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
-			void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	return -ENOSYS;
-}
-
-int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp,
-		    void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	return -ENOSYS;
-}
-
-int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
-			  void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	return -ENOSYS;
-}
-
-int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp,
-			  void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	return -ENOSYS;
-}
-
-int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
-			     void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	return -ENOSYS;
-}
-
-int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
-		    void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	return -ENOSYS;
-}
-
-int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
-				      struct file *filp,
-				      void __user *buffer,
-				      size_t *lenp, loff_t *ppos)
-{
-    return -ENOSYS;
-}
-
-struct ctl_table_header * register_sysctl_table(ctl_table * table, 
-						int insert_at_head)
-{
-	return NULL;
-}
-
-void unregister_sysctl_table(struct ctl_table_header * table)
-{
-}
-
-#endif /* CONFIG_SYSCTL */
+#endif /* CONFIG_SYSCTL_SYSCALL */
 
 /*
  * No sense putting this after each symbol definition, twice,
-- 
cgit v1.2.3


From c18258c6f0848f97e85287f6271c511a092bb784 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 27 Sep 2006 01:51:06 -0700
Subject: [PATCH] pid: Implement transfer_pid and use it to simplify de_thread

In de_thread we move pids from one process to another, a rather ugly case.
The function transfer_pid makes it clear what we are doing, and makes the
action atomic.  This is useful we ever want to atomically traverse the
process group and session lists, in a rcu safe manner.

Even if the atomic properties this change should be a win as transfer_pid
should be less code to execute than executing both attach_pid and
detach_pid, and this should make de_thread slightly smaller as only a
single function call needs to be emitted.  The only downside is that the
code might be slower to execute as the odds are against transfer_pid being
in cache.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/pid.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index 93e212f20671..6db82b68e2f8 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -252,6 +252,15 @@ void fastcall detach_pid(struct task_struct *task, enum pid_type type)
 	free_pid(pid);
 }
 
+/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
+void fastcall transfer_pid(struct task_struct *old, struct task_struct *new,
+			   enum pid_type type)
+{
+	new->pids[type].pid = old->pids[type].pid;
+	hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node);
+	old->pids[type].pid = NULL;
+}
+
 struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
 {
 	struct task_struct *result = NULL;
-- 
cgit v1.2.3


From 65800ac77e080cf159d6c1207b6886e18f22bc08 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 27 Sep 2006 01:51:11 -0700
Subject: [PATCH] pid: remove temporary debug code in attach_pid

With the patches flying between Oleg and myself somehow this temporary
debug code got left in pid.c.  It was never intended to make it to the
stable kernel.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/pid.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index 6db82b68e2f8..8387e8c68193 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -223,9 +223,6 @@ int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr)
 	struct pid_link *link;
 	struct pid *pid;
 
-	WARN_ON(!task->pid); /* to be removed soon */
-	WARN_ON(!nr); /* to be removed soon */
-
 	link = &task->pids[type];
 	link->pid = pid = find_pid(nr);
 	hlist_add_head_rcu(&link->node, &pid->tasks[type]);
-- 
cgit v1.2.3


From 910067d188d56d80801b71b0ca1f73aa400c7b8c Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Fri, 29 Sep 2006 01:58:36 -0700
Subject: [PATCH] remove generic__raw_read_trylock()

If the cpu has the lock held for write, is interrupted, and the interrupt
handler calls read_trylock(), it's an instant deadlock.

Now, Dave Miller has subsequently pointed out that we don't have any
situations where this can occur.  Nevertheless, we should delete
generic__raw_read_lock (and its associated EXPORT to make Arjan happy) so that
nobody thinks they can use it.

Acked-by: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/spinlock.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 9644a41e0bef..d48143eafbfd 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -21,17 +21,6 @@
 #include <linux/debug_locks.h>
 #include <linux/module.h>
 
-/*
- * Generic declaration of the raw read_trylock() function,
- * architectures are supposed to optimize this:
- */
-int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock)
-{
-	__raw_read_lock(lock);
-	return 1;
-}
-EXPORT_SYMBOL(generic__raw_read_trylock);
-
 int __lockfunc _spin_trylock(spinlock_t *lock)
 {
 	preempt_disable();
-- 
cgit v1.2.3


From d8c7649e99e4b081b624aefe1e77caa30b53cb18 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Fri, 29 Sep 2006 01:58:55 -0700
Subject: [PATCH] kernel/params: driver layer error checking

Check driver layer return values in kernel/params.c

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/params.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 91aea7aa532e..63d432d0ebc0 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -547,6 +547,7 @@ static void __init kernel_param_sysfs_setup(const char *name,
 					    unsigned int name_skip)
 {
 	struct module_kobject *mk;
+	int ret;
 
 	mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
 	BUG_ON(!mk);
@@ -554,7 +555,8 @@ static void __init kernel_param_sysfs_setup(const char *name,
 	mk->mod = THIS_MODULE;
 	kobj_set_kset_s(mk, module_subsys);
 	kobject_set_name(&mk->kobj, name);
-	kobject_register(&mk->kobj);
+	ret = kobject_register(&mk->kobj);
+	BUG_ON(ret < 0);
 
 	/* no need to keep the kobject if no parameter is exported */
 	if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) {
@@ -684,7 +686,14 @@ decl_subsys(module, &module_ktype, NULL);
  */
 static int __init param_sysfs_init(void)
 {
-	subsystem_register(&module_subsys);
+	int ret;
+
+	ret = subsystem_register(&module_subsys);
+	if (ret < 0) {
+		printk(KERN_WARNING "%s (%d): subsystem_register error: %d\n",
+			__FILE__, __LINE__, ret);
+		return ret;
+	}
 
 	param_sysfs_builtin();
 
-- 
cgit v1.2.3


From 4c78a6639386f9e7399fbc0d0a173d4cc1a3e9bf Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Fri, 29 Sep 2006 01:59:10 -0700
Subject: [PATCH] kernel-doc for relay interface

Add relay interface support to DocBook/kernel-api.tmpl.  Fix typos etc.  in
relay.c and relayfs.txt.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Acked-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/relay.c | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 85786ff2a4f9..1d63ecddfa70 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -95,7 +95,7 @@ int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
  *	@buf: the buffer struct
  *	@size: total size of the buffer
  *
- *	Returns a pointer to the resulting buffer, NULL if unsuccessful. The
+ *	Returns a pointer to the resulting buffer, %NULL if unsuccessful. The
  *	passed in size will get page aligned, if it isn't already.
  */
 static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
@@ -132,10 +132,9 @@ depopulate:
 
 /**
  *	relay_create_buf - allocate and initialize a channel buffer
- *	@alloc_size: size of the buffer to allocate
- *	@n_subbufs: number of sub-buffers in the channel
+ *	@chan: the relay channel
  *
- *	Returns channel buffer if successful, NULL otherwise
+ *	Returns channel buffer if successful, %NULL otherwise.
  */
 struct rchan_buf *relay_create_buf(struct rchan *chan)
 {
@@ -163,6 +162,7 @@ free_buf:
 
 /**
  *	relay_destroy_channel - free the channel struct
+ *	@kref: target kernel reference that contains the relay channel
  *
  *	Should only be called from kref_put().
  */
@@ -194,6 +194,7 @@ void relay_destroy_buf(struct rchan_buf *buf)
 
 /**
  *	relay_remove_buf - remove a channel buffer
+ *	@kref: target kernel reference that contains the relay buffer
  *
  *	Removes the file from the fileystem, which also frees the
  *	rchan_buf_struct and the channel buffer.  Should only be called from
@@ -374,7 +375,7 @@ void relay_reset(struct rchan *chan)
 }
 EXPORT_SYMBOL_GPL(relay_reset);
 
-/**
+/*
  *	relay_open_buf - create a new relay channel buffer
  *
  *	Internal - used by relay_open().
@@ -448,12 +449,12 @@ static inline void setup_callbacks(struct rchan *chan,
 /**
  *	relay_open - create a new relay channel
  *	@base_filename: base name of files to create
- *	@parent: dentry of parent directory, NULL for root directory
+ *	@parent: dentry of parent directory, %NULL for root directory
  *	@subbuf_size: size of sub-buffers
  *	@n_subbufs: number of sub-buffers
  *	@cb: client callback functions
  *
- *	Returns channel pointer if successful, NULL otherwise.
+ *	Returns channel pointer if successful, %NULL otherwise.
  *
  *	Creates a channel buffer for each cpu using the sizes and
  *	attributes specified.  The created channel buffer files
@@ -585,7 +586,7 @@ EXPORT_SYMBOL_GPL(relay_switch_subbuf);
  *	subbufs_consumed should be the number of sub-buffers newly consumed,
  *	not the total consumed.
  *
- *	NOTE: kernel clients don't need to call this function if the channel
+ *	NOTE: Kernel clients don't need to call this function if the channel
  *	mode is 'overwrite'.
  */
 void relay_subbufs_consumed(struct rchan *chan,
@@ -641,7 +642,7 @@ EXPORT_SYMBOL_GPL(relay_close);
  *	relay_flush - close the channel
  *	@chan: the channel
  *
- *	Flushes all channel buffers i.e. forces buffer switch.
+ *	Flushes all channel buffers, i.e. forces buffer switch.
  */
 void relay_flush(struct rchan *chan)
 {
@@ -729,7 +730,7 @@ static int relay_file_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-/**
+/*
  *	relay_file_read_consume - update the consumed count for the buffer
  */
 static void relay_file_read_consume(struct rchan_buf *buf,
@@ -756,7 +757,7 @@ static void relay_file_read_consume(struct rchan_buf *buf,
 	}
 }
 
-/**
+/*
  *	relay_file_read_avail - boolean, are there unconsumed bytes available?
  */
 static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
@@ -793,6 +794,8 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
 
 /**
  *	relay_file_read_subbuf_avail - return bytes available in sub-buffer
+ *	@read_pos: file read position
+ *	@buf: relay channel buffer
  */
 static size_t relay_file_read_subbuf_avail(size_t read_pos,
 					   struct rchan_buf *buf)
@@ -818,6 +821,8 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos,
 
 /**
  *	relay_file_read_start_pos - find the first available byte to read
+ *	@read_pos: file read position
+ *	@buf: relay channel buffer
  *
  *	If the read_pos is in the middle of padding, return the
  *	position of the first actually available byte, otherwise
@@ -844,6 +849,9 @@ static size_t relay_file_read_start_pos(size_t read_pos,
 
 /**
  *	relay_file_read_end_pos - return the new read position
+ *	@read_pos: file read position
+ *	@buf: relay channel buffer
+ *	@count: number of bytes to be read
  */
 static size_t relay_file_read_end_pos(struct rchan_buf *buf,
 				      size_t read_pos,
@@ -865,7 +873,7 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf,
 	return end_pos;
 }
 
-/**
+/*
  *	subbuf_read_actor - read up to one subbuf's worth of data
  */
 static int subbuf_read_actor(size_t read_start,
@@ -890,7 +898,7 @@ static int subbuf_read_actor(size_t read_start,
 	return ret;
 }
 
-/**
+/*
  *	subbuf_send_actor - send up to one subbuf's worth of data
  */
 static int subbuf_send_actor(size_t read_start,
@@ -933,7 +941,7 @@ typedef int (*subbuf_actor_t) (size_t read_start,
 			       read_descriptor_t *desc,
 			       read_actor_t actor);
 
-/**
+/*
  *	relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
  */
 static inline ssize_t relay_file_read_subbufs(struct file *filp,
-- 
cgit v1.2.3


From 2ff6fd8f4a40b72ff35dbb1e08eb9ed6b64b6028 Mon Sep 17 00:00:00 2001
From: Yoichi Yuasa <yoichi_yuasa@tripeaks.co.jp>
Date: Fri, 29 Sep 2006 01:59:21 -0700
Subject: [PATCH] irq: fixed coding style

Signed-off-by: Yoichi Yuasa <yoichi_yuasa@tripeaks.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/chip.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ac1f850d4937..3c3fec55f6d1 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -146,7 +146,7 @@ static void default_disable(unsigned int irq)
 	struct irq_desc *desc = irq_desc + irq;
 
 	if (!(desc->status & IRQ_DELAYED_DISABLE))
-		irq_desc[irq].chip->mask(irq);
+		desc->chip->mask(irq);
 }
 
 /*
-- 
cgit v1.2.3


From 538d9d532b0e0320c9dd326a560b5a72d73f910d Mon Sep 17 00:00:00 2001
From: Yoichi Yuasa <yoichi_yuasa@tripeaks.co.jp>
Date: Fri, 29 Sep 2006 01:59:22 -0700
Subject: [PATCH] irq: remove a extra line

Signed-off-by: Yoichi Yuasa <yoichi_yuasa@tripeaks.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/chip.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3c3fec55f6d1..736cb0bd498f 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -40,10 +40,6 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
 	spin_lock_irqsave(&desc->lock, flags);
 	irq_chip_set_defaults(chip);
 	desc->chip = chip;
-	/*
-	 * For compatibility only:
-	 */
-	desc->chip = chip;
 	spin_unlock_irqrestore(&desc->lock, flags);
 
 	return 0;
-- 
cgit v1.2.3


From a49a4af759c0193d42aeaeefb4df7de8973dd713 Mon Sep 17 00:00:00 2001
From: Josh Triplett <josht@us.ibm.com>
Date: Fri, 29 Sep 2006 01:59:30 -0700
Subject: [PATCH] rcu: add lock annotations to
 rcu{,_bh}_torture_read_{lock,unlock}

rcu_torture_read_lock and rcu_bh_torture_read_lock acquire locks without
releasing them, and the matching functions rcu_torture_read_unlock and
rcu_bh_torture_read_unlock get called with the corresponding locks held and
release them.  Add lock annotations to these four functions so that sparse
can check callers for lock pairing, and so that sparse will not complain
about these functions since they intentionally use locks in this manner.

Signed-off-by: Josh Triplett <josh@freedesktop.org>
Acked-by: Paul McKenney <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcutorture.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 4d1c3d247127..4f2c4272d59c 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -192,13 +192,13 @@ static struct rcu_torture_ops *cur_ops = NULL;
  * Definitions for rcu torture testing.
  */
 
-static int rcu_torture_read_lock(void)
+static int rcu_torture_read_lock(void) __acquires(RCU)
 {
 	rcu_read_lock();
 	return 0;
 }
 
-static void rcu_torture_read_unlock(int idx)
+static void rcu_torture_read_unlock(int idx) __releases(RCU)
 {
 	rcu_read_unlock();
 }
@@ -250,13 +250,13 @@ static struct rcu_torture_ops rcu_ops = {
  * Definitions for rcu_bh torture testing.
  */
 
-static int rcu_bh_torture_read_lock(void)
+static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH)
 {
 	rcu_read_lock_bh();
 	return 0;
 }
 
-static void rcu_bh_torture_read_unlock(int idx)
+static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
 {
 	rcu_read_unlock_bh();
 }
-- 
cgit v1.2.3


From d10be6d1bdb0c901b78244872de3cc1c1b6c3fb2 Mon Sep 17 00:00:00 2001
From: Mark Huang <mlhuang@CS.Princeton.EDU>
Date: Fri, 29 Sep 2006 01:59:34 -0700
Subject: [PATCH] module_subsys: initialize earlier

Initialize module_subsys earlier (or at least earlier than devices) since
it could be used very early in the boot process if kmod loads a module
before the device initcalls.  Otherwise, kmod will crash in
kernel/module.c:mod_sysfs_setup() since the kset in module_subsys is not
initialized yet.

I only noticed this problem because occasionally, kmod loads the modules
for my SCSI and Ethernet adapters very early, during the boot process
itself.  I don't quite understand why it loads them sometimes and doesn't
load them other times.  Or who is telling kmod to do so.  Can someone
explain?

Signed-off-by: Mark Huang <mlhuang@cs.princeton.edu>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/params.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 63d432d0ebc0..f406655d6653 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -699,7 +699,7 @@ static int __init param_sysfs_init(void)
 
 	return 0;
 }
-__initcall(param_sysfs_init);
+subsys_initcall(param_sysfs_init);
 
 EXPORT_SYMBOL(param_set_byte);
 EXPORT_SYMBOL(param_get_byte);
-- 
cgit v1.2.3


From 89e7e374dde1015d69d2d70797ae4053b14fa9db Mon Sep 17 00:00:00 2001
From: Josh Triplett <josht@us.ibm.com>
Date: Fri, 29 Sep 2006 01:59:36 -0700
Subject: [PATCH] timer: add lock annotation to lock_timer_base

lock_timer_base acquires a lock and returns with that lock held.  Add a
lock annotation to this function so that sparse can check callers for lock
pairing, and so that sparse will not complain about this function since it
intentionally uses the lock in this manner.

Signed-off-by: Josh Triplett <josh@freedesktop.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 1d7dd6267c2d..6c9fa80088ed 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -175,6 +175,7 @@ static inline void detach_timer(struct timer_list *timer,
  */
 static tvec_base_t *lock_timer_base(struct timer_list *timer,
 					unsigned long *flags)
+	__acquires(timer->base->lock)
 {
 	tvec_base_t *base;
 
-- 
cgit v1.2.3


From 6c5c934153513dc72e2d6464f39e8ef1f27c0a3e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 29 Sep 2006 01:59:40 -0700
Subject: [PATCH] ifdef blktrace debugging fields

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 802b1cf0e63f..bca6ce6d3ded 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -183,7 +183,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	/* One for us, one for whoever does the "release_task()" (usually parent) */
 	atomic_set(&tsk->usage,2);
 	atomic_set(&tsk->fs_excl, 0);
+#ifdef CONFIG_BLK_DEV_IO_TRACE
 	tsk->btrace_seq = 0;
+#endif
 	tsk->splice_pipe = NULL;
 	return tsk;
 }
-- 
cgit v1.2.3


From db630637b2f192bea2ba1c000e9cbe4e542a03ea Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 29 Sep 2006 01:59:44 -0700
Subject: [PATCH] clean up and remove some extra spinlocks from rtmutex

Oleg brought up some interesting points about grabbing the pi_lock for some
protections.  In this discussion, I realized that there are some places
that the pi_lock is being grabbed when it really wasn't necessary.  Also
this patch does a little bit of clean up.

This patch basically does three things:

1) renames the "boost" variable to "chain_walk".  Since it is used in
   the debugging case when it isn't going to be boosted.  It better
   describes what the test is going to do if it succeeds.

2) moves get_task_struct to just before the unlocking of the wait_lock.
   This removes duplicate code, and makes it a little easier to read.  The
   owner wont go away while either the pi_lock or the wait_lock are held.

3) removes the pi_locking and owner blocked checking completely from the
   debugging case.  This is because the grabbing the lock and doing the
   check, then releasing the lock is just so full of races.  It's just as
   good to go ahead and call the pi_chain_walk function, since after
   releasing the lock the owner can then block anyway, and we would have
   missed that.  For the debug case, we really do want to do the chain walk
   to test for deadlocks anyway.

[oleg@tv-sign.ru: more of the same]
Signed-of-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Esben Nielsen <nielsen.esben@googlemail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rtmutex.c | 51 +++++++++++++++++++++++++--------------------------
 1 file changed, 25 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 3e13a1e5856f..4ab17da46fd8 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -251,6 +251,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 
 	/* Grab the next task */
 	task = rt_mutex_owner(lock);
+	get_task_struct(task);
 	spin_lock_irqsave(&task->pi_lock, flags);
 
 	if (waiter == rt_mutex_top_waiter(lock)) {
@@ -269,7 +270,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 		__rt_mutex_adjust_prio(task);
 	}
 
-	get_task_struct(task);
 	spin_unlock_irqrestore(&task->pi_lock, flags);
 
 	top_waiter = rt_mutex_top_waiter(lock);
@@ -409,7 +409,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 	struct task_struct *owner = rt_mutex_owner(lock);
 	struct rt_mutex_waiter *top_waiter = waiter;
 	unsigned long flags;
-	int boost = 0, res;
+	int chain_walk = 0, res;
 
 	spin_lock_irqsave(&current->pi_lock, flags);
 	__rt_mutex_adjust_prio(current);
@@ -433,25 +433,23 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 		plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
 
 		__rt_mutex_adjust_prio(owner);
-		if (owner->pi_blocked_on) {
-			boost = 1;
-			/* gets dropped in rt_mutex_adjust_prio_chain()! */
-			get_task_struct(owner);
-		}
-		spin_unlock_irqrestore(&owner->pi_lock, flags);
-	}
-	else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
-		spin_lock_irqsave(&owner->pi_lock, flags);
-		if (owner->pi_blocked_on) {
-			boost = 1;
-			/* gets dropped in rt_mutex_adjust_prio_chain()! */
-			get_task_struct(owner);
-		}
+		if (owner->pi_blocked_on)
+			chain_walk = 1;
 		spin_unlock_irqrestore(&owner->pi_lock, flags);
 	}
-	if (!boost)
+	else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
+		chain_walk = 1;
+
+	if (!chain_walk)
 		return 0;
 
+	/*
+	 * The owner can't disappear while holding a lock,
+	 * so the owner struct is protected by wait_lock.
+	 * Gets dropped in rt_mutex_adjust_prio_chain()!
+	 */
+	get_task_struct(owner);
+
 	spin_unlock(&lock->wait_lock);
 
 	res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
@@ -532,7 +530,7 @@ static void remove_waiter(struct rt_mutex *lock,
 	int first = (waiter == rt_mutex_top_waiter(lock));
 	struct task_struct *owner = rt_mutex_owner(lock);
 	unsigned long flags;
-	int boost = 0;
+	int chain_walk = 0;
 
 	spin_lock_irqsave(&current->pi_lock, flags);
 	plist_del(&waiter->list_entry, &lock->wait_list);
@@ -554,19 +552,20 @@ static void remove_waiter(struct rt_mutex *lock,
 		}
 		__rt_mutex_adjust_prio(owner);
 
-		if (owner->pi_blocked_on) {
-			boost = 1;
-			/* gets dropped in rt_mutex_adjust_prio_chain()! */
-			get_task_struct(owner);
-		}
+		if (owner->pi_blocked_on)
+			chain_walk = 1;
+
 		spin_unlock_irqrestore(&owner->pi_lock, flags);
 	}
 
 	WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
 
-	if (!boost)
+	if (!chain_walk)
 		return;
 
+	/* gets dropped in rt_mutex_adjust_prio_chain()! */
+	get_task_struct(owner);
+
 	spin_unlock(&lock->wait_lock);
 
 	rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
@@ -592,10 +591,10 @@ void rt_mutex_adjust_pi(struct task_struct *task)
 		return;
 	}
 
-	/* gets dropped in rt_mutex_adjust_prio_chain()! */
-	get_task_struct(task);
 	spin_unlock_irqrestore(&task->pi_lock, flags);
 
+	/* gets dropped in rt_mutex_adjust_prio_chain()! */
+	get_task_struct(task);
 	rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
 }
 
-- 
cgit v1.2.3


From 2aae4a108dab8b8bc92270335f6e4b5c146b32ae Mon Sep 17 00:00:00 2001
From: Rolf Eike Beer <eike-kernel@sf-tec.de>
Date: Fri, 29 Sep 2006 01:59:46 -0700
Subject: [PATCH] Fix kerneldoc comments in kernel/timer.c

Some of the kerneldoc comments in this file are ignored since the lead-in
is malformed, using either "/*" or "/***" instead of "/**".

[rdunlap@xenotime.net: kerneldoc fixes]
Signed-off-by: Rolf Eike Beer <eike-kernel@sf-tec.de>
Acked-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 6c9fa80088ed..d644f4e9ca0c 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -136,7 +136,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
 	list_add_tail(&timer->entry, vec);
 }
 
-/***
+/**
  * init_timer - initialize a timer.
  * @timer: the timer to be initialized
  *
@@ -236,7 +236,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
 
 EXPORT_SYMBOL(__mod_timer);
 
-/***
+/**
  * add_timer_on - start a timer on a particular CPU
  * @timer: the timer to be added
  * @cpu: the CPU to start it on
@@ -256,9 +256,10 @@ void add_timer_on(struct timer_list *timer, int cpu)
 }
 
 
-/***
+/**
  * mod_timer - modify a timer's timeout
  * @timer: the timer to be modified
+ * @expires: new timeout in jiffies
  *
  * mod_timer is a more efficient way to update the expire field of an
  * active timer (if the timer is inactive it will be activated)
@@ -292,7 +293,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
 
 EXPORT_SYMBOL(mod_timer);
 
-/***
+/**
  * del_timer - deactive a timer.
  * @timer: the timer to be deactivated
  *
@@ -324,7 +325,10 @@ int del_timer(struct timer_list *timer)
 EXPORT_SYMBOL(del_timer);
 
 #ifdef CONFIG_SMP
-/*
+/**
+ * try_to_del_timer_sync - Try to deactivate a timer
+ * @timer: timer do del
+ *
  * This function tries to deactivate a timer. Upon successful (ret >= 0)
  * exit the timer is not queued and the handler is not running on any CPU.
  *
@@ -352,7 +356,7 @@ out:
 	return ret;
 }
 
-/***
+/**
  * del_timer_sync - deactivate a timer and wait for the handler to finish.
  * @timer: the timer to be deactivated
  *
@@ -402,15 +406,15 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
 	return index;
 }
 
-/***
+#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
+
+/**
  * __run_timers - run all expired timers (if any) on this CPU.
  * @base: the timer vector to be processed.
  *
  * This function cascades all vectors and executes all expired timer
  * vectors.
  */
-#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
-
 static inline void __run_timers(tvec_base_t *base)
 {
 	struct timer_list *timer;
@@ -971,7 +975,7 @@ void __init timekeeping_init(void)
 
 
 static int timekeeping_suspended;
-/*
+/**
  * timekeeping_resume - Resumes the generic timekeeping subsystem.
  * @dev:	unused
  *
@@ -1107,7 +1111,7 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset)
 	clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift);
 }
 
-/*
+/**
  * update_wall_time - Uses the current clocksource to increment the wall time
  *
  * Called from the timer interrupt, must hold a write on xtime_lock.
@@ -1471,8 +1475,9 @@ asmlinkage long sys_gettid(void)
 	return current->pid;
 }
 
-/*
+/**
  * sys_sysinfo - fill in sysinfo struct
+ * @info: pointer to buffer to fill
  */ 
 asmlinkage long sys_sysinfo(struct sysinfo __user *info)
 {
-- 
cgit v1.2.3


From e6cab99bb478e067b1a7a120333ff326954a2412 Mon Sep 17 00:00:00 2001
From: Chuck Ebbert <76306.1226@compuserve.com>
Date: Fri, 29 Sep 2006 01:59:57 -0700
Subject: [PATCH] unwind: fix unused variable warning when !CONFIG_MODULES

Fix "variable defined but not used" compiler warning in unwind.c when
CONFIG_MODULES is not set.

Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com>
Cc: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/unwind.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/unwind.c b/kernel/unwind.c
index 3430475fcd88..2e2368607aab 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -102,7 +102,7 @@ static struct unwind_table {
 	unsigned long size;
 	struct unwind_table *link;
 	const char *name;
-} root_table, *last_table;
+} root_table;
 
 struct unwind_item {
 	enum item_location {
@@ -174,6 +174,8 @@ void __init unwind_init(void)
 
 #ifdef CONFIG_MODULES
 
+static struct unwind_table *last_table;
+
 /* Must be called with module_mutex held. */
 void *unwind_add_table(struct module *module,
                        const void *table_start,
-- 
cgit v1.2.3


From 3b9b8ab65d8eed784b9164d03807cb2bda7b5cd6 Mon Sep 17 00:00:00 2001
From: Kirill Korotaev <dev@sw.ru>
Date: Fri, 29 Sep 2006 02:00:05 -0700
Subject: [PATCH] Fix unserialized task->files changing

Fixed race on put_files_struct on exec with proc.  Restoring files on
current on error path may lead to proc having a pointer to already kfree-d
files_struct.

->files changing at exit.c and khtread.c are safe as exit_files() makes all
things under lock.

Found during OpenVZ stress testing.

[akpm@osdl.org: add export]
Signed-off-by: Pavel Emelianov <xemul@openvz.org>
Signed-off-by: Kirill Korotaev <dev@openvz.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index d891883420f7..4b6fb054b25d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -487,6 +487,18 @@ void fastcall put_files_struct(struct files_struct *files)
 
 EXPORT_SYMBOL(put_files_struct);
 
+void reset_files_struct(struct task_struct *tsk, struct files_struct *files)
+{
+	struct files_struct *old;
+
+	old = tsk->files;
+	task_lock(tsk);
+	tsk->files = files;
+	task_unlock(tsk);
+	put_files_struct(old);
+}
+EXPORT_SYMBOL(reset_files_struct);
+
 static inline void __exit_files(struct task_struct *tsk)
 {
 	struct files_struct * files = tsk->files;
-- 
cgit v1.2.3


From f400e198b2ed26ce55b22a1412ded0896e7516ac Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Date: Fri, 29 Sep 2006 02:00:07 -0700
Subject: [PATCH] pidspace: is_init()

This is an updated version of Eric Biederman's is_init() patch.
(http://lkml.org/lkml/2006/2/6/280).  It applies cleanly to 2.6.18-rc3 and
replaces a few more instances of ->pid == 1 with is_init().

Further, is_init() checks pid and thus removes dependency on Eric's other
patches for now.

Eric's original description:

	There are a lot of places in the kernel where we test for init
	because we give it special properties.  Most  significantly init
	must not die.  This results in code all over the kernel test
	->pid == 1.

	Introduce is_init to capture this case.

	With multiple pid spaces for all of the cases affected we are
	looking for only the first process on the system, not some other
	process that has pid == 1.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: <lxc-devel@lists.sourceforge.net>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/capability.c | 2 +-
 kernel/cpuset.c     | 2 +-
 kernel/exit.c       | 2 +-
 kernel/kexec.c      | 2 +-
 kernel/ptrace.c     | 1 +
 kernel/sysctl.c     | 2 +-
 6 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index c7685ad00a97..edb845a6e84a 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -133,7 +133,7 @@ static inline int cap_set_all(kernel_cap_t *effective,
      int found = 0;
 
      do_each_thread(g, target) {
-             if (target == current || target->pid == 1)
+             if (target == current || is_init(target))
                      continue;
              found = 1;
 	     if (security_capset_check(target, effective, inheritable,
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1b32c2c04c15..584bb4e6c042 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -240,7 +240,7 @@ static struct super_block *cpuset_sb;
  * A cpuset can only be deleted if both its 'count' of using tasks
  * is zero, and its list of 'children' cpusets is empty.  Since all
  * tasks in the system use _some_ cpuset, and since there is always at
- * least one task in the system (init, pid == 1), therefore, top_cpuset
+ * least one task in the system (init), therefore, top_cpuset
  * always has either children cpusets and/or using tasks.  So we don't
  * need a special hack to ensure that top_cpuset cannot be deleted.
  *
diff --git a/kernel/exit.c b/kernel/exit.c
index 4b6fb054b25d..9961192d6055 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -219,7 +219,7 @@ static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task)
 	do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
 		if (p == ignored_task
 				|| p->exit_state
-				|| p->real_parent->pid == 1)
+				|| is_init(p->real_parent))
 			continue;
 		if (process_group(p->real_parent) != pgrp
 			    && p->real_parent->signal->session == p->signal->session) {
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 50087ecf337e..37cad75cf494 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -40,7 +40,7 @@ struct resource crashk_res = {
 
 int kexec_should_crash(struct task_struct *p)
 {
-	if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
+	if (in_interrupt() || !p->pid || is_init(p) || panic_on_oops)
 		return 1;
 	return 0;
 }
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 8aad0331d82e..4d50e06fd745 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -440,6 +440,7 @@ struct task_struct *ptrace_get_task_struct(pid_t pid)
 	child = find_task_by_pid(pid);
 	if (child)
 		get_task_struct(child);
+
 	read_unlock(&tasklist_lock);
 	if (!child)
 		return ERR_PTR(-ESRCH);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8bfa7d117c54..9535a3839930 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1915,7 +1915,7 @@ int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
 		return -EPERM;
 	}
 
-	op = (current->pid == 1) ? OP_SET : OP_AND;
+	op = is_init(current) ? OP_SET : OP_AND;
 	return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
 				do_proc_dointvec_bset_conv,&op);
 }
-- 
cgit v1.2.3


From 99de055ac065e19ed69de961e97c6336a261b34e Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Fri, 29 Sep 2006 02:00:10 -0700
Subject: [PATCH] lockdep: print kernel version

Lets do the same thing we do for oopses - print out the version in the
report.  It's an extra line of output though.  We could tack it on the end
of the INFO: lines, but that screws up Ingo's pretty output.

Signed-off-by: Dave Jones <davej@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/lockdep.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index c088e5542e84..df1c3594de31 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -36,6 +36,7 @@
 #include <linux/stacktrace.h>
 #include <linux/debug_locks.h>
 #include <linux/irqflags.h>
+#include <linux/utsname.h>
 
 #include <asm/sections.h>
 
@@ -515,6 +516,13 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth)
 	return 0;
 }
 
+static void print_kernel_version(void)
+{
+	printk("%s %.*s\n", system_utsname.release,
+		(int)strcspn(system_utsname.version, " "),
+		system_utsname.version);
+}
+
 /*
  * When a circular dependency is detected, print the
  * header first:
@@ -531,6 +539,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
 
 	printk("\n=======================================================\n");
 	printk(  "[ INFO: possible circular locking dependency detected ]\n");
+	print_kernel_version();
 	printk(  "-------------------------------------------------------\n");
 	printk("%s/%d is trying to acquire lock:\n",
 		curr->comm, curr->pid);
@@ -712,6 +721,7 @@ print_bad_irq_dependency(struct task_struct *curr,
 	printk("\n======================================================\n");
 	printk(  "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
 		irqclass, irqclass);
+	print_kernel_version();
 	printk(  "------------------------------------------------------\n");
 	printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
 		curr->comm, curr->pid,
@@ -793,6 +803,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
 
 	printk("\n=============================================\n");
 	printk(  "[ INFO: possible recursive locking detected ]\n");
+	print_kernel_version();
 	printk(  "---------------------------------------------\n");
 	printk("%s/%d is trying to acquire lock:\n",
 		curr->comm, curr->pid);
@@ -1375,6 +1386,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
 
 	printk("\n=========================================================\n");
 	printk(  "[ INFO: possible irq lock inversion dependency detected ]\n");
+	print_kernel_version();
 	printk(  "---------------------------------------------------------\n");
 	printk("%s/%d just changed the state of lock:\n",
 		curr->comm, curr->pid);
@@ -1469,6 +1481,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
 
 	printk("\n=================================\n");
 	printk(  "[ INFO: inconsistent lock state ]\n");
+	print_kernel_version();
 	printk(  "---------------------------------\n");
 
 	printk("inconsistent {%s} -> {%s} usage.\n",
-- 
cgit v1.2.3


From a45bce49545739a940f8bd4ca85c3b7435564893 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Fri, 29 Sep 2006 02:00:11 -0700
Subject: [PATCH] memory ordering in __kfifo primitives

Both __kfifo_put() and __kfifo_get() have header comments stating that if
there is but one concurrent reader and one concurrent writer, locking is not
necessary.  This is almost the case, but a couple of memory barriers are
needed.  Another option would be to change the header comments to remove the
bit about locking not being needed, and to change the those callers who
currently don't use locking to add the required locking.  The attachment
analyzes this approach, but the patch below seems simpler.

Signed-off-by: Paul E. McKenney <paulmck@us.ibm.com>
Cc: Stelian Pop <stelian@popies.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kfifo.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 64ab045c3d9d..5d1d907378a2 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -122,6 +122,13 @@ unsigned int __kfifo_put(struct kfifo *fifo,
 
 	len = min(len, fifo->size - fifo->in + fifo->out);
 
+	/*
+	 * Ensure that we sample the fifo->out index -before- we
+	 * start putting bytes into the kfifo.
+	 */
+
+	smp_mb();
+
 	/* first put the data starting from fifo->in to buffer end */
 	l = min(len, fifo->size - (fifo->in & (fifo->size - 1)));
 	memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l);
@@ -129,6 +136,13 @@ unsigned int __kfifo_put(struct kfifo *fifo,
 	/* then put the rest (if any) at the beginning of the buffer */
 	memcpy(fifo->buffer, buffer + l, len - l);
 
+	/*
+	 * Ensure that we add the bytes to the kfifo -before-
+	 * we update the fifo->in index.
+	 */
+
+	smp_wmb();
+
 	fifo->in += len;
 
 	return len;
@@ -154,6 +168,13 @@ unsigned int __kfifo_get(struct kfifo *fifo,
 
 	len = min(len, fifo->in - fifo->out);
 
+	/*
+	 * Ensure that we sample the fifo->in index -before- we
+	 * start removing bytes from the kfifo.
+	 */
+
+	smp_rmb();
+
 	/* first get the data from fifo->out until the end of the buffer */
 	l = min(len, fifo->size - (fifo->out & (fifo->size - 1)));
 	memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l);
@@ -161,6 +182,13 @@ unsigned int __kfifo_get(struct kfifo *fifo,
 	/* then get the rest (if any) from the beginning of the buffer */
 	memcpy(buffer + l, fifo->buffer, len - l);
 
+	/*
+	 * Ensure that we remove the bytes from the kfifo -before-
+	 * we update the fifo->out index.
+	 */
+
+	smp_mb();
+
 	fifo->out += len;
 
 	return len;
-- 
cgit v1.2.3


From 07dccf3344010f9b9df7fe725da7e73bca2992df Mon Sep 17 00:00:00 2001
From: Akinobu Mita <mita@miraclelinux.com>
Date: Fri, 29 Sep 2006 02:00:22 -0700
Subject: [PATCH] check return value of cpu_callback

Spawing ksoftirqd, migration, or watchdog, and calling init_timers_cpu()
may fail with small memory.  If it happens in initcalls, kernel NULL
pointer dereference happens later.  This patch makes crash happen
immediately in such cases.  It seems a bit better than getting kernel NULL
pointer dereference later.

Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Akinobu Mita <mita@miraclelinux.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c      | 4 +++-
 kernel/softirq.c    | 4 +++-
 kernel/softlockup.c | 3 ++-
 kernel/timer.c      | 4 +++-
 4 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5c848fd4e461..b946209d9c15 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5272,9 +5272,11 @@ static struct notifier_block __cpuinitdata migration_notifier = {
 int __init migration_init(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
+	int err;
 
 	/* Start one for the boot CPU: */
-	migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+	BUG_ON(err == NOTIFY_BAD);
 	migration_call(&migration_notifier, CPU_ONLINE, cpu);
 	register_cpu_notifier(&migration_notifier);
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 3789ca98197c..bf25015dce16 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -612,7 +612,9 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
 __init int spawn_ksoftirqd(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
-	cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+
+	BUG_ON(err == NOTIFY_BAD);
 	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
 	register_cpu_notifier(&cpu_nfb);
 	return 0;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 03e6a2b0b787..50afeb813305 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -149,8 +149,9 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
 __init void spawn_softlockup_task(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
+	int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
 
-	cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	BUG_ON(err == NOTIFY_BAD);
 	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
 	register_cpu_notifier(&cpu_nfb);
 
diff --git a/kernel/timer.c b/kernel/timer.c
index d644f4e9ca0c..a2cb1ecb1b28 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1694,8 +1694,10 @@ static struct notifier_block __cpuinitdata timers_nb = {
 
 void __init init_timers(void)
 {
-	timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
+	int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
 				(void *)(long)smp_processor_id());
+
+	BUG_ON(err == NOTIFY_BAD);
 	register_cpu_notifier(&timers_nb);
 	open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
 }
-- 
cgit v1.2.3


From 1711ef3866b0360e102327389fe4b76c849bbe83 Mon Sep 17 00:00:00 2001
From: Toyo Abe <toyoa@mvista.com>
Date: Fri, 29 Sep 2006 02:00:28 -0700
Subject: [PATCH] posix-timers: Fix clock_nanosleep() doesn't return the
 remaining time in compatibility mode

The clock_nanosleep() function does not return the time remaining when the
sleep is interrupted by a signal.

This patch creates a new call out, compat_clock_nanosleep_restart(), which
handles returning the remaining time after a sleep is interrupted.  This
patch revives clock_nanosleep_restart().  It is now accessed via the new
call out.  The compat_clock_nanosleep_restart() is used for compatibility
access.

Since this is implemented in compatibility mode the normal path is
virtually unaffected - no real performance impact.

Signed-off-by: Toyo Abe <toyoa@mvista.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/compat.c           | 33 +++++++++++++++++++++++++++++++++
 kernel/hrtimer.c          | 20 ++++++++++----------
 kernel/posix-cpu-timers.c | 17 ++++++++++++-----
 kernel/posix-timers.c     | 21 +++++++++++++++++++++
 4 files changed, 76 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index 126dee9530aa..75573e5d27b0 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -22,6 +22,7 @@
 #include <linux/security.h>
 #include <linux/timex.h>
 #include <linux/migrate.h>
+#include <linux/posix-timers.h>
 
 #include <asm/uaccess.h>
 
@@ -601,6 +602,30 @@ long compat_sys_clock_getres(clockid_t which_clock,
 	return err;
 } 
 
+static long compat_clock_nanosleep_restart(struct restart_block *restart)
+{
+	long err;
+	mm_segment_t oldfs;
+	struct timespec tu;
+	struct compat_timespec *rmtp = (struct compat_timespec *)(restart->arg1);
+
+	restart->arg1 = (unsigned long) &tu;
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	err = clock_nanosleep_restart(restart);
+	set_fs(oldfs);
+
+	if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
+	    put_compat_timespec(&tu, rmtp))
+		return -EFAULT;
+
+	if (err == -ERESTART_RESTARTBLOCK) {
+		restart->fn = compat_clock_nanosleep_restart;
+		restart->arg1 = (unsigned long) rmtp;
+	}
+	return err;
+}
+
 long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
 			    struct compat_timespec __user *rqtp,
 			    struct compat_timespec __user *rmtp)
@@ -608,6 +633,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
 	long err;
 	mm_segment_t oldfs;
 	struct timespec in, out; 
+	struct restart_block *restart;
 
 	if (get_compat_timespec(&in, rqtp)) 
 		return -EFAULT;
@@ -618,9 +644,16 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
 				  (struct timespec __user *) &in,
 				  (struct timespec __user *) &out);
 	set_fs(oldfs);
+
 	if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
 	    put_compat_timespec(&out, rmtp))
 		return -EFAULT;
+
+	if (err == -ERESTART_RESTARTBLOCK) {
+		restart = &current_thread_info()->restart_block;
+		restart->fn = compat_clock_nanosleep_restart;
+		restart->arg1 = (unsigned long) rmtp;
+	}
 	return err;	
 } 
 
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 21c38a7e666b..d0ba190dfeb6 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -693,7 +693,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
 	return t->task == NULL;
 }
 
-static long __sched nanosleep_restart(struct restart_block *restart)
+long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
 {
 	struct hrtimer_sleeper t;
 	struct timespec __user *rmtp;
@@ -702,13 +702,13 @@ static long __sched nanosleep_restart(struct restart_block *restart)
 
 	restart->fn = do_no_restart_syscall;
 
-	hrtimer_init(&t.timer, restart->arg3, HRTIMER_ABS);
-	t.timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
+	hrtimer_init(&t.timer, restart->arg0, HRTIMER_ABS);
+	t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2;
 
 	if (do_nanosleep(&t, HRTIMER_ABS))
 		return 0;
 
-	rmtp = (struct timespec __user *) restart->arg2;
+	rmtp = (struct timespec __user *) restart->arg1;
 	if (rmtp) {
 		time = ktime_sub(t.timer.expires, t.timer.base->get_time());
 		if (time.tv64 <= 0)
@@ -718,7 +718,7 @@ static long __sched nanosleep_restart(struct restart_block *restart)
 			return -EFAULT;
 	}
 
-	restart->fn = nanosleep_restart;
+	restart->fn = hrtimer_nanosleep_restart;
 
 	/* The other values in restart are already filled in */
 	return -ERESTART_RESTARTBLOCK;
@@ -751,11 +751,11 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 	}
 
 	restart = &current_thread_info()->restart_block;
-	restart->fn = nanosleep_restart;
-	restart->arg0 = t.timer.expires.tv64 & 0xFFFFFFFF;
-	restart->arg1 = t.timer.expires.tv64 >> 32;
-	restart->arg2 = (unsigned long) rmtp;
-	restart->arg3 = (unsigned long) t.timer.base->index;
+	restart->fn = hrtimer_nanosleep_restart;
+	restart->arg0 = (unsigned long) t.timer.base->index;
+	restart->arg1 = (unsigned long) rmtp;
+	restart->arg2 = t.timer.expires.tv64 & 0xFFFFFFFF;
+	restart->arg3 = t.timer.expires.tv64 >> 32;
 
 	return -ERESTART_RESTARTBLOCK;
 }
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index d38d9ec3276c..5667fef89dd1 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1393,8 +1393,6 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 	}
 }
 
-static long posix_cpu_clock_nanosleep_restart(struct restart_block *);
-
 int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 		     struct timespec *rqtp, struct timespec __user *rmtp)
 {
@@ -1471,7 +1469,7 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 		    copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
 			return -EFAULT;
 
-		restart_block->fn = posix_cpu_clock_nanosleep_restart;
+		restart_block->fn = posix_cpu_nsleep_restart;
 		/* Caller already set restart_block->arg1 */
 		restart_block->arg0 = which_clock;
 		restart_block->arg1 = (unsigned long) rmtp;
@@ -1484,8 +1482,7 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 	return error;
 }
 
-static long
-posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block)
+long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 {
 	clockid_t which_clock = restart_block->arg0;
 	struct timespec __user *rmtp;
@@ -1524,6 +1521,10 @@ static int process_cpu_nsleep(const clockid_t which_clock, int flags,
 {
 	return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
 }
+static long process_cpu_nsleep_restart(struct restart_block *restart_block)
+{
+	return -EINVAL;
+}
 static int thread_cpu_clock_getres(const clockid_t which_clock,
 				   struct timespec *tp)
 {
@@ -1544,6 +1545,10 @@ static int thread_cpu_nsleep(const clockid_t which_clock, int flags,
 {
 	return -EINVAL;
 }
+static long thread_cpu_nsleep_restart(struct restart_block *restart_block)
+{
+	return -EINVAL;
+}
 
 static __init int init_posix_cpu_timers(void)
 {
@@ -1553,6 +1558,7 @@ static __init int init_posix_cpu_timers(void)
 		.clock_set = do_posix_clock_nosettime,
 		.timer_create = process_cpu_timer_create,
 		.nsleep = process_cpu_nsleep,
+		.nsleep_restart = process_cpu_nsleep_restart,
 	};
 	struct k_clock thread = {
 		.clock_getres = thread_cpu_clock_getres,
@@ -1560,6 +1566,7 @@ static __init int init_posix_cpu_timers(void)
 		.clock_set = do_posix_clock_nosettime,
 		.timer_create = thread_cpu_timer_create,
 		.nsleep = thread_cpu_nsleep,
+		.nsleep_restart = thread_cpu_nsleep_restart,
 	};
 
 	register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ac6dc8744429..e5ebcc1ec3a0 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -973,3 +973,24 @@ sys_clock_nanosleep(const clockid_t which_clock, int flags,
 	return CLOCK_DISPATCH(which_clock, nsleep,
 			      (which_clock, flags, &t, rmtp));
 }
+
+/*
+ * nanosleep_restart for monotonic and realtime clocks
+ */
+static int common_nsleep_restart(struct restart_block *restart_block)
+{
+	return hrtimer_nanosleep_restart(restart_block);
+}
+
+/*
+ * This will restart clock_nanosleep. This is required only by
+ * compat_clock_nanosleep_restart for now.
+ */
+long
+clock_nanosleep_restart(struct restart_block *restart_block)
+{
+	clockid_t which_clock = restart_block->arg0;
+
+	return CLOCK_DISPATCH(which_clock, nsleep_restart,
+			      (restart_block));
+}
-- 
cgit v1.2.3


From e4b765551aa6355eae60b644bed851a9477c4e2b Mon Sep 17 00:00:00 2001
From: Toyo Abe <toyoa@mvista.com>
Date: Fri, 29 Sep 2006 02:00:29 -0700
Subject: [PATCH] posix-timers: Fix the flags handling in posix_cpu_nsleep()

When a posix_cpu_nsleep() sleep is interrupted by a signal more than twice, it
incorrectly reports the sleep time remaining to the user.  Because
posix_cpu_nsleep() doesn't report back to the user when it's called from
restart function due to the wrong flags handling.

This patch, which applies after previous one, moves the nanosleep() function
from posix_cpu_nsleep() to do_cpu_nanosleep() and cleans up the flags handling
appropriately.

Signed-off-by: Toyo Abe <toyoa@mvista.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 84 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 58 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 5667fef89dd1..479b16b44f79 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1393,22 +1393,12 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 	}
 }
 
-int posix_cpu_nsleep(const clockid_t which_clock, int flags,
-		     struct timespec *rqtp, struct timespec __user *rmtp)
+static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
+			    struct timespec *rqtp, struct itimerspec *it)
 {
-	struct restart_block *restart_block =
-	    &current_thread_info()->restart_block;
 	struct k_itimer timer;
 	int error;
 
-	/*
-	 * Diagnose required errors first.
-	 */
-	if (CPUCLOCK_PERTHREAD(which_clock) &&
-	    (CPUCLOCK_PID(which_clock) == 0 ||
-	     CPUCLOCK_PID(which_clock) == current->pid))
-		return -EINVAL;
-
 	/*
 	 * Set up a temporary timer and then wait for it to go off.
 	 */
@@ -1420,11 +1410,12 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 	timer.it_process = current;
 	if (!error) {
 		static struct itimerspec zero_it;
-		struct itimerspec it = { .it_value = *rqtp,
-					 .it_interval = {} };
+
+		memset(it, 0, sizeof *it);
+		it->it_value = *rqtp;
 
 		spin_lock_irq(&timer.it_lock);
-		error = posix_cpu_timer_set(&timer, flags, &it, NULL);
+		error = posix_cpu_timer_set(&timer, flags, it, NULL);
 		if (error) {
 			spin_unlock_irq(&timer.it_lock);
 			return error;
@@ -1452,33 +1443,56 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 		 * We were interrupted by a signal.
 		 */
 		sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
-		posix_cpu_timer_set(&timer, 0, &zero_it, &it);
+		posix_cpu_timer_set(&timer, 0, &zero_it, it);
 		spin_unlock_irq(&timer.it_lock);
 
-		if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) {
+		if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
 			/*
 			 * It actually did fire already.
 			 */
 			return 0;
 		}
 
+		error = -ERESTART_RESTARTBLOCK;
+	}
+
+	return error;
+}
+
+int posix_cpu_nsleep(const clockid_t which_clock, int flags,
+		     struct timespec *rqtp, struct timespec __user *rmtp)
+{
+	struct restart_block *restart_block =
+	    &current_thread_info()->restart_block;
+	struct itimerspec it;
+	int error;
+
+	/*
+	 * Diagnose required errors first.
+	 */
+	if (CPUCLOCK_PERTHREAD(which_clock) &&
+	    (CPUCLOCK_PID(which_clock) == 0 ||
+	     CPUCLOCK_PID(which_clock) == current->pid))
+		return -EINVAL;
+
+	error = do_cpu_nanosleep(which_clock, flags, rqtp, &it);
+
+	if (error == -ERESTART_RESTARTBLOCK) {
+
+	       	if (flags & TIMER_ABSTIME)
+			return -ERESTARTNOHAND;
 		/*
-		 * Report back to the user the time still remaining.
-		 */
-		if (rmtp != NULL && !(flags & TIMER_ABSTIME) &&
-		    copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+	 	 * Report back to the user the time still remaining.
+	 	 */
+		if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
 			return -EFAULT;
 
 		restart_block->fn = posix_cpu_nsleep_restart;
-		/* Caller already set restart_block->arg1 */
 		restart_block->arg0 = which_clock;
 		restart_block->arg1 = (unsigned long) rmtp;
 		restart_block->arg2 = rqtp->tv_sec;
 		restart_block->arg3 = rqtp->tv_nsec;
-
-		error = -ERESTART_RESTARTBLOCK;
 	}
-
 	return error;
 }
 
@@ -1487,13 +1501,31 @@ long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 	clockid_t which_clock = restart_block->arg0;
 	struct timespec __user *rmtp;
 	struct timespec t;
+	struct itimerspec it;
+	int error;
 
 	rmtp = (struct timespec __user *) restart_block->arg1;
 	t.tv_sec = restart_block->arg2;
 	t.tv_nsec = restart_block->arg3;
 
 	restart_block->fn = do_no_restart_syscall;
-	return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t, rmtp);
+	error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
+
+	if (error == -ERESTART_RESTARTBLOCK) {
+		/*
+	 	 * Report back to the user the time still remaining.
+	 	 */
+		if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+			return -EFAULT;
+
+		restart_block->fn = posix_cpu_nsleep_restart;
+		restart_block->arg0 = which_clock;
+		restart_block->arg1 = (unsigned long) rmtp;
+		restart_block->arg2 = t.tv_sec;
+		restart_block->arg3 = t.tv_nsec;
+	}
+	return error;
+
 }
 
 
-- 
cgit v1.2.3


From b9ecb2bd5d3ab8904752685696cb76aac1f3fef2 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 29 Sep 2006 02:00:31 -0700
Subject: [PATCH] has_stopped_jobs() cleanup

This check has been obsolete since the introduction of TASK_TRACED.  Now
TASK_STOPPED always means job control stop.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 9961192d6055..3ec7b10eae38 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -249,17 +249,6 @@ static int has_stopped_jobs(int pgrp)
 	do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
 		if (p->state != TASK_STOPPED)
 			continue;
-
-		/* If p is stopped by a debugger on a signal that won't
-		   stop it, then don't count p as stopped.  This isn't
-		   perfect but it's a good approximation.  */
-		if (unlikely (p->ptrace)
-		    && p->exit_code != SIGSTOP
-		    && p->exit_code != SIGTSTP
-		    && p->exit_code != SIGTTOU
-		    && p->exit_code != SIGTTIN)
-			continue;
-
 		retval = 1;
 		break;
 	} while_each_task_pid(pgrp, PIDTYPE_PGID, p);
-- 
cgit v1.2.3


From 27d91e07f9b863fa94491ffafe250580f0c2ce78 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 29 Sep 2006 02:00:31 -0700
Subject: [PATCH] __dequeue_signal() cleanup

This tightens up __dequeue_signal a little.  It also avoids doing
recalc_sigpending twice in a row, instead doing it once in dequeue_signal.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 05853a7337e3..fb5da6d19f14 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -417,9 +417,8 @@ static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
 static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
 			siginfo_t *info)
 {
-	int sig = 0;
+	int sig = next_signal(pending, mask);
 
-	sig = next_signal(pending, mask);
 	if (sig) {
 		if (current->notifier) {
 			if (sigismember(current->notifier_mask, sig)) {
@@ -432,9 +431,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
 
 		if (!collect_signal(sig, pending, info))
 			sig = 0;
-				
 	}
-	recalc_sigpending();
 
 	return sig;
 }
@@ -451,6 +448,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 	if (!signr)
 		signr = __dequeue_signal(&tsk->signal->shared_pending,
 					 mask, info);
+	recalc_sigpending_tsk(tsk);
  	if (signr && unlikely(sig_kernel_stop(signr))) {
  		/*
  		 * Set a marker that we have dequeued a stop signal.  Our
-- 
cgit v1.2.3


From 3171a0305d62e6627a24bff35af4f997e4988a80 Mon Sep 17 00:00:00 2001
From: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Date: Fri, 29 Sep 2006 02:00:32 -0700
Subject: [PATCH] simplify update_times (avoid jiffies/jiffies_64 aliasing
 problem)

Pass ticks to do_timer() and update_times(), and adjust x86_64 and s390
timer interrupt handler with this change.

Currently update_times() calculates ticks by "jiffies - wall_jiffies", but
callers of do_timer() should know how many ticks to update.  Passing ticks
get rid of this redundant calculation.  Also there are another redundancy
pointed out by Martin Schwidefsky.

This cleanup make a barrier added by
5aee405c662ca644980c184774277fc6d0769a84 needless.  So this patch removes
it.

As a bonus, this cleanup make wall_jiffies can be removed easily, since now
wall_jiffies is always synced with jiffies.  (This patch does not really
remove wall_jiffies.  It would be another cleanup patch)

Signed-off-by: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Acked-by: Russell King <rmk@arm.linux.org.uk>
Cc: Ian Molton <spyro@f2s.com>
Cc: Mikael Starvik <starvik@axis.com>
Acked-by: David Howells <dhowells@redhat.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Hirokazu Takata <takata.hirokazu@renesas.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Kazumoto Kojima <kkojima@rr.iij4u.or.jp>
Cc: Richard Curnow <rc@rc0.org.uk>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Cc: Miles Bader <uclinux-v850@lsi.nec.co.jp>
Cc: Chris Zankel <chris@zankel.net>
Acked-by: "Luck, Tony" <tony.luck@intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index a2cb1ecb1b28..4f55622b0d38 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1222,10 +1222,8 @@ static inline void calc_load(unsigned long ticks)
 	unsigned long active_tasks; /* fixed-point */
 	static int count = LOAD_FREQ;
 
-	count -= ticks;
-	if (count < 0) {
-		count += LOAD_FREQ;
-		active_tasks = count_active_tasks();
+	active_tasks = count_active_tasks();
+	for (count -= ticks; count < 0; count += LOAD_FREQ) {
 		CALC_LOAD(avenrun[0], EXP_1, active_tasks);
 		CALC_LOAD(avenrun[1], EXP_5, active_tasks);
 		CALC_LOAD(avenrun[2], EXP_15, active_tasks);
@@ -1270,11 +1268,8 @@ void run_local_timers(void)
  * Called by the timer interrupt. xtime_lock must already be taken
  * by the timer IRQ!
  */
-static inline void update_times(void)
+static inline void update_times(unsigned long ticks)
 {
-	unsigned long ticks;
-
-	ticks = jiffies - wall_jiffies;
 	wall_jiffies += ticks;
 	update_wall_time();
 	calc_load(ticks);
@@ -1286,12 +1281,10 @@ static inline void update_times(void)
  * jiffies is defined in the linker script...
  */
 
-void do_timer(struct pt_regs *regs)
+void do_timer(unsigned long ticks)
 {
-	jiffies_64++;
-	/* prevent loading jiffies before storing new jiffies_64 value. */
-	barrier();
-	update_times();
+	jiffies_64 += ticks;
+	update_times(ticks);
 }
 
 #ifdef __ARCH_WANT_SYS_ALARM
-- 
cgit v1.2.3


From 0b4a8a789a328af6aac613734c362cf6aad72201 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 29 Sep 2006 02:00:39 -0700
Subject: [PATCH] kexec warning fix

This fixes a couple of compiler warnings, and adds paranoia checks as well.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kexec.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 37cad75cf494..fcdd5d2bc3f4 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -995,7 +995,8 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
 	image = xchg(dest_image, image);
 
 out:
-	xchg(&kexec_lock, 0); /* Release the mutex */
+	locked = xchg(&kexec_lock, 0); /* Release the mutex */
+	BUG_ON(!locked);
 	kimage_free(image);
 
 	return result;
@@ -1061,7 +1062,8 @@ void crash_kexec(struct pt_regs *regs)
 			machine_crash_shutdown(&fixed_regs);
 			machine_kexec(kexec_crash_image);
 		}
-		xchg(&kexec_lock, 0);
+		locked = xchg(&kexec_lock, 0);
+		BUG_ON(!locked);
 	}
 }
 
-- 
cgit v1.2.3


From 54306cf04c0ea0a8c432603dbc657ab62a438668 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Fri, 29 Sep 2006 02:00:42 -0700
Subject: [PATCH] exit: fix crash case

If we are going to BUG() not panic() here then we should cover the case of
the BUG being compiled out

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 3ec7b10eae38..a51b347ae67b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -963,7 +963,8 @@ fastcall NORET_TYPE void do_exit(long code)
 	schedule();
 	BUG();
 	/* Avoid "noreturn function does return".  */
-	for (;;) ;
+	for (;;)
+		cpu_relax();	/* For when BUG is null */
 }
 
 EXPORT_SYMBOL_GPL(do_exit);
-- 
cgit v1.2.3


From 111dbe0c8a21dffa473239861be47ebc87f593b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Steinbrink?= <B.Steinbrink@gmx.de>
Date: Fri, 29 Sep 2006 02:00:46 -0700
Subject: [PATCH] Fix ____call_usermodehelper errors being silently ignored
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If ____call_usermodehelper fails, we're not interested in the child
process' exit value, but the real error, so let's stop wait_for_helper from
overwriting it in that case.

Issue discovered by Benedikt Böhm while working on a Linux-VServer usermode
helper.

Signed-off-by: Björn Steinbrink <B.Steinbrink@gmx.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kmod.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 5c470c57fb57..842f8015d7fd 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -176,6 +176,8 @@ static int wait_for_helper(void *data)
 	if (pid < 0) {
 		sub_info->retval = pid;
 	} else {
+		int ret;
+
 		/*
 		 * Normally it is bogus to call wait4() from in-kernel because
 		 * wait4() wants to write the exit code to a userspace address.
@@ -185,7 +187,15 @@ static int wait_for_helper(void *data)
 		 *
 		 * Thus the __user pointer cast is valid here.
 		 */
-		sys_wait4(pid, (int __user *) &sub_info->retval, 0, NULL);
+		sys_wait4(pid, (int __user *)&ret, 0, NULL);
+
+		/*
+		 * If ret is 0, either ____call_usermodehelper failed and the
+		 * real error code is already in sub_info->retval or
+		 * sub_info->retval is 0 anyway, so don't mess with it then.
+		 */
+		if (ret)
+			sub_info->retval = ret;
 	}
 
 	complete(sub_info->complete);
-- 
cgit v1.2.3


From c9472e0f28cd2f0695a0ac3a0b4bd33f21714a7e Mon Sep 17 00:00:00 2001
From: Cal Peake <cp@absolutedigital.net>
Date: Fri, 29 Sep 2006 02:00:47 -0700
Subject: [PATCH] kill extraneous printk in kernel_restart()

Get rid of an extraneous printk in kernel_restart().

Signed-off-by: Cal Peake <cp@absolutedigital.net>
Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 3f894775488d..8647061c084a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -612,7 +612,6 @@ void kernel_restart(char *cmd)
 	} else {
 		printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
 	}
-	printk(".\n");
 	machine_restart(cmd);
 }
 EXPORT_SYMBOL_GPL(kernel_restart);
-- 
cgit v1.2.3


From 5fe1d75f34974046fffcca5e22fb8a7b42fded33 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:00:48 -0700
Subject: [PATCH] do_sched_setscheduler(): don't take tasklist_lock

Use rcu locks instead. sched_setscheduler() now takes ->siglock
before reading ->signal->rlim[].

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b946209d9c15..f9b3c6a414f1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4080,6 +4080,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
  * @p: the task in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
+ *
+ * NOTE: the task may be already dead
  */
 int sched_setscheduler(struct task_struct *p, int policy,
 		       struct sched_param *param)
@@ -4115,19 +4117,26 @@ recheck:
 	 * Allow unprivileged RT tasks to decrease priority:
 	 */
 	if (!capable(CAP_SYS_NICE)) {
+		unsigned long rlim_rtprio;
+		unsigned long flags;
+
+		if (!lock_task_sighand(p, &flags))
+			return -ESRCH;
+		rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
+		unlock_task_sighand(p, &flags);
+
 		/*
 		 * can't change policy, except between SCHED_NORMAL
 		 * and SCHED_BATCH:
 		 */
 		if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
 			(policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
-				!p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
+				!rlim_rtprio)
 			return -EPERM;
 		/* can't increase priority */
 		if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
 		    param->sched_priority > p->rt_priority &&
-		    param->sched_priority >
-				p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
+		    param->sched_priority > rlim_rtprio)
 			return -EPERM;
 		/* can't change other user's priorities */
 		if ((current->euid != p->euid) &&
@@ -4193,14 +4202,13 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 		return -EINVAL;
 	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
 		return -EFAULT;
-	read_lock_irq(&tasklist_lock);
+
+	rcu_read_lock();
+	retval = -ESRCH;
 	p = find_process_by_pid(pid);
-	if (!p) {
-		read_unlock_irq(&tasklist_lock);
-		return -ESRCH;
-	}
-	retval = sched_setscheduler(p, policy, &lparam);
-	read_unlock_irq(&tasklist_lock);
+	if (p != NULL)
+		retval = sched_setscheduler(p, policy, &lparam);
+	rcu_read_unlock();
 
 	return retval;
 }
-- 
cgit v1.2.3


From 57a6f51c4281aa3119975473c70dce0480d322bd Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:00:49 -0700
Subject: [PATCH] introduce is_rt_policy() helper

Imho, makes the code a bit easier to read.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index f9b3c6a414f1..c3c718aea618 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4109,8 +4109,7 @@ recheck:
 	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
 	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
 		return -EINVAL;
-	if ((policy == SCHED_NORMAL || policy == SCHED_BATCH)
-					!= (param->sched_priority == 0))
+	if (is_rt_policy(policy) != (param->sched_priority != 0))
 		return -EINVAL;
 
 	/*
@@ -4134,7 +4133,7 @@ recheck:
 				!rlim_rtprio)
 			return -EPERM;
 		/* can't increase priority */
-		if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
+		if (is_rt_policy(policy) &&
 		    param->sched_priority > p->rt_priority &&
 		    param->sched_priority > rlim_rtprio)
 			return -EPERM;
-- 
cgit v1.2.3


From 8dc3e9099e01dfdd53f27f329c268eced03dfef6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:00:50 -0700
Subject: [PATCH] sched_setscheduler: fix? policy checks

I am not sure this patch is correct: I can't understand what the current
code does, and I don't know what it was supposed to do.

The comment says:

		 * can't change policy, except between SCHED_NORMAL
		 * and SCHED_BATCH:

The code:

		if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
			(policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&

But this is equivalent to:

		if ( (is_rt_policy(policy) && has_rt_policy(p)) &&

which means something different.  We can't _decrease_ the current
->rt_priority with such a check (if rlim[RLIMIT_RTPRIO] == 0).

Probably, it was supposed to be:

		if (	!(policy == SCHED_NORMAL && p->policy == SCHED_BATCH)  &&
			!(policy == SCHED_BATCH  && p->policy == SCHED_NORMAL)

this matches the comment, but strange: it doesn't allow to _drop_ the
realtime priority when rlim[RLIMIT_RTPRIO] == 0.

I think the right check would be:

		/* can't set/change rt policy */
		if (is_rt_policy(policy) &&
				policy != p->policy &&
				!rlim_rtprio)
			return -EPERM;

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 38 ++++++++++++++++++--------------------
 1 file changed, 18 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c3c718aea618..155a33da7aa7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4116,27 +4116,25 @@ recheck:
 	 * Allow unprivileged RT tasks to decrease priority:
 	 */
 	if (!capable(CAP_SYS_NICE)) {
-		unsigned long rlim_rtprio;
-		unsigned long flags;
-
-		if (!lock_task_sighand(p, &flags))
-			return -ESRCH;
-		rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
-		unlock_task_sighand(p, &flags);
+		if (is_rt_policy(policy)) {
+			unsigned long rlim_rtprio;
+			unsigned long flags;
+
+			if (!lock_task_sighand(p, &flags))
+				return -ESRCH;
+			rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
+			unlock_task_sighand(p, &flags);
+
+			/* can't set/change the rt policy */
+			if (policy != p->policy && !rlim_rtprio)
+				return -EPERM;
+
+			/* can't increase priority */
+			if (param->sched_priority > p->rt_priority &&
+			    param->sched_priority > rlim_rtprio)
+				return -EPERM;
+		}
 
-		/*
-		 * can't change policy, except between SCHED_NORMAL
-		 * and SCHED_BATCH:
-		 */
-		if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
-			(policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
-				!rlim_rtprio)
-			return -EPERM;
-		/* can't increase priority */
-		if (is_rt_policy(policy) &&
-		    param->sched_priority > p->rt_priority &&
-		    param->sched_priority > rlim_rtprio)
-			return -EPERM;
 		/* can't change other user's priorities */
 		if ((current->euid != p->euid) &&
 		    (current->euid != p->uid))
-- 
cgit v1.2.3


From 1c573afebc6213e4372e0d8352034c23d5262e1f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:00:51 -0700
Subject: [PATCH] reparent_to_init(): use has_rt_policy()

Remove open-coded has_rt_policy(), no changes in kernel/exit.o

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index a51b347ae67b..4a280856acd2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -281,9 +281,7 @@ static void reparent_to_init(void)
 	/* Set the exit signal to SIGCHLD so we signal init on exit */
 	current->exit_signal = SIGCHLD;
 
-	if ((current->policy == SCHED_NORMAL ||
-			current->policy == SCHED_BATCH)
-				&& (task_nice(current) < 0))
+	if (!has_rt_policy(current) && (task_nice(current) < 0))
 		set_user_nice(current, 0);
 	/* cpus_allowed? */
 	/* rt_priority? */
-- 
cgit v1.2.3


From 5b160f5ecd2f1b6df2e0015dc1f319c8ef803d62 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:00:52 -0700
Subject: [PATCH] copy_process: cosmetic ->ioprio tweak

copy_process:
// holds tasklist_lock + ->siglock
       /*
        * inherit ioprio
        */
       p->ioprio = current->ioprio;

Why?  ->ioprio was already copied in dup_task_struct().  I guess this is
needed to ensure that the child can't escape
sys_ioprio_set(IOPRIO_WHO_{PGRP,USER}), yes?

In that case we don't need ->siglock held, and the comment should be
updated.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index bca6ce6d3ded..1c999f3e0b47 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1150,7 +1150,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	/* Our parent execution domain becomes current domain
 	   These must match for thread signalling to apply */
-	   
 	p->parent_exec_id = p->self_exec_id;
 
 	/* ok, now we should be set up.. */
@@ -1173,6 +1172,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	/* Need tasklist lock for parent etc handling! */
 	write_lock_irq(&tasklist_lock);
 
+	/* for sys_ioprio_set(IOPRIO_WHO_PGRP) */
+	p->ioprio = current->ioprio;
+
 	/*
 	 * The task hasn't been attached yet, so its cpus_allowed mask will
 	 * not be changed, nor will its assigned CPU.
@@ -1232,11 +1234,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		}
 	}
 
-	/*
-	 * inherit ioprio
-	 */
-	p->ioprio = current->ioprio;
-
 	if (likely(p->pid)) {
 		add_parent(p);
 		if (unlikely(p->ptrace & PT_PTRACED))
-- 
cgit v1.2.3


From d359b549bf3d7f42f0084918a4816ea4572e507c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:00:55 -0700
Subject: [PATCH] futex_find_get_task(): don't take tasklist_lock

It is ok to do find_task_by_pid() + get_task_struct() under
rcu_read_lock(), we cand drop tasklist_lock.

Note that testing of ->exit_state is racy with or without tasklist anyway.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 9d260e838cff..ca8ef11feb65 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -389,7 +389,7 @@ static struct task_struct * futex_find_get_task(pid_t pid)
 {
 	struct task_struct *p;
 
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	p = find_task_by_pid(pid);
 	if (!p)
 		goto out_unlock;
@@ -403,7 +403,7 @@ static struct task_struct * futex_find_get_task(pid_t pid)
 	}
 	get_task_struct(p);
 out_unlock:
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 
 	return p;
 }
-- 
cgit v1.2.3


From aaa2a97eb9c0e91d7abc66bf76811a9599fdb3ee Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:00:55 -0700
Subject: [PATCH] sys_get_robust_list(): don't take tasklist_lock

use rcu locks for find_task_by_pid().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index ca8ef11feb65..4b6770e9806d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1624,7 +1624,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
 		struct task_struct *p;
 
 		ret = -ESRCH;
-		read_lock(&tasklist_lock);
+		rcu_read_lock();
 		p = find_task_by_pid(pid);
 		if (!p)
 			goto err_unlock;
@@ -1633,7 +1633,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
 				!capable(CAP_SYS_PTRACE))
 			goto err_unlock;
 		head = p->robust_list;
-		read_unlock(&tasklist_lock);
+		rcu_read_unlock();
 	}
 
 	if (put_user(sizeof(*head), len_ptr))
@@ -1641,7 +1641,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
 	return put_user(head, head_ptr);
 
 err_unlock:
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 
 	return ret;
 }
-- 
cgit v1.2.3


From 29b884921634e1e01cbd276e1c9b8fc07a7e4a90 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:01:09 -0700
Subject: [PATCH] set EXIT_DEAD state in do_exit(), not in schedule()

schedule() checks PF_DEAD on every context switch and sets ->state = EXIT_DEAD
to ensure that the exiting task will be deactivated.  Note that this EXIT_DEAD
is in fact a "random" value, we can use any bit except normal TASK_XXX values.

It is better to set this state in do_exit() along with PF_DEAD flag and remove
that check in schedule().

We are safe wrt concurrent try_to_wake_up() (for example ptrace, tkill), it
can not change task's ->state: the 'state' argument of try_to_wake_up() can't
have EXIT_DEAD bit.  And in case when try_to_wake_up() sees a stale value of
->state == TASK_RUNNING it will do nothing.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c  | 1 +
 kernel/sched.c | 3 ---
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 4a280856acd2..3d759c98fb11 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -957,6 +957,7 @@ fastcall NORET_TYPE void do_exit(long code)
 	preempt_disable();
 	BUG_ON(tsk->flags & PF_DEAD);
 	tsk->flags |= PF_DEAD;
+	tsk->state = EXIT_DEAD;
 
 	schedule();
 	BUG();
diff --git a/kernel/sched.c b/kernel/sched.c
index 155a33da7aa7..e1646b044b69 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3348,9 +3348,6 @@ need_resched_nonpreemptible:
 
 	spin_lock_irq(&rq->lock);
 
-	if (unlikely(prev->flags & PF_DEAD))
-		prev->state = EXIT_DEAD;
-
 	switch_count = &prev->nivcsw;
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
 		switch_count = &prev->nvcsw;
-- 
cgit v1.2.3


From 55a101f8f71a3d3dbda7b5c77083ffe47552f831 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:01:10 -0700
Subject: [PATCH] kill PF_DEAD flag

After the previous change (->flags & PF_DEAD) <=> (->state == EXIT_DEAD), we
don't need PF_DEAD any longer.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c  |  6 ++----
 kernel/sched.c | 16 ++++++++--------
 2 files changed, 10 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 3d759c98fb11..9dd5f1336da2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -953,10 +953,8 @@ fastcall NORET_TYPE void do_exit(long code)
 	if (tsk->splice_pipe)
 		__free_pipe_info(tsk->splice_pipe);
 
-	/* PF_DEAD causes final put_task_struct after we schedule. */
 	preempt_disable();
-	BUG_ON(tsk->flags & PF_DEAD);
-	tsk->flags |= PF_DEAD;
+	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = EXIT_DEAD;
 
 	schedule();
@@ -972,7 +970,7 @@ NORET_TYPE void complete_and_exit(struct completion *comp, long code)
 {
 	if (comp)
 		complete(comp);
-	
+
 	do_exit(code);
 }
 
diff --git a/kernel/sched.c b/kernel/sched.c
index e1646b044b69..a9405d7cc6ab 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1755,27 +1755,27 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct mm_struct *mm = rq->prev_mm;
-	unsigned long prev_task_flags;
+	long prev_state;
 
 	rq->prev_mm = NULL;
 
 	/*
 	 * A task struct has one reference for the use as "current".
-	 * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
-	 * calls schedule one last time. The schedule call will never return,
-	 * and the scheduled task must drop that reference.
-	 * The test for EXIT_ZOMBIE must occur while the runqueue locks are
+	 * If a task dies, then it sets EXIT_DEAD in tsk->state and calls
+	 * schedule one last time. The schedule call will never return, and
+	 * the scheduled task must drop that reference.
+	 * The test for EXIT_DEAD must occur while the runqueue locks are
 	 * still held, otherwise prev could be scheduled on another cpu, die
 	 * there before we look at prev->state, and then the reference would
 	 * be dropped twice.
 	 *		Manfred Spraul <manfred@colorfullife.com>
 	 */
-	prev_task_flags = prev->flags;
+	prev_state = prev->state;
 	finish_arch_switch(prev);
 	finish_lock_switch(rq, prev);
 	if (mm)
 		mmdrop(mm);
-	if (unlikely(prev_task_flags & PF_DEAD)) {
+	if (unlikely(prev_state == EXIT_DEAD)) {
 		/*
 		 * Remove function-return probe instances associated with this
 		 * task and put them back on the free list.
@@ -5153,7 +5153,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
 	BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
 
 	/* Cannot have done final schedule yet: would have vanished. */
-	BUG_ON(p->flags & PF_DEAD);
+	BUG_ON(p->state == EXIT_DEAD);
 
 	get_task_struct(p);
 
-- 
cgit v1.2.3


From c394cc9fbb367f87faa2228ec2eabacd2d4701c6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:01:11 -0700
Subject: [PATCH] introduce TASK_DEAD state

I am not sure about this patch, I am asking Ingo to take a decision.

task_struct->state == EXIT_DEAD is a very special case, to avoid a confusion
it makes sense to introduce a new state, TASK_DEAD, while EXIT_DEAD should
live only in ->exit_state as documented in sched.h.

Note that this state is not visible to user-space, get_task_state() masks off
unsuitable states.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c  | 2 +-
 kernel/sched.c | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 9dd5f1336da2..2e4c13cba95a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -955,7 +955,7 @@ fastcall NORET_TYPE void do_exit(long code)
 
 	preempt_disable();
 	/* causes final put_task_struct in finish_task_switch(). */
-	tsk->state = EXIT_DEAD;
+	tsk->state = TASK_DEAD;
 
 	schedule();
 	BUG();
diff --git a/kernel/sched.c b/kernel/sched.c
index a9405d7cc6ab..74f169ac0773 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1761,10 +1761,10 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
 
 	/*
 	 * A task struct has one reference for the use as "current".
-	 * If a task dies, then it sets EXIT_DEAD in tsk->state and calls
+	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
 	 * schedule one last time. The schedule call will never return, and
 	 * the scheduled task must drop that reference.
-	 * The test for EXIT_DEAD must occur while the runqueue locks are
+	 * The test for TASK_DEAD must occur while the runqueue locks are
 	 * still held, otherwise prev could be scheduled on another cpu, die
 	 * there before we look at prev->state, and then the reference would
 	 * be dropped twice.
@@ -1775,7 +1775,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	finish_lock_switch(rq, prev);
 	if (mm)
 		mmdrop(mm);
-	if (unlikely(prev_state == EXIT_DEAD)) {
+	if (unlikely(prev_state == TASK_DEAD)) {
 		/*
 		 * Remove function-return probe instances associated with this
 		 * task and put them back on the free list.
@@ -5153,7 +5153,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
 	BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
 
 	/* Cannot have done final schedule yet: would have vanished. */
-	BUG_ON(p->state == EXIT_DEAD);
+	BUG_ON(p->state == TASK_DEAD);
 
 	get_task_struct(p);
 
-- 
cgit v1.2.3


From 38837fc75acb7fa9b0e111b0241fe4fe76c5d4b3 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 29 Sep 2006 02:01:16 -0700
Subject: [PATCH] cpuset: top_cpuset tracks hotplug changes to node_online_map

Change the list of memory nodes allowed to tasks in the top (root) nodeset
to dynamically track what cpus are online, using a call to a cpuset hook
from the memory hotplug code.  Make this top cpus file read-only.

On systems that have cpusets configured in their kernel, but that aren't
actively using cpusets (for some distros, this covers the majority of
systems) all tasks end up in the top cpuset.

If that system does support memory hotplug, then these tasks cannot make
use of memory nodes that are added after system boot, because the memory
nodes are not allowed in the top cpuset.  This is a surprising regression
over earlier kernels that didn't have cpusets enabled.

One key motivation for this change is to remain consistent with the
behaviour for the top_cpuset's 'cpus', which is also read-only, and which
automatically tracks the cpu_online_map.

This change also has the minor benefit that it fixes a long standing,
little noticed, minor bug in cpusets.  The cpuset performance tweak to
short circuit the cpuset_zone_allowed() check on systems with just a single
cpuset (see 'number_of_cpusets', in linux/cpuset.h) meant that simply
changing the 'mems' of the top_cpuset had no affect, even though the change
(the write system call) appeared to succeed.  With the following change,
that write to the 'mems' file fails -EACCES, and the 'mems' file stubbornly
refuses to be changed via user space writes.  Thus no one should be mislead
into thinking they've changed the top_cpusets's 'mems' when in affect they
haven't.

In order to keep the behaviour of cpusets consistent between systems
actively making use of them and systems not using them, this patch changes
the behaviour of the 'mems' file in the top (root) cpuset, making it read
only, and making it automatically track the value of node_online_map.  Thus
tasks in the top cpuset will have automatic use of hot plugged memory nodes
allowed by their cpuset.

[akpm@osdl.org: build fix]
[bunk@stusta.de: build fix]
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 584bb4e6c042..794af5024c2f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -912,6 +912,10 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	int fudge;
 	int retval;
 
+	/* top_cpuset.mems_allowed tracks node_online_map; it's read-only */
+	if (cs == &top_cpuset)
+		return -EACCES;
+
 	trialcs = *cs;
 	retval = nodelist_parse(buf, trialcs.mems_allowed);
 	if (retval < 0)
@@ -2042,9 +2046,8 @@ out:
  * (of no affect) on systems that are actively using CPU hotplug
  * but making no active use of cpusets.
  *
- * This handles CPU hotplug (cpuhp) events.  If someday Memory
- * Nodes can be hotplugged (dynamically changing node_online_map)
- * then we should handle that too, perhaps in a similar way.
+ * This routine ensures that top_cpuset.cpus_allowed tracks
+ * cpu_online_map on each CPU hotplug (cpuhp) event.
  */
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -2063,6 +2066,25 @@ static int cpuset_handle_cpuhp(struct notifier_block *nb,
 }
 #endif
 
+/*
+ * Keep top_cpuset.mems_allowed tracking node_online_map.
+ * Call this routine anytime after you change node_online_map.
+ * See also the previous routine cpuset_handle_cpuhp().
+ */
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+void cpuset_track_online_nodes()
+{
+	mutex_lock(&manage_mutex);
+	mutex_lock(&callback_mutex);
+
+	top_cpuset.mems_allowed = node_online_map;
+
+	mutex_unlock(&callback_mutex);
+	mutex_unlock(&manage_mutex);
+}
+#endif
+
 /**
  * cpuset_init_smp - initialize cpus_allowed
  *
-- 
cgit v1.2.3


From b1aac8bb824c658ddebd296b088a8bff5029c288 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 29 Sep 2006 02:01:17 -0700
Subject: [PATCH] cpuset: hotunplug cpus and mems in all cpusets

The cpuset code handling hot unplug of CPUs or Memory Nodes was incorrect -
it could remove a CPU or Node from the top cpuset, while leaving it still
in some child cpusets.

One basic rule of cpusets is that each cpusets cpus and mems are subsets of
its parents.  The cpuset hot unplug code violated this rule.

So the cpuset hotunplug handler must walk down the tree, removing any
removed CPU or Node from all cpusets.

However, it is not allowed to make a cpusets cpus or mems become empty.
They can only transition from empty to non-empty, not back.

So if the last CPU or Node would be removed from a cpuset by the above
walk, we scan back up the cpuset hierarchy, finding the nearest ancestor
that still has something online, and copy its CPU or Memory placement.

Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Nathan Lynch <ntl@pobox.com>
Cc: Anton Blanchard <anton@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 70 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 794af5024c2f..cc0395d7eba1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2040,6 +2040,73 @@ out:
 	return err;
 }
 
+#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
+/*
+ * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
+ * or memory nodes, we need to walk over the cpuset hierarchy,
+ * removing that CPU or node from all cpusets.  If this removes the
+ * last CPU or node from a cpuset, then the guarantee_online_cpus()
+ * or guarantee_online_mems() code will use that emptied cpusets
+ * parent online CPUs or nodes.  Cpusets that were already empty of
+ * CPUs or nodes are left empty.
+ *
+ * This routine is intentionally inefficient in a couple of regards.
+ * It will check all cpusets in a subtree even if the top cpuset of
+ * the subtree has no offline CPUs or nodes.  It checks both CPUs and
+ * nodes, even though the caller could have been coded to know that
+ * only one of CPUs or nodes needed to be checked on a given call.
+ * This was done to minimize text size rather than cpu cycles.
+ *
+ * Call with both manage_mutex and callback_mutex held.
+ *
+ * Recursive, on depth of cpuset subtree.
+ */
+
+static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
+{
+	struct cpuset *c;
+
+	/* Each of our child cpusets mems must be online */
+	list_for_each_entry(c, &cur->children, sibling) {
+		guarantee_online_cpus_mems_in_subtree(c);
+		if (!cpus_empty(c->cpus_allowed))
+			guarantee_online_cpus(c, &c->cpus_allowed);
+		if (!nodes_empty(c->mems_allowed))
+			guarantee_online_mems(c, &c->mems_allowed);
+	}
+}
+
+/*
+ * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
+ * cpu_online_map and node_online_map.  Force the top cpuset to track
+ * whats online after any CPU or memory node hotplug or unplug event.
+ *
+ * To ensure that we don't remove a CPU or node from the top cpuset
+ * that is currently in use by a child cpuset (which would violate
+ * the rule that cpusets must be subsets of their parent), we first
+ * call the recursive routine guarantee_online_cpus_mems_in_subtree().
+ *
+ * Since there are two callers of this routine, one for CPU hotplug
+ * events and one for memory node hotplug events, we could have coded
+ * two separate routines here.  We code it as a single common routine
+ * in order to minimize text size.
+ */
+
+static void common_cpu_mem_hotplug_unplug(void)
+{
+	mutex_lock(&manage_mutex);
+	mutex_lock(&callback_mutex);
+
+	guarantee_online_cpus_mems_in_subtree(&top_cpuset);
+	top_cpuset.cpus_allowed = cpu_online_map;
+	top_cpuset.mems_allowed = node_online_map;
+
+	mutex_unlock(&callback_mutex);
+	mutex_unlock(&manage_mutex);
+}
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
 /*
  * The top_cpuset tracks what CPUs and Memory Nodes are online,
  * period.  This is necessary in order to make cpusets transparent
@@ -2050,38 +2117,24 @@ out:
  * cpu_online_map on each CPU hotplug (cpuhp) event.
  */
 
-#ifdef CONFIG_HOTPLUG_CPU
 static int cpuset_handle_cpuhp(struct notifier_block *nb,
 				unsigned long phase, void *cpu)
 {
-	mutex_lock(&manage_mutex);
-	mutex_lock(&callback_mutex);
-
-	top_cpuset.cpus_allowed = cpu_online_map;
-
-	mutex_unlock(&callback_mutex);
-	mutex_unlock(&manage_mutex);
-
+	common_cpu_mem_hotplug_unplug();
 	return 0;
 }
 #endif
 
+#ifdef CONFIG_MEMORY_HOTPLUG
 /*
  * Keep top_cpuset.mems_allowed tracking node_online_map.
  * Call this routine anytime after you change node_online_map.
  * See also the previous routine cpuset_handle_cpuhp().
  */
 
-#ifdef CONFIG_MEMORY_HOTPLUG
 void cpuset_track_online_nodes()
 {
-	mutex_lock(&manage_mutex);
-	mutex_lock(&callback_mutex);
-
-	top_cpuset.mems_allowed = node_online_map;
-
-	mutex_unlock(&callback_mutex);
-	mutex_unlock(&manage_mutex);
+	common_cpu_mem_hotplug_unplug();
 }
 #endif
 
-- 
cgit v1.2.3


From 04b1db9fd7eea63c9663072feece616ea41b0a79 Mon Sep 17 00:00:00 2001
From: "Ian S. Nelson" <nelsonian@comcast.net>
Date: Fri, 29 Sep 2006 02:01:31 -0700
Subject: [PATCH] /sys/modules: allow full length section names

I've been using systemtap for some debugging and I noticed that it can't
probe a lot of modules.  Turns out it's kind of silly, the sections section
of /sys/module is limited to 32byte filenames and many of the actual
sections are a a bit longer than that.

[akpm@osdl.org: rewrite to use dymanic allocation]
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index b7fe6e840963..05625d5dc758 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -933,6 +933,15 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
 	return sprintf(buf, "0x%lx\n", sattr->address);
 }
 
+static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
+{
+	int section;
+
+	for (section = 0; section < sect_attrs->nsections; section++)
+		kfree(sect_attrs->attrs[section].name);
+	kfree(sect_attrs);
+}
+
 static void add_sect_attrs(struct module *mod, unsigned int nsect,
 		char *secstrings, Elf_Shdr *sechdrs)
 {
@@ -949,21 +958,26 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
 			+ nloaded * sizeof(sect_attrs->attrs[0]),
 			sizeof(sect_attrs->grp.attrs[0]));
 	size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]);
-	if (! (sect_attrs = kmalloc(size[0] + size[1], GFP_KERNEL)))
+	sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
+	if (sect_attrs == NULL)
 		return;
 
 	/* Setup section attributes. */
 	sect_attrs->grp.name = "sections";
 	sect_attrs->grp.attrs = (void *)sect_attrs + size[0];
 
+	sect_attrs->nsections = 0;
 	sattr = &sect_attrs->attrs[0];
 	gattr = &sect_attrs->grp.attrs[0];
 	for (i = 0; i < nsect; i++) {
 		if (! (sechdrs[i].sh_flags & SHF_ALLOC))
 			continue;
 		sattr->address = sechdrs[i].sh_addr;
-		strlcpy(sattr->name, secstrings + sechdrs[i].sh_name,
-			MODULE_SECT_NAME_LEN);
+		sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
+					GFP_KERNEL);
+		if (sattr->name == NULL)
+			goto out;
+		sect_attrs->nsections++;
 		sattr->mattr.show = module_sect_show;
 		sattr->mattr.store = NULL;
 		sattr->mattr.attr.name = sattr->name;
@@ -979,7 +993,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
 	mod->sect_attrs = sect_attrs;
 	return;
   out:
-	kfree(sect_attrs);
+	free_sect_attrs(sect_attrs);
 }
 
 static void remove_sect_attrs(struct module *mod)
@@ -989,13 +1003,13 @@ static void remove_sect_attrs(struct module *mod)
 				   &mod->sect_attrs->grp);
 		/* We are positive that no one is using any sect attrs
 		 * at this point.  Deallocate immediately. */
-		kfree(mod->sect_attrs);
+		free_sect_attrs(mod->sect_attrs);
 		mod->sect_attrs = NULL;
 	}
 }
 
-
 #else
+
 static inline void add_sect_attrs(struct module *mod, unsigned int nsect,
 		char *sectstrings, Elf_Shdr *sechdrs)
 {
-- 
cgit v1.2.3


From e5582ca21af82929d5cd3613321ac9233c492ebc Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 29 Sep 2006 02:01:35 -0700
Subject: [PATCH] stop_machine.c copyright

I had to look back: this code was extracted from the module.c code in 2005.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/stop_machine.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 51cacd111dbd..12458040e665 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,3 +1,6 @@
+/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
+ * GPL v2 and any later version.
+ */
 #include <linux/stop_machine.h>
 #include <linux/kthread.h>
 #include <linux/sched.h>
-- 
cgit v1.2.3


From eb84a20e9e6b98dcb33023ad22241d79107a08a7 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Fri, 29 Sep 2006 02:01:41 -0700
Subject: [PATCH] audit/accounting: tty locking

Add tty locking around the audit and accounting code.

The whole current->signal-> locking is all deeply strange but it's for
someone else to sort out.  Add rather than replace the lock for acct.c

Signed-off-by: Alan Cox <alan@redhat.com>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/acct.c    | 6 +++++-
 kernel/auditsc.c | 5 +++++
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index 2a7c933651c7..f4330acead46 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -483,10 +483,14 @@ static void do_acct_process(struct file *file)
 	ac.ac_ppid = current->parent->tgid;
 #endif
 
-	read_lock(&tasklist_lock);	/* pin current->signal */
+	mutex_lock(&tty_mutex);
+	/* FIXME: Whoever is responsible for current->signal locking needs
+	   to use the same locking all over the kernel and document it */
+	read_lock(&tasklist_lock);
 	ac.ac_tty = current->signal->tty ?
 		old_encode_dev(tty_devnum(current->signal->tty)) : 0;
 	read_unlock(&tasklist_lock);
+	mutex_unlock(&tty_mutex);
 
 	spin_lock_irq(&current->sighand->siglock);
 	ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fb83c5cb8c32..105147631753 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -817,6 +817,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 		audit_log_format(ab, " success=%s exit=%ld", 
 				 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
 				 context->return_code);
+
+	mutex_lock(&tty_mutex);
 	if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
 		tty = tsk->signal->tty->name;
 	else
@@ -838,6 +840,9 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 		  context->gid,
 		  context->euid, context->suid, context->fsuid,
 		  context->egid, context->sgid, context->fsgid, tty);
+
+	mutex_unlock(&tty_mutex);
+
 	audit_log_task_info(ab, tsk);
 	if (context->filterkey) {
 		audit_log_format(ab, " key=");
-- 
cgit v1.2.3


From 03cbc358aab3faf34bfeaa02206fa660127e9b3f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 29 Sep 2006 02:01:46 -0700
Subject: [PATCH] lockdep core: improve the lock-chain-hash

With CONFIG_DEBUG_LOCK_ALLOC turned off i was getting sporadic failures in
the locking self-test:

  ------------>
  | Locking API testsuite:
  ----------------------------------------------------------------------------
                                   | spin |wlock |rlock |mutex | wsem | rsem |
    --------------------------------------------------------------------------
                       A-A deadlock:  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |
                   A-B-B-A deadlock:  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |
               A-B-B-C-C-A deadlock:  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |
               A-B-C-A-B-C deadlock:  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |
           A-B-B-C-C-D-D-A deadlock:  ok  |FAILED|  ok  |  ok  |  ok  |  ok  |
           A-B-C-D-B-D-D-A deadlock:  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |
           A-B-C-D-B-C-D-A deadlock:  ok  |  ok  |  ok  |  ok  |  ok  |FAILED|

after much debugging it turned out to be caused by accidental chain-hash
key collisions.  The current hash is:

 #define iterate_chain_key(key1, key2) \
	(((key1) << MAX_LOCKDEP_KEYS_BITS/2) ^ \
	((key1) >> (64-MAX_LOCKDEP_KEYS_BITS/2)) ^ \
 	(key2))

where MAX_LOCKDEP_KEYS_BITS is 11.  This hash is pretty good as it will
shift by 5 bits in every iteration, where every new ID 'mixed' into the
hash would have up to 11 bits.  But because there was a 6 bits overlap
between subsequent IDs and their high bits tended to be similar, there was
a chance for accidental chain-hash collision for a low number of locks
held.

the solution is to shift by 11 bits:

 #define iterate_chain_key(key1, key2) \
	(((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \
	((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \
 	(key2))

This keeps the hash perfect up to 5 locks held, but even above that the
hash is still good because 11 bits is a relative prime to the total 64
bits, so a complete match will only occur after 64 held locks (which doesnt
happen in Linux).  Even after 5 locks held, entropy of the 5 IDs mixed into
the hash is already good enough so that overlap doesnt generate a colliding
hash ID.

with this change the false positives went away.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/lockdep.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index df1c3594de31..e596525669ed 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -122,8 +122,8 @@ static struct list_head chainhash_table[CHAINHASH_SIZE];
  * unique.
  */
 #define iterate_chain_key(key1, key2) \
-	(((key1) << MAX_LOCKDEP_KEYS_BITS/2) ^ \
-	((key1) >> (64-MAX_LOCKDEP_KEYS_BITS/2)) ^ \
+	(((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \
+	((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \
 	(key2))
 
 void lockdep_off(void)
-- 
cgit v1.2.3


From 181b64803661209cda64e5e874ad75f373a69de8 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 29 Sep 2006 02:01:48 -0700
Subject: [PATCH] cpuset: fix obscure attach_task vs exiting race

Fix obscure race condition in kernel/cpuset.c attach_task() code.

There is basically zero chance of anyone accidentally being harmed by this
race.

It requires a special 'micro-stress' load and a special timing loop hacks
in the kernel to hit in less than an hour, and even then you'd have to hit
it hundreds or thousands of times, followed by some unusual and senseless
cpuset configuration requests, including removing the top cpuset, to cause
any visibly harm affects.

One could, with perhaps a few days or weeks of such effort, get the
reference count on the top cpuset below zero, and manage to crash the
kernel by asking to remove the top cpuset.

I found it by code inspection.

The race was introduced when 'the_top_cpuset_hack' was introduced, and one
piece of code was not updated.  An old check for a possibly null task
cpuset pointer needed to be changed to a check for a task marked
PF_EXITING.  The pointer can't be null anymore, thanks to
the_top_cpuset_hack (documented in kernel/cpuset.c).  But the task could
have gone into PF_EXITING state after it was found in the task_list scan.

If a task is PF_EXITING in this code, it is possible that its task->cpuset
pointer is pointing to the top cpuset due to the_top_cpuset_hack, rather
than because the top_cpuset was that tasks last valid cpuset.  In that
case, the wrong cpuset reference counter would be decremented.

The fix is trivial.  Instead of failing the system call if the tasks cpuset
pointer is null here, fail it if the task is in PF_EXITING state.

The code for 'the_top_cpuset_hack' that changes an exiting tasks cpuset to
the top_cpuset is done without locking, so could happen at anytime.  But it
is done during the exit handling, after the PF_EXITING flag is set.  So if
we verify that a task is still not PF_EXITING after we copy out its cpuset
pointer (into 'oldcs', below), we know that 'oldcs' is not one of these
hack references to the top_cpuset.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index cc0395d7eba1..8c3c400cce91 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1225,7 +1225,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 
 	task_lock(tsk);
 	oldcs = tsk->cpuset;
-	if (!oldcs) {
+	/*
+	 * After getting 'oldcs' cpuset ptr, be sure still not exiting.
+	 * If 'oldcs' might be the top_cpuset due to the_top_cpuset_hack
+	 * then fail this attach_task(), to avoid breaking top_cpuset.count.
+	 */
+	if (tsk->flags & PF_EXITING) {
 		task_unlock(tsk);
 		mutex_unlock(&callback_mutex);
 		put_task_struct(tsk);
-- 
cgit v1.2.3


From 29cbc78b90a73ad80f2f58ba2927956cf663abed Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 30 Sep 2006 01:47:55 +0200
Subject: [PATCH] x86: Clean up x86 NMI sysctls

Use prototypes in headers
Don't define panic_on_unrecovered_nmi for all architectures

Cc: dzickus@redhat.com

Signed-off-by: Andi Kleen <ak@suse.de>
---
 kernel/panic.c  |  1 -
 kernel/sysctl.c | 11 ++++-------
 2 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 6ceb664fb52a..525e365f7239 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -21,7 +21,6 @@
 #include <linux/debug_locks.h>
 
 int panic_on_oops;
-int panic_on_unrecovered_nmi;
 int tainted;
 static int pause_on_oops;
 static int pause_on_oops_flag;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9535a3839930..c57c4532e296 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -52,6 +52,10 @@
 extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
                      void __user *buffer, size_t *lenp, loff_t *ppos);
 
+#ifdef CONFIG_X86
+#include <asm/nmi.h>
+#endif
+
 #if defined(CONFIG_SYSCTL)
 
 /* External variables not in a header file. */
@@ -74,13 +78,6 @@ extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
 
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
-int unknown_nmi_panic;
-int nmi_watchdog_enabled;
-extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
-			void __user *, size_t *, loff_t *);
-#endif
-
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
 static int minolduid;
-- 
cgit v1.2.3


From 34596dc9e59d7bece16fe5aba08116b49465da26 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 30 Sep 2006 01:47:55 +0200
Subject: [PATCH] Define vsyscall cache as blob to make clearer that user space
 shouldn't use it

Signed-off-by: Andi Kleen <ak@suse.de>
---
 kernel/sys.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 8647061c084a..b88806c66244 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2083,12 +2083,12 @@ asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep,
 		 * padding
 		 */
 		unsigned long t0, t1;
-		get_user(t0, &cache->t0);
-		get_user(t1, &cache->t1);
+		get_user(t0, &cache->blob[0]);
+		get_user(t1, &cache->blob[1]);
 		t0++;
 		t1++;
-		put_user(t0, &cache->t0);
-		put_user(t1, &cache->t1);
+		put_user(t0, &cache->blob[0]);
+		put_user(t1, &cache->blob[1]);
 	}
 	return err ? -EFAULT : 0;
 }
-- 
cgit v1.2.3


From 0d67a46df0125e20d14f12dbd3646f1f1bf23e8c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 29 Aug 2006 19:05:56 +0100
Subject: [PATCH] BLOCK: Remove duplicate declaration of exit_io_context() [try
 #6]

Remove the duplicate declaration of exit_io_context() from linux/sched.h.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/exit.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 2e4c13cba95a..c189de2927ab 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -38,6 +38,7 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/audit.h> /* for audit_free() */
 #include <linux/resource.h>
+#include <linux/blkdev.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
-- 
cgit v1.2.3


From 07f3f05c1e3052b8656129b2a5aca9f888241a34 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 30 Sep 2006 20:52:18 +0200
Subject: [PATCH] BLOCK: Move extern declarations out of fs/*.c into header
 files [try #6]

Create a new header file, fs/internal.h, for common definitions local to the
sources in the fs/ directory.

Move extern definitions that should be in header files from fs/*.c to
fs/internal.h or other main header files where they span directories.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/compat.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index 75573e5d27b0..b4fbd838cd77 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -26,6 +26,8 @@
 
 #include <asm/uaccess.h>
 
+extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat);
+
 int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
 {
 	return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
-- 
cgit v1.2.3


From 9361401eb7619c033e2394e4f9f6d410d6719ac7 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 30 Sep 2006 20:45:40 +0200
Subject: [PATCH] BLOCK: Make it possible to disable the block layer [try #6]

Make it possible to disable the block layer.  Not all embedded devices require
it, some can make do with just JFFS2, NFS, ramfs, etc - none of which require
the block layer to be present.

This patch does the following:

 (*) Introduces CONFIG_BLOCK to disable the block layer, buffering and blockdev
     support.

 (*) Adds dependencies on CONFIG_BLOCK to any configuration item that controls
     an item that uses the block layer.  This includes:

     (*) Block I/O tracing.

     (*) Disk partition code.

     (*) All filesystems that are block based, eg: Ext3, ReiserFS, ISOFS.

     (*) The SCSI layer.  As far as I can tell, even SCSI chardevs use the
     	 block layer to do scheduling.  Some drivers that use SCSI facilities -
     	 such as USB storage - end up disabled indirectly from this.

     (*) Various block-based device drivers, such as IDE and the old CDROM
     	 drivers.

     (*) MTD blockdev handling and FTL.

     (*) JFFS - which uses set_bdev_super(), something it could avoid doing by
     	 taking a leaf out of JFFS2's book.

 (*) Makes most of the contents of linux/blkdev.h, linux/buffer_head.h and
     linux/elevator.h contingent on CONFIG_BLOCK being set.  sector_div() is,
     however, still used in places, and so is still available.

 (*) Also made contingent are the contents of linux/mpage.h, linux/genhd.h and
     parts of linux/fs.h.

 (*) Makes a number of files in fs/ contingent on CONFIG_BLOCK.

 (*) Makes mm/bounce.c (bounce buffering) contingent on CONFIG_BLOCK.

 (*) set_page_dirty() doesn't call __set_page_dirty_buffers() if CONFIG_BLOCK
     is not enabled.

 (*) fs/no-block.c is created to hold out-of-line stubs and things that are
     required when CONFIG_BLOCK is not set:

     (*) Default blockdev file operations (to give error ENODEV on opening).

 (*) Makes some /proc changes:

     (*) /proc/devices does not list any blockdevs.

     (*) /proc/diskstats and /proc/partitions are contingent on CONFIG_BLOCK.

 (*) Makes some compat ioctl handling contingent on CONFIG_BLOCK.

 (*) If CONFIG_BLOCK is not defined, makes sys_quotactl() return -ENODEV if
     given command other than Q_SYNC or if a special device is specified.

 (*) In init/do_mounts.c, no reference is made to the blockdev routines if
     CONFIG_BLOCK is not defined.  This does not prohibit NFS roots or JFFS2.

 (*) The bdflush, ioprio_set and ioprio_get syscalls can now be absent (return
     error ENOSYS by way of cond_syscall if so).

 (*) The seclvl_bd_claim() and seclvl_bd_release() security calls do nothing if
     CONFIG_BLOCK is not set, since they can't then happen.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/sys_ni.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 6991bece67e8..7a3b2e75f040 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -134,3 +134,8 @@ cond_syscall(sys_madvise);
 cond_syscall(sys_mremap);
 cond_syscall(sys_remap_file_pages);
 cond_syscall(compat_sys_move_pages);
+
+/* block-layer dependent */
+cond_syscall(sys_bdflush);
+cond_syscall(sys_ioprio_set);
+cond_syscall(sys_ioprio_get);
-- 
cgit v1.2.3


From 5c87579e65ee4f419b2369407f82326d38b5d2d8 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 30 Sep 2006 23:27:17 -0700
Subject: [PATCH] maximum latency tracking infrastructure

Add infrastructure to track "maximum allowable latency" for power saving
policies.

The reason for adding this infrastructure is that power management in the
idle loop needs to make a tradeoff between latency and power savings
(deeper power save modes have a longer latency to running code again).  The
code that today makes this tradeoff just does a rather simple algorithm;
however this is not good enough: There are devices and use cases where a
lower latency is required than that the higher power saving states provide.
 An example would be audio playback, but another example is the ipw2100
wireless driver that right now has a very direct and ugly acpi hook to
disable some higher power states randomly when it gets certain types of
error.

The proposed solution is to have an interface where drivers can

* announce the maximum latency (in microseconds) that they can deal with
* modify this latency
* give up their constraint

and a function where the code that decides on power saving strategy can
query the current global desired maximum.

This patch has a user of each side: on the consumer side, ACPI is patched
to use this, on the producer side the ipw2100 driver is patched.

A generic maximum latency is also registered of 2 timer ticks (more and you
lose accurate time tracking after all).

While the existing users of the patch are x86 specific, the infrastructure
is not.  I'd like to ask the arch maintainers of other architectures if the
infrastructure is generic enough for their use (assuming the architecture
has such a tradeoff as concept at all), and the sound/multimedia driver
owners to look at the driver facing API to see if this is something they
can use.

[akpm@osdl.org: cleanups]
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Jesse Barnes <jesse.barnes@intel.com>
Cc: "Brown, Len" <len.brown@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/Makefile  |   2 +-
 kernel/latency.c | 279 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 280 insertions(+), 1 deletion(-)
 create mode 100644 kernel/latency.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index d62ec66c1af2..e210e8cf7237 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
-	    hrtimer.o rwsem.o
+	    hrtimer.o rwsem.o latency.o
 
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
diff --git a/kernel/latency.c b/kernel/latency.c
new file mode 100644
index 000000000000..258f2555abbc
--- /dev/null
+++ b/kernel/latency.c
@@ -0,0 +1,279 @@
+/*
+ * latency.c: Explicit system-wide latency-expectation infrastructure
+ *
+ * The purpose of this infrastructure is to allow device drivers to set
+ * latency constraint they have and to collect and summarize these
+ * expectations globally. The cummulated result can then be used by
+ * power management and similar users to make decisions that have
+ * tradoffs with a latency component.
+ *
+ * An example user of this are the x86 C-states; each higher C state saves
+ * more power, but has a higher exit latency. For the idle loop power
+ * code to make a good decision which C-state to use, information about
+ * acceptable latencies is required.
+ *
+ * An example announcer of latency is an audio driver that knowns it
+ * will get an interrupt when the hardware has 200 usec of samples
+ * left in the DMA buffer; in that case the driver can set a latency
+ * constraint of, say, 150 usec.
+ *
+ * Multiple drivers can each announce their maximum accepted latency,
+ * to keep these appart, a string based identifier is used.
+ *
+ *
+ * (C) Copyright 2006 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/latency.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <asm/atomic.h>
+
+struct latency_info {
+	struct list_head list;
+	int usecs;
+	char *identifier;
+};
+
+/*
+ * locking rule: all modifications to current_max_latency and
+ * latency_list need to be done while holding the latency_lock.
+ * latency_lock needs to be taken _irqsave.
+ */
+static atomic_t current_max_latency;
+static DEFINE_SPINLOCK(latency_lock);
+
+static LIST_HEAD(latency_list);
+static BLOCKING_NOTIFIER_HEAD(latency_notifier);
+
+/*
+ * This function returns the maximum latency allowed, which
+ * happens to be the minimum of all maximum latencies on the
+ * list.
+ */
+static int __find_max_latency(void)
+{
+	int min = INFINITE_LATENCY;
+	struct latency_info *info;
+
+	list_for_each_entry(info, &latency_list, list) {
+		if (info->usecs < min)
+			min = info->usecs;
+	}
+	return min;
+}
+
+/**
+ * set_acceptable_latency - sets the maximum latency acceptable
+ * @identifier: string that identifies this driver
+ * @usecs: maximum acceptable latency for this driver
+ *
+ * This function informs the kernel that this device(driver)
+ * can accept at most usecs latency. This setting is used for
+ * power management and similar tradeoffs.
+ *
+ * This function sleeps and can only be called from process
+ * context.
+ * Calling this function with an existing identifier is valid
+ * and will cause the existing latency setting to be changed.
+ */
+void set_acceptable_latency(char *identifier, int usecs)
+{
+	struct latency_info *info, *iter;
+	unsigned long flags;
+	int found_old = 0;
+
+	info = kzalloc(sizeof(struct latency_info), GFP_KERNEL);
+	if (!info)
+		return;
+	info->usecs = usecs;
+	info->identifier = kstrdup(identifier, GFP_KERNEL);
+	if (!info->identifier)
+		goto free_info;
+
+	spin_lock_irqsave(&latency_lock, flags);
+	list_for_each_entry(iter, &latency_list, list) {
+		if (strcmp(iter->identifier, identifier)==0) {
+			found_old = 1;
+			iter->usecs = usecs;
+			break;
+		}
+	}
+	if (!found_old)
+		list_add(&info->list, &latency_list);
+
+	if (usecs < atomic_read(&current_max_latency))
+		atomic_set(&current_max_latency, usecs);
+
+	spin_unlock_irqrestore(&latency_lock, flags);
+
+	blocking_notifier_call_chain(&latency_notifier,
+		atomic_read(&current_max_latency), NULL);
+
+	/*
+	 * if we inserted the new one, we're done; otherwise there was
+	 * an existing one so we need to free the redundant data
+	 */
+	if (!found_old)
+		return;
+
+	kfree(info->identifier);
+free_info:
+	kfree(info);
+}
+EXPORT_SYMBOL_GPL(set_acceptable_latency);
+
+/**
+ * modify_acceptable_latency - changes the maximum latency acceptable
+ * @identifier: string that identifies this driver
+ * @usecs: maximum acceptable latency for this driver
+ *
+ * This function informs the kernel that this device(driver)
+ * can accept at most usecs latency. This setting is used for
+ * power management and similar tradeoffs.
+ *
+ * This function does not sleep and can be called in any context.
+ * Trying to use a non-existing identifier silently gets ignored.
+ *
+ * Due to the atomic nature of this function, the modified latency
+ * value will only be used for future decisions; past decisions
+ * can still lead to longer latencies in the near future.
+ */
+void modify_acceptable_latency(char *identifier, int usecs)
+{
+	struct latency_info *iter;
+	unsigned long flags;
+
+	spin_lock_irqsave(&latency_lock, flags);
+	list_for_each_entry(iter, &latency_list, list) {
+		if (strcmp(iter->identifier, identifier) == 0) {
+			iter->usecs = usecs;
+			break;
+		}
+	}
+	if (usecs < atomic_read(&current_max_latency))
+		atomic_set(&current_max_latency, usecs);
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+EXPORT_SYMBOL_GPL(modify_acceptable_latency);
+
+/**
+ * remove_acceptable_latency - removes the maximum latency acceptable
+ * @identifier: string that identifies this driver
+ *
+ * This function removes a previously set maximum latency setting
+ * for the driver and frees up any resources associated with the
+ * bookkeeping needed for this.
+ *
+ * This function does not sleep and can be called in any context.
+ * Trying to use a non-existing identifier silently gets ignored.
+ */
+void remove_acceptable_latency(char *identifier)
+{
+	unsigned long flags;
+	int newmax = 0;
+	struct latency_info *iter, *temp;
+
+	spin_lock_irqsave(&latency_lock, flags);
+
+	list_for_each_entry_safe(iter,  temp, &latency_list, list) {
+		if (strcmp(iter->identifier, identifier) == 0) {
+			list_del(&iter->list);
+			newmax = iter->usecs;
+			kfree(iter->identifier);
+			kfree(iter);
+			break;
+		}
+	}
+
+	/* If we just deleted the system wide value, we need to
+	 * recalculate with a full search
+	 */
+	if (newmax == atomic_read(&current_max_latency)) {
+		newmax = __find_max_latency();
+		atomic_set(&current_max_latency, newmax);
+	}
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+EXPORT_SYMBOL_GPL(remove_acceptable_latency);
+
+/**
+ * system_latency_constraint - queries the system wide latency maximum
+ *
+ * This function returns the system wide maximum latency in
+ * microseconds.
+ *
+ * This function does not sleep and can be called in any context.
+ */
+int system_latency_constraint(void)
+{
+	return atomic_read(&current_max_latency);
+}
+EXPORT_SYMBOL_GPL(system_latency_constraint);
+
+/**
+ * synchronize_acceptable_latency - recalculates all latency decisions
+ *
+ * This function will cause a callback to various kernel pieces that
+ * will make those pieces rethink their latency decisions. This implies
+ * that if there are overlong latencies in hardware state already, those
+ * latencies get taken right now. When this call completes no overlong
+ * latency decisions should be active anymore.
+ *
+ * Typical usecase of this is after a modify_acceptable_latency() call,
+ * which in itself is non-blocking and non-synchronizing.
+ *
+ * This function blocks and should not be called with locks held.
+ */
+
+void synchronize_acceptable_latency(void)
+{
+	blocking_notifier_call_chain(&latency_notifier,
+		atomic_read(&current_max_latency), NULL);
+}
+EXPORT_SYMBOL_GPL(synchronize_acceptable_latency);
+
+/*
+ * Latency notifier: this notifier gets called when a non-atomic new
+ * latency value gets set. The expectation nof the caller of the
+ * non-atomic set is that when the call returns, future latencies
+ * are within bounds, so the functions on the notifier list are
+ * expected to take the overlong latencies immediately, inside the
+ * callback, and not make a overlong latency decision anymore.
+ *
+ * The callback gets called when the new latency value is made
+ * active so system_latency_constraint() returns the new latency.
+ */
+int register_latency_notifier(struct notifier_block * nb)
+{
+	return blocking_notifier_chain_register(&latency_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(register_latency_notifier);
+
+int unregister_latency_notifier(struct notifier_block * nb)
+{
+	return blocking_notifier_chain_unregister(&latency_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_latency_notifier);
+
+static __init int latency_init(void)
+{
+	atomic_set(&current_max_latency, INFINITE_LATENCY);
+	/*
+	 * we don't want by default to have longer latencies than 2 ticks,
+	 * since that would cause lost ticks
+	 */
+	set_acceptable_latency("kernel", 2*1000000/HZ);
+	return 0;
+}
+
+module_init(latency_init);
-- 
cgit v1.2.3


From 756184b7d771992f4fb1998d62aebcaf3e028076 Mon Sep 17 00:00:00 2001
From: Cal Peake <cp@absolutedigital.net>
Date: Sat, 30 Sep 2006 23:27:24 -0700
Subject: [PATCH] CodingStyle cleanup for kernel/sys.c

Fix up kernel/sys.c to be consistent with CodingStyle and the rest of the
file.

Signed-off-by: Cal Peake <cp@absolutedigital.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 80 +++++++++++++++++++++++-------------------------------------
 1 file changed, 30 insertions(+), 50 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index b88806c66244..2460581c928c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -607,11 +607,10 @@ static void kernel_restart_prepare(char *cmd)
 void kernel_restart(char *cmd)
 {
 	kernel_restart_prepare(cmd);
-	if (!cmd) {
+	if (!cmd)
 		printk(KERN_EMERG "Restarting system.\n");
-	} else {
+	else
 		printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
-	}
 	machine_restart(cmd);
 }
 EXPORT_SYMBOL_GPL(kernel_restart);
@@ -627,9 +626,8 @@ static void kernel_kexec(void)
 #ifdef CONFIG_KEXEC
 	struct kimage *image;
 	image = xchg(&kexec_image, NULL);
-	if (!image) {
+	if (!image)
 		return;
-	}
 	kernel_restart_prepare(NULL);
 	printk(KERN_EMERG "Starting new kernel\n");
 	machine_shutdown();
@@ -823,12 +821,10 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
 		    (current->sgid == egid) ||
 		    capable(CAP_SETGID))
 			new_egid = egid;
-		else {
+		else
 			return -EPERM;
-		}
 	}
-	if (new_egid != old_egid)
-	{
+	if (new_egid != old_egid) {
 		current->mm->dumpable = suid_dumpable;
 		smp_wmb();
 	}
@@ -857,19 +853,14 @@ asmlinkage long sys_setgid(gid_t gid)
 	if (retval)
 		return retval;
 
-	if (capable(CAP_SETGID))
-	{
-		if(old_egid != gid)
-		{
+	if (capable(CAP_SETGID)) {
+		if (old_egid != gid) {
 			current->mm->dumpable = suid_dumpable;
 			smp_wmb();
 		}
 		current->gid = current->egid = current->sgid = current->fsgid = gid;
-	}
-	else if ((gid == current->gid) || (gid == current->sgid))
-	{
-		if(old_egid != gid)
-		{
+	} else if ((gid == current->gid) || (gid == current->sgid)) {
+		if (old_egid != gid) {
 			current->mm->dumpable = suid_dumpable;
 			smp_wmb();
 		}
@@ -900,8 +891,7 @@ static int set_user(uid_t new_ruid, int dumpclear)
 
 	switch_uid(new_user);
 
-	if(dumpclear)
-	{
+	if (dumpclear) {
 		current->mm->dumpable = suid_dumpable;
 		smp_wmb();
 	}
@@ -957,8 +947,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
 	if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0)
 		return -EAGAIN;
 
-	if (new_euid != old_euid)
-	{
+	if (new_euid != old_euid) {
 		current->mm->dumpable = suid_dumpable;
 		smp_wmb();
 	}
@@ -1008,8 +997,7 @@ asmlinkage long sys_setuid(uid_t uid)
 	} else if ((uid != current->uid) && (uid != new_suid))
 		return -EPERM;
 
-	if (old_euid != uid)
-	{
+	if (old_euid != uid) {
 		current->mm->dumpable = suid_dumpable;
 		smp_wmb();
 	}
@@ -1054,8 +1042,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
 			return -EAGAIN;
 	}
 	if (euid != (uid_t) -1) {
-		if (euid != current->euid)
-		{
+		if (euid != current->euid) {
 			current->mm->dumpable = suid_dumpable;
 			smp_wmb();
 		}
@@ -1105,8 +1092,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
 			return -EPERM;
 	}
 	if (egid != (gid_t) -1) {
-		if (egid != current->egid)
-		{
+		if (egid != current->egid) {
 			current->mm->dumpable = suid_dumpable;
 			smp_wmb();
 		}
@@ -1151,10 +1137,8 @@ asmlinkage long sys_setfsuid(uid_t uid)
 
 	if (uid == current->uid || uid == current->euid ||
 	    uid == current->suid || uid == current->fsuid || 
-	    capable(CAP_SETUID))
-	{
-		if (uid != old_fsuid)
-		{
+	    capable(CAP_SETUID)) {
+		if (uid != old_fsuid) {
 			current->mm->dumpable = suid_dumpable;
 			smp_wmb();
 		}
@@ -1182,10 +1166,8 @@ asmlinkage long sys_setfsgid(gid_t gid)
 
 	if (gid == current->gid || gid == current->egid ||
 	    gid == current->sgid || gid == current->fsgid || 
-	    capable(CAP_SETGID))
-	{
-		if (gid != old_fsgid)
-		{
+	    capable(CAP_SETGID)) {
+		if (gid != old_fsgid) {
 			current->mm->dumpable = suid_dumpable;
 			smp_wmb();
 		}
@@ -1321,9 +1303,9 @@ out:
 
 asmlinkage long sys_getpgid(pid_t pid)
 {
-	if (!pid) {
+	if (!pid)
 		return process_group(current);
-	} else {
+	else {
 		int retval;
 		struct task_struct *p;
 
@@ -1353,9 +1335,9 @@ asmlinkage long sys_getpgrp(void)
 
 asmlinkage long sys_getsid(pid_t pid)
 {
-	if (!pid) {
+	if (!pid)
 		return current->signal->session;
-	} else {
+	else {
 		int retval;
 		struct task_struct *p;
 
@@ -1363,7 +1345,7 @@ asmlinkage long sys_getsid(pid_t pid)
 		p = find_task_by_pid(pid);
 
 		retval = -ESRCH;
-		if(p) {
+		if (p) {
 			retval = security_task_getsid(p);
 			if (!retval)
 				retval = p->signal->session;
@@ -1431,9 +1413,9 @@ struct group_info *groups_alloc(int gidsetsize)
 	group_info->nblocks = nblocks;
 	atomic_set(&group_info->usage, 1);
 
-	if (gidsetsize <= NGROUPS_SMALL) {
+	if (gidsetsize <= NGROUPS_SMALL)
 		group_info->blocks[0] = group_info->small_block;
-	} else {
+	else {
 		for (i = 0; i < nblocks; i++) {
 			gid_t *b;
 			b = (void *)__get_free_page(GFP_USER);
@@ -1489,7 +1471,7 @@ static int groups_to_user(gid_t __user *grouplist,
 /* fill a group_info from a user-space array - it must be allocated already */
 static int groups_from_user(struct group_info *group_info,
     gid_t __user *grouplist)
- {
+{
 	int i;
 	int count = group_info->ngroups;
 
@@ -1647,9 +1629,8 @@ asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist)
 int in_group_p(gid_t grp)
 {
 	int retval = 1;
-	if (grp != current->fsgid) {
+	if (grp != current->fsgid)
 		retval = groups_search(current->group_info, grp);
-	}
 	return retval;
 }
 
@@ -1658,9 +1639,8 @@ EXPORT_SYMBOL(in_group_p);
 int in_egroup_p(gid_t grp)
 {
 	int retval = 1;
-	if (grp != current->egid) {
+	if (grp != current->egid)
 		retval = groups_search(current->group_info, grp);
-	}
 	return retval;
 }
 
@@ -1775,9 +1755,9 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r
 	task_lock(current->group_leader);
 	x = current->signal->rlim[resource];
 	task_unlock(current->group_leader);
-	if(x.rlim_cur > 0x7FFFFFFF)
+	if (x.rlim_cur > 0x7FFFFFFF)
 		x.rlim_cur = 0x7FFFFFFF;
-	if(x.rlim_max > 0x7FFFFFFF)
+	if (x.rlim_max > 0x7FFFFFFF)
 		x.rlim_max = 0x7FFFFFFF;
 	return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0;
 }
-- 
cgit v1.2.3


From ef6edc9746dc2bfdacf44eefd5f881179971c478 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Sat, 30 Sep 2006 23:27:43 -0700
Subject: [PATCH] Directed yield: cpu_relax variants for spinlocks and rw-locks

On systems running with virtual cpus there is optimization potential in
regard to spinlocks and rw-locks.  If the virtual cpu that has taken a lock
is known to a cpu that wants to acquire the same lock it is beneficial to
yield the timeslice of the virtual cpu in favour of the cpu that has the
lock (directed yield).

With CONFIG_PREEMPT="n" this can be implemented by the architecture without
common code changes.  Powerpc already does this.

With CONFIG_PREEMPT="y" the lock loops are coded with _raw_spin_trylock,
_raw_read_trylock and _raw_write_trylock in kernel/spinlock.c.  If the lock
could not be taken cpu_relax is called.  A directed yield is not possible
because cpu_relax doesn't know anything about the lock.  To be able to
yield the lock in favour of the current lock holder variants of cpu_relax
for spinlocks and rw-locks are needed.  The new _raw_spin_relax,
_raw_read_relax and _raw_write_relax primitives differ from cpu_relax
insofar that they have an argument: a pointer to the lock structure.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Haavard Skinnemoen <hskinnemoen@atmel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/spinlock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index d48143eafbfd..476c3741511b 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -215,7 +215,7 @@ void __lockfunc _##op##_lock(locktype##_t *lock)			\
 		if (!(lock)->break_lock)				\
 			(lock)->break_lock = 1;				\
 		while (!op##_can_lock(lock) && (lock)->break_lock)	\
-			cpu_relax();					\
+			_raw_##op##_relax(&lock->raw_lock);		\
 	}								\
 	(lock)->break_lock = 0;						\
 }									\
@@ -237,7 +237,7 @@ unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock)	\
 		if (!(lock)->break_lock)				\
 			(lock)->break_lock = 1;				\
 		while (!op##_can_lock(lock) && (lock)->break_lock)	\
-			cpu_relax();					\
+			_raw_##op##_relax(&lock->raw_lock);		\
 	}								\
 	(lock)->break_lock = 0;						\
 	return flags;							\
-- 
cgit v1.2.3


From 4c7ee8de956fc250fe31e2fa91f6da980fabe317 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Sat, 30 Sep 2006 23:28:22 -0700
Subject: [PATCH] NTP: Move all the NTP related code to ntp.c

Move all the NTP related code to ntp.c

[akpm@osdl.org: cleanups, build fix]
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time.c        | 173 -----------------------
 kernel/time/Makefile |   2 +-
 kernel/time/ntp.c    | 389 +++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/timer.c       | 211 +---------------------------
 4 files changed, 391 insertions(+), 384 deletions(-)
 create mode 100644 kernel/time/ntp.c

(limited to 'kernel')

diff --git a/kernel/time.c b/kernel/time.c
index 5bd489747643..0e017bff4c19 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -202,179 +202,6 @@ asmlinkage long sys_settimeofday(struct timeval __user *tv,
 	return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
 }
 
-/* we call this to notify the arch when the clock is being
- * controlled.  If no such arch routine, do nothing.
- */
-void __attribute__ ((weak)) notify_arch_cmos_timer(void)
-{
-	return;
-}
-
-/* adjtimex mainly allows reading (and writing, if superuser) of
- * kernel time-keeping variables. used by xntpd.
- */
-int do_adjtimex(struct timex *txc)
-{
-        long ltemp, mtemp, save_adjust;
-	int result;
-
-	/* In order to modify anything, you gotta be super-user! */
-	if (txc->modes && !capable(CAP_SYS_TIME))
-		return -EPERM;
-		
-	/* Now we validate the data before disabling interrupts */
-
-	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
-	  /* singleshot must not be used with any other mode bits */
-		if (txc->modes != ADJ_OFFSET_SINGLESHOT)
-			return -EINVAL;
-
-	if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET))
-	  /* adjustment Offset limited to +- .512 seconds */
-		if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE )
-			return -EINVAL;	
-
-	/* if the quartz is off by more than 10% something is VERY wrong ! */
-	if (txc->modes & ADJ_TICK)
-		if (txc->tick <  900000/USER_HZ ||
-		    txc->tick > 1100000/USER_HZ)
-			return -EINVAL;
-
-	write_seqlock_irq(&xtime_lock);
-	result = time_state;	/* mostly `TIME_OK' */
-
-	/* Save for later - semantics of adjtime is to return old value */
-	save_adjust = time_next_adjust ? time_next_adjust : time_adjust;
-
-#if 0	/* STA_CLOCKERR is never set yet */
-	time_status &= ~STA_CLOCKERR;		/* reset STA_CLOCKERR */
-#endif
-	/* If there are input parameters, then process them */
-	if (txc->modes)
-	{
-	    if (txc->modes & ADJ_STATUS)	/* only set allowed bits */
-		time_status =  (txc->status & ~STA_RONLY) |
-			      (time_status & STA_RONLY);
-
-	    if (txc->modes & ADJ_FREQUENCY) {	/* p. 22 */
-		if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) {
-		    result = -EINVAL;
-		    goto leave;
-		}
-		time_freq = txc->freq;
-	    }
-
-	    if (txc->modes & ADJ_MAXERROR) {
-		if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) {
-		    result = -EINVAL;
-		    goto leave;
-		}
-		time_maxerror = txc->maxerror;
-	    }
-
-	    if (txc->modes & ADJ_ESTERROR) {
-		if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) {
-		    result = -EINVAL;
-		    goto leave;
-		}
-		time_esterror = txc->esterror;
-	    }
-
-	    if (txc->modes & ADJ_TIMECONST) {	/* p. 24 */
-		if (txc->constant < 0) {	/* NTP v4 uses values > 6 */
-		    result = -EINVAL;
-		    goto leave;
-		}
-		time_constant = txc->constant;
-	    }
-
-	    if (txc->modes & ADJ_OFFSET) {	/* values checked earlier */
-		if (txc->modes == ADJ_OFFSET_SINGLESHOT) {
-		    /* adjtime() is independent from ntp_adjtime() */
-		    if ((time_next_adjust = txc->offset) == 0)
-			 time_adjust = 0;
-		}
-		else if (time_status & STA_PLL) {
-		    ltemp = txc->offset;
-
-		    /*
-		     * Scale the phase adjustment and
-		     * clamp to the operating range.
-		     */
-		    if (ltemp > MAXPHASE)
-		        time_offset = MAXPHASE << SHIFT_UPDATE;
-		    else if (ltemp < -MAXPHASE)
-			time_offset = -(MAXPHASE << SHIFT_UPDATE);
-		    else
-		        time_offset = ltemp << SHIFT_UPDATE;
-
-		    /*
-		     * Select whether the frequency is to be controlled
-		     * and in which mode (PLL or FLL). Clamp to the operating
-		     * range. Ugly multiply/divide should be replaced someday.
-		     */
-
-		    if (time_status & STA_FREQHOLD || time_reftime == 0)
-		        time_reftime = xtime.tv_sec;
-		    mtemp = xtime.tv_sec - time_reftime;
-		    time_reftime = xtime.tv_sec;
-		    if (time_status & STA_FLL) {
-		        if (mtemp >= MINSEC) {
-			    ltemp = (time_offset / mtemp) << (SHIFT_USEC -
-							      SHIFT_UPDATE);
-			    time_freq += shift_right(ltemp, SHIFT_KH);
-			} else /* calibration interval too short (p. 12) */
-				result = TIME_ERROR;
-		    } else {	/* PLL mode */
-		        if (mtemp < MAXSEC) {
-			    ltemp *= mtemp;
-			    time_freq += shift_right(ltemp,(time_constant +
-						       time_constant +
-						       SHIFT_KF - SHIFT_USEC));
-			} else /* calibration interval too long (p. 12) */
-				result = TIME_ERROR;
-		    }
-		    time_freq = min(time_freq, time_tolerance);
-		    time_freq = max(time_freq, -time_tolerance);
-		} /* STA_PLL */
-	    } /* txc->modes & ADJ_OFFSET */
-	    if (txc->modes & ADJ_TICK) {
-		tick_usec = txc->tick;
-		tick_nsec = TICK_USEC_TO_NSEC(tick_usec);
-	    }
-	} /* txc->modes */
-leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
-		result = TIME_ERROR;
-	
-	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
-	    txc->offset	   = save_adjust;
-	else {
-	    txc->offset = shift_right(time_offset, SHIFT_UPDATE);
-	}
-	txc->freq	   = time_freq;
-	txc->maxerror	   = time_maxerror;
-	txc->esterror	   = time_esterror;
-	txc->status	   = time_status;
-	txc->constant	   = time_constant;
-	txc->precision	   = time_precision;
-	txc->tolerance	   = time_tolerance;
-	txc->tick	   = tick_usec;
-
-	/* PPS is not implemented, so these are zero */
-	txc->ppsfreq	   = 0;
-	txc->jitter	   = 0;
-	txc->shift	   = 0;
-	txc->stabil	   = 0;
-	txc->jitcnt	   = 0;
-	txc->calcnt	   = 0;
-	txc->errcnt	   = 0;
-	txc->stbcnt	   = 0;
-	write_sequnlock_irq(&xtime_lock);
-	do_gettimeofday(&txc->time);
-	notify_arch_cmos_timer();
-	return(result);
-}
-
 asmlinkage long sys_adjtimex(struct timex __user *txc_p)
 {
 	struct timex txc;		/* Local copy of parameter */
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index e1dfd8e86cce..61a3907d16fb 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1 +1 @@
-obj-y += clocksource.o jiffies.o
+obj-y += ntp.o clocksource.o jiffies.o
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
new file mode 100644
index 000000000000..8ccce15b4b23
--- /dev/null
+++ b/kernel/time/ntp.c
@@ -0,0 +1,389 @@
+/*
+ * linux/kernel/time/ntp.c
+ *
+ * NTP state machine interfaces and logic.
+ *
+ * This code was mainly moved from kernel/timer.c and kernel/time.c
+ * Please see those files for relevant copyright info and historical
+ * changelogs.
+ */
+
+#include <linux/mm.h>
+#include <linux/time.h>
+#include <linux/timex.h>
+
+#include <asm/div64.h>
+#include <asm/timex.h>
+
+/* Don't completely fail for HZ > 500.  */
+int tickadj = 500/HZ ? : 1;		/* microsecs */
+
+/*
+ * phase-lock loop variables
+ */
+/* TIME_ERROR prevents overwriting the CMOS clock */
+int time_state = TIME_OK;		/* clock synchronization status	*/
+int time_status = STA_UNSYNC;		/* clock status bits		*/
+long time_offset;			/* time adjustment (us)		*/
+long time_constant = 2;			/* pll time constant		*/
+long time_tolerance = MAXFREQ;		/* frequency tolerance (ppm)	*/
+long time_precision = 1;		/* clock precision (us)		*/
+long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
+long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
+long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
+					/* frequency offset (scaled ppm)*/
+static long time_adj;			/* tick adjust (scaled 1 / HZ)	*/
+long time_reftime;			/* time at last adjustment (s)	*/
+long time_adjust;
+long time_next_adjust;
+
+/*
+ * this routine handles the overflow of the microsecond field
+ *
+ * The tricky bits of code to handle the accurate clock support
+ * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
+ * They were originally developed for SUN and DEC kernels.
+ * All the kudos should go to Dave for this stuff.
+ */
+void second_overflow(void)
+{
+	long ltemp;
+
+	/* Bump the maxerror field */
+	time_maxerror += time_tolerance >> SHIFT_USEC;
+	if (time_maxerror > NTP_PHASE_LIMIT) {
+		time_maxerror = NTP_PHASE_LIMIT;
+		time_status |= STA_UNSYNC;
+	}
+
+	/*
+	 * Leap second processing. If in leap-insert state at the end of the
+	 * day, the system clock is set back one second; if in leap-delete
+	 * state, the system clock is set ahead one second. The microtime()
+	 * routine or external clock driver will insure that reported time is
+	 * always monotonic. The ugly divides should be replaced.
+	 */
+	switch (time_state) {
+	case TIME_OK:
+		if (time_status & STA_INS)
+			time_state = TIME_INS;
+		else if (time_status & STA_DEL)
+			time_state = TIME_DEL;
+		break;
+	case TIME_INS:
+		if (xtime.tv_sec % 86400 == 0) {
+			xtime.tv_sec--;
+			wall_to_monotonic.tv_sec++;
+			/*
+			 * The timer interpolator will make time change
+			 * gradually instead of an immediate jump by one second
+			 */
+			time_interpolator_update(-NSEC_PER_SEC);
+			time_state = TIME_OOP;
+			clock_was_set();
+			printk(KERN_NOTICE "Clock: inserting leap second "
+					"23:59:60 UTC\n");
+		}
+		break;
+	case TIME_DEL:
+		if ((xtime.tv_sec + 1) % 86400 == 0) {
+			xtime.tv_sec++;
+			wall_to_monotonic.tv_sec--;
+			/*
+			 * Use of time interpolator for a gradual change of
+			 * time
+			 */
+			time_interpolator_update(NSEC_PER_SEC);
+			time_state = TIME_WAIT;
+			clock_was_set();
+			printk(KERN_NOTICE "Clock: deleting leap second "
+					"23:59:59 UTC\n");
+		}
+		break;
+	case TIME_OOP:
+		time_state = TIME_WAIT;
+		break;
+	case TIME_WAIT:
+		if (!(time_status & (STA_INS | STA_DEL)))
+		time_state = TIME_OK;
+	}
+
+	/*
+	 * Compute the phase adjustment for the next second. In PLL mode, the
+	 * offset is reduced by a fixed factor times the time constant. In FLL
+	 * mode the offset is used directly. In either mode, the maximum phase
+	 * adjustment for each second is clamped so as to spread the adjustment
+	 * over not more than the number of seconds between updates.
+	 */
+	ltemp = time_offset;
+	if (!(time_status & STA_FLL))
+		ltemp = shift_right(ltemp, SHIFT_KG + time_constant);
+	ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE);
+	ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE);
+	time_offset -= ltemp;
+	time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
+
+	/*
+	 * Compute the frequency estimate and additional phase adjustment due
+	 * to frequency error for the next second.
+	 */
+	ltemp = time_freq;
+	time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
+
+#if HZ == 100
+	/*
+	 * Compensate for (HZ==100) != (1 << SHIFT_HZ).  Add 25% and 3.125% to
+	 * get 128.125; => only 0.125% error (p. 14)
+	 */
+	time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5);
+#endif
+#if HZ == 250
+	/*
+	 * Compensate for (HZ==250) != (1 << SHIFT_HZ).  Add 1.5625% and
+	 * 0.78125% to get 255.85938; => only 0.05% error (p. 14)
+	 */
+	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
+#endif
+#if HZ == 1000
+	/*
+	 * Compensate for (HZ==1000) != (1 << SHIFT_HZ).  Add 1.5625% and
+	 * 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
+	 */
+	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
+#endif
+}
+
+/*
+ * Returns how many microseconds we need to add to xtime this tick
+ * in doing an adjustment requested with adjtime.
+ */
+static long adjtime_adjustment(void)
+{
+	long time_adjust_step;
+
+	time_adjust_step = time_adjust;
+	if (time_adjust_step) {
+		/*
+		 * We are doing an adjtime thing.  Prepare time_adjust_step to
+		 * be within bounds.  Note that a positive time_adjust means we
+		 * want the clock to run faster.
+		 *
+		 * Limit the amount of the step to be in the range
+		 * -tickadj .. +tickadj
+		 */
+		time_adjust_step = min(time_adjust_step, (long)tickadj);
+		time_adjust_step = max(time_adjust_step, (long)-tickadj);
+	}
+	return time_adjust_step;
+}
+
+/* in the NTP reference this is called "hardclock()" */
+void update_ntp_one_tick(void)
+{
+	long time_adjust_step;
+
+	time_adjust_step = adjtime_adjustment();
+	if (time_adjust_step)
+		/* Reduce by this step the amount of time left  */
+		time_adjust -= time_adjust_step;
+
+	/* Changes by adjtime() do not take effect till next tick. */
+	if (time_next_adjust != 0) {
+		time_adjust = time_next_adjust;
+		time_next_adjust = 0;
+	}
+}
+
+/*
+ * Return how long ticks are at the moment, that is, how much time
+ * update_wall_time_one_tick will add to xtime next time we call it
+ * (assuming no calls to do_adjtimex in the meantime).
+ * The return value is in fixed-point nanoseconds shifted by the
+ * specified number of bits to the right of the binary point.
+ * This function has no side-effects.
+ */
+u64 current_tick_length(void)
+{
+	long delta_nsec;
+	u64 ret;
+
+	/* calculate the finest interval NTP will allow.
+	 *    ie: nanosecond value shifted by (SHIFT_SCALE - 10)
+	 */
+	delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
+	ret = (u64)delta_nsec << TICK_LENGTH_SHIFT;
+	ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10));
+
+	return ret;
+}
+
+
+void __attribute__ ((weak)) notify_arch_cmos_timer(void)
+{
+	return;
+}
+
+/* adjtimex mainly allows reading (and writing, if superuser) of
+ * kernel time-keeping variables. used by xntpd.
+ */
+int do_adjtimex(struct timex *txc)
+{
+	long ltemp, mtemp, save_adjust;
+	int result;
+
+	/* In order to modify anything, you gotta be super-user! */
+	if (txc->modes && !capable(CAP_SYS_TIME))
+		return -EPERM;
+
+	/* Now we validate the data before disabling interrupts */
+
+	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
+	  /* singleshot must not be used with any other mode bits */
+		if (txc->modes != ADJ_OFFSET_SINGLESHOT)
+			return -EINVAL;
+
+	if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET))
+	  /* adjustment Offset limited to +- .512 seconds */
+		if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE )
+			return -EINVAL;
+
+	/* if the quartz is off by more than 10% something is VERY wrong ! */
+	if (txc->modes & ADJ_TICK)
+		if (txc->tick <  900000/USER_HZ ||
+		    txc->tick > 1100000/USER_HZ)
+			return -EINVAL;
+
+	write_seqlock_irq(&xtime_lock);
+	result = time_state;	/* mostly `TIME_OK' */
+
+	/* Save for later - semantics of adjtime is to return old value */
+	save_adjust = time_next_adjust ? time_next_adjust : time_adjust;
+
+#if 0	/* STA_CLOCKERR is never set yet */
+	time_status &= ~STA_CLOCKERR;		/* reset STA_CLOCKERR */
+#endif
+	/* If there are input parameters, then process them */
+	if (txc->modes)
+	{
+	    if (txc->modes & ADJ_STATUS)	/* only set allowed bits */
+		time_status =  (txc->status & ~STA_RONLY) |
+			      (time_status & STA_RONLY);
+
+	    if (txc->modes & ADJ_FREQUENCY) {	/* p. 22 */
+		if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) {
+		    result = -EINVAL;
+		    goto leave;
+		}
+		time_freq = txc->freq;
+	    }
+
+	    if (txc->modes & ADJ_MAXERROR) {
+		if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) {
+		    result = -EINVAL;
+		    goto leave;
+		}
+		time_maxerror = txc->maxerror;
+	    }
+
+	    if (txc->modes & ADJ_ESTERROR) {
+		if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) {
+		    result = -EINVAL;
+		    goto leave;
+		}
+		time_esterror = txc->esterror;
+	    }
+
+	    if (txc->modes & ADJ_TIMECONST) {	/* p. 24 */
+		if (txc->constant < 0) {	/* NTP v4 uses values > 6 */
+		    result = -EINVAL;
+		    goto leave;
+		}
+		time_constant = txc->constant;
+	    }
+
+	    if (txc->modes & ADJ_OFFSET) {	/* values checked earlier */
+		if (txc->modes == ADJ_OFFSET_SINGLESHOT) {
+		    /* adjtime() is independent from ntp_adjtime() */
+		    if ((time_next_adjust = txc->offset) == 0)
+			 time_adjust = 0;
+		}
+		else if (time_status & STA_PLL) {
+		    ltemp = txc->offset;
+
+		    /*
+		     * Scale the phase adjustment and
+		     * clamp to the operating range.
+		     */
+		    if (ltemp > MAXPHASE)
+		        time_offset = MAXPHASE << SHIFT_UPDATE;
+		    else if (ltemp < -MAXPHASE)
+			time_offset = -(MAXPHASE << SHIFT_UPDATE);
+		    else
+		        time_offset = ltemp << SHIFT_UPDATE;
+
+		    /*
+		     * Select whether the frequency is to be controlled
+		     * and in which mode (PLL or FLL). Clamp to the operating
+		     * range. Ugly multiply/divide should be replaced someday.
+		     */
+
+		    if (time_status & STA_FREQHOLD || time_reftime == 0)
+		        time_reftime = xtime.tv_sec;
+		    mtemp = xtime.tv_sec - time_reftime;
+		    time_reftime = xtime.tv_sec;
+		    if (time_status & STA_FLL) {
+		        if (mtemp >= MINSEC) {
+			    ltemp = (time_offset / mtemp) << (SHIFT_USEC -
+							      SHIFT_UPDATE);
+			    time_freq += shift_right(ltemp, SHIFT_KH);
+			} else /* calibration interval too short (p. 12) */
+				result = TIME_ERROR;
+		    } else {	/* PLL mode */
+		        if (mtemp < MAXSEC) {
+			    ltemp *= mtemp;
+			    time_freq += shift_right(ltemp,(time_constant +
+						       time_constant +
+						       SHIFT_KF - SHIFT_USEC));
+			} else /* calibration interval too long (p. 12) */
+				result = TIME_ERROR;
+		    }
+		    time_freq = min(time_freq, time_tolerance);
+		    time_freq = max(time_freq, -time_tolerance);
+		} /* STA_PLL */
+	    } /* txc->modes & ADJ_OFFSET */
+	    if (txc->modes & ADJ_TICK) {
+		tick_usec = txc->tick;
+		tick_nsec = TICK_USEC_TO_NSEC(tick_usec);
+	    }
+	} /* txc->modes */
+leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
+		result = TIME_ERROR;
+
+	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
+	    txc->offset	   = save_adjust;
+	else {
+	    txc->offset = shift_right(time_offset, SHIFT_UPDATE);
+	}
+	txc->freq	   = time_freq;
+	txc->maxerror	   = time_maxerror;
+	txc->esterror	   = time_esterror;
+	txc->status	   = time_status;
+	txc->constant	   = time_constant;
+	txc->precision	   = time_precision;
+	txc->tolerance	   = time_tolerance;
+	txc->tick	   = tick_usec;
+
+	/* PPS is not implemented, so these are zero */
+	txc->ppsfreq	   = 0;
+	txc->jitter	   = 0;
+	txc->shift	   = 0;
+	txc->stabil	   = 0;
+	txc->jitcnt	   = 0;
+	txc->calcnt	   = 0;
+	txc->errcnt	   = 0;
+	txc->stbcnt	   = 0;
+	write_sequnlock_irq(&xtime_lock);
+	do_gettimeofday(&txc->time);
+	notify_arch_cmos_timer();
+	return(result);
+}
diff --git a/kernel/timer.c b/kernel/timer.c
index 4f55622b0d38..5fccc7cbf3b4 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -41,12 +41,6 @@
 #include <asm/timex.h>
 #include <asm/io.h>
 
-#ifdef CONFIG_TIME_INTERPOLATION
-static void time_interpolator_update(long delta_nsec);
-#else
-#define time_interpolator_update(x)
-#endif
-
 u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
 
 EXPORT_SYMBOL(jiffies_64);
@@ -587,209 +581,6 @@ struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
 
 EXPORT_SYMBOL(xtime);
 
-/* Don't completely fail for HZ > 500.  */
-int tickadj = 500/HZ ? : 1;		/* microsecs */
-
-
-/*
- * phase-lock loop variables
- */
-/* TIME_ERROR prevents overwriting the CMOS clock */
-int time_state = TIME_OK;		/* clock synchronization status	*/
-int time_status = STA_UNSYNC;		/* clock status bits		*/
-long time_offset;			/* time adjustment (us)		*/
-long time_constant = 2;			/* pll time constant		*/
-long time_tolerance = MAXFREQ;		/* frequency tolerance (ppm)	*/
-long time_precision = 1;		/* clock precision (us)		*/
-long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
-long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
-long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
-					/* frequency offset (scaled ppm)*/
-static long time_adj;			/* tick adjust (scaled 1 / HZ)	*/
-long time_reftime;			/* time at last adjustment (s)	*/
-long time_adjust;
-long time_next_adjust;
-
-/*
- * this routine handles the overflow of the microsecond field
- *
- * The tricky bits of code to handle the accurate clock support
- * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
- * They were originally developed for SUN and DEC kernels.
- * All the kudos should go to Dave for this stuff.
- *
- */
-static void second_overflow(void)
-{
-	long ltemp;
-
-	/* Bump the maxerror field */
-	time_maxerror += time_tolerance >> SHIFT_USEC;
-	if (time_maxerror > NTP_PHASE_LIMIT) {
-		time_maxerror = NTP_PHASE_LIMIT;
-		time_status |= STA_UNSYNC;
-	}
-
-	/*
-	 * Leap second processing. If in leap-insert state at the end of the
-	 * day, the system clock is set back one second; if in leap-delete
-	 * state, the system clock is set ahead one second. The microtime()
-	 * routine or external clock driver will insure that reported time is
-	 * always monotonic. The ugly divides should be replaced.
-	 */
-	switch (time_state) {
-	case TIME_OK:
-		if (time_status & STA_INS)
-			time_state = TIME_INS;
-		else if (time_status & STA_DEL)
-			time_state = TIME_DEL;
-		break;
-	case TIME_INS:
-		if (xtime.tv_sec % 86400 == 0) {
-			xtime.tv_sec--;
-			wall_to_monotonic.tv_sec++;
-			/*
-			 * The timer interpolator will make time change
-			 * gradually instead of an immediate jump by one second
-			 */
-			time_interpolator_update(-NSEC_PER_SEC);
-			time_state = TIME_OOP;
-			clock_was_set();
-			printk(KERN_NOTICE "Clock: inserting leap second "
-					"23:59:60 UTC\n");
-		}
-		break;
-	case TIME_DEL:
-		if ((xtime.tv_sec + 1) % 86400 == 0) {
-			xtime.tv_sec++;
-			wall_to_monotonic.tv_sec--;
-			/*
-			 * Use of time interpolator for a gradual change of
-			 * time
-			 */
-			time_interpolator_update(NSEC_PER_SEC);
-			time_state = TIME_WAIT;
-			clock_was_set();
-			printk(KERN_NOTICE "Clock: deleting leap second "
-					"23:59:59 UTC\n");
-		}
-		break;
-	case TIME_OOP:
-		time_state = TIME_WAIT;
-		break;
-	case TIME_WAIT:
-		if (!(time_status & (STA_INS | STA_DEL)))
-		time_state = TIME_OK;
-	}
-
-	/*
-	 * Compute the phase adjustment for the next second. In PLL mode, the
-	 * offset is reduced by a fixed factor times the time constant. In FLL
-	 * mode the offset is used directly. In either mode, the maximum phase
-	 * adjustment for each second is clamped so as to spread the adjustment
-	 * over not more than the number of seconds between updates.
-	 */
-	ltemp = time_offset;
-	if (!(time_status & STA_FLL))
-		ltemp = shift_right(ltemp, SHIFT_KG + time_constant);
-	ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE);
-	ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE);
-	time_offset -= ltemp;
-	time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
-
-	/*
-	 * Compute the frequency estimate and additional phase adjustment due
-	 * to frequency error for the next second.
-	 */
-	ltemp = time_freq;
-	time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
-
-#if HZ == 100
-	/*
-	 * Compensate for (HZ==100) != (1 << SHIFT_HZ).  Add 25% and 3.125% to
-	 * get 128.125; => only 0.125% error (p. 14)
-	 */
-	time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5);
-#endif
-#if HZ == 250
-	/*
-	 * Compensate for (HZ==250) != (1 << SHIFT_HZ).  Add 1.5625% and
-	 * 0.78125% to get 255.85938; => only 0.05% error (p. 14)
-	 */
-	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
-#endif
-#if HZ == 1000
-	/*
-	 * Compensate for (HZ==1000) != (1 << SHIFT_HZ).  Add 1.5625% and
-	 * 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
-	 */
-	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
-#endif
-}
-
-/*
- * Returns how many microseconds we need to add to xtime this tick
- * in doing an adjustment requested with adjtime.
- */
-static long adjtime_adjustment(void)
-{
-	long time_adjust_step;
-
-	time_adjust_step = time_adjust;
-	if (time_adjust_step) {
-		/*
-		 * We are doing an adjtime thing.  Prepare time_adjust_step to
-		 * be within bounds.  Note that a positive time_adjust means we
-		 * want the clock to run faster.
-		 *
-		 * Limit the amount of the step to be in the range
-		 * -tickadj .. +tickadj
-		 */
-		time_adjust_step = min(time_adjust_step, (long)tickadj);
-		time_adjust_step = max(time_adjust_step, (long)-tickadj);
-	}
-	return time_adjust_step;
-}
-
-/* in the NTP reference this is called "hardclock()" */
-static void update_ntp_one_tick(void)
-{
-	long time_adjust_step;
-
-	time_adjust_step = adjtime_adjustment();
-	if (time_adjust_step)
-		/* Reduce by this step the amount of time left  */
-		time_adjust -= time_adjust_step;
-
-	/* Changes by adjtime() do not take effect till next tick. */
-	if (time_next_adjust != 0) {
-		time_adjust = time_next_adjust;
-		time_next_adjust = 0;
-	}
-}
-
-/*
- * Return how long ticks are at the moment, that is, how much time
- * update_wall_time_one_tick will add to xtime next time we call it
- * (assuming no calls to do_adjtimex in the meantime).
- * The return value is in fixed-point nanoseconds shifted by the
- * specified number of bits to the right of the binary point.
- * This function has no side-effects.
- */
-u64 current_tick_length(void)
-{
-	long delta_nsec;
-	u64 ret;
-
-	/* calculate the finest interval NTP will allow.
-	 *    ie: nanosecond value shifted by (SHIFT_SCALE - 10)
-	 */
-	delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
-	ret = (u64)delta_nsec << TICK_LENGTH_SHIFT;
-	ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10));
-
-	return ret;
-}
 
 /* XXX - all of this timekeeping code should be later moved to time.c */
 #include <linux/clocksource.h>
@@ -1775,7 +1566,7 @@ unsigned long time_interpolator_get_offset(void)
 #define INTERPOLATOR_ADJUST 65536
 #define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST
 
-static void time_interpolator_update(long delta_nsec)
+void time_interpolator_update(long delta_nsec)
 {
 	u64 counter;
 	unsigned long offset;
-- 
cgit v1.2.3


From b0ee75561beadc4db4d9a899c8ef4a7db50aa0ab Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sat, 30 Sep 2006 23:28:22 -0700
Subject: [PATCH] ntp: add ntp_update_frequency

This introduces ntp_update_frequency() and deinlines ntp_clear() (as it's not
performance critical).  ntp_update_frequency() calculates the base tick length
using tick_usec and adds a base adjustment, in case the frequency doesn't
divide evenly by HZ.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time/ntp.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------
 kernel/timer.c    | 11 ++++-------
 2 files changed, 48 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 8ccce15b4b23..77137bec2aea 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -15,6 +15,13 @@
 #include <asm/div64.h>
 #include <asm/timex.h>
 
+/*
+ * Timekeeping variables
+ */
+unsigned long tick_usec = TICK_USEC; 		/* USER_HZ period (usec) */
+unsigned long tick_nsec;			/* ACTHZ period (nsec) */
+static u64 tick_length, tick_length_base;
+
 /* Don't completely fail for HZ > 500.  */
 int tickadj = 500/HZ ? : 1;		/* microsecs */
 
@@ -37,6 +44,36 @@ long time_reftime;			/* time at last adjustment (s)	*/
 long time_adjust;
 long time_next_adjust;
 
+/**
+ * ntp_clear - Clears the NTP state variables
+ *
+ * Must be called while holding a write on the xtime_lock
+ */
+void ntp_clear(void)
+{
+	time_adjust = 0;		/* stop active adjtime() */
+	time_status |= STA_UNSYNC;
+	time_maxerror = NTP_PHASE_LIMIT;
+	time_esterror = NTP_PHASE_LIMIT;
+
+	ntp_update_frequency();
+
+	tick_length = tick_length_base;
+}
+
+#define CLOCK_TICK_OVERFLOW	(LATCH * HZ - CLOCK_TICK_RATE)
+#define CLOCK_TICK_ADJUST	(((s64)CLOCK_TICK_OVERFLOW * NSEC_PER_SEC) / (s64)CLOCK_TICK_RATE)
+
+void ntp_update_frequency(void)
+{
+	tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT;
+	tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
+
+	do_div(tick_length_base, HZ);
+
+	tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT;
+}
+
 /*
  * this routine handles the overflow of the microsecond field
  *
@@ -151,6 +188,7 @@ void second_overflow(void)
 	 */
 	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
 #endif
+	tick_length = tick_length_base;
 }
 
 /*
@@ -204,14 +242,13 @@ void update_ntp_one_tick(void)
  */
 u64 current_tick_length(void)
 {
-	long delta_nsec;
 	u64 ret;
 
 	/* calculate the finest interval NTP will allow.
 	 *    ie: nanosecond value shifted by (SHIFT_SCALE - 10)
 	 */
-	delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
-	ret = (u64)delta_nsec << TICK_LENGTH_SHIFT;
+	ret = tick_length;
+	ret += (u64)(adjtime_adjustment() * 1000) << TICK_LENGTH_SHIFT;
 	ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10));
 
 	return ret;
@@ -351,10 +388,11 @@ int do_adjtimex(struct timex *txc)
 		    time_freq = max(time_freq, -time_tolerance);
 		} /* STA_PLL */
 	    } /* txc->modes & ADJ_OFFSET */
-	    if (txc->modes & ADJ_TICK) {
+	    if (txc->modes & ADJ_TICK)
 		tick_usec = txc->tick;
-		tick_nsec = TICK_USEC_TO_NSEC(tick_usec);
-	    }
+
+	    if (txc->modes & ADJ_TICK)
+		    ntp_update_frequency();
 	} /* txc->modes */
 leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
 		result = TIME_ERROR;
diff --git a/kernel/timer.c b/kernel/timer.c
index 5fccc7cbf3b4..78d3fa10fcd6 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -562,12 +562,6 @@ found:
 
 /******************************************************************/
 
-/*
- * Timekeeping variables
- */
-unsigned long tick_usec = TICK_USEC; 		/* USER_HZ period (usec) */
-unsigned long tick_nsec = TICK_NSEC;		/* ACTHZ period (nsec) */
-
 /* 
  * The current time 
  * wall_to_monotonic is what we need to add to xtime (or xtime corrected 
@@ -757,10 +751,13 @@ void __init timekeeping_init(void)
 	unsigned long flags;
 
 	write_seqlock_irqsave(&xtime_lock, flags);
+
+	ntp_clear();
+
 	clock = clocksource_get_next();
 	clocksource_calculate_interval(clock, tick_nsec);
 	clock->cycle_last = clocksource_read(clock);
-	ntp_clear();
+
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 }
 
-- 
cgit v1.2.3


From ab8783b688f33c40ed7b37b814a4a1e7d341ce11 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sat, 30 Sep 2006 23:28:23 -0700
Subject: [PATCH] ntp: add time_adj to tick length

This makes time_adj local to second_overflow() and integrates it into the tick
length instead of adding it everytime.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time/ntp.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 77137bec2aea..c09628d6b848 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -39,7 +39,6 @@ long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
 long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
 long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
 					/* frequency offset (scaled ppm)*/
-static long time_adj;			/* tick adjust (scaled 1 / HZ)	*/
 long time_reftime;			/* time at last adjustment (s)	*/
 long time_adjust;
 long time_next_adjust;
@@ -84,7 +83,7 @@ void ntp_update_frequency(void)
  */
 void second_overflow(void)
 {
-	long ltemp;
+	long ltemp, time_adj;
 
 	/* Bump the maxerror field */
 	time_maxerror += time_tolerance >> SHIFT_USEC;
@@ -189,6 +188,7 @@ void second_overflow(void)
 	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
 #endif
 	tick_length = tick_length_base;
+	tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10));
 }
 
 /*
@@ -245,11 +245,9 @@ u64 current_tick_length(void)
 	u64 ret;
 
 	/* calculate the finest interval NTP will allow.
-	 *    ie: nanosecond value shifted by (SHIFT_SCALE - 10)
 	 */
 	ret = tick_length;
 	ret += (u64)(adjtime_adjustment() * 1000) << TICK_LENGTH_SHIFT;
-	ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10));
 
 	return ret;
 }
-- 
cgit v1.2.3


From dc6a43e46f1b6de22701f97bec022e97088cfa90 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sat, 30 Sep 2006 23:28:24 -0700
Subject: [PATCH] ntp: add time_freq to tick length

This adds the frequency part to ntp_update_frequency().

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time/ntp.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index c09628d6b848..ab21eb06e09b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -37,8 +37,7 @@ long time_tolerance = MAXFREQ;		/* frequency tolerance (ppm)	*/
 long time_precision = 1;		/* clock precision (us)		*/
 long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
 long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
-long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
-					/* frequency offset (scaled ppm)*/
+long time_freq;				/* frequency offset (scaled ppm)*/
 long time_reftime;			/* time at last adjustment (s)	*/
 long time_adjust;
 long time_next_adjust;
@@ -67,6 +66,7 @@ void ntp_update_frequency(void)
 {
 	tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT;
 	tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
+	tick_length_base += ((s64)time_freq * NSEC_PER_USEC) << (TICK_LENGTH_SHIFT - SHIFT_USEC);
 
 	do_div(tick_length_base, HZ);
 
@@ -163,8 +163,6 @@ void second_overflow(void)
 	 * Compute the frequency estimate and additional phase adjustment due
 	 * to frequency error for the next second.
 	 */
-	ltemp = time_freq;
-	time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
 
 #if HZ == 100
 	/*
@@ -389,7 +387,7 @@ int do_adjtimex(struct timex *txc)
 	    if (txc->modes & ADJ_TICK)
 		tick_usec = txc->tick;
 
-	    if (txc->modes & ADJ_TICK)
+	    if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
 		    ntp_update_frequency();
 	} /* txc->modes */
 leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
-- 
cgit v1.2.3


From 3d3675cc3d04d7fd4bb11e8c1ea79e5ade4f5e44 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sat, 30 Sep 2006 23:28:25 -0700
Subject: [PATCH] ntp: prescale time_offset

This converts time_offset into a scaled per tick value.  This avoids now
completely the crude compensation in second_overflow().

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time/ntp.c | 64 ++++++++++++++-----------------------------------------
 1 file changed, 16 insertions(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index ab21eb06e09b..238ce47ef09d 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -31,7 +31,7 @@ int tickadj = 500/HZ ? : 1;		/* microsecs */
 /* TIME_ERROR prevents overwriting the CMOS clock */
 int time_state = TIME_OK;		/* clock synchronization status	*/
 int time_status = STA_UNSYNC;		/* clock status bits		*/
-long time_offset;			/* time adjustment (us)		*/
+long time_offset;			/* time adjustment (ns)		*/
 long time_constant = 2;			/* pll time constant		*/
 long time_tolerance = MAXFREQ;		/* frequency tolerance (ppm)	*/
 long time_precision = 1;		/* clock precision (us)		*/
@@ -57,6 +57,7 @@ void ntp_clear(void)
 	ntp_update_frequency();
 
 	tick_length = tick_length_base;
+	time_offset = 0;
 }
 
 #define CLOCK_TICK_OVERFLOW	(LATCH * HZ - CLOCK_TICK_RATE)
@@ -83,7 +84,7 @@ void ntp_update_frequency(void)
  */
 void second_overflow(void)
 {
-	long ltemp, time_adj;
+	long time_adj;
 
 	/* Bump the maxerror field */
 	time_maxerror += time_tolerance >> SHIFT_USEC;
@@ -151,42 +152,14 @@ void second_overflow(void)
 	 * adjustment for each second is clamped so as to spread the adjustment
 	 * over not more than the number of seconds between updates.
 	 */
-	ltemp = time_offset;
-	if (!(time_status & STA_FLL))
-		ltemp = shift_right(ltemp, SHIFT_KG + time_constant);
-	ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE);
-	ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE);
-	time_offset -= ltemp;
-	time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
-
-	/*
-	 * Compute the frequency estimate and additional phase adjustment due
-	 * to frequency error for the next second.
-	 */
-
-#if HZ == 100
-	/*
-	 * Compensate for (HZ==100) != (1 << SHIFT_HZ).  Add 25% and 3.125% to
-	 * get 128.125; => only 0.125% error (p. 14)
-	 */
-	time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5);
-#endif
-#if HZ == 250
-	/*
-	 * Compensate for (HZ==250) != (1 << SHIFT_HZ).  Add 1.5625% and
-	 * 0.78125% to get 255.85938; => only 0.05% error (p. 14)
-	 */
-	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
-#endif
-#if HZ == 1000
-	/*
-	 * Compensate for (HZ==1000) != (1 << SHIFT_HZ).  Add 1.5625% and
-	 * 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
-	 */
-	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
-#endif
 	tick_length = tick_length_base;
-	tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10));
+	time_adj = time_offset;
+	if (!(time_status & STA_FLL))
+		time_adj = shift_right(time_adj, SHIFT_KG + time_constant);
+	time_adj = min(time_adj, -((MAXPHASE / HZ) << SHIFT_UPDATE) / MINSEC);
+	time_adj = max(time_adj, ((MAXPHASE / HZ) << SHIFT_UPDATE) / MINSEC);
+	time_offset -= time_adj;
+	tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - SHIFT_UPDATE);
 }
 
 /*
@@ -347,12 +320,8 @@ int do_adjtimex(struct timex *txc)
 		     * Scale the phase adjustment and
 		     * clamp to the operating range.
 		     */
-		    if (ltemp > MAXPHASE)
-		        time_offset = MAXPHASE << SHIFT_UPDATE;
-		    else if (ltemp < -MAXPHASE)
-			time_offset = -(MAXPHASE << SHIFT_UPDATE);
-		    else
-		        time_offset = ltemp << SHIFT_UPDATE;
+		    time_offset = min(ltemp, MAXPHASE);
+		    time_offset = max(time_offset, -MAXPHASE);
 
 		    /*
 		     * Select whether the frequency is to be controlled
@@ -366,8 +335,7 @@ int do_adjtimex(struct timex *txc)
 		    time_reftime = xtime.tv_sec;
 		    if (time_status & STA_FLL) {
 		        if (mtemp >= MINSEC) {
-			    ltemp = (time_offset / mtemp) << (SHIFT_USEC -
-							      SHIFT_UPDATE);
+			    ltemp = ((time_offset << 12) / mtemp) << (SHIFT_USEC - 12);
 			    time_freq += shift_right(ltemp, SHIFT_KH);
 			} else /* calibration interval too short (p. 12) */
 				result = TIME_ERROR;
@@ -382,6 +350,7 @@ int do_adjtimex(struct timex *txc)
 		    }
 		    time_freq = min(time_freq, time_tolerance);
 		    time_freq = max(time_freq, -time_tolerance);
+		    time_offset = (time_offset * NSEC_PER_USEC / HZ) << SHIFT_UPDATE;
 		} /* STA_PLL */
 	    } /* txc->modes & ADJ_OFFSET */
 	    if (txc->modes & ADJ_TICK)
@@ -395,9 +364,8 @@ leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
 
 	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
 	    txc->offset	   = save_adjust;
-	else {
-	    txc->offset = shift_right(time_offset, SHIFT_UPDATE);
-	}
+	else
+	    txc->offset    = shift_right(time_offset, SHIFT_UPDATE) * HZ / 1000;
 	txc->freq	   = time_freq;
 	txc->maxerror	   = time_maxerror;
 	txc->esterror	   = time_esterror;
-- 
cgit v1.2.3


From 8f807f8d2137ba728d22820103131038639b68a9 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sat, 30 Sep 2006 23:28:25 -0700
Subject: [PATCH] ntp: add time_adjust to tick length

This folds update_ntp_one_tick() into second_overflow() and adds time_adjust
to the tick length, this makes time_next_adjust unnecessary.  This slightly
changes the adjtime() behaviour, instead of applying it to the next tick, it's
applied to the next second.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time/ntp.c | 71 ++++++++++++++-----------------------------------------
 kernel/timer.c    |  2 --
 2 files changed, 18 insertions(+), 55 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 238ce47ef09d..65223b7ed810 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -22,8 +22,9 @@ unsigned long tick_usec = TICK_USEC; 		/* USER_HZ period (usec) */
 unsigned long tick_nsec;			/* ACTHZ period (nsec) */
 static u64 tick_length, tick_length_base;
 
-/* Don't completely fail for HZ > 500.  */
-int tickadj = 500/HZ ? : 1;		/* microsecs */
+#define MAX_TICKADJ		500		/* microsecs */
+#define MAX_TICKADJ_SCALED	(((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \
+				  TICK_LENGTH_SHIFT) / HZ)
 
 /*
  * phase-lock loop variables
@@ -40,7 +41,6 @@ long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
 long time_freq;				/* frequency offset (scaled ppm)*/
 long time_reftime;			/* time at last adjustment (s)	*/
 long time_adjust;
-long time_next_adjust;
 
 /**
  * ntp_clear - Clears the NTP state variables
@@ -160,46 +160,19 @@ void second_overflow(void)
 	time_adj = max(time_adj, ((MAXPHASE / HZ) << SHIFT_UPDATE) / MINSEC);
 	time_offset -= time_adj;
 	tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - SHIFT_UPDATE);
-}
-
-/*
- * Returns how many microseconds we need to add to xtime this tick
- * in doing an adjustment requested with adjtime.
- */
-static long adjtime_adjustment(void)
-{
-	long time_adjust_step;
-
-	time_adjust_step = time_adjust;
-	if (time_adjust_step) {
-		/*
-		 * We are doing an adjtime thing.  Prepare time_adjust_step to
-		 * be within bounds.  Note that a positive time_adjust means we
-		 * want the clock to run faster.
-		 *
-		 * Limit the amount of the step to be in the range
-		 * -tickadj .. +tickadj
-		 */
-		time_adjust_step = min(time_adjust_step, (long)tickadj);
-		time_adjust_step = max(time_adjust_step, (long)-tickadj);
-	}
-	return time_adjust_step;
-}
 
-/* in the NTP reference this is called "hardclock()" */
-void update_ntp_one_tick(void)
-{
-	long time_adjust_step;
-
-	time_adjust_step = adjtime_adjustment();
-	if (time_adjust_step)
-		/* Reduce by this step the amount of time left  */
-		time_adjust -= time_adjust_step;
-
-	/* Changes by adjtime() do not take effect till next tick. */
-	if (time_next_adjust != 0) {
-		time_adjust = time_next_adjust;
-		time_next_adjust = 0;
+	if (unlikely(time_adjust)) {
+		if (time_adjust > MAX_TICKADJ) {
+			time_adjust -= MAX_TICKADJ;
+			tick_length += MAX_TICKADJ_SCALED;
+		} else if (time_adjust < -MAX_TICKADJ) {
+			time_adjust += MAX_TICKADJ;
+			tick_length -= MAX_TICKADJ_SCALED;
+		} else {
+			time_adjust = 0;
+			tick_length += (s64)(time_adjust * NSEC_PER_USEC /
+					     HZ) << TICK_LENGTH_SHIFT;
+		}
 	}
 }
 
@@ -213,14 +186,7 @@ void update_ntp_one_tick(void)
  */
 u64 current_tick_length(void)
 {
-	u64 ret;
-
-	/* calculate the finest interval NTP will allow.
-	 */
-	ret = tick_length;
-	ret += (u64)(adjtime_adjustment() * 1000) << TICK_LENGTH_SHIFT;
-
-	return ret;
+	return tick_length;
 }
 
 
@@ -263,7 +229,7 @@ int do_adjtimex(struct timex *txc)
 	result = time_state;	/* mostly `TIME_OK' */
 
 	/* Save for later - semantics of adjtime is to return old value */
-	save_adjust = time_next_adjust ? time_next_adjust : time_adjust;
+	save_adjust = time_adjust;
 
 #if 0	/* STA_CLOCKERR is never set yet */
 	time_status &= ~STA_CLOCKERR;		/* reset STA_CLOCKERR */
@@ -310,8 +276,7 @@ int do_adjtimex(struct timex *txc)
 	    if (txc->modes & ADJ_OFFSET) {	/* values checked earlier */
 		if (txc->modes == ADJ_OFFSET_SINGLESHOT) {
 		    /* adjtime() is independent from ntp_adjtime() */
-		    if ((time_next_adjust = txc->offset) == 0)
-			 time_adjust = 0;
+		    time_adjust = txc->offset;
 		}
 		else if (time_status & STA_PLL) {
 		    ltemp = txc->offset;
diff --git a/kernel/timer.c b/kernel/timer.c
index 78d3fa10fcd6..654dd5df2417 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -937,8 +937,6 @@ static void update_wall_time(void)
 		/* interpolator bits */
 		time_interpolator_update(clock->xtime_interval
 						>> clock->shift);
-		/* increment the NTP state machine */
-		update_ntp_one_tick();
 
 		/* accumulate error between NTP and clock interval */
 		clock->error += current_tick_length();
-- 
cgit v1.2.3


From 97eebe138caaf78354b1fad233e63bafdcc4fd54 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sat, 30 Sep 2006 23:28:26 -0700
Subject: [PATCH] ntp: remove time_tolerance

time_tolerance isn't changed at all in the kernel, so simply remove it, this
simplifies the next patch, as it avoids a number of conversions.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time/ntp.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 65223b7ed810..af7563f5d4e2 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -34,7 +34,6 @@ int time_state = TIME_OK;		/* clock synchronization status	*/
 int time_status = STA_UNSYNC;		/* clock status bits		*/
 long time_offset;			/* time adjustment (ns)		*/
 long time_constant = 2;			/* pll time constant		*/
-long time_tolerance = MAXFREQ;		/* frequency tolerance (ppm)	*/
 long time_precision = 1;		/* clock precision (us)		*/
 long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
 long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
@@ -87,7 +86,7 @@ void second_overflow(void)
 	long time_adj;
 
 	/* Bump the maxerror field */
-	time_maxerror += time_tolerance >> SHIFT_USEC;
+	time_maxerror += MAXFREQ >> SHIFT_USEC;
 	if (time_maxerror > NTP_PHASE_LIMIT) {
 		time_maxerror = NTP_PHASE_LIMIT;
 		time_status |= STA_UNSYNC;
@@ -313,8 +312,8 @@ int do_adjtimex(struct timex *txc)
 			} else /* calibration interval too long (p. 12) */
 				result = TIME_ERROR;
 		    }
-		    time_freq = min(time_freq, time_tolerance);
-		    time_freq = max(time_freq, -time_tolerance);
+		    time_freq = min(time_freq, MAXFREQ);
+		    time_freq = max(time_freq, -MAXFREQ);
 		    time_offset = (time_offset * NSEC_PER_USEC / HZ) << SHIFT_UPDATE;
 		} /* STA_PLL */
 	    } /* txc->modes & ADJ_OFFSET */
@@ -337,7 +336,7 @@ leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
 	txc->status	   = time_status;
 	txc->constant	   = time_constant;
 	txc->precision	   = time_precision;
-	txc->tolerance	   = time_tolerance;
+	txc->tolerance	   = MAXFREQ;
 	txc->tick	   = tick_usec;
 
 	/* PPS is not implemented, so these are zero */
-- 
cgit v1.2.3


From 04b617e71e363e640e88be1e43f53fa6a3afef9f Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sat, 30 Sep 2006 23:28:27 -0700
Subject: [PATCH] ntp: convert time_freq to nsec value

This converts time_freq to a scaled nsec value and adds around 6bit of extra
resolution.  This pushes the time_freq to its 32bit limits so the calculatons
have to be done with 64bit.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time/ntp.c | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index af7563f5d4e2..9137b54613e0 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -66,7 +66,7 @@ void ntp_update_frequency(void)
 {
 	tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT;
 	tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
-	tick_length_base += ((s64)time_freq * NSEC_PER_USEC) << (TICK_LENGTH_SHIFT - SHIFT_USEC);
+	tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
 
 	do_div(tick_length_base, HZ);
 
@@ -200,6 +200,7 @@ void __attribute__ ((weak)) notify_arch_cmos_timer(void)
 int do_adjtimex(struct timex *txc)
 {
 	long ltemp, mtemp, save_adjust;
+	s64 freq_adj;
 	int result;
 
 	/* In order to modify anything, you gotta be super-user! */
@@ -245,7 +246,7 @@ int do_adjtimex(struct timex *txc)
 		    result = -EINVAL;
 		    goto leave;
 		}
-		time_freq = txc->freq;
+		time_freq = ((s64)txc->freq * NSEC_PER_USEC) >> (SHIFT_USEC - SHIFT_NSEC);
 	    }
 
 	    if (txc->modes & ADJ_MAXERROR) {
@@ -278,14 +279,14 @@ int do_adjtimex(struct timex *txc)
 		    time_adjust = txc->offset;
 		}
 		else if (time_status & STA_PLL) {
-		    ltemp = txc->offset;
+		    ltemp = txc->offset * NSEC_PER_USEC;
 
 		    /*
 		     * Scale the phase adjustment and
 		     * clamp to the operating range.
 		     */
-		    time_offset = min(ltemp, MAXPHASE);
-		    time_offset = max(time_offset, -MAXPHASE);
+		    time_offset = min(ltemp, MAXPHASE * NSEC_PER_USEC);
+		    time_offset = max(time_offset, -MAXPHASE * NSEC_PER_USEC);
 
 		    /*
 		     * Select whether the frequency is to be controlled
@@ -297,24 +298,31 @@ int do_adjtimex(struct timex *txc)
 		        time_reftime = xtime.tv_sec;
 		    mtemp = xtime.tv_sec - time_reftime;
 		    time_reftime = xtime.tv_sec;
+		    freq_adj = 0;
 		    if (time_status & STA_FLL) {
 		        if (mtemp >= MINSEC) {
-			    ltemp = ((time_offset << 12) / mtemp) << (SHIFT_USEC - 12);
-			    time_freq += shift_right(ltemp, SHIFT_KH);
+			    freq_adj = (s64)time_offset << (SHIFT_NSEC - SHIFT_KH);
+			    if (time_offset < 0) {
+				freq_adj = -freq_adj;
+				do_div(freq_adj, mtemp);
+				freq_adj = -freq_adj;
+			    } else
+				do_div(freq_adj, mtemp);
 			} else /* calibration interval too short (p. 12) */
 				result = TIME_ERROR;
 		    } else {	/* PLL mode */
 		        if (mtemp < MAXSEC) {
-			    ltemp *= mtemp;
-			    time_freq += shift_right(ltemp,(time_constant +
+			    freq_adj = (s64)ltemp * mtemp;
+			    freq_adj = shift_right(freq_adj,(time_constant +
 						       time_constant +
-						       SHIFT_KF - SHIFT_USEC));
+						       SHIFT_KF - SHIFT_NSEC));
 			} else /* calibration interval too long (p. 12) */
 				result = TIME_ERROR;
 		    }
-		    time_freq = min(time_freq, MAXFREQ);
-		    time_freq = max(time_freq, -MAXFREQ);
-		    time_offset = (time_offset * NSEC_PER_USEC / HZ) << SHIFT_UPDATE;
+		    freq_adj += time_freq;
+		    freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC);
+		    time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC);
+		    time_offset = (time_offset / HZ) << SHIFT_UPDATE;
 		} /* STA_PLL */
 	    } /* txc->modes & ADJ_OFFSET */
 	    if (txc->modes & ADJ_TICK)
@@ -330,7 +338,7 @@ leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
 	    txc->offset	   = save_adjust;
 	else
 	    txc->offset    = shift_right(time_offset, SHIFT_UPDATE) * HZ / 1000;
-	txc->freq	   = time_freq;
+	txc->freq	   = (time_freq / NSEC_PER_USEC) << (SHIFT_USEC - SHIFT_NSEC);
 	txc->maxerror	   = time_maxerror;
 	txc->esterror	   = time_esterror;
 	txc->status	   = time_status;
-- 
cgit v1.2.3


From f19923937321244e7dc334767eb4b67e0e3d5c74 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sat, 30 Sep 2006 23:28:28 -0700
Subject: [PATCH] ntp: convert to the NTP4 reference model

This converts the kernel ntp model into a model which matches the nanokernel
reference implementations.  The previous patches already increased the
resolution and precision of the computations, so that this conversion becomes
quite simple.

<linux@horizon.com> explains:

The original NTP kernel interface was defined in units of microseconds.
That's what Linux implements.  As computers have gotten faster and can now
split microseconds easily, a new kernel interface using nanosecond units was
defined ("the nanokernel", confusing as that name is to OS hackers), and
there's an STA_NANO bit in the adjtimex() status field to tell the application
which units it's using.

The current ntpd supports both, but Linux loses some possible timing
resolution because of quantization effects, and the ntpd hackers would really
like to be able to drop the backwards compatibility code.

Ulrich Windl has been maintaining a patch set to do the conversion for years,
but it's hard to keep in sync.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time/ntp.c | 51 +++++++++++++++++++--------------------------------
 1 file changed, 19 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 9137b54613e0..1ab5e9d7fa50 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -145,18 +145,11 @@ void second_overflow(void)
 	}
 
 	/*
-	 * Compute the phase adjustment for the next second. In PLL mode, the
-	 * offset is reduced by a fixed factor times the time constant. In FLL
-	 * mode the offset is used directly. In either mode, the maximum phase
-	 * adjustment for each second is clamped so as to spread the adjustment
-	 * over not more than the number of seconds between updates.
+	 * Compute the phase adjustment for the next second. The offset is
+	 * reduced by a fixed factor times the time constant.
 	 */
 	tick_length = tick_length_base;
-	time_adj = time_offset;
-	if (!(time_status & STA_FLL))
-		time_adj = shift_right(time_adj, SHIFT_KG + time_constant);
-	time_adj = min(time_adj, -((MAXPHASE / HZ) << SHIFT_UPDATE) / MINSEC);
-	time_adj = max(time_adj, ((MAXPHASE / HZ) << SHIFT_UPDATE) / MINSEC);
+	time_adj = shift_right(time_offset, SHIFT_PLL + time_constant);
 	time_offset -= time_adj;
 	tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - SHIFT_UPDATE);
 
@@ -200,7 +193,7 @@ void __attribute__ ((weak)) notify_arch_cmos_timer(void)
 int do_adjtimex(struct timex *txc)
 {
 	long ltemp, mtemp, save_adjust;
-	s64 freq_adj;
+	s64 freq_adj, temp64;
 	int result;
 
 	/* In order to modify anything, you gotta be super-user! */
@@ -270,7 +263,7 @@ int do_adjtimex(struct timex *txc)
 		    result = -EINVAL;
 		    goto leave;
 		}
-		time_constant = txc->constant;
+		time_constant = min(txc->constant + 4, (long)MAXTC);
 	    }
 
 	    if (txc->modes & ADJ_OFFSET) {	/* values checked earlier */
@@ -298,26 +291,20 @@ int do_adjtimex(struct timex *txc)
 		        time_reftime = xtime.tv_sec;
 		    mtemp = xtime.tv_sec - time_reftime;
 		    time_reftime = xtime.tv_sec;
-		    freq_adj = 0;
-		    if (time_status & STA_FLL) {
-		        if (mtemp >= MINSEC) {
-			    freq_adj = (s64)time_offset << (SHIFT_NSEC - SHIFT_KH);
-			    if (time_offset < 0) {
-				freq_adj = -freq_adj;
-				do_div(freq_adj, mtemp);
-				freq_adj = -freq_adj;
-			    } else
-				do_div(freq_adj, mtemp);
-			} else /* calibration interval too short (p. 12) */
-				result = TIME_ERROR;
-		    } else {	/* PLL mode */
-		        if (mtemp < MAXSEC) {
-			    freq_adj = (s64)ltemp * mtemp;
-			    freq_adj = shift_right(freq_adj,(time_constant +
-						       time_constant +
-						       SHIFT_KF - SHIFT_NSEC));
-			} else /* calibration interval too long (p. 12) */
-				result = TIME_ERROR;
+
+		    freq_adj = (s64)time_offset * mtemp;
+		    freq_adj = shift_right(freq_adj, time_constant * 2 +
+					   (SHIFT_PLL + 2) * 2 - SHIFT_NSEC);
+		    if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
+			temp64 = (s64)time_offset << (SHIFT_NSEC - SHIFT_FLL);
+			if (time_offset < 0) {
+			    temp64 = -temp64;
+			    do_div(temp64, mtemp);
+			    freq_adj -= temp64;
+			} else {
+			    do_div(temp64, mtemp);
+			    freq_adj += temp64;
+			}
 		    }
 		    freq_adj += time_freq;
 		    freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC);
-- 
cgit v1.2.3


From 70bc42f90a3f4721c89dbe865e6c95da8565b41c Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sat, 30 Sep 2006 23:28:29 -0700
Subject: [PATCH] kernel/time/ntp.c: possible cleanups

This patch contains the following possible cleanups:
- make the following needlessly global function static:
  - ntp_update_frequency()
- make the following needlessly global variables static:
  - time_state
  - time_offset
  - time_constant
  - time_reftime
- remove the following read-only global variable:
  - time_precision

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time/ntp.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 1ab5e9d7fa50..47195fa0ec4f 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -30,17 +30,31 @@ static u64 tick_length, tick_length_base;
  * phase-lock loop variables
  */
 /* TIME_ERROR prevents overwriting the CMOS clock */
-int time_state = TIME_OK;		/* clock synchronization status	*/
+static int time_state = TIME_OK;	/* clock synchronization status	*/
 int time_status = STA_UNSYNC;		/* clock status bits		*/
-long time_offset;			/* time adjustment (ns)		*/
-long time_constant = 2;			/* pll time constant		*/
-long time_precision = 1;		/* clock precision (us)		*/
+static long time_offset;		/* time adjustment (ns)		*/
+static long time_constant = 2;		/* pll time constant		*/
 long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
 long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
 long time_freq;				/* frequency offset (scaled ppm)*/
-long time_reftime;			/* time at last adjustment (s)	*/
+static long time_reftime;		/* time at last adjustment (s)	*/
 long time_adjust;
 
+#define CLOCK_TICK_OVERFLOW	(LATCH * HZ - CLOCK_TICK_RATE)
+#define CLOCK_TICK_ADJUST	(((s64)CLOCK_TICK_OVERFLOW * NSEC_PER_SEC) / \
+					(s64)CLOCK_TICK_RATE)
+
+static void ntp_update_frequency(void)
+{
+	tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT;
+	tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
+	tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
+
+	do_div(tick_length_base, HZ);
+
+	tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT;
+}
+
 /**
  * ntp_clear - Clears the NTP state variables
  *
@@ -59,20 +73,6 @@ void ntp_clear(void)
 	time_offset = 0;
 }
 
-#define CLOCK_TICK_OVERFLOW	(LATCH * HZ - CLOCK_TICK_RATE)
-#define CLOCK_TICK_ADJUST	(((s64)CLOCK_TICK_OVERFLOW * NSEC_PER_SEC) / (s64)CLOCK_TICK_RATE)
-
-void ntp_update_frequency(void)
-{
-	tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT;
-	tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
-	tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
-
-	do_div(tick_length_base, HZ);
-
-	tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT;
-}
-
 /*
  * this routine handles the overflow of the microsecond field
  *
@@ -330,7 +330,7 @@ leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
 	txc->esterror	   = time_esterror;
 	txc->status	   = time_status;
 	txc->constant	   = time_constant;
-	txc->precision	   = time_precision;
+	txc->precision	   = 1;
 	txc->tolerance	   = MAXFREQ;
 	txc->tick	   = tick_usec;
 
-- 
cgit v1.2.3


From 8ef386092d7c2891bd7acefb2a87f878f7e9a0d6 Mon Sep 17 00:00:00 2001
From: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Date: Sat, 30 Sep 2006 23:28:31 -0700
Subject: [PATCH] kill wall_jiffies

With 2.6.18-rc4-mm2, now wall_jiffies will always be the same as jiffies.
So we can kill wall_jiffies completely.

This is just a cleanup and logically should not change any real behavior
except for one thing: RTC updating code in (old) ppc and xtensa use a
condition "jiffies - wall_jiffies == 1".  This condition is never met so I
suppose it is just a bug.  I just remove that condition only instead of
kill the whole "if" block.

[heiko.carstens@de.ibm.com: s390 build fix and cleanup]
Signed-off-by: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Cc: Andi Kleen <ak@muc.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ian Molton <spyro@f2s.com>
Cc: Mikael Starvik <starvik@axis.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Hirokazu Takata <takata.hirokazu@renesas.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Kazumoto Kojima <kkojima@rr.iij4u.or.jp>
Cc: Richard Curnow <rc@rc0.org.uk>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Cc: Miles Bader <uclinux-v850@lsi.nec.co.jp>
Cc: Chris Zankel <chris@zankel.net>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 654dd5df2417..c1c7fbcffec1 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -768,7 +768,7 @@ static int timekeeping_suspended;
  * @dev:	unused
  *
  * This is for the generic clocksource timekeeping.
- * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are
+ * xtime/wall_to_monotonic/jiffies/etc are
  * still managed by arch specific suspend/resume code.
  */
 static int timekeeping_resume(struct sys_device *dev)
@@ -1016,9 +1016,6 @@ static inline void calc_load(unsigned long ticks)
 	}
 }
 
-/* jiffies at the most recent update of wall time */
-unsigned long wall_jiffies = INITIAL_JIFFIES;
-
 /*
  * This read-write spinlock protects us from races in SMP while
  * playing with xtime and avenrun.
@@ -1056,7 +1053,6 @@ void run_local_timers(void)
  */
 static inline void update_times(unsigned long ticks)
 {
-	wall_jiffies += ticks;
 	update_wall_time();
 	calc_load(ticks);
 }
-- 
cgit v1.2.3


From 0ae646845b603e9df5711084436d389f8371ffb3 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbir@in.ibm.com>
Date: Sat, 30 Sep 2006 23:28:53 -0700
Subject: [PATCH] Fix taskstats size calculation (use the new genetlink utility
 functions)

The addition of the CSA patch pushed the size of struct taskstats to 256
bytes.  This exposed a problem with prepare_reply(), we were not allocating
space for the netlink and genetlink header.  It worked earlier because
alloc_skb() would align the skb to SMP_CACHE_BYTES, which added some additonal
bytes.

Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Cc: Jamal Hadi <hadi@cyberus.ca>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 2ed4040d0dc5..c451af2ddb50 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -75,7 +75,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
 	/*
 	 * If new attributes are added, please revisit this allocation
 	 */
-	skb = nlmsg_new(size, GFP_KERNEL);
+	skb = nlmsg_new(genlmsg_total_size(size), GFP_KERNEL);
 	if (!skb)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From f3cef7a99469afc159fec3a61b42dc7ca5b6824f Mon Sep 17 00:00:00 2001
From: Jay Lan <jlan@engr.sgi.com>
Date: Sat, 30 Sep 2006 23:28:55 -0700
Subject: [PATCH] csa: basic accounting over taskstats

Add some basic accounting fields to the taskstats struct, add a new
kernel/tsacct.c to handle basic accounting data handling upon exit.  A handle
is added to taskstats.c to invoke the basic accounting data handling.

Signed-off-by: Jay Lan <jlan@sgi.com>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Chris Sturtivant <csturtiv@sgi.com>
Cc: Tony Ernst <tee@sgi.com>
Cc: Guillaume Thouvenin <guillaume.thouvenin@bull.net>
Cc: "Michal Piotrowski" <michal.k.k.piotrowski@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/Makefile    |  2 +-
 kernel/taskstats.c |  4 +++
 kernel/tsacct.c    | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 77 insertions(+), 1 deletion(-)
 create mode 100644 kernel/tsacct.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index e210e8cf7237..aacaafb28b9d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -49,7 +49,7 @@ obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
-obj-$(CONFIG_TASKSTATS) += taskstats.o
+obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index c451af2ddb50..6c38dce88e8c 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -18,6 +18,7 @@
 
 #include <linux/kernel.h>
 #include <linux/taskstats_kern.h>
+#include <linux/tsacct_kern.h>
 #include <linux/delayacct.h>
 #include <linux/cpumask.h>
 #include <linux/percpu.h>
@@ -198,7 +199,10 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 	 */
 
 	delayacct_add_tsk(stats, tsk);
+
+	/* fill in basic acct fields */
 	stats->version = TASKSTATS_VERSION;
+	bacct_add_tsk(stats, tsk);
 
 	/* Define err: label here if needed */
 	put_task_struct(tsk);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
new file mode 100644
index 000000000000..899067950a88
--- /dev/null
+++ b/kernel/tsacct.c
@@ -0,0 +1,72 @@
+/*
+ * tsacct.c - System accounting over taskstats interface
+ *
+ * Copyright (C) Jay Lan,	<jlan@sgi.com>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/tsacct_kern.h>
+#include <linux/acct.h>
+
+
+#define USEC_PER_TICK	(USEC_PER_SEC/HZ)
+/*
+ * fill in basic accounting fields
+ */
+void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
+{
+	struct timespec uptime, ts;
+	s64 ac_etime;
+
+	BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
+
+	/* calculate task elapsed time in timespec */
+	do_posix_clock_monotonic_gettime(&uptime);
+	ts = timespec_sub(uptime, current->group_leader->start_time);
+	/* rebase elapsed time to usec */
+	ac_etime = timespec_to_ns(&ts);
+	do_div(ac_etime, NSEC_PER_USEC);
+	stats->ac_etime = ac_etime;
+	stats->ac_btime = xtime.tv_sec - ts.tv_sec;
+	if (thread_group_leader(tsk)) {
+		stats->ac_exitcode = tsk->exit_code;
+		if (tsk->flags & PF_FORKNOEXEC)
+			stats->ac_flag |= AFORK;
+	}
+	if (tsk->flags & PF_SUPERPRIV)
+		stats->ac_flag |= ASU;
+	if (tsk->flags & PF_DUMPCORE)
+		stats->ac_flag |= ACORE;
+	if (tsk->flags & PF_SIGNALED)
+		stats->ac_flag |= AXSIG;
+	stats->ac_nice	 = task_nice(tsk);
+	stats->ac_sched	 = tsk->policy;
+	stats->ac_uid	 = tsk->uid;
+	stats->ac_gid	 = tsk->gid;
+	stats->ac_pid	 = tsk->pid;
+	stats->ac_ppid	 = (tsk->parent) ? tsk->parent->pid : 0;
+	stats->ac_utime	 = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC;
+	stats->ac_stime	 = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC;
+	stats->ac_minflt = tsk->min_flt;
+	stats->ac_majflt = tsk->maj_flt;
+	/* Each process gets a minimum of one usec cpu time */
+	if ((stats->ac_utime == 0) && (stats->ac_stime == 0)) {
+		stats->ac_stime = 1;
+	}
+
+	strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm));
+}
+
-- 
cgit v1.2.3


From 9acc1853519a0473620d424105f9d49ea5b4e62e Mon Sep 17 00:00:00 2001
From: Jay Lan <jlan@engr.sgi.com>
Date: Sat, 30 Sep 2006 23:28:58 -0700
Subject: [PATCH] csa: Extended system accounting over taskstats

Add extended system accounting handling over taskstats interface.  A
CONFIG_TASK_XACCT flag is created to enable the extended accounting code.

Signed-off-by: Jay Lan <jlan@sgi.com>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Chris Sturtivant <csturtiv@sgi.com>
Cc: Tony Ernst <tee@sgi.com>
Cc: Guillaume Thouvenin <guillaume.thouvenin@bull.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c |  4 ++++
 kernel/tsacct.c    | 19 +++++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 6c38dce88e8c..5d6a8c54ee85 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -20,6 +20,7 @@
 #include <linux/taskstats_kern.h>
 #include <linux/tsacct_kern.h>
 #include <linux/delayacct.h>
+#include <linux/tsacct_kern.h>
 #include <linux/cpumask.h>
 #include <linux/percpu.h>
 #include <net/genetlink.h>
@@ -204,6 +205,9 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 	stats->version = TASKSTATS_VERSION;
 	bacct_add_tsk(stats, tsk);
 
+	/* fill in extended acct fields */
+	xacct_add_tsk(stats, tsk);
+
 	/* Define err: label here if needed */
 	put_task_struct(tsk);
 	return rc;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 899067950a88..410483490cf6 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -70,3 +70,22 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 	strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm));
 }
 
+
+#ifdef CONFIG_TASK_XACCT
+/*
+ * fill in extended accounting fields
+ */
+void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
+{
+	stats->acct_rss_mem1 = p->acct_rss_mem1;
+	stats->acct_vm_mem1  = p->acct_vm_mem1;
+	if (p->mm) {
+		stats->hiwater_rss   = p->mm->hiwater_rss;
+		stats->hiwater_vm    = p->mm->hiwater_vm;
+	}
+	stats->read_char	= p->rchar;
+	stats->write_char	= p->wchar;
+	stats->read_syscalls	= p->syscr;
+	stats->write_syscalls	= p->syscw;
+}
+#endif
-- 
cgit v1.2.3


From 8f0ab5147951267134612570604cf8341901a80c Mon Sep 17 00:00:00 2001
From: Jay Lan <jlan@engr.sgi.com>
Date: Sat, 30 Sep 2006 23:28:59 -0700
Subject: [PATCH] csa: convert CONFIG tag for extended accounting routines

There were a few accounting data/macros that are used in CSA but are #ifdef'ed
inside CONFIG_BSD_PROCESS_ACCT.  This patch is to change those ifdef's from
CONFIG_BSD_PROCESS_ACCT to CONFIG_TASK_XACCT.  A few defines are moved from
kernel/acct.c and include/linux/acct.h to kernel/tsacct.c and
include/linux/tsacct_kern.h.

Signed-off-by: Jay Lan <jlan@sgi.com>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Chris Sturtivant <csturtiv@sgi.com>
Cc: Tony Ernst <tee@sgi.com>
Cc: Guillaume Thouvenin <guillaume.thouvenin@bull.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/acct.c   | 30 ------------------------------
 kernel/exit.c   |  1 +
 kernel/fork.c   |  1 +
 kernel/sched.c  |  2 +-
 kernel/tsacct.c | 30 ++++++++++++++++++++++++++++++
 5 files changed, 33 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index f4330acead46..0aad5ca36a81 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -602,33 +602,3 @@ void acct_process(void)
 	do_acct_process(file);
 	fput(file);
 }
-
-
-/**
- * acct_update_integrals - update mm integral fields in task_struct
- * @tsk: task_struct for accounting
- */
-void acct_update_integrals(struct task_struct *tsk)
-{
-	if (likely(tsk->mm)) {
-		long delta =
-			cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd;
-
-		if (delta == 0)
-			return;
-		tsk->acct_stimexpd = tsk->stime;
-		tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
-		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
-	}
-}
-
-/**
- * acct_clear_integrals - clear the mm integral fields in task_struct
- * @tsk: task_struct whose accounting fields are cleared
- */
-void acct_clear_integrals(struct task_struct *tsk)
-{
-	tsk->acct_stimexpd = 0;
-	tsk->acct_rss_mem1 = 0;
-	tsk->acct_vm_mem1 = 0;
-}
diff --git a/kernel/exit.c b/kernel/exit.c
index c189de2927ab..3b47f26985f2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -18,6 +18,7 @@
 #include <linux/security.h>
 #include <linux/cpu.h>
 #include <linux/acct.h>
+#include <linux/tsacct_kern.h>
 #include <linux/file.h>
 #include <linux/binfmts.h>
 #include <linux/ptrace.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 1c999f3e0b47..89f666491d1f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -42,6 +42,7 @@
 #include <linux/profile.h>
 #include <linux/rmap.h>
 #include <linux/acct.h>
+#include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/delayacct.h>
 #include <linux/taskstats_kern.h>
diff --git a/kernel/sched.c b/kernel/sched.c
index 74f169ac0773..2bbd948f0169 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -49,7 +49,7 @@
 #include <linux/seq_file.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
-#include <linux/acct.h>
+#include <linux/tsacct_kern.h>
 #include <linux/kprobes.h>
 #include <linux/delayacct.h>
 #include <asm/tlb.h>
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 410483490cf6..47c71daa416f 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -88,4 +88,34 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 	stats->read_syscalls	= p->syscr;
 	stats->write_syscalls	= p->syscw;
 }
+
+
+/**
+ * acct_update_integrals - update mm integral fields in task_struct
+ * @tsk: task_struct for accounting
+ */
+void acct_update_integrals(struct task_struct *tsk)
+{
+	if (likely(tsk->mm)) {
+		long delta =
+			cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd;
+
+		if (delta == 0)
+			return;
+		tsk->acct_stimexpd = tsk->stime;
+		tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
+		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
+	}
+}
+
+/**
+ * acct_clear_integrals - clear the mm integral fields in task_struct
+ * @tsk: task_struct whose accounting fields are cleared
+ */
+void acct_clear_integrals(struct task_struct *tsk)
+{
+	tsk->acct_stimexpd = 0;
+	tsk->acct_rss_mem1 = 0;
+	tsk->acct_vm_mem1 = 0;
+}
 #endif
-- 
cgit v1.2.3


From db5fed26b2e0beed939b773dd5896077a1794d65 Mon Sep 17 00:00:00 2001
From: Jay Lan <jlan@sgi.com>
Date: Sat, 30 Sep 2006 23:29:00 -0700
Subject: [PATCH] csa accounting taskstats update

ChangeLog:
   Feedbacks from Andrew Morton:
   - define TS_COMM_LEN to 32
   - change acct_stimexpd field of task_struct to be of
     cputime_t, which is to be used to save the tsk->stime
     of last timer interrupt update.
   - a new Documentation/accounting/taskstats-struct.txt
     to describe fields of taskstats struct.

   Feedback from Balbir Singh:
   - keep the stime of a task to be zero when both stime
     and utime are zero as recoreded in task_struct.

   Misc:
   - convert accumulated RSS/VM from platform dependent
     pages-ticks to MBytes-usecs in the kernel

Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Chris Sturtivant <csturtiv@sgi.com>
Cc: Tony Ernst <tee@sgi.com>
Cc: Guillaume Thouvenin <guillaume.thouvenin@bull.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/tsacct.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 47c71daa416f..db443221ba5b 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -20,6 +20,7 @@
 #include <linux/sched.h>
 #include <linux/tsacct_kern.h>
 #include <linux/acct.h>
+#include <linux/jiffies.h>
 
 
 #define USEC_PER_TICK	(USEC_PER_SEC/HZ)
@@ -62,33 +63,35 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 	stats->ac_stime	 = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC;
 	stats->ac_minflt = tsk->min_flt;
 	stats->ac_majflt = tsk->maj_flt;
-	/* Each process gets a minimum of one usec cpu time */
-	if ((stats->ac_utime == 0) && (stats->ac_stime == 0)) {
-		stats->ac_stime = 1;
-	}
 
 	strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm));
 }
 
 
 #ifdef CONFIG_TASK_XACCT
+
+#define KB 1024
+#define MB (1024*KB)
 /*
  * fill in extended accounting fields
  */
 void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 {
-	stats->acct_rss_mem1 = p->acct_rss_mem1;
-	stats->acct_vm_mem1  = p->acct_vm_mem1;
+	/* convert pages-jiffies to Mbyte-usec */
+	stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB;
+	stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB;
 	if (p->mm) {
-		stats->hiwater_rss   = p->mm->hiwater_rss;
-		stats->hiwater_vm    = p->mm->hiwater_vm;
+		/* adjust to KB unit */
+		stats->hiwater_rss   = p->mm->hiwater_rss * PAGE_SIZE / KB;
+		stats->hiwater_vm    = p->mm->hiwater_vm * PAGE_SIZE / KB;
 	}
 	stats->read_char	= p->rchar;
 	stats->write_char	= p->wchar;
 	stats->read_syscalls	= p->syscr;
 	stats->write_syscalls	= p->syscw;
 }
-
+#undef KB
+#undef MB
 
 /**
  * acct_update_integrals - update mm integral fields in task_struct
@@ -97,8 +100,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 void acct_update_integrals(struct task_struct *tsk)
 {
 	if (likely(tsk->mm)) {
-		long delta =
-			cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd;
+		long delta = cputime_to_jiffies(
+			cputime_sub(tsk->stime, tsk->acct_stimexpd));
 
 		if (delta == 0)
 			return;
-- 
cgit v1.2.3


From d8c76e6f45c111c32a4b3e50a2adc9210737b0d8 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Sat, 30 Sep 2006 23:29:04 -0700
Subject: [PATCH] r/o bind mount prepwork: inc_nlink() helper

This is mostly included for parity with dec_nlink(), where we will have some
more hooks.  This one should stay pretty darn straightforward for now.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8c3c400cce91..9d850ae13b1b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -377,7 +377,7 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data,
 		inode->i_op = &simple_dir_inode_operations;
 		inode->i_fop = &simple_dir_operations;
 		/* directories start off with i_nlink == 2 (for "." entry) */
-		inode->i_nlink++;
+		inc_nlink(inode);
 	} else {
 		return -ENOMEM;
 	}
@@ -1565,7 +1565,7 @@ static int cpuset_create_file(struct dentry *dentry, int mode)
 		inode->i_fop = &simple_dir_operations;
 
 		/* start off with i_nlink == 2 (for "." entry) */
-		inode->i_nlink++;
+		inc_nlink(inode);
 	} else if (S_ISREG(mode)) {
 		inode->i_size = 0;
 		inode->i_fop = &cpuset_file_operations;
@@ -1598,7 +1598,7 @@ static int cpuset_create_dir(struct cpuset *cs, const char *name, int mode)
 	error = cpuset_create_file(dentry, S_IFDIR | mode);
 	if (!error) {
 		dentry->d_fsdata = cs;
-		parent->d_inode->i_nlink++;
+		inc_nlink(parent->d_inode);
 		cs->dentry = dentry;
 	}
 	dput(dentry);
@@ -2033,7 +2033,7 @@ int __init cpuset_init(void)
 	}
 	root = cpuset_mount->mnt_sb->s_root;
 	root->d_fsdata = &top_cpuset;
-	root->d_inode->i_nlink++;
+	inc_nlink(root->d_inode);
 	top_cpuset.dentry = root;
 	root->d_inode->i_op = &cpuset_dir_inode_operations;
 	number_of_cpusets = 1;
-- 
cgit v1.2.3


From e239ca540594cff00adcce163dc332b27015d8e5 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 30 Sep 2006 23:29:27 -0700
Subject: [PATCH] Create call_usermodehelper_pipe()

A new member in the ever growing family of call_usermode* functions is
born.  The new call_usermodehelper_pipe() function allows to pipe data to
the stdin of the called user mode progam and behaves otherwise like the
normal call_usermodehelp() (except that it always waits for the child to
finish)

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kmod.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 842f8015d7fd..5c63c53014a9 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -122,6 +122,7 @@ struct subprocess_info {
 	struct key *ring;
 	int wait;
 	int retval;
+	struct file *stdin;
 };
 
 /*
@@ -145,12 +146,26 @@ static int ____call_usermodehelper(void *data)
 
 	key_put(old_session);
 
+	/* Install input pipe when needed */
+	if (sub_info->stdin) {
+		struct files_struct *f = current->files;
+		struct fdtable *fdt;
+		/* no races because files should be private here */
+		sys_close(0);
+		fd_install(0, sub_info->stdin);
+		spin_lock(&f->file_lock);
+		fdt = files_fdtable(f);
+		FD_SET(0, fdt->open_fds);
+		FD_CLR(0, fdt->close_on_exec);
+		spin_unlock(&f->file_lock);
+	}
+
 	/* We can run anywhere, unlike our parent keventd(). */
 	set_cpus_allowed(current, CPU_MASK_ALL);
 
 	retval = -EPERM;
 	if (current->fs->root)
-		retval = execve(sub_info->path, sub_info->argv,sub_info->envp);
+		retval = execve(sub_info->path, sub_info->argv, sub_info->envp);
 
 	/* Exec failed? */
 	sub_info->retval = retval;
@@ -268,6 +283,44 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
 }
 EXPORT_SYMBOL(call_usermodehelper_keys);
 
+int call_usermodehelper_pipe(char *path, char **argv, char **envp,
+			     struct file **filp)
+{
+	DECLARE_COMPLETION(done);
+	struct subprocess_info sub_info = {
+		.complete	= &done,
+		.path		= path,
+		.argv		= argv,
+		.envp		= envp,
+		.retval		= 0,
+	};
+	struct file *f;
+	DECLARE_WORK(work, __call_usermodehelper, &sub_info);
+
+	if (!khelper_wq)
+		return -EBUSY;
+
+	if (path[0] == '\0')
+		return 0;
+
+	f = create_write_pipe();
+	if (!f)
+		return -ENOMEM;
+	*filp = f;
+
+	f = create_read_pipe(f);
+	if (!f) {
+		free_write_pipe(*filp);
+		return -ENOMEM;
+	}
+	sub_info.stdin = f;
+
+	queue_work(khelper_wq, &work);
+	wait_for_completion(&done);
+	return sub_info.retval;
+}
+EXPORT_SYMBOL(call_usermodehelper_pipe);
+
 void __init usermodehelper_init(void)
 {
 	khelper_wq = create_singlethread_workqueue("khelper");
-- 
cgit v1.2.3


From d025c9db7f31fc0554ce7fb2dfc78d35a77f3487 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 30 Sep 2006 23:29:28 -0700
Subject: [PATCH] Support piping into commands in /proc/sys/kernel/core_pattern

Using the infrastructure created in previous patches implement support to
pipe core dumps into programs.

This is done by overloading the existing core_pattern sysctl
with a new syntax:

|program

When the first character of the pattern is a '|' the kernel will instead
threat the rest of the pattern as a command to run.  The core dump will be
written to the standard input of that program instead of to a file.

This is useful for having automatic core dump analysis without filling up
disks.  The program can do some simple analysis and save only a summary of
the core dump.

The core dump proces will run with the privileges and in the name space of
the process that caused the core dump.

I also increased the core pattern size to 128 bytes so that longer command
lines fit.

Most of the changes comes from allowing core dumps without seeks.  They are
fairly straight forward though.

One small incompatibility is that if someone had a core pattern previously
that started with '|' they will get suddenly new behaviour.  I think that's
unlikely to be a real problem though.

Additional background:

> Very nice, do you happen to have a program that can accept this kind of
> input for crash dumps?  I'm guessing that the embedded people will
> really want this functionality.

I had a cheesy demo/prototype.  Basically it wrote the dump to a file again,
ran gdb on it to get a backtrace and wrote the summary to a shared directory.
Then there was a simple CGI script to generate a "top 10" crashes HTML
listing.

Unfortunately this still had the disadvantage to needing full disk space for a
dump except for deleting it afterwards (in fact it was worse because over the
pipe holes didn't work so if you have a holey address map it would require
more space).

Fortunately gdb seems to be happy to handle /proc/pid/fd/xxx input pipes as
cores (at least it worked with zsh's =(cat core) syntax), so it would be
likely possible to do it without temporary space with a simple wrapper that
calls it in the right way.  I ran out of time before doing that though.

The demo prototype scripts weren't very good.  If there is really interest I
can dig them out (they are currently on a laptop disk on the desk with the
laptop itself being in service), but I would recommend to rewrite them for any
serious application of this and fix the disk space problem.

Also to be really useful it should probably find a way to automatically fetch
the debuginfos (I cheated and just installed them in advance).  If nobody else
does it I can probably do the rewrite myself again at some point.

My hope at some point was that desktops would support it in their builtin
crash reporters, but at least the KDE people I talked too seemed to be happy
with their user space only solution.

Alan sayeth:

  I don't believe that piping as such as neccessarily the right model, but
  the ability to intercept and processes core dumps from user space is asked
  for by many enterprise users as well.  They want to know about, capture,
  analyse and process core dumps, often centrally and in automated form.

[akpm@osdl.org: loff_t != unsigned long]
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kmod.c   | 4 ++++
 kernel/sysctl.c | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 5c63c53014a9..f8121b95183f 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -35,6 +35,7 @@
 #include <linux/mount.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/resource.h>
 #include <asm/uaccess.h>
 
 extern int max_threads;
@@ -158,6 +159,9 @@ static int ____call_usermodehelper(void *data)
 		FD_SET(0, fdt->open_fds);
 		FD_CLR(0, fdt->close_on_exec);
 		spin_unlock(&f->file_lock);
+
+		/* and disallow core files too */
+		current->signal->rlim[RLIMIT_CORE] = (struct rlimit){0, 0};
 	}
 
 	/* We can run anywhere, unlike our parent keventd(). */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c57c4532e296..ba42694f0453 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -294,7 +294,7 @@ static ctl_table kern_table[] = {
 		.ctl_name	= KERN_CORE_PATTERN,
 		.procname	= "core_pattern",
 		.data		= core_pattern,
-		.maxlen		= 64,
+		.maxlen		= 128,
 		.mode		= 0644,
 		.proc_handler	= &proc_dostring,
 		.strategy	= &sysctl_string,
-- 
cgit v1.2.3


From 2bc2d61a9638dab670d8361e928d1a5a291173ef Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Mon, 2 Oct 2006 02:17:02 -0700
Subject: [PATCH] list module taint flags in Oops/panic

When listing loaded modules during an oops or panic, also list each
module's Tainted flags if non-zero (P: Proprietary or F: Forced load only).

If a module is did not taint the kernel, it is just listed like
	usbcore
but if it did taint the kernel, it is listed like
	wizmodem(PF)

Example:
[ 3260.121718] Unable to handle kernel NULL pointer dereference at 0000000000000000 RIP:
[ 3260.121729]  [<ffffffff8804c099>] :dump_test:proc_dump_test+0x99/0xc8
[ 3260.121742] PGD fe8d067 PUD 264a6067 PMD 0
[ 3260.121748] Oops: 0002 [1] SMP
[ 3260.121753] CPU 1
[ 3260.121756] Modules linked in: dump_test(P) snd_pcm_oss snd_mixer_oss snd_seq snd_seq_device ide_cd generic ohci1394 snd_hda_intel snd_hda_codec snd_pcm snd_timer snd ieee1394 snd_page_alloc piix ide_core arcmsr aic79xx scsi_transport_spi usblp
[ 3260.121785] Pid: 5556, comm: bash Tainted: P      2.6.18-git10 #1

[Alternatively, I can look into listing tainted flags with 'lsmod',
but that won't help in oopsen/panics so much.]

[akpm@osdl.org: cleanup]
Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 37 ++++++++++++++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 05625d5dc758..7c77a0a9275c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -851,6 +851,7 @@ static int check_version(Elf_Shdr *sechdrs,
 		printk("%s: no version for \"%s\" found: kernel tainted.\n",
 		       mod->name, symname);
 		add_taint(TAINT_FORCED_MODULE);
+		mod->taints |= TAINT_FORCED_MODULE;
 	}
 	return 1;
 }
@@ -1339,6 +1340,7 @@ static void set_license(struct module *mod, const char *license)
 		printk(KERN_WARNING "%s: module license '%s' taints kernel.\n",
 		       mod->name, license);
 		add_taint(TAINT_PROPRIETARY_MODULE);
+		mod->taints |= TAINT_PROPRIETARY_MODULE;
 	}
 }
 
@@ -1618,6 +1620,7 @@ static struct module *load_module(void __user *umod,
 	/* This is allowed: modprobe --force will invalidate it. */
 	if (!modmagic) {
 		add_taint(TAINT_FORCED_MODULE);
+		mod->taints |= TAINT_FORCED_MODULE;
 		printk(KERN_WARNING "%s: no version magic, tainting kernel.\n",
 		       mod->name);
 	} else if (!same_magic(modmagic, vermagic)) {
@@ -1711,10 +1714,14 @@ static struct module *load_module(void __user *umod,
 	/* Set up license info based on the info section */
 	set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
 
-	if (strcmp(mod->name, "ndiswrapper") == 0)
+	if (strcmp(mod->name, "ndiswrapper") == 0) {
 		add_taint(TAINT_PROPRIETARY_MODULE);
-	if (strcmp(mod->name, "driverloader") == 0)
+		mod->taints |= TAINT_PROPRIETARY_MODULE;
+	}
+	if (strcmp(mod->name, "driverloader") == 0) {
 		add_taint(TAINT_PROPRIETARY_MODULE);
+		mod->taints |= TAINT_PROPRIETARY_MODULE;
+	}
 
 	/* Set up MODINFO_ATTR fields */
 	setup_modinfo(mod, sechdrs, infoindex);
@@ -1760,6 +1767,7 @@ static struct module *load_module(void __user *umod,
 		printk(KERN_WARNING "%s: No versions for exported symbols."
 		       " Tainting kernel.\n", mod->name);
 		add_taint(TAINT_FORCED_MODULE);
+		mod->taints |= TAINT_FORCED_MODULE;
 	}
 #endif
 
@@ -2226,14 +2234,37 @@ struct module *module_text_address(unsigned long addr)
 	return mod;
 }
 
+static char *taint_flags(unsigned int taints, char *buf)
+{
+	*buf = '\0';
+	if (taints) {
+		int bx;
+
+		buf[0] = '(';
+		bx = 1;
+		if (taints & TAINT_PROPRIETARY_MODULE)
+			buf[bx++] = 'P';
+		if (taints & TAINT_FORCED_MODULE)
+			buf[bx++] = 'F';
+		/*
+		 * TAINT_FORCED_RMMOD: could be added.
+		 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
+		 * apply to modules.
+		 */
+		buf[bx] = ')';
+	}
+	return buf;
+}
+
 /* Don't grab lock, we're oopsing. */
 void print_modules(void)
 {
 	struct module *mod;
+	char buf[8];
 
 	printk("Modules linked in:");
 	list_for_each_entry(mod, &modules, list)
-		printk(" %s", mod->name);
+		printk(" %s%s", mod->name, taint_flags(mod->taints, buf));
 	printk("\n");
 }
 
-- 
cgit v1.2.3


From 0804ef4b0de7121261f77c565b20a11ac694e877 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 2 Oct 2006 02:17:04 -0700
Subject: [PATCH] proc: readdir race fix (take 3)

The problem: An opendir, readdir, closedir sequence can fail to report
process ids that are continually in use throughout the sequence of system
calls.  For this race to trigger the process that proc_pid_readdir stops at
must exit before readdir is called again.

This can cause ps to fail to report processes, and it is in violation of
posix guarantees and normal application expectations with respect to
readdir.

Currently there is no way to work around this problem in user space short
of providing a gargantuan buffer to user space so the directory read all
happens in on system call.

This patch implements the normal directory semantics for proc, that
guarantee that a directory entry that is neither created nor destroyed
while reading the directory entry will be returned.  For directory that are
either created or destroyed during the readdir you may or may not see them.
 Furthermore you may seek to a directory offset you have previously seen.

These are the guarantee that ext[23] provides and that posix requires, and
more importantly that user space expects.  Plus it is a simple semantic to
implement reliable service.  It is just a matter of calling readdir a
second time if you are wondering if something new has show up.

These better semantics are implemented by scanning through the pids in
numerical order and by making the file offset a pid plus a fixed offset.

The pid scan happens on the pid bitmap, which when you look at it is
remarkably efficient for a brute force algorithm.  Given that a typical
cache line is 64 bytes and thus covers space for 64*8 == 200 pids.  There
are only 40 cache lines for the entire 32K pid space.  A typical system
will have 100 pids or more so this is actually fewer cache lines we have to
look at to scan a linked list, and the worst case of having to scan the
entire pid bitmap is pretty reasonable.

If we need something more efficient we can go to a more efficient data
structure for indexing the pids, but for now what we have should be
sufficient.

In addition this takes no additional locks and is actually less code than
what we are doing now.

Also another very subtle bug in this area has been fixed.  It is possible
to catch a task in the middle of de_thread where a thread is assuming the
thread of it's thread group leader.  This patch carefully handles that case
so if we hit it we don't fail to return the pid, that is undergoing the
de_thread dance.

Thanks to KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> for
providing the first fix, pointing this out and working on it.

[oleg@tv-sign.ru: fix it]
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Jean Delvare <jdelvare@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/pid.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index 8387e8c68193..ed89a732432c 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -145,6 +145,23 @@ static int alloc_pidmap(void)
 	return -1;
 }
 
+static int next_pidmap(int last)
+{
+	int offset;
+	pidmap_t *map;
+
+	offset = (last + 1) & BITS_PER_PAGE_MASK;
+	map = &pidmap_array[(last + 1)/BITS_PER_PAGE];
+	for (; map < &pidmap_array[PIDMAP_ENTRIES]; map++, offset = 0) {
+		if (unlikely(!map->page))
+			continue;
+		offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
+		if (offset < BITS_PER_PAGE)
+			return mk_pid(map, offset);
+	}
+	return -1;
+}
+
 fastcall void put_pid(struct pid *pid)
 {
 	if (!pid)
@@ -302,6 +319,25 @@ struct pid *find_get_pid(pid_t nr)
 	return pid;
 }
 
+/*
+ * Used by proc to find the first pid that is greater then or equal to nr.
+ *
+ * If there is a pid at nr this function is exactly the same as find_pid.
+ */
+struct pid *find_ge_pid(int nr)
+{
+	struct pid *pid;
+
+	do {
+		pid = find_pid(nr);
+		if (pid)
+			break;
+		nr = next_pidmap(nr);
+	} while (nr > 0);
+
+	return pid;
+}
+
 /*
  * The pid hash table is scaled according to the amount of memory in the
  * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
-- 
cgit v1.2.3


From c4b92fc112f7be5cce308128236ff75cc98535c3 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 2 Oct 2006 02:17:10 -0700
Subject: [PATCH] pid: implement signal functions that take a struct pid *

Currently the signal functions all either take a task or a pid_t argument.
This patch implements variants that take a struct pid *.  After all of the
users have been update it is my intention to remove the variants that take a
pid_t as using pid_t can be more work (an extra hash table lookup) and
difficult to get right in the presence of multiple pid namespaces.

There are two kinds of functions introduced in this patch.  The are the
general use functions kill_pgrp and kill_pid which take a priv argument that
is ultimately used to create the appropriate siginfo information, Then there
are _kill_pgrp_info, kill_pgrp_info, kill_pid_info the internal implementation
helpers that take an explicit siginfo.

The distinction is made because filling out an explcit siginfo is tricky, and
will be even more tricky when pid namespaces are introduced.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 47 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index fb5da6d19f14..5230ddcb1757 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1055,28 +1055,44 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 }
 
 /*
- * kill_pg_info() sends a signal to a process group: this is what the tty
+ * kill_pgrp_info() sends a signal to a process group: this is what the tty
  * control characters do (^C, ^Z etc)
  */
 
-int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp)
+int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
 {
 	struct task_struct *p = NULL;
 	int retval, success;
 
-	if (pgrp <= 0)
-		return -EINVAL;
-
 	success = 0;
 	retval = -ESRCH;
-	do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
+	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
 		int err = group_send_sig_info(sig, info, p);
 		success |= !err;
 		retval = err;
-	} while_each_task_pid(pgrp, PIDTYPE_PGID, p);
+	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
 	return success ? 0 : retval;
 }
 
+int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
+{
+	int retval;
+
+	read_lock(&tasklist_lock);
+	retval = __kill_pgrp_info(sig, info, pgrp);
+	read_unlock(&tasklist_lock);
+
+	return retval;
+}
+
+int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp)
+{
+	if (pgrp <= 0)
+		return -EINVAL;
+
+	return __kill_pgrp_info(sig, info, find_pid(pgrp));
+}
+
 int
 kill_pg_info(int sig, struct siginfo *info, pid_t pgrp)
 {
@@ -1089,8 +1105,7 @@ kill_pg_info(int sig, struct siginfo *info, pid_t pgrp)
 	return retval;
 }
 
-int
-kill_proc_info(int sig, struct siginfo *info, pid_t pid)
+int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
 {
 	int error;
 	int acquired_tasklist_lock = 0;
@@ -1101,7 +1116,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 		read_lock(&tasklist_lock);
 		acquired_tasklist_lock = 1;
 	}
-	p = find_task_by_pid(pid);
+	p = pid_task(pid, PIDTYPE_PID);
 	error = -ESRCH;
 	if (p)
 		error = group_send_sig_info(sig, info, p);
@@ -1111,6 +1126,16 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 	return error;
 }
 
+int
+kill_proc_info(int sig, struct siginfo *info, pid_t pid)
+{
+	int error;
+	rcu_read_lock();
+	error = kill_pid_info(sig, info, find_pid(pid));
+	rcu_read_unlock();
+	return error;
+}
+
 /* like kill_proc_info(), but doesn't use uid/euid of "current" */
 int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid,
 		      uid_t uid, uid_t euid, u32 secid)
@@ -1264,6 +1289,18 @@ force_sigsegv(int sig, struct task_struct *p)
 	return 0;
 }
 
+int kill_pgrp(struct pid *pid, int sig, int priv)
+{
+	return kill_pgrp_info(sig, __si_special(priv), pid);
+}
+EXPORT_SYMBOL(kill_pgrp);
+
+int kill_pid(struct pid *pid, int sig, int priv)
+{
+	return kill_pid_info(sig, __si_special(priv), pid);
+}
+EXPORT_SYMBOL(kill_pid);
+
 int
 kill_pg(pid_t pgrp, int sig, int priv)
 {
-- 
cgit v1.2.3


From bbf73147e2d46611fbdcbc126f887c614c32350b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 2 Oct 2006 02:17:11 -0700
Subject: [PATCH] pid: export the symbols needed to use struct pid *

pids aren't something that drivers should care about.  However there are a lot
of helper layers in the kernel that do care, and are built as modules.  Before
I can convert them to using struct pid instead of pid_t I need to export the
appropriate symbols so they can continue to be built.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/pid.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index ed89a732432c..5c9037ba42f5 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -170,6 +170,7 @@ fastcall void put_pid(struct pid *pid)
 	     atomic_dec_and_test(&pid->count))
 		kmem_cache_free(pid_cachep, pid);
 }
+EXPORT_SYMBOL_GPL(put_pid);
 
 static void delayed_put_pid(struct rcu_head *rhp)
 {
@@ -234,6 +235,7 @@ struct pid * fastcall find_pid(int nr)
 	}
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(find_pid);
 
 int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr)
 {
@@ -337,6 +339,7 @@ struct pid *find_ge_pid(int nr)
 
 	return pid;
 }
+EXPORT_SYMBOL_GPL(find_get_pid);
 
 /*
  * The pid hash table is scaled according to the amount of memory in the
-- 
cgit v1.2.3


From 609d7fa9565c754428d2520cac2accc9052e1245 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 2 Oct 2006 02:17:15 -0700
Subject: [PATCH] file: modify struct fown_struct to use a struct pid

File handles can be requested to send sigio and sigurg to processes.  By
tracking the destination processes using struct pid instead of pid_t we make
the interface safe from all potential pid wrap around problems.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 4b6770e9806d..4aaf91951a43 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1527,7 +1527,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
 	filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
 
 	if (signal) {
-		err = f_setown(filp, current->pid, 1);
+		err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1);
 		if (err < 0) {
 			goto error;
 		}
-- 
cgit v1.2.3


From 6a1f3b84557774a46af68747c92d8f36382027ae Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Date: Mon, 2 Oct 2006 02:17:20 -0700
Subject: [PATCH] pids: coding style: use struct pidmap

Use struct pidmap instead of pidmap_t.

Its a subset of Eric Biederman's patch http://lkml.org/lkml/2006/2/6/271.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/pid.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index 5c9037ba42f5..0a45de2918e2 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -53,12 +53,12 @@ int pid_max_max = PID_MAX_LIMIT;
  * value does not cause lots of bitmaps to be allocated, but
  * the scheme scales to up to 4 million PIDs, runtime.
  */
-typedef struct pidmap {
+struct pidmap {
 	atomic_t nr_free;
 	void *page;
-} pidmap_t;
+};
 
-static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
+static struct pidmap pidmap_array[PIDMAP_ENTRIES] =
 	 { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
 
 /*
@@ -78,7 +78,7 @@ static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
 
 static fastcall void free_pidmap(int pid)
 {
-	pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE;
+	struct pidmap *map = pidmap_array + pid / BITS_PER_PAGE;
 	int offset = pid & BITS_PER_PAGE_MASK;
 
 	clear_bit(offset, map->page);
@@ -88,7 +88,7 @@ static fastcall void free_pidmap(int pid)
 static int alloc_pidmap(void)
 {
 	int i, offset, max_scan, pid, last = last_pid;
-	pidmap_t *map;
+	struct pidmap *map;
 
 	pid = last + 1;
 	if (pid >= pid_max)
-- 
cgit v1.2.3


From c88be3eb2e01bbb21c9ccdc3805f0d3546c1898c Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 2 Oct 2006 02:17:21 -0700
Subject: [PATCH] pids coding style use struct pidmap in next_pidmap

Use struct pidmap instead of pidmap_t.

This updates my proc: readdir race fix (take 3) patch
to account for the changes made by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
to kill pidmap_t.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/pid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index 0a45de2918e2..373639a10d84 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -148,7 +148,7 @@ static int alloc_pidmap(void)
 static int next_pidmap(int last)
 {
 	int offset;
-	pidmap_t *map;
+	struct pidmap *map;
 
 	offset = (last + 1) & BITS_PER_PAGE_MASK;
 	map = &pidmap_array[(last + 1)/BITS_PER_PAGE];
-- 
cgit v1.2.3


From aa5a6662f93f52605b6c447ba6f7291e92f515c5 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Date: Mon, 2 Oct 2006 02:17:23 -0700
Subject: [PATCH] Move pidmap to pspace.h

Move struct pidmap and PIDMAP_ENTRIES to a new file, include/linux/pspace.h
where it will be used in subsequent patches to define pid spaces.

Its a subset of Eric Biederman's patch http://lkml.org/lkml/2006/2/6/285

[akpm@osdl.org: cleanups]
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/pid.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index 373639a10d84..8234bd08a3cb 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -26,6 +26,7 @@
 #include <linux/init.h>
 #include <linux/bootmem.h>
 #include <linux/hash.h>
+#include <linux/pspace.h>
 
 #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
 static struct hlist_head *pid_hash;
@@ -40,7 +41,6 @@ int last_pid;
 int pid_max_min = RESERVED_PIDS + 1;
 int pid_max_max = PID_MAX_LIMIT;
 
-#define PIDMAP_ENTRIES		((PID_MAX_LIMIT + 8*PAGE_SIZE - 1)/PAGE_SIZE/8)
 #define BITS_PER_PAGE		(PAGE_SIZE*8)
 #define BITS_PER_PAGE_MASK	(BITS_PER_PAGE-1)
 #define mk_pid(map, off)	(((map) - pidmap_array)*BITS_PER_PAGE + (off))
@@ -53,11 +53,6 @@ int pid_max_max = PID_MAX_LIMIT;
  * value does not cause lots of bitmaps to be allocated, but
  * the scheme scales to up to 4 million PIDs, runtime.
  */
-struct pidmap {
-	atomic_t nr_free;
-	void *page;
-};
-
 static struct pidmap pidmap_array[PIDMAP_ENTRIES] =
 	 { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
 
-- 
cgit v1.2.3


From 3fbc96486459324e182717b03c50c90c880be6ec Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Date: Mon, 2 Oct 2006 02:17:24 -0700
Subject: [PATCH] Define struct pspace

Define a per-container pid space object.  And create one instance of this
object, init_pspace, to define the entire pid space.  Subsequent patches
will provide/use interfaces to create/destroy pid spaces.

Its a subset/rework of Eric Biederman's patch
http://lkml.org/lkml/2006/2/6/285 .

Signed-off-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Kirill Korotaev <dev@sw.ru>
Cc: Andrey Savochkin <saw@sw.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/pid.c | 53 +++++++++++++++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index 8234bd08a3cb..89107b7481af 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -34,7 +34,6 @@ static int pidhash_shift;
 static kmem_cache_t *pid_cachep;
 
 int pid_max = PID_MAX_DEFAULT;
-int last_pid;
 
 #define RESERVED_PIDS		300
 
@@ -43,7 +42,12 @@ int pid_max_max = PID_MAX_LIMIT;
 
 #define BITS_PER_PAGE		(PAGE_SIZE*8)
 #define BITS_PER_PAGE_MASK	(BITS_PER_PAGE-1)
-#define mk_pid(map, off)	(((map) - pidmap_array)*BITS_PER_PAGE + (off))
+
+static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off)
+{
+	return (map - pspace->pidmap)*BITS_PER_PAGE + off;
+}
+
 #define find_next_offset(map, off)					\
 		find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
 
@@ -53,8 +57,12 @@ int pid_max_max = PID_MAX_LIMIT;
  * value does not cause lots of bitmaps to be allocated, but
  * the scheme scales to up to 4 million PIDs, runtime.
  */
-static struct pidmap pidmap_array[PIDMAP_ENTRIES] =
-	 { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
+struct pspace init_pspace = {
+	.pidmap = {
+		[ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
+	},
+	.last_pid = 0
+};
 
 /*
  * Note: disable interrupts while the pidmap_lock is held as an
@@ -69,40 +77,41 @@ static struct pidmap pidmap_array[PIDMAP_ENTRIES] =
  * irq handlers that take it we can leave the interrupts enabled.
  * For now it is easier to be safe than to prove it can't happen.
  */
+
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
 
-static fastcall void free_pidmap(int pid)
+static fastcall void free_pidmap(struct pspace *pspace, int pid)
 {
-	struct pidmap *map = pidmap_array + pid / BITS_PER_PAGE;
+	struct pidmap *map = pspace->pidmap + pid / BITS_PER_PAGE;
 	int offset = pid & BITS_PER_PAGE_MASK;
 
 	clear_bit(offset, map->page);
 	atomic_inc(&map->nr_free);
 }
 
-static int alloc_pidmap(void)
+static int alloc_pidmap(struct pspace *pspace)
 {
-	int i, offset, max_scan, pid, last = last_pid;
+	int i, offset, max_scan, pid, last = pspace->last_pid;
 	struct pidmap *map;
 
 	pid = last + 1;
 	if (pid >= pid_max)
 		pid = RESERVED_PIDS;
 	offset = pid & BITS_PER_PAGE_MASK;
-	map = &pidmap_array[pid/BITS_PER_PAGE];
+	map = &pspace->pidmap[pid/BITS_PER_PAGE];
 	max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
 	for (i = 0; i <= max_scan; ++i) {
 		if (unlikely(!map->page)) {
-			unsigned long page = get_zeroed_page(GFP_KERNEL);
+			void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
 			/*
 			 * Free the page if someone raced with us
 			 * installing it:
 			 */
 			spin_lock_irq(&pidmap_lock);
 			if (map->page)
-				free_page(page);
+				kfree(page);
 			else
-				map->page = (void *)page;
+				map->page = page;
 			spin_unlock_irq(&pidmap_lock);
 			if (unlikely(!map->page))
 				break;
@@ -111,11 +120,11 @@ static int alloc_pidmap(void)
 			do {
 				if (!test_and_set_bit(offset, map->page)) {
 					atomic_dec(&map->nr_free);
-					last_pid = pid;
+					pspace->last_pid = pid;
 					return pid;
 				}
 				offset = find_next_offset(map, offset);
-				pid = mk_pid(map, offset);
+				pid = mk_pid(pspace, map, offset);
 			/*
 			 * find_next_offset() found a bit, the pid from it
 			 * is in-bounds, and if we fell back to the last
@@ -126,16 +135,16 @@ static int alloc_pidmap(void)
 					(i != max_scan || pid < last ||
 					    !((last+1) & BITS_PER_PAGE_MASK)));
 		}
-		if (map < &pidmap_array[(pid_max-1)/BITS_PER_PAGE]) {
+		if (map < &pspace->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
 			++map;
 			offset = 0;
 		} else {
-			map = &pidmap_array[0];
+			map = &pspace->pidmap[0];
 			offset = RESERVED_PIDS;
 			if (unlikely(last == offset))
 				break;
 		}
-		pid = mk_pid(map, offset);
+		pid = mk_pid(pspace, map, offset);
 	}
 	return -1;
 }
@@ -182,7 +191,7 @@ fastcall void free_pid(struct pid *pid)
 	hlist_del_rcu(&pid->pid_chain);
 	spin_unlock_irqrestore(&pidmap_lock, flags);
 
-	free_pidmap(pid->nr);
+	free_pidmap(&init_pspace, pid->nr);
 	call_rcu(&pid->rcu, delayed_put_pid);
 }
 
@@ -196,7 +205,7 @@ struct pid *alloc_pid(void)
 	if (!pid)
 		goto out;
 
-	nr = alloc_pidmap();
+	nr = alloc_pidmap(&init_pspace);
 	if (nr < 0)
 		goto out_free;
 
@@ -363,10 +372,10 @@ void __init pidhash_init(void)
 
 void __init pidmap_init(void)
 {
-	pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL);
+	init_pspace.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
 	/* Reserve PID 0. We never call free_pidmap(0) */
-	set_bit(0, pidmap_array->page);
-	atomic_dec(&pidmap_array->nr_free);
+	set_bit(0, init_pspace.pidmap[0].page);
+	atomic_dec(&init_pspace.pidmap[0].nr_free);
 
 	pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
 					__alignof__(struct pid),
-- 
cgit v1.2.3


From f40f50d3bb33b52dfd550ca80be7daaddad21883 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 2 Oct 2006 02:17:25 -0700
Subject: [PATCH] Use struct pspace in next_pidmap and find_ge_pid

This updates my proc: readdir race fix (take 3) patch
to account for the changes made by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
to introduce struct pspace.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/pid.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index 89107b7481af..e4779bbb2058 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -149,19 +149,20 @@ static int alloc_pidmap(struct pspace *pspace)
 	return -1;
 }
 
-static int next_pidmap(int last)
+static int next_pidmap(struct pspace *pspace, int last)
 {
 	int offset;
-	struct pidmap *map;
+	struct pidmap *map, *end;
 
 	offset = (last + 1) & BITS_PER_PAGE_MASK;
-	map = &pidmap_array[(last + 1)/BITS_PER_PAGE];
-	for (; map < &pidmap_array[PIDMAP_ENTRIES]; map++, offset = 0) {
+	map = &pspace->pidmap[(last + 1)/BITS_PER_PAGE];
+	end = &pspace->pidmap[PIDMAP_ENTRIES];
+	for (; map < end; map++, offset = 0) {
 		if (unlikely(!map->page))
 			continue;
 		offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
 		if (offset < BITS_PER_PAGE)
-			return mk_pid(map, offset);
+			return mk_pid(pspace, map, offset);
 	}
 	return -1;
 }
@@ -338,7 +339,7 @@ struct pid *find_ge_pid(int nr)
 		pid = find_pid(nr);
 		if (pid)
 			break;
-		nr = next_pidmap(nr);
+		nr = next_pidmap(&init_pspace, nr);
 	} while (nr > 0);
 
 	return pid;
-- 
cgit v1.2.3


From 2425c08b37244005ff221efe4957d8aaff18609c Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 2 Oct 2006 02:17:28 -0700
Subject: [PATCH] usb: fixup usb so it uses struct pid

The problem with remembering a user space process by its pid is that it is
possible that the process will exit, pid wrap around will occur.
Converting to a struct pid avoid that problem, and paves the way for
implementing a pid namespace.

Also since usb is the only user of kill_proc_info_as_uid rename
kill_proc_info_as_uid to kill_pid_info_as_uid and have the new version take
a struct pid.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 5230ddcb1757..7ed8d5304bec 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1136,8 +1136,8 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 	return error;
 }
 
-/* like kill_proc_info(), but doesn't use uid/euid of "current" */
-int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid,
+/* like kill_pid_info(), but doesn't use uid/euid of "current" */
+int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
 		      uid_t uid, uid_t euid, u32 secid)
 {
 	int ret = -EINVAL;
@@ -1147,7 +1147,7 @@ int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid,
 		return ret;
 
 	read_lock(&tasklist_lock);
-	p = find_task_by_pid(pid);
+	p = pid_task(pid, PIDTYPE_PID);
 	if (!p) {
 		ret = -ESRCH;
 		goto out_unlock;
@@ -1171,7 +1171,7 @@ out_unlock:
 	read_unlock(&tasklist_lock);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(kill_proc_info_as_uid);
+EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
 
 /*
  * kill_something_info() interprets pid in interesting ways just like kill(2).
-- 
cgit v1.2.3


From 3a872d89baae821a0f6e2c1055d4b47650661137 Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Mon, 2 Oct 2006 02:17:30 -0700
Subject: [PATCH] Kprobes: Make kprobe modules more portable

In an effort to make kprobe modules more portable, here is a patch that:

o Introduces the "symbol_name" field to struct kprobe.
  The symbol->address resolution now happens in the kernel in an
  architecture agnostic manner. 64-bit powerpc users no longer have
  to specify the ".symbols"
o Introduces the "offset" field to struct kprobe to allow a user to
  specify an offset into a symbol.
o The legacy mechanism of specifying the kprobe.addr is still supported.
  However, if both the kprobe.addr and kprobe.symbol_name are specified,
  probe registration fails with an -EINVAL.
o The symbol resolution code uses kallsyms_lookup_name(). So
  CONFIG_KPROBES now depends on CONFIG_KALLSYMS
o Apparantly kprobe modules were the only legitimate out-of-tree user of
  the kallsyms_lookup_name() EXPORT. Now that the symbol resolution
  happens in-kernel, remove the EXPORT as suggested by Christoph Hellwig
o Modify tcp_probe.c that uses the kprobe interface so as to make it
  work on multiple platforms (in its earlier form, the code wouldn't
  work, say, on powerpc)

Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kallsyms.c |  1 -
 kernel/kprobes.c  | 26 ++++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index ab16a5a4cfe9..342bca62c496 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -154,7 +154,6 @@ unsigned long kallsyms_lookup_name(const char *name)
 	}
 	return module_kallsyms_lookup_name(name);
 }
-EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
 
 /*
  * Lookup an address
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3f57dfdc8f92..f66b8e681b4d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -37,6 +37,7 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/moduleloader.h>
+#include <linux/kallsyms.h>
 #include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
 #include <asm/errno.h>
@@ -45,6 +46,16 @@
 #define KPROBE_HASH_BITS 6
 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
 
+
+/*
+ * Some oddball architectures like 64bit powerpc have function descriptors
+ * so this must be overridable.
+ */
+#ifndef kprobe_lookup_name
+#define kprobe_lookup_name(name, addr) \
+	addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name)))
+#endif
+
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 static atomic_t kprobe_count;
@@ -447,6 +458,21 @@ static int __kprobes __register_kprobe(struct kprobe *p,
 	struct kprobe *old_p;
 	struct module *probed_mod;
 
+	/*
+	 * If we have a symbol_name argument look it up,
+	 * and add it to the address.  That way the addr
+	 * field can either be global or relative to a symbol.
+	 */
+	if (p->symbol_name) {
+		if (p->addr)
+			return -EINVAL;
+		kprobe_lookup_name(p->symbol_name, p->addr);
+	}
+
+	if (!p->addr)
+		return -EINVAL;
+	p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset);
+
 	if ((!kernel_text_address((unsigned long) p->addr)) ||
 		in_kprobes_functions((unsigned long) p->addr))
 		return -EINVAL;
-- 
cgit v1.2.3


From 62c27be0dd8144e11bd3ed054a0fb890579925f8 Mon Sep 17 00:00:00 2001
From: "bibo,mao" <bibo.mao@intel.com>
Date: Mon, 2 Oct 2006 02:17:33 -0700
Subject: [PATCH] kprobe whitespace cleanup

Whitespace is used to indent, this patch cleans up these sentences by
kernel coding style.

Signed-off-by: bibo, mao <bibo.mao@intel.com>
Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kprobes.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index f66b8e681b4d..41dfda50e22a 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -347,17 +347,17 @@ struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
  */
 void __kprobes kprobe_flush_task(struct task_struct *tk)
 {
-        struct kretprobe_instance *ri;
-        struct hlist_head *head;
+	struct kretprobe_instance *ri;
+	struct hlist_head *head;
 	struct hlist_node *node, *tmp;
 	unsigned long flags = 0;
 
 	spin_lock_irqsave(&kretprobe_lock, flags);
-        head = kretprobe_inst_table_head(tk);
-        hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
-                if (ri->task == tk)
-                        recycle_rp_inst(ri);
-        }
+	head = kretprobe_inst_table_head(tk);
+	hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+		if (ri->task == tk)
+			recycle_rp_inst(ri);
+	}
 	spin_unlock_irqrestore(&kretprobe_lock, flags);
 }
 
@@ -514,7 +514,7 @@ static int __kprobes __register_kprobe(struct kprobe *p,
 				(ARCH_INACTIVE_KPROBE_COUNT + 1))
 		register_page_fault_notifier(&kprobe_page_fault_nb);
 
-  	arch_arm_kprobe(p);
+	arch_arm_kprobe(p);
 
 out:
 	mutex_unlock(&kprobe_mutex);
-- 
cgit v1.2.3


From f2aa85a0ccd90110e76c6375535adc3ae358f971 Mon Sep 17 00:00:00 2001
From: "bibo,mao" <bibo.mao@intel.com>
Date: Mon, 2 Oct 2006 02:17:34 -0700
Subject: [PATCH] disallow kprobes on notifier_call_chain

When kprobe is re-entered, the re-entered kprobe kernel path will will call
atomic_notifier_call_chain function, if this function is kprobed that will
incur numerous kprobe recursive fault.  This patch disallows kprobes on
atomic_notifier_call_chain function.

Signed-off-by: bibo, mao <bibo.mao@intel.com>
Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 2460581c928c..398d57923f95 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -221,7 +221,7 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
  *	of the last notifier function called.
  */
  
-int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
 		unsigned long val, void *v)
 {
 	int ret;
-- 
cgit v1.2.3


From 99219a3fbc2dcf2eaa954f7b2ac27299fd7894cd Mon Sep 17 00:00:00 2001
From: "bibo,mao" <bibo.mao@intel.com>
Date: Mon, 2 Oct 2006 02:17:35 -0700
Subject: [PATCH] kretprobe spinlock deadlock patch

kprobe_flush_task() possibly calls kfree function during holding
kretprobe_lock spinlock, if kfree function is probed by kretprobe that will
incur spinlock deadlock.  This patch moves kfree function out scope of
kretprobe_lock.

Signed-off-by: bibo, mao <bibo.mao@intel.com>
Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kprobes.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 41dfda50e22a..610c837ad9e0 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -319,7 +319,8 @@ void __kprobes add_rp_inst(struct kretprobe_instance *ri)
 }
 
 /* Called with kretprobe_lock held */
-void __kprobes recycle_rp_inst(struct kretprobe_instance *ri)
+void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
+				struct hlist_head *head)
 {
 	/* remove rp inst off the rprobe_inst_table */
 	hlist_del(&ri->hlist);
@@ -331,7 +332,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri)
 		hlist_add_head(&ri->uflist, &ri->rp->free_instances);
 	} else
 		/* Unregistering */
-		kfree(ri);
+		hlist_add_head(&ri->hlist, head);
 }
 
 struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
@@ -348,17 +349,23 @@ struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
 void __kprobes kprobe_flush_task(struct task_struct *tk)
 {
 	struct kretprobe_instance *ri;
-	struct hlist_head *head;
+	struct hlist_head *head, empty_rp;
 	struct hlist_node *node, *tmp;
 	unsigned long flags = 0;
 
+	INIT_HLIST_HEAD(&empty_rp);
 	spin_lock_irqsave(&kretprobe_lock, flags);
 	head = kretprobe_inst_table_head(tk);
 	hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
 		if (ri->task == tk)
-			recycle_rp_inst(ri);
+			recycle_rp_inst(ri, &empty_rp);
 	}
 	spin_unlock_irqrestore(&kretprobe_lock, flags);
+
+	hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
+		hlist_del(&ri->hlist);
+		kfree(ri);
+	}
 }
 
 static inline void free_rp_inst(struct kretprobe *rp)
-- 
cgit v1.2.3


From e16b38f71322efd8a221f64b6ddc0748d21d2e1a Mon Sep 17 00:00:00 2001
From: Greg Banks <gnb@melbourne.sgi.com>
Date: Mon, 2 Oct 2006 02:17:40 -0700
Subject: [PATCH] cpumask: export cpu_online_map and cpu_possible_map
 consistently

cpumask: ensure that the cpu_online_map and cpu_possible_map bitmasks, and
hence all the macros in <linux/cpumask.h> that require them, are available to
modules for all supported combinations of architecture and CONFIG_SMP.

Signed-off-by: Greg Banks <gnb@melbourne.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 2bbd948f0169..e4e54e86f4a2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4384,7 +4384,10 @@ EXPORT_SYMBOL(cpu_present_map);
 
 #ifndef CONFIG_SMP
 cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
+EXPORT_SYMBOL(cpu_online_map);
+
 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
+EXPORT_SYMBOL(cpu_possible_map);
 #endif
 
 long sched_getaffinity(pid_t pid, cpumask_t *mask)
-- 
cgit v1.2.3


From f5dd3d6fadf98a53b35d20427ca198fda42f1251 Mon Sep 17 00:00:00 2001
From: Sam Vilain <sam.vilain@catalyst.net.nz>
Date: Mon, 2 Oct 2006 02:18:04 -0700
Subject: [PATCH] proc: sysctl: add _proc_do_string helper

The logic in proc_do_string is worth re-using without passing in a
ctl_table structure (say, we want to calculate a pointer and pass that in
instead); pass in the two fields it uses from that structure as explicit
arguments.

Signed-off-by: Sam Vilain <sam.vilain@catalyst.net.nz>
Cc: Serge E. Hallyn <serue@us.ibm.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Andrey Savochkin <saw@sw.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 65 ++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 36 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ba42694f0453..8b5f4bb8ad2f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1624,32 +1624,14 @@ static ssize_t proc_writesys(struct file * file, const char __user * buf,
 	return do_rw_proc(1, file, (char __user *) buf, count, ppos);
 }
 
-/**
- * proc_dostring - read a string sysctl
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes a string from/to the user buffer. If the kernel
- * buffer provided is not large enough to hold the string, the
- * string is truncated. The copied string is %NULL-terminated.
- * If the string is being read by the user process, it is copied
- * and a newline '\n' is added. It is truncated if the buffer is
- * not large enough.
- *
- * Returns 0 on success.
- */
-int proc_dostring(ctl_table *table, int write, struct file *filp,
-		  void __user *buffer, size_t *lenp, loff_t *ppos)
+int _proc_do_string(void* data, int maxlen, int write, struct file *filp,
+		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	size_t len;
 	char __user *p;
 	char c;
 	
-	if (!table->data || !table->maxlen || !*lenp ||
+	if (!data || !maxlen || !*lenp ||
 	    (*ppos && !write)) {
 		*lenp = 0;
 		return 0;
@@ -1665,20 +1647,20 @@ int proc_dostring(ctl_table *table, int write, struct file *filp,
 				break;
 			len++;
 		}
-		if (len >= table->maxlen)
-			len = table->maxlen-1;
-		if(copy_from_user(table->data, buffer, len))
+		if (len >= maxlen)
+			len = maxlen-1;
+		if(copy_from_user(data, buffer, len))
 			return -EFAULT;
-		((char *) table->data)[len] = 0;
+		((char *) data)[len] = 0;
 		*ppos += *lenp;
 	} else {
-		len = strlen(table->data);
-		if (len > table->maxlen)
-			len = table->maxlen;
+		len = strlen(data);
+		if (len > maxlen)
+			len = maxlen;
 		if (len > *lenp)
 			len = *lenp;
 		if (len)
-			if(copy_to_user(buffer, table->data, len))
+			if(copy_to_user(buffer, data, len))
 				return -EFAULT;
 		if (len < *lenp) {
 			if(put_user('\n', ((char __user *) buffer) + len))
@@ -1691,6 +1673,31 @@ int proc_dostring(ctl_table *table, int write, struct file *filp,
 	return 0;
 }
 
+/**
+ * proc_dostring - read a string sysctl
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes a string from/to the user buffer. If the kernel
+ * buffer provided is not large enough to hold the string, the
+ * string is truncated. The copied string is %NULL-terminated.
+ * If the string is being read by the user process, it is copied
+ * and a newline '\n' is added. It is truncated if the buffer is
+ * not large enough.
+ *
+ * Returns 0 on success.
+ */
+int proc_dostring(ctl_table *table, int write, struct file *filp,
+		  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return _proc_do_string(table->data, table->maxlen, write, filp,
+			       buffer, lenp, ppos);
+}
+
 /*
  *	Special case of dostring for the UTS structure. This has locks
  *	to observe. Should this be in kernel/sys.c ????
-- 
cgit v1.2.3


From b1ba4ddde0cf67991d89f039365eaaeda61aa027 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 2 Oct 2006 02:18:05 -0700
Subject: [PATCH] make kernel/sysctl.c:_proc_do_string() static

This patch makes the needlessly global _proc_do_string() static.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8b5f4bb8ad2f..8f3f00d5ee52 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1624,8 +1624,9 @@ static ssize_t proc_writesys(struct file * file, const char __user * buf,
 	return do_rw_proc(1, file, (char __user *) buf, count, ppos);
 }
 
-int _proc_do_string(void* data, int maxlen, int write, struct file *filp,
-		    void __user *buffer, size_t *lenp, loff_t *ppos)
+static int _proc_do_string(void* data, int maxlen, int write,
+			   struct file *filp, void __user *buffer,
+			   size_t *lenp, loff_t *ppos)
 {
 	size_t len;
 	char __user *p;
-- 
cgit v1.2.3


From ab516013ad9ca47f1d3a936fa81303bfbf734d52 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Mon, 2 Oct 2006 02:18:06 -0700
Subject: [PATCH] namespaces: add nsproxy

This patch adds a nsproxy structure to the task struct.  Later patches will
move the fs namespace pointer into this structure, and introduce a new utsname
namespace into the nsproxy.

The vserver and openvz functionality, then, would be implemented in large part
by virtualizing/isolating more and more resources into namespaces, each
contained in the nsproxy.

[akpm@osdl.org: build fix]
Signed-off-by: Serge Hallyn <serue@us.ibm.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Andrey Savochkin <saw@sw.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/Makefile  |  2 +-
 kernel/exit.c    |  7 ++++++
 kernel/fork.c    | 18 ++++++++++++-
 kernel/nsproxy.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 102 insertions(+), 2 deletions(-)
 create mode 100644 kernel/nsproxy.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index aacaafb28b9d..6ec53009b866 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
-	    hrtimer.o rwsem.o latency.o
+	    hrtimer.o rwsem.o latency.o nsproxy.o
 
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
diff --git a/kernel/exit.c b/kernel/exit.c
index 3b47f26985f2..1d0e9ea1fa05 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -21,6 +21,7 @@
 #include <linux/tsacct_kern.h>
 #include <linux/file.h>
 #include <linux/binfmts.h>
+#include <linux/nsproxy.h>
 #include <linux/ptrace.h>
 #include <linux/profile.h>
 #include <linux/mount.h>
@@ -397,9 +398,14 @@ void daemonize(const char *name, ...)
 	fs = init_task.fs;
 	current->fs = fs;
 	atomic_inc(&fs->count);
+
 	exit_namespace(current);
+	exit_task_namespaces(current);
 	current->namespace = init_task.namespace;
+	current->nsproxy = init_task.nsproxy;
 	get_namespace(current->namespace);
+	get_task_namespaces(current);
+
  	exit_files(current);
 	current->files = init_task.files;
 	atomic_inc(&current->files->count);
@@ -918,6 +924,7 @@ fastcall NORET_TYPE void do_exit(long code)
 	__exit_files(tsk);
 	__exit_fs(tsk);
 	exit_namespace(tsk);
+	exit_task_namespaces(tsk);
 	exit_thread();
 	cpuset_exit(tsk);
 	exit_keys(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 89f666491d1f..c9e660ae47aa 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -27,6 +27,7 @@
 #include <linux/binfmts.h>
 #include <linux/mman.h>
 #include <linux/fs.h>
+#include <linux/nsproxy.h>
 #include <linux/capability.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
@@ -1116,8 +1117,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto bad_fork_cleanup_signal;
 	if ((retval = copy_keys(clone_flags, p)))
 		goto bad_fork_cleanup_mm;
-	if ((retval = copy_namespace(clone_flags, p)))
+	if ((retval = copy_namespaces(clone_flags, p)))
 		goto bad_fork_cleanup_keys;
+	if ((retval = copy_namespace(clone_flags, p)))
+		goto bad_fork_cleanup_namespaces;
 	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
 	if (retval)
 		goto bad_fork_cleanup_namespace;
@@ -1262,6 +1265,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 bad_fork_cleanup_namespace:
 	exit_namespace(p);
+bad_fork_cleanup_namespaces:
+	exit_task_namespaces(p);
 bad_fork_cleanup_keys:
 	exit_keys(p);
 bad_fork_cleanup_mm:
@@ -1606,6 +1611,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 	struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
 	struct files_struct *fd, *new_fd = NULL;
 	struct sem_undo_list *new_ulist = NULL;
+	struct nsproxy *new_nsproxy, *old_nsproxy;
 
 	check_unshare_flags(&unshare_flags);
 
@@ -1632,7 +1638,15 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 
 	if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) {
 
+		old_nsproxy = current->nsproxy;
+		new_nsproxy = dup_namespaces(old_nsproxy);
+		if (!new_nsproxy) {
+			err = -ENOMEM;
+			goto bad_unshare_cleanup_semundo;
+		}
+
 		task_lock(current);
+		current->nsproxy = new_nsproxy;
 
 		if (new_fs) {
 			fs = current->fs;
@@ -1668,8 +1682,10 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 		}
 
 		task_unlock(current);
+		put_nsproxy(old_nsproxy);
 	}
 
+bad_unshare_cleanup_semundo:
 bad_unshare_cleanup_fd:
 	if (new_fd)
 		put_files_struct(new_fd);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
new file mode 100644
index 000000000000..ad9508865473
--- /dev/null
+++ b/kernel/nsproxy.c
@@ -0,0 +1,77 @@
+/*
+ *  Copyright (C) 2006 IBM Corporation
+ *
+ *  Author: Serge Hallyn <serue@us.ibm.com>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation, version 2 of the
+ *  License.
+ */
+
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/nsproxy.h>
+
+static inline void get_nsproxy(struct nsproxy *ns)
+{
+	atomic_inc(&ns->count);
+}
+
+void get_task_namespaces(struct task_struct *tsk)
+{
+	struct nsproxy *ns = tsk->nsproxy;
+	if (ns) {
+		get_nsproxy(ns);
+	}
+}
+
+/*
+ * creates a copy of "orig" with refcount 1.
+ * This does not grab references to the contained namespaces,
+ * so that needs to be done by dup_namespaces.
+ */
+static inline struct nsproxy *clone_namespaces(struct nsproxy *orig)
+{
+	struct nsproxy *ns;
+
+	ns = kmalloc(sizeof(struct nsproxy), GFP_KERNEL);
+	if (ns) {
+		memcpy(ns, orig, sizeof(struct nsproxy));
+		atomic_set(&ns->count, 1);
+	}
+	return ns;
+}
+
+/*
+ * copies the nsproxy, setting refcount to 1, and grabbing a
+ * reference to all contained namespaces.  Called from
+ * sys_unshare()
+ */
+struct nsproxy *dup_namespaces(struct nsproxy *orig)
+{
+	struct nsproxy *ns = clone_namespaces(orig);
+
+	return ns;
+}
+
+/*
+ * called from clone.  This now handles copy for nsproxy and all
+ * namespaces therein.
+ */
+int copy_namespaces(int flags, struct task_struct *tsk)
+{
+	struct nsproxy *old_ns = tsk->nsproxy;
+
+	if (!old_ns)
+		return 0;
+
+	get_nsproxy(old_ns);
+
+	return 0;
+}
+
+void free_nsproxy(struct nsproxy *ns)
+{
+		kfree(ns);
+}
-- 
cgit v1.2.3


From 0437eb594e6e5e699248f865482e61034be846d0 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Mon, 2 Oct 2006 02:18:07 -0700
Subject: [PATCH] nsproxy: move init_nsproxy into kernel/nsproxy.c

Move the init_nsproxy definition out of arch/ into kernel/nsproxy.c.  This
avoids all arches having to be updated.  Compiles and boots on s390.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Andrey Savochkin <saw@sw.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/nsproxy.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index ad9508865473..a3612f82f187 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -12,6 +12,9 @@
 #include <linux/module.h>
 #include <linux/version.h>
 #include <linux/nsproxy.h>
+#include <linux/init_task.h>
+
+struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
 
 static inline void get_nsproxy(struct nsproxy *ns)
 {
-- 
cgit v1.2.3


From 1651e14e28a2d9f446018ef522882e0709a2ce4f Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Mon, 2 Oct 2006 02:18:08 -0700
Subject: [PATCH] namespaces: incorporate fs namespace into nsproxy

This moves the mount namespace into the nsproxy.  The mount namespace count
now refers to the number of nsproxies point to it, rather than the number of
tasks.  As a result, the unshare_namespace() function in kernel/fork.c no
longer checks whether it is being shared.

Signed-off-by: Serge Hallyn <serue@us.ibm.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Andrey Savochkin <saw@sw.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c    |  4 ----
 kernel/fork.c    | 17 ++++++-----------
 kernel/nsproxy.c | 32 +++++++++++++++++++++++++++++++-
 3 files changed, 37 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 1d0e9ea1fa05..741bbe42dfe8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -399,11 +399,8 @@ void daemonize(const char *name, ...)
 	current->fs = fs;
 	atomic_inc(&fs->count);
 
-	exit_namespace(current);
 	exit_task_namespaces(current);
-	current->namespace = init_task.namespace;
 	current->nsproxy = init_task.nsproxy;
-	get_namespace(current->namespace);
 	get_task_namespaces(current);
 
  	exit_files(current);
@@ -923,7 +920,6 @@ fastcall NORET_TYPE void do_exit(long code)
 	exit_sem(tsk);
 	__exit_files(tsk);
 	__exit_fs(tsk);
-	exit_namespace(tsk);
 	exit_task_namespaces(tsk);
 	exit_thread();
 	cpuset_exit(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index c9e660ae47aa..33fcf0733ca6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1119,11 +1119,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto bad_fork_cleanup_mm;
 	if ((retval = copy_namespaces(clone_flags, p)))
 		goto bad_fork_cleanup_keys;
-	if ((retval = copy_namespace(clone_flags, p)))
-		goto bad_fork_cleanup_namespaces;
 	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
 	if (retval)
-		goto bad_fork_cleanup_namespace;
+		goto bad_fork_cleanup_namespaces;
 
 	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
 	/*
@@ -1215,7 +1213,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		spin_unlock(&current->sighand->siglock);
 		write_unlock_irq(&tasklist_lock);
 		retval = -ERESTARTNOINTR;
-		goto bad_fork_cleanup_namespace;
+		goto bad_fork_cleanup_namespaces;
 	}
 
 	if (clone_flags & CLONE_THREAD) {
@@ -1263,8 +1261,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	proc_fork_connector(p);
 	return p;
 
-bad_fork_cleanup_namespace:
-	exit_namespace(p);
 bad_fork_cleanup_namespaces:
 	exit_task_namespaces(p);
 bad_fork_cleanup_keys:
@@ -1519,10 +1515,9 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
  */
 static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs)
 {
-	struct namespace *ns = current->namespace;
+	struct namespace *ns = current->nsproxy->namespace;
 
-	if ((unshare_flags & CLONE_NEWNS) &&
-	    (ns && atomic_read(&ns->count) > 1)) {
+	if ((unshare_flags & CLONE_NEWNS) && ns) {
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 
@@ -1655,8 +1650,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 		}
 
 		if (new_ns) {
-			ns = current->namespace;
-			current->namespace = new_ns;
+			ns = current->nsproxy->namespace;
+			current->nsproxy->namespace = new_ns;
 			new_ns = ns;
 		}
 
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index a3612f82f187..e10385c17f73 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -13,6 +13,7 @@
 #include <linux/version.h>
 #include <linux/nsproxy.h>
 #include <linux/init_task.h>
+#include <linux/namespace.h>
 
 struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
 
@@ -55,6 +56,11 @@ struct nsproxy *dup_namespaces(struct nsproxy *orig)
 {
 	struct nsproxy *ns = clone_namespaces(orig);
 
+	if (ns) {
+		if (ns->namespace)
+			get_namespace(ns->namespace);
+	}
+
 	return ns;
 }
 
@@ -65,16 +71,40 @@ struct nsproxy *dup_namespaces(struct nsproxy *orig)
 int copy_namespaces(int flags, struct task_struct *tsk)
 {
 	struct nsproxy *old_ns = tsk->nsproxy;
+	struct nsproxy *new_ns;
+	int err = 0;
 
 	if (!old_ns)
 		return 0;
 
 	get_nsproxy(old_ns);
 
-	return 0;
+	if (!(flags & CLONE_NEWNS))
+		return 0;
+
+	new_ns = clone_namespaces(old_ns);
+	if (!new_ns) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	tsk->nsproxy = new_ns;
+
+	err = copy_namespace(flags, tsk);
+	if (err) {
+		tsk->nsproxy = old_ns;
+		put_nsproxy(new_ns);
+		goto out;
+	}
+
+out:
+	put_nsproxy(old_ns);
+	return err;
 }
 
 void free_nsproxy(struct nsproxy *ns)
 {
+		if (ns->namespace)
+			put_namespace(ns->namespace);
 		kfree(ns);
 }
-- 
cgit v1.2.3


From fab413a334a7b3dd2688c5cd5d4718476e430ea4 Mon Sep 17 00:00:00 2001
From: Cedric Le Goater <clg@fr.ibm.com>
Date: Mon, 2 Oct 2006 02:18:09 -0700
Subject: [PATCH] namespaces: exit_task_namespaces() invalidates nsproxy

exit_task_namespaces() has replaced the former exit_namespace().  It
invalidates task->nsproxy and associated namespaces.  This is an issue for
the (futur) pid namespace which is required to be valid in exit_notify().

This patch moves exit_task_namespaces() after exit_notify() to keep nsproxy
valid.

Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Cc: Serge E. Hallyn <serue@us.ibm.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Andrey Savochkin <saw@sw.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 741bbe42dfe8..f250a5e3e281 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -920,7 +920,6 @@ fastcall NORET_TYPE void do_exit(long code)
 	exit_sem(tsk);
 	__exit_files(tsk);
 	__exit_fs(tsk);
-	exit_task_namespaces(tsk);
 	exit_thread();
 	cpuset_exit(tsk);
 	exit_keys(tsk);
@@ -935,6 +934,7 @@ fastcall NORET_TYPE void do_exit(long code)
 	tsk->exit_code = code;
 	proc_exit_connector(tsk);
 	exit_notify(tsk);
+	exit_task_namespaces(tsk);
 #ifdef CONFIG_NUMA
 	mpol_free(tsk->mempolicy);
 	tsk->mempolicy = NULL;
-- 
cgit v1.2.3


From e9ff3990f08e9a0c2839cc22808b01732ea5b3e4 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Mon, 2 Oct 2006 02:18:11 -0700
Subject: [PATCH] namespaces: utsname: switch to using uts namespaces

Replace references to system_utsname to the per-process uts namespace
where appropriate.  This includes things like uname.

Changes: Per Eric Biederman's comments, use the per-process uts namespace
	for ELF_PLATFORM, sunrpc, and parts of net/ipv4/ipconfig.c

[jdike@addtoit.com: UML fix]
[clg@fr.ibm.com: cleanup]
[akpm@osdl.org: build fix]
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Andrey Savochkin <saw@sw.ru>
Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 398d57923f95..3a4776e8f16e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1655,7 +1655,7 @@ asmlinkage long sys_newuname(struct new_utsname __user * name)
 	int errno = 0;
 
 	down_read(&uts_sem);
-	if (copy_to_user(name,&system_utsname,sizeof *name))
+	if (copy_to_user(name, utsname(), sizeof *name))
 		errno = -EFAULT;
 	up_read(&uts_sem);
 	return errno;
@@ -1673,8 +1673,8 @@ asmlinkage long sys_sethostname(char __user *name, int len)
 	down_write(&uts_sem);
 	errno = -EFAULT;
 	if (!copy_from_user(tmp, name, len)) {
-		memcpy(system_utsname.nodename, tmp, len);
-		system_utsname.nodename[len] = 0;
+		memcpy(utsname()->nodename, tmp, len);
+		utsname()->nodename[len] = 0;
 		errno = 0;
 	}
 	up_write(&uts_sem);
@@ -1690,11 +1690,11 @@ asmlinkage long sys_gethostname(char __user *name, int len)
 	if (len < 0)
 		return -EINVAL;
 	down_read(&uts_sem);
-	i = 1 + strlen(system_utsname.nodename);
+	i = 1 + strlen(utsname()->nodename);
 	if (i > len)
 		i = len;
 	errno = 0;
-	if (copy_to_user(name, system_utsname.nodename, i))
+	if (copy_to_user(name, utsname()->nodename, i))
 		errno = -EFAULT;
 	up_read(&uts_sem);
 	return errno;
@@ -1719,8 +1719,8 @@ asmlinkage long sys_setdomainname(char __user *name, int len)
 	down_write(&uts_sem);
 	errno = -EFAULT;
 	if (!copy_from_user(tmp, name, len)) {
-		memcpy(system_utsname.domainname, tmp, len);
-		system_utsname.domainname[len] = 0;
+		memcpy(utsname()->domainname, tmp, len);
+		utsname()->domainname[len] = 0;
 		errno = 0;
 	}
 	up_write(&uts_sem);
-- 
cgit v1.2.3


From 96b644bdec977b97a45133e5b4466ba47a7a5e65 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Mon, 2 Oct 2006 02:18:13 -0700
Subject: [PATCH] namespaces: utsname: use init_utsname when appropriate

In some places, particularly drivers and __init code, the init utsns is the
appropriate one to use.  This patch replaces those with a the init_utsname
helper.

Changes: Removed several uses of init_utsname().  Hope I picked all the
	right ones in net/ipv4/ipconfig.c.  These are now changed to
	utsname() (the per-process namespace utsname) in the previous
	patch (2/7)

[akpm@osdl.org: CIFS fix]
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Andrey Savochkin <saw@sw.ru>
Cc: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/lockdep.c        |  6 +++---
 kernel/power/snapshot.c | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index e596525669ed..4c0553461000 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -518,9 +518,9 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth)
 
 static void print_kernel_version(void)
 {
-	printk("%s %.*s\n", system_utsname.release,
-		(int)strcspn(system_utsname.version, " "),
-		system_utsname.version);
+	printk("%s %.*s\n", init_utsname()->release,
+		(int)strcspn(init_utsname()->version, " "),
+		init_utsname()->version);
 }
 
 /*
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 1b84313cbab5..99f9b7d177d6 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -906,7 +906,7 @@ static void init_header(struct swsusp_info *info)
 	memset(info, 0, sizeof(struct swsusp_info));
 	info->version_code = LINUX_VERSION_CODE;
 	info->num_physpages = num_physpages;
-	memcpy(&info->uts, &system_utsname, sizeof(system_utsname));
+	memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname));
 	info->cpus = num_online_cpus();
 	info->image_pages = nr_copy_pages;
 	info->pages = nr_copy_pages + nr_meta_pages + 1;
@@ -1050,13 +1050,13 @@ static inline int check_header(struct swsusp_info *info)
 		reason = "kernel version";
 	if (info->num_physpages != num_physpages)
 		reason = "memory size";
-	if (strcmp(info->uts.sysname,system_utsname.sysname))
+	if (strcmp(info->uts.sysname,init_utsname()->sysname))
 		reason = "system type";
-	if (strcmp(info->uts.release,system_utsname.release))
+	if (strcmp(info->uts.release,init_utsname()->release))
 		reason = "kernel release";
-	if (strcmp(info->uts.version,system_utsname.version))
+	if (strcmp(info->uts.version,init_utsname()->version))
 		reason = "version";
-	if (strcmp(info->uts.machine,system_utsname.machine))
+	if (strcmp(info->uts.machine,init_utsname()->machine))
 		reason = "machine";
 	if (reason) {
 		printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
-- 
cgit v1.2.3


From 4865ecf1315b450ab3317a745a6678c04d311e40 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Mon, 2 Oct 2006 02:18:14 -0700
Subject: [PATCH] namespaces: utsname: implement utsname namespaces

This patch defines the uts namespace and some manipulators.
Adds the uts namespace to task_struct, and initializes a
system-wide init namespace.

It leaves a #define for system_utsname so sysctl will compile.
This define will be removed in a separate patch.

[akpm@osdl.org: build fix, cleanup]
Signed-off-by: Serge Hallyn <serue@us.ibm.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Andrey Savochkin <saw@sw.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/Makefile  |  1 +
 kernel/nsproxy.c | 14 ++++++++++++++
 kernel/utsname.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 57 insertions(+)
 create mode 100644 kernel/utsname.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 6ec53009b866..d948ca12acf0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_RELAY) += relay.o
+obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index e10385c17f73..47c19280c55b 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -14,6 +14,7 @@
 #include <linux/nsproxy.h>
 #include <linux/init_task.h>
 #include <linux/namespace.h>
+#include <linux/utsname.h>
 
 struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
 
@@ -59,6 +60,8 @@ struct nsproxy *dup_namespaces(struct nsproxy *orig)
 	if (ns) {
 		if (ns->namespace)
 			get_namespace(ns->namespace);
+		if (ns->uts_ns)
+			get_uts_ns(ns->uts_ns);
 	}
 
 	return ns;
@@ -97,6 +100,15 @@ int copy_namespaces(int flags, struct task_struct *tsk)
 		goto out;
 	}
 
+	err = copy_utsname(flags, tsk);
+	if (err) {
+		if (new_ns->namespace)
+			put_namespace(new_ns->namespace);
+		tsk->nsproxy = old_ns;
+		put_nsproxy(new_ns);
+		goto out;
+	}
+
 out:
 	put_nsproxy(old_ns);
 	return err;
@@ -106,5 +118,7 @@ void free_nsproxy(struct nsproxy *ns)
 {
 		if (ns->namespace)
 			put_namespace(ns->namespace);
+		if (ns->uts_ns)
+			put_uts_ns(ns->uts_ns);
 		kfree(ns);
 }
diff --git a/kernel/utsname.c b/kernel/utsname.c
new file mode 100644
index 000000000000..1824384ecfa3
--- /dev/null
+++ b/kernel/utsname.c
@@ -0,0 +1,42 @@
+/*
+ *  Copyright (C) 2004 IBM Corporation
+ *
+ *  Author: Serge Hallyn <serue@us.ibm.com>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation, version 2 of the
+ *  License.
+ */
+
+#include <linux/module.h>
+#include <linux/uts.h>
+#include <linux/utsname.h>
+#include <linux/version.h>
+
+/*
+ * Copy task tsk's utsname namespace, or clone it if flags
+ * specifies CLONE_NEWUTS.  In latter case, changes to the
+ * utsname of this process won't be seen by parent, and vice
+ * versa.
+ */
+int copy_utsname(int flags, struct task_struct *tsk)
+{
+	struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
+	int err = 0;
+
+	if (!old_ns)
+		return 0;
+
+	get_uts_ns(old_ns);
+
+	return err;
+}
+
+void free_uts_ns(struct kref *kref)
+{
+	struct uts_namespace *ns;
+
+	ns = container_of(kref, struct uts_namespace, kref);
+	kfree(ns);
+}
-- 
cgit v1.2.3


From 8218c74c02a7bdb5db2e40a2100534bdeb83475b Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Mon, 2 Oct 2006 02:18:15 -0700
Subject: [PATCH] namespaces: utsname: sysctl

Sysctl uts patch.  This will need to be done another way, but since sysctl
itself needs to be container aware, 'the right thing' is a separate patchset.

[akpm@osdl.org: ia64 build fix]
[sam.vilain@catalyst.net.nz: cleanup]
[sam.vilain@catalyst.net.nz: add proc_do_utsns_string]
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Andrey Savochkin <saw@sw.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 119 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8f3f00d5ee52..476ac2522f18 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -139,7 +139,7 @@ static int parse_table(int __user *, int, void __user *, size_t __user *,
 		void __user *, size_t, ctl_table *, void **);
 #endif
 
-static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
+static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
 
 static ctl_table root_table[];
@@ -229,51 +229,100 @@ static ctl_table root_table[] = {
 };
 
 static ctl_table kern_table[] = {
+#ifndef CONFIG_UTS_NS
 	{
 		.ctl_name	= KERN_OSTYPE,
 		.procname	= "ostype",
-		.data		= system_utsname.sysname,
-		.maxlen		= sizeof(system_utsname.sysname),
+		.data		= init_uts_ns.name.sysname,
+		.maxlen		= sizeof(init_uts_ns.name.sysname),
 		.mode		= 0444,
-		.proc_handler	= &proc_doutsstring,
+		.proc_handler	= &proc_do_uts_string,
 		.strategy	= &sysctl_string,
 	},
 	{
 		.ctl_name	= KERN_OSRELEASE,
 		.procname	= "osrelease",
-		.data		= system_utsname.release,
-		.maxlen		= sizeof(system_utsname.release),
+		.data		= init_uts_ns.name.release,
+		.maxlen		= sizeof(init_uts_ns.name.release),
 		.mode		= 0444,
-		.proc_handler	= &proc_doutsstring,
+		.proc_handler	= &proc_do_uts_string,
 		.strategy	= &sysctl_string,
 	},
 	{
 		.ctl_name	= KERN_VERSION,
 		.procname	= "version",
-		.data		= system_utsname.version,
-		.maxlen		= sizeof(system_utsname.version),
+		.data		= init_uts_ns.name.version,
+		.maxlen		= sizeof(init_uts_ns.name.version),
 		.mode		= 0444,
-		.proc_handler	= &proc_doutsstring,
+		.proc_handler	= &proc_do_uts_string,
 		.strategy	= &sysctl_string,
 	},
 	{
 		.ctl_name	= KERN_NODENAME,
 		.procname	= "hostname",
-		.data		= system_utsname.nodename,
-		.maxlen		= sizeof(system_utsname.nodename),
+		.data		= init_uts_ns.name.nodename,
+		.maxlen		= sizeof(init_uts_ns.name.nodename),
 		.mode		= 0644,
-		.proc_handler	= &proc_doutsstring,
+		.proc_handler	= &proc_do_uts_string,
 		.strategy	= &sysctl_string,
 	},
 	{
 		.ctl_name	= KERN_DOMAINNAME,
 		.procname	= "domainname",
-		.data		= system_utsname.domainname,
-		.maxlen		= sizeof(system_utsname.domainname),
+		.data		= init_uts_ns.name.domainname,
+		.maxlen		= sizeof(init_uts_ns.name.domainname),
 		.mode		= 0644,
-		.proc_handler	= &proc_doutsstring,
+		.proc_handler	= &proc_do_uts_string,
 		.strategy	= &sysctl_string,
 	},
+#else  /* !CONFIG_UTS_NS */
+	{
+		.ctl_name	= KERN_OSTYPE,
+		.procname	= "ostype",
+		.data		= NULL,
+		/* could maybe use __NEW_UTS_LEN here? */
+		.maxlen		= FIELD_SIZEOF(struct new_utsname, sysname),
+		.mode		= 0444,
+		.proc_handler	= &proc_do_uts_string,
+		.strategy	= &sysctl_string,
+	},
+	{
+		.ctl_name	= KERN_OSRELEASE,
+		.procname	= "osrelease",
+		.data		= NULL,
+		.maxlen		= FIELD_SIZEOF(struct new_utsname, release),
+		.mode		= 0444,
+		.proc_handler	= &proc_do_uts_string,
+		.strategy	= &sysctl_string,
+	},
+	{
+		.ctl_name	= KERN_VERSION,
+		.procname	= "version",
+		.data		= NULL,
+		.maxlen		= FIELD_SIZEOF(struct new_utsname, version),
+		.mode		= 0444,
+		.proc_handler	= &proc_do_uts_string,
+		.strategy	= &sysctl_string,
+	},
+	{
+		.ctl_name	= KERN_NODENAME,
+		.procname	= "hostname",
+		.data		= NULL,
+		.maxlen		= FIELD_SIZEOF(struct new_utsname, nodename),
+		.mode		= 0644,
+		.proc_handler	= &proc_do_uts_string,
+		.strategy	= &sysctl_string,
+	},
+	{
+		.ctl_name	= KERN_DOMAINNAME,
+		.procname	= "domainname",
+		.data		= NULL,
+		.maxlen		= FIELD_SIZEOF(struct new_utsname, domainname),
+		.mode		= 0644,
+		.proc_handler	= &proc_do_uts_string,
+		.strategy	= &sysctl_string,
+	},
+#endif /* !CONFIG_UTS_NS */
 	{
 		.ctl_name	= KERN_PANIC,
 		.procname	= "panic",
@@ -1704,7 +1753,8 @@ int proc_dostring(ctl_table *table, int write, struct file *filp,
  *	to observe. Should this be in kernel/sys.c ????
  */
  
-static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
+#ifndef CONFIG_UTS_NS
+static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
 		  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int r;
@@ -1720,6 +1770,48 @@ static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
 	}
 	return r;
 }
+#else /* !CONFIG_UTS_NS */
+static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
+		  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int r;
+	struct uts_namespace* uts_ns = current->nsproxy->uts_ns;
+	char* which;
+
+	switch (table->ctl_name) {
+	case KERN_OSTYPE:
+		which = uts_ns->name.sysname;
+		break;
+	case KERN_NODENAME:
+		which = uts_ns->name.nodename;
+		break;
+	case KERN_OSRELEASE:
+		which = uts_ns->name.release;
+		break;
+	case KERN_VERSION:
+		which = uts_ns->name.version;
+		break;
+	case KERN_DOMAINNAME:
+		which = uts_ns->name.domainname;
+		break;
+	default:
+		r = -EINVAL;
+		goto out;
+	}
+
+	if (!write) {
+		down_read(&uts_sem);
+		r=_proc_do_string(which,table->maxlen,0,filp,buffer,lenp, ppos);
+		up_read(&uts_sem);
+	} else {
+		down_write(&uts_sem);
+		r=_proc_do_string(which,table->maxlen,1,filp,buffer,lenp, ppos);
+		up_write(&uts_sem);
+	}
+ out:
+	return r;
+}
+#endif /* !CONFIG_UTS_NS */
 
 static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
 				 int *valp,
@@ -2283,12 +2375,20 @@ int proc_dostring(ctl_table *table, int write, struct file *filp,
 	return -ENOSYS;
 }
 
-static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
-			    void __user *buffer, size_t *lenp, loff_t *ppos)
+static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
+#ifdef CONFIG_SYSVIPC
+static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+#endif
+
 int proc_dointvec(ctl_table *table, int write, struct file *filp,
 		  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-- 
cgit v1.2.3


From 071df104f808b8195c40643dcb4d060681742e29 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Mon, 2 Oct 2006 02:18:17 -0700
Subject: [PATCH] namespaces: utsname: implement CLONE_NEWUTS flag

Implement a CLONE_NEWUTS flag, and use it at clone and sys_unshare.

[clg@fr.ibm.com: IPC unshare fix]
[bunk@stusta.de: cleanup]
Signed-off-by: Serge Hallyn <serue@us.ibm.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Andrey Savochkin <saw@sw.ru>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c    | 20 +++++++++++++++++---
 kernel/nsproxy.c |  2 +-
 kernel/utsname.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 71 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 33fcf0733ca6..c08cf05dd59b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1607,13 +1607,14 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 	struct files_struct *fd, *new_fd = NULL;
 	struct sem_undo_list *new_ulist = NULL;
 	struct nsproxy *new_nsproxy, *old_nsproxy;
+	struct uts_namespace *uts, *new_uts = NULL;
 
 	check_unshare_flags(&unshare_flags);
 
 	/* Return -EINVAL for all unsupported flags */
 	err = -EINVAL;
 	if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
-				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM))
+				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|CLONE_NEWUTS))
 		goto bad_unshare_out;
 
 	if ((err = unshare_thread(unshare_flags)))
@@ -1630,14 +1631,17 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 		goto bad_unshare_cleanup_vm;
 	if ((err = unshare_semundo(unshare_flags, &new_ulist)))
 		goto bad_unshare_cleanup_fd;
+	if ((err = unshare_utsname(unshare_flags, &new_uts)))
+		goto bad_unshare_cleanup_semundo;
 
-	if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) {
+	if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist ||
+				new_uts) {
 
 		old_nsproxy = current->nsproxy;
 		new_nsproxy = dup_namespaces(old_nsproxy);
 		if (!new_nsproxy) {
 			err = -ENOMEM;
-			goto bad_unshare_cleanup_semundo;
+			goto bad_unshare_cleanup_uts;
 		}
 
 		task_lock(current);
@@ -1676,10 +1680,20 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 			new_fd = fd;
 		}
 
+		if (new_uts) {
+			uts = current->nsproxy->uts_ns;
+			current->nsproxy->uts_ns = new_uts;
+			new_uts = uts;
+		}
+
 		task_unlock(current);
 		put_nsproxy(old_nsproxy);
 	}
 
+bad_unshare_cleanup_uts:
+	if (new_uts)
+		put_uts_ns(new_uts);
+
 bad_unshare_cleanup_semundo:
 bad_unshare_cleanup_fd:
 	if (new_fd)
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 47c19280c55b..8246813335cc 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -82,7 +82,7 @@ int copy_namespaces(int flags, struct task_struct *tsk)
 
 	get_nsproxy(old_ns);
 
-	if (!(flags & CLONE_NEWNS))
+	if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS)))
 		return 0;
 
 	new_ns = clone_namespaces(old_ns);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 1824384ecfa3..c859164a6993 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,41 @@
 #include <linux/utsname.h>
 #include <linux/version.h>
 
+/*
+ * Clone a new ns copying an original utsname, setting refcount to 1
+ * @old_ns: namespace to clone
+ * Return NULL on error (failure to kmalloc), new ns otherwise
+ */
+static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
+{
+	struct uts_namespace *ns;
+
+	ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
+	if (ns) {
+		memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
+		kref_init(&ns->kref);
+	}
+	return ns;
+}
+
+/*
+ * unshare the current process' utsname namespace.
+ * called only in sys_unshare()
+ */
+int unshare_utsname(unsigned long unshare_flags, struct uts_namespace **new_uts)
+{
+	if (unshare_flags & CLONE_NEWUTS) {
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		*new_uts = clone_uts_ns(current->nsproxy->uts_ns);
+		if (!*new_uts)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
 /*
  * Copy task tsk's utsname namespace, or clone it if flags
  * specifies CLONE_NEWUTS.  In latter case, changes to the
@@ -23,6 +58,7 @@
 int copy_utsname(int flags, struct task_struct *tsk)
 {
 	struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
+	struct uts_namespace *new_ns;
 	int err = 0;
 
 	if (!old_ns)
@@ -30,6 +66,23 @@ int copy_utsname(int flags, struct task_struct *tsk)
 
 	get_uts_ns(old_ns);
 
+	if (!(flags & CLONE_NEWUTS))
+		return 0;
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		err = -EPERM;
+		goto out;
+	}
+
+	new_ns = clone_uts_ns(old_ns);
+	if (!new_ns) {
+		err = -ENOMEM;
+		goto out;
+	}
+	tsk->nsproxy->uts_ns = new_ns;
+
+out:
+	put_uts_ns(old_ns);
 	return err;
 }
 
-- 
cgit v1.2.3


From c0b2fc316599d6cd875b6b8cafa67f03b9512b4d Mon Sep 17 00:00:00 2001
From: Serge Hallyn <serue@us.ibm.com>
Date: Mon, 2 Oct 2006 02:18:18 -0700
Subject: [PATCH] uts: copy nsproxy only when needed

The nsproxy was being copied in unshare() when anything was being unshared,
even if it was something not referenced from nsproxy.  This should end up
in some cases with far more memory usage than necessary.

Signed-off-by: Serge Hallyn <serue@us.ibm.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Andrey Savochkin <saw@sw.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index c08cf05dd59b..208dd99f13bc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1606,7 +1606,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 	struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
 	struct files_struct *fd, *new_fd = NULL;
 	struct sem_undo_list *new_ulist = NULL;
-	struct nsproxy *new_nsproxy, *old_nsproxy;
+	struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL;
 	struct uts_namespace *uts, *new_uts = NULL;
 
 	check_unshare_flags(&unshare_flags);
@@ -1634,18 +1634,24 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 	if ((err = unshare_utsname(unshare_flags, &new_uts)))
 		goto bad_unshare_cleanup_semundo;
 
-	if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist ||
-				new_uts) {
-
+	if (new_ns || new_uts) {
 		old_nsproxy = current->nsproxy;
 		new_nsproxy = dup_namespaces(old_nsproxy);
 		if (!new_nsproxy) {
 			err = -ENOMEM;
 			goto bad_unshare_cleanup_uts;
 		}
+	}
+
+	if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist ||
+				new_uts) {
 
 		task_lock(current);
-		current->nsproxy = new_nsproxy;
+
+		if (new_nsproxy) {
+			current->nsproxy = new_nsproxy;
+			new_nsproxy = old_nsproxy;
+		}
 
 		if (new_fs) {
 			fs = current->fs;
@@ -1687,9 +1693,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 		}
 
 		task_unlock(current);
-		put_nsproxy(old_nsproxy);
 	}
 
+	if (new_nsproxy)
+		put_nsproxy(new_nsproxy);
+
 bad_unshare_cleanup_uts:
 	if (new_uts)
 		put_uts_ns(new_uts);
-- 
cgit v1.2.3


From 25b21cb2f6d69b0475b134e0a3e8e269137270fa Mon Sep 17 00:00:00 2001
From: Kirill Korotaev <dev@openvz.org>
Date: Mon, 2 Oct 2006 02:18:19 -0700
Subject: [PATCH] IPC namespace core

This patch set allows to unshare IPCs and have a private set of IPC objects
(sem, shm, msg) inside namespace.  Basically, it is another building block of
containers functionality.

This patch implements core IPC namespace changes:
- ipc_namespace structure
- new config option CONFIG_IPC_NS
- adds CLONE_NEWIPC flag
- unshare support

[clg@fr.ibm.com: small fix for unshare of ipc namespace]
[akpm@osdl.org: build fix]
Signed-off-by: Pavel Emelianov <xemul@openvz.org>
Signed-off-by: Kirill Korotaev <dev@openvz.org>
Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c    | 22 ++++++++++++++++++----
 kernel/nsproxy.c | 41 ++++++++++++++++++++++++++++-------------
 2 files changed, 46 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 208dd99f13bc..d6cc56558507 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1608,13 +1608,15 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 	struct sem_undo_list *new_ulist = NULL;
 	struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL;
 	struct uts_namespace *uts, *new_uts = NULL;
+	struct ipc_namespace *ipc, *new_ipc = NULL;
 
 	check_unshare_flags(&unshare_flags);
 
 	/* Return -EINVAL for all unsupported flags */
 	err = -EINVAL;
 	if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
-				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|CLONE_NEWUTS))
+				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
+				CLONE_NEWUTS|CLONE_NEWIPC))
 		goto bad_unshare_out;
 
 	if ((err = unshare_thread(unshare_flags)))
@@ -1633,18 +1635,20 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 		goto bad_unshare_cleanup_fd;
 	if ((err = unshare_utsname(unshare_flags, &new_uts)))
 		goto bad_unshare_cleanup_semundo;
+	if ((err = unshare_ipcs(unshare_flags, &new_ipc)))
+		goto bad_unshare_cleanup_uts;
 
-	if (new_ns || new_uts) {
+	if (new_ns || new_uts || new_ipc) {
 		old_nsproxy = current->nsproxy;
 		new_nsproxy = dup_namespaces(old_nsproxy);
 		if (!new_nsproxy) {
 			err = -ENOMEM;
-			goto bad_unshare_cleanup_uts;
+			goto bad_unshare_cleanup_ipc;
 		}
 	}
 
 	if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist ||
-				new_uts) {
+				new_uts || new_ipc) {
 
 		task_lock(current);
 
@@ -1692,12 +1696,22 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 			new_uts = uts;
 		}
 
+		if (new_ipc) {
+			ipc = current->nsproxy->ipc_ns;
+			current->nsproxy->ipc_ns = new_ipc;
+			new_ipc = ipc;
+		}
+
 		task_unlock(current);
 	}
 
 	if (new_nsproxy)
 		put_nsproxy(new_nsproxy);
 
+bad_unshare_cleanup_ipc:
+	if (new_ipc)
+		put_ipc_ns(new_ipc);
+
 bad_unshare_cleanup_uts:
 	if (new_uts)
 		put_uts_ns(new_uts);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 8246813335cc..8d6c852dc51e 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -7,6 +7,10 @@
  *  modify it under the terms of the GNU General Public License as
  *  published by the Free Software Foundation, version 2 of the
  *  License.
+ *
+ *  Jun 2006 - namespaces support
+ *             OpenVZ, SWsoft Inc.
+ *             Pavel Emelianov <xemul@openvz.org>
  */
 
 #include <linux/module.h>
@@ -62,6 +66,8 @@ struct nsproxy *dup_namespaces(struct nsproxy *orig)
 			get_namespace(ns->namespace);
 		if (ns->uts_ns)
 			get_uts_ns(ns->uts_ns);
+		if (ns->ipc_ns)
+			get_ipc_ns(ns->ipc_ns);
 	}
 
 	return ns;
@@ -82,7 +88,7 @@ int copy_namespaces(int flags, struct task_struct *tsk)
 
 	get_nsproxy(old_ns);
 
-	if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS)))
+	if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC)))
 		return 0;
 
 	new_ns = clone_namespaces(old_ns);
@@ -94,24 +100,31 @@ int copy_namespaces(int flags, struct task_struct *tsk)
 	tsk->nsproxy = new_ns;
 
 	err = copy_namespace(flags, tsk);
-	if (err) {
-		tsk->nsproxy = old_ns;
-		put_nsproxy(new_ns);
-		goto out;
-	}
+	if (err)
+		goto out_ns;
 
 	err = copy_utsname(flags, tsk);
-	if (err) {
-		if (new_ns->namespace)
-			put_namespace(new_ns->namespace);
-		tsk->nsproxy = old_ns;
-		put_nsproxy(new_ns);
-		goto out;
-	}
+	if (err)
+		goto out_uts;
+
+	err = copy_ipcs(flags, tsk);
+	if (err)
+		goto out_ipc;
 
 out:
 	put_nsproxy(old_ns);
 	return err;
+
+out_ipc:
+	if (new_ns->uts_ns)
+		put_uts_ns(new_ns->uts_ns);
+out_uts:
+	if (new_ns->namespace)
+		put_namespace(new_ns->namespace);
+out_ns:
+	tsk->nsproxy = old_ns;
+	put_nsproxy(new_ns);
+	goto out;
 }
 
 void free_nsproxy(struct nsproxy *ns)
@@ -120,5 +133,7 @@ void free_nsproxy(struct nsproxy *ns)
 			put_namespace(ns->namespace);
 		if (ns->uts_ns)
 			put_uts_ns(ns->uts_ns);
+		if (ns->ipc_ns)
+			put_ipc_ns(ns->ipc_ns);
 		kfree(ns);
 }
-- 
cgit v1.2.3


From 73ea41302bab5e02c9e86ab15c509494a550f1db Mon Sep 17 00:00:00 2001
From: Kirill Korotaev <dev@openvz.org>
Date: Mon, 2 Oct 2006 02:18:20 -0700
Subject: [PATCH] IPC namespace - utils

This patch adds basic IPC namespace functionality to
IPC utils:
- init_ipc_ns
- copy/clone/unshare/free IPC ns
- /proc preparations

Signed-off-by: Pavel Emelianov <xemul@openvz.org>
Signed-off-by: Kirill Korotaev <dev@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index d6cc56558507..7dc6140baac6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1589,6 +1589,16 @@ static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **n
 	return 0;
 }
 
+#ifndef CONFIG_IPC_NS
+static inline int unshare_ipcs(unsigned long flags, struct ipc_namespace **ns)
+{
+	if (flags & CLONE_NEWIPC)
+		return -EINVAL;
+
+	return 0;
+}
+#endif
+
 /*
  * unshare allows a process to 'unshare' part of the process
  * context which was originally shared using clone.  copy_*
-- 
cgit v1.2.3


From fcfbd547b1209aae9d880fe5db33464413925cc8 Mon Sep 17 00:00:00 2001
From: Kirill Korotaev <dev@openvz.org>
Date: Mon, 2 Oct 2006 02:18:23 -0700
Subject: [PATCH] IPC namespace - sysctls

Sysctl tweaks for IPC namespace

Signed-off-by: Pavel Emelianiov <xemul@openvz.org>
Signed-off-by: Kirill Korotaev <dev@openvz.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 116 ++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 88 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 476ac2522f18..a79ccf9d113b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -92,13 +92,8 @@ extern char modprobe_path[];
 extern int sg_big_buff;
 #endif
 #ifdef CONFIG_SYSVIPC
-extern size_t shm_ctlmax;
-extern size_t shm_ctlall;
-extern int shm_ctlmni;
-extern int msg_ctlmax;
-extern int msg_ctlmnb;
-extern int msg_ctlmni;
-extern int sem_ctls[];
+static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
+		void __user *buffer, size_t *lenp, loff_t *ppos);
 #endif
 
 #ifdef __sparc__
@@ -481,58 +476,58 @@ static ctl_table kern_table[] = {
 	{
 		.ctl_name	= KERN_SHMMAX,
 		.procname	= "shmmax",
-		.data		= &shm_ctlmax,
+		.data		= NULL,
 		.maxlen		= sizeof (size_t),
 		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_minmax,
+		.proc_handler	= &proc_do_ipc_string,
 	},
 	{
 		.ctl_name	= KERN_SHMALL,
 		.procname	= "shmall",
-		.data		= &shm_ctlall,
+		.data		= NULL,
 		.maxlen		= sizeof (size_t),
 		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_minmax,
+		.proc_handler	= &proc_do_ipc_string,
 	},
 	{
 		.ctl_name	= KERN_SHMMNI,
 		.procname	= "shmmni",
-		.data		= &shm_ctlmni,
+		.data		= NULL,
 		.maxlen		= sizeof (int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_do_ipc_string,
 	},
 	{
 		.ctl_name	= KERN_MSGMAX,
 		.procname	= "msgmax",
-		.data		= &msg_ctlmax,
+		.data		= NULL,
 		.maxlen		= sizeof (int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_do_ipc_string,
 	},
 	{
 		.ctl_name	= KERN_MSGMNI,
 		.procname	= "msgmni",
-		.data		= &msg_ctlmni,
+		.data		= NULL,
 		.maxlen		= sizeof (int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_do_ipc_string,
 	},
 	{
 		.ctl_name	= KERN_MSGMNB,
 		.procname	=  "msgmnb",
-		.data		= &msg_ctlmnb,
+		.data		= NULL,
 		.maxlen		= sizeof (int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_do_ipc_string,
 	},
 	{
 		.ctl_name	= KERN_SEM,
 		.procname	= "sem",
-		.data		= &sem_ctls,
+		.data		= NULL,
 		.maxlen		= 4*sizeof (int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_do_ipc_string,
 	},
 #endif
 #ifdef CONFIG_MAGIC_SYSRQ
@@ -1832,8 +1827,9 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
 	return 0;
 }
 
-static int do_proc_dointvec(ctl_table *table, int write, struct file *filp,
-		  void __user *buffer, size_t *lenp, loff_t *ppos,
+static int __do_proc_dointvec(void *tbl_data, ctl_table *table,
+		  int write, struct file *filp, void __user *buffer,
+		  size_t *lenp, loff_t *ppos,
 		  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
 			      int write, void *data),
 		  void *data)
@@ -1846,13 +1842,13 @@ static int do_proc_dointvec(ctl_table *table, int write, struct file *filp,
 	char buf[TMPBUFLEN], *p;
 	char __user *s = buffer;
 	
-	if (!table->data || !table->maxlen || !*lenp ||
+	if (!tbl_data || !table->maxlen || !*lenp ||
 	    (*ppos && !write)) {
 		*lenp = 0;
 		return 0;
 	}
 	
-	i = (int *) table->data;
+	i = (int *) tbl_data;
 	vleft = table->maxlen / sizeof(*i);
 	left = *lenp;
 
@@ -1941,6 +1937,16 @@ static int do_proc_dointvec(ctl_table *table, int write, struct file *filp,
 #undef TMPBUFLEN
 }
 
+static int do_proc_dointvec(ctl_table *table, int write, struct file *filp,
+		  void __user *buffer, size_t *lenp, loff_t *ppos,
+		  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
+			      int write, void *data),
+		  void *data)
+{
+	return __do_proc_dointvec(table->data, table, write, filp,
+			buffer, lenp, ppos, conv, data);
+}
+
 /**
  * proc_dointvec - read a vector of integers
  * @table: the sysctl table
@@ -2074,7 +2080,7 @@ int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp,
 				do_proc_dointvec_minmax_conv, &param);
 }
 
-static int do_proc_doulongvec_minmax(ctl_table *table, int write,
+static int __do_proc_doulongvec_minmax(void *data, ctl_table *table, int write,
 				     struct file *filp,
 				     void __user *buffer,
 				     size_t *lenp, loff_t *ppos,
@@ -2088,13 +2094,13 @@ static int do_proc_doulongvec_minmax(ctl_table *table, int write,
 	char buf[TMPBUFLEN], *p;
 	char __user *s = buffer;
 	
-	if (!table->data || !table->maxlen || !*lenp ||
+	if (!data || !table->maxlen || !*lenp ||
 	    (*ppos && !write)) {
 		*lenp = 0;
 		return 0;
 	}
 	
-	i = (unsigned long *) table->data;
+	i = (unsigned long *) data;
 	min = (unsigned long *) table->extra1;
 	max = (unsigned long *) table->extra2;
 	vleft = table->maxlen / sizeof(unsigned long);
@@ -2179,6 +2185,17 @@ static int do_proc_doulongvec_minmax(ctl_table *table, int write,
 #undef TMPBUFLEN
 }
 
+static int do_proc_doulongvec_minmax(ctl_table *table, int write,
+				     struct file *filp,
+				     void __user *buffer,
+				     size_t *lenp, loff_t *ppos,
+				     unsigned long convmul,
+				     unsigned long convdiv)
+{
+	return __do_proc_doulongvec_minmax(table->data, table, write,
+			filp, buffer, lenp, ppos, convmul, convdiv);
+}
+
 /**
  * proc_doulongvec_minmax - read a vector of long integers with min/max values
  * @table: the sysctl table
@@ -2367,6 +2384,49 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
 				do_proc_dointvec_ms_jiffies_conv, NULL);
 }
 
+#ifdef CONFIG_SYSVIPC
+static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	void *data;
+	struct ipc_namespace *ns;
+
+	ns = current->nsproxy->ipc_ns;
+
+	switch (table->ctl_name) {
+	case KERN_SHMMAX:
+		data = &ns->shm_ctlmax;
+		goto proc_minmax;
+	case KERN_SHMALL:
+		data = &ns->shm_ctlall;
+		goto proc_minmax;
+	case KERN_SHMMNI:
+		data = &ns->shm_ctlmni;
+		break;
+	case KERN_MSGMAX:
+		data = &ns->msg_ctlmax;
+		break;
+	case KERN_MSGMNI:
+		data = &ns->msg_ctlmni;
+		break;
+	case KERN_MSGMNB:
+		data = &ns->msg_ctlmnb;
+		break;
+	case KERN_SEM:
+		data = &ns->sem_ctls;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return __do_proc_dointvec(data, table, write, filp, buffer,
+			lenp, ppos, NULL, NULL);
+proc_minmax:
+	return __do_proc_doulongvec_minmax(data, table, write, filp, buffer,
+			lenp, ppos, 1l, 1l);
+}
+#endif
+
 #else /* CONFIG_PROC_FS */
 
 int proc_dostring(ctl_table *table, int write, struct file *filp,
-- 
cgit v1.2.3


From 5d124e99c2fee1c8f3020ecb0dff8d5617ee7991 Mon Sep 17 00:00:00 2001
From: Pavel <xemul@openvz.org>
Date: Mon, 2 Oct 2006 02:18:24 -0700
Subject: [PATCH] nsproxy cloning error path fix

This patch fixes copy_namespaces()'s error path.

when new nsproxy (new_ns) is created pointers to namespaces (ipc, uts) are
copied from the old nsproxy.  Later in copy_utsname, copy_ipcs, etc.
according namespaces are get-ed.  On error path needed namespaces are
put-ed, so there's no need to put new nsproxy itelf as it woud cause
putting namespaces for the second time.

Found when incorporating namespaces into OpenVZ kernel.

Signed-off-by: Pavel Emelianov <xemul@openvz.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/nsproxy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 8d6c852dc51e..6ebdb82a0ce4 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -123,7 +123,7 @@ out_uts:
 		put_namespace(new_ns->namespace);
 out_ns:
 	tsk->nsproxy = old_ns;
-	put_nsproxy(new_ns);
+	kfree(new_ns);
 	goto out;
 }
 
-- 
cgit v1.2.3


From 6760856791c6e527da678021ee6a67896549d4da Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 2 Oct 2006 02:18:26 -0700
Subject: [PATCH] introduce kernel_execve

The use of execve() in the kernel is dubious, since it relies on the
__KERNEL_SYSCALLS__ mechanism that stores the result in a global errno
variable.  As a first step of getting rid of this, change all users to a
global kernel_execve function that returns a proper error code.

This function is a terrible hack, and a later patch removes it again after the
kernel syscalls are gone.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Andi Kleen <ak@muc.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ian Molton <spyro@f2s.com>
Cc: Mikael Starvik <starvik@axis.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Hirokazu Takata <takata.hirokazu@renesas.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Kazumoto Kojima <kkojima@rr.iij4u.or.jp>
Cc: Richard Curnow <rc@rc0.org.uk>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Cc: Miles Bader <uclinux-v850@lsi.nec.co.jp>
Cc: Chris Zankel <chris@zankel.net>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kmod.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index f8121b95183f..bb4e29d924e4 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -18,8 +18,6 @@
 	call_usermodehelper wait flag, and remove exec_usermodehelper.
 	Rusty Russell <rusty@rustcorp.com.au>  Jan 2003
 */
-#define __KERNEL_SYSCALLS__
-
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/syscalls.h>
@@ -169,7 +167,8 @@ static int ____call_usermodehelper(void *data)
 
 	retval = -EPERM;
 	if (current->fs->root)
-		retval = execve(sub_info->path, sub_info->argv, sub_info->envp);
+		retval = kernel_execve(sub_info->path,
+				sub_info->argv, sub_info->envp);
 
 	/* Exec failed? */
 	sub_info->retval = retval;
-- 
cgit v1.2.3


From 1a657f78dcc8ea7c53eaa1f2a45ea2315738c15f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 2 Oct 2006 02:18:59 -0700
Subject: [PATCH] introduce get_task_pid() to fix unsafe get_pid()

proc_pid_make_inode:

	ei->pid = get_pid(task_pid(task));

I think this is not safe.  get_pid() can be preempted after checking "pid
!= NULL".  Then the task exits, does detach_pid(), and RCU frees the pid.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/pid.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index e4779bbb2058..b914392085f9 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -304,6 +304,15 @@ struct task_struct *find_task_by_pid_type(int type, int nr)
 
 EXPORT_SYMBOL(find_task_by_pid_type);
 
+struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
+{
+	struct pid *pid;
+	rcu_read_lock();
+	pid = get_pid(task->pids[type].pid);
+	rcu_read_unlock();
+	return pid;
+}
+
 struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type)
 {
 	struct task_struct *result;
-- 
cgit v1.2.3


From 9ec52099e4b8678a60e9f93e41ad87885d64f3e6 Mon Sep 17 00:00:00 2001
From: Cedric Le Goater <clg@fr.ibm.com>
Date: Mon, 2 Oct 2006 02:19:00 -0700
Subject: [PATCH] replace cad_pid by a struct pid

There are a few places in the kernel where the init task is signaled.  The
ctrl+alt+del sequence is one them.  It kills a task, usually init, using a
cached pid (cad_pid).

This patch replaces the pid_t by a struct pid to avoid pid wrap around
problem.  The struct pid is initialized at boot time in init() and can be
modified through systctl with

	/proc/sys/kernel/cad_pid

[ I haven't found any distro using it ? ]

It also introduces a small helper routine kill_cad_pid() which is used
where it seemed ok to use cad_pid instead of pid 1.

[akpm@osdl.org: cleanups, build fix]
Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c    |  6 +++---
 kernel/sysctl.c | 30 +++++++++++++++++++++++++++---
 2 files changed, 30 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 3a4776e8f16e..2314867ae34f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -92,7 +92,8 @@ EXPORT_SYMBOL(fs_overflowgid);
  */
 
 int C_A_D = 1;
-int cad_pid = 1;
+struct pid *cad_pid;
+EXPORT_SYMBOL(cad_pid);
 
 /*
  *	Notifier list for kernel code which wants to be called
@@ -773,10 +774,9 @@ void ctrl_alt_del(void)
 	if (C_A_D)
 		schedule_work(&cad_work);
 	else
-		kill_proc(cad_pid, SIGINT, 1);
+		kill_cad_pid(SIGINT, 1);
 }
 	
-
 /*
  * Unprivileged users may change the real gid to the effective gid
  * or vice versa.  (BSD-style)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index a79ccf9d113b..8020fb273c4f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -68,7 +68,6 @@ extern int sysrq_enabled;
 extern int core_uses_pid;
 extern int suid_dumpable;
 extern char core_pattern[];
-extern int cad_pid;
 extern int pid_max;
 extern int min_free_kbytes;
 extern int printk_ratelimit_jiffies;
@@ -137,6 +136,9 @@ static int parse_table(int __user *, int, void __user *, size_t __user *,
 static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
 
+static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
+		  void __user *buffer, size_t *lenp, loff_t *ppos);
+
 static ctl_table root_table[];
 static struct ctl_table_header root_table_header =
 	{ root_table, LIST_HEAD_INIT(root_table_header.ctl_entry) };
@@ -543,10 +545,10 @@ static ctl_table kern_table[] = {
 	{
 		.ctl_name	= KERN_CADPID,
 		.procname	= "cad_pid",
-		.data		= &cad_pid,
+		.data		= NULL,
 		.maxlen		= sizeof (int),
 		.mode		= 0600,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_do_cad_pid,
 	},
 	{
 		.ctl_name	= KERN_MAX_THREADS,
@@ -2427,6 +2429,28 @@ proc_minmax:
 }
 #endif
 
+static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
+			   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct pid *new_pid;
+	pid_t tmp;
+	int r;
+
+	tmp = pid_nr(cad_pid);
+
+	r = __do_proc_dointvec(&tmp, table, write, filp, buffer,
+			       lenp, ppos, NULL, NULL);
+	if (r || !write)
+		return r;
+
+	new_pid = find_get_pid(tmp);
+	if (!new_pid)
+		return -ESRCH;
+
+	put_pid(xchg(&cad_pid, new_pid));
+	return 0;
+}
+
 #else /* CONFIG_PROC_FS */
 
 int proc_dostring(ctl_table *table, int write, struct file *filp,
-- 
cgit v1.2.3


From 3f2e05e90e0846c42626e3d272454f26be34a1bc Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 2 Oct 2006 14:12:31 +0100
Subject: [PATCH] BLOCK: Revert patch to hack around undeclared sigset_t in
 linux/compat.h

Revert Andrew Morton's patch to temporarily hack around the lack of a
declaration of sigset_t in linux/compat.h to make the block-disablement
patches build on IA64.  This got accidentally pushed to Linus and should
be fixed in a different manner.

Also make linux/compat.h #include asm/signal.h to gain a definition of
sigset_t so that it can externally declare sigset_from_compat().

This has been compile-tested for i386, x86_64, ia64, mips, mips64, frv, ppc and
ppc64 and run-tested on frv.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/compat.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index b4fbd838cd77..75573e5d27b0 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -26,8 +26,6 @@
 
 #include <asm/uaccess.h>
 
-extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat);
-
 int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
 {
 	return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
-- 
cgit v1.2.3


From ffc5089196446c08d9a005cf0dd7cab18d119606 Mon Sep 17 00:00:00 2001
From: Franck Bui-Huu <vagabon.xyz@gmail.com>
Date: Tue, 3 Oct 2006 01:13:48 -0700
Subject: [PATCH] Create kallsyms_lookup_size_offset()

Some uses of kallsyms_lookup() do not need to find out the name of a symbol
and its module's name it belongs.  This is specially true in arch specific
code, which needs to unwind the stack to show the back trace during oops
(mips is an example).  In this specific case, we just need to retreive the
function's size and the offset of the active intruction inside it.

Adds a new entry "kallsyms_lookup_size_offset()" This new entry does
exactly the same as kallsyms_lookup() but does not require any buffers to
store any names.

It returns 0 if it fails otherwise 1.

Signed-off-by: Franck Bui-Huu <vagabon.xyz@gmail.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kallsyms.c | 123 +++++++++++++++++++++++++++++++++++-------------------
 kernel/module.c   |   3 +-
 2 files changed, 82 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 342bca62c496..eeac3e313b2b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -69,6 +69,15 @@ static inline int is_kernel(unsigned long addr)
 	return in_gate_area_no_task(addr);
 }
 
+static int is_ksym_addr(unsigned long addr)
+{
+	if (all_var)
+		return is_kernel(addr);
+
+	return is_kernel_text(addr) || is_kernel_inittext(addr) ||
+		is_kernel_extratext(addr);
+}
+
 /* expand a compressed symbol data into the resulting uncompressed string,
    given the offset to where the symbol is in the compressed stream */
 static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
@@ -155,6 +164,73 @@ unsigned long kallsyms_lookup_name(const char *name)
 	return module_kallsyms_lookup_name(name);
 }
 
+static unsigned long get_symbol_pos(unsigned long addr,
+				    unsigned long *symbolsize,
+				    unsigned long *offset)
+{
+	unsigned long symbol_start = 0, symbol_end = 0;
+	unsigned long i, low, high, mid;
+
+	/* This kernel should never had been booted. */
+	BUG_ON(!kallsyms_addresses);
+
+	/* do a binary search on the sorted kallsyms_addresses array */
+	low = 0;
+	high = kallsyms_num_syms;
+
+	while (high - low > 1) {
+		mid = (low + high) / 2;
+		if (kallsyms_addresses[mid] <= addr)
+			low = mid;
+		else
+			high = mid;
+	}
+
+	/*
+	 * search for the first aliased symbol. Aliased
+	 * symbols are symbols with the same address
+	 */
+	while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low])
+		--low;
+
+	symbol_start = kallsyms_addresses[low];
+
+	/* Search for next non-aliased symbol */
+	for (i = low + 1; i < kallsyms_num_syms; i++) {
+		if (kallsyms_addresses[i] > symbol_start) {
+			symbol_end = kallsyms_addresses[i];
+			break;
+		}
+	}
+
+	/* if we found no next symbol, we use the end of the section */
+	if (!symbol_end) {
+		if (is_kernel_inittext(addr))
+			symbol_end = (unsigned long)_einittext;
+		else if (all_var)
+			symbol_end = (unsigned long)_end;
+		else
+			symbol_end = (unsigned long)_etext;
+	}
+
+	*symbolsize = symbol_end - symbol_start;
+	*offset = addr - symbol_start;
+
+	return low;
+}
+
+/*
+ * Lookup an address but don't bother to find any names.
+ */
+int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
+				unsigned long *offset)
+{
+	if (is_ksym_addr(addr))
+		return !!get_symbol_pos(addr, symbolsize, offset);
+
+	return !!module_address_lookup(addr, symbolsize, offset, NULL);
+}
+
 /*
  * Lookup an address
  * - modname is set to NULL if it's in the kernel
@@ -167,57 +243,18 @@ const char *kallsyms_lookup(unsigned long addr,
 			    unsigned long *offset,
 			    char **modname, char *namebuf)
 {
-	unsigned long i, low, high, mid;
 	const char *msym;
 
-	/* This kernel should never had been booted. */
-	BUG_ON(!kallsyms_addresses);
-
 	namebuf[KSYM_NAME_LEN] = 0;
 	namebuf[0] = 0;
 
-	if ((all_var && is_kernel(addr)) ||
-	    (!all_var && (is_kernel_text(addr) || is_kernel_inittext(addr) ||
-				is_kernel_extratext(addr)))) {
-		unsigned long symbol_end = 0;
-
-		/* do a binary search on the sorted kallsyms_addresses array */
-		low = 0;
-		high = kallsyms_num_syms;
-
-		while (high-low > 1) {
-			mid = (low + high) / 2;
-			if (kallsyms_addresses[mid] <= addr) low = mid;
-			else high = mid;
-		}
-
-		/* search for the first aliased symbol. Aliased symbols are
-		   symbols with the same address */
-		while (low && kallsyms_addresses[low - 1] == kallsyms_addresses[low])
-			--low;
+	if (is_ksym_addr(addr)) {
+		unsigned long pos;
 
+		pos = get_symbol_pos(addr, symbolsize, offset);
 		/* Grab name */
-		kallsyms_expand_symbol(get_symbol_offset(low), namebuf);
-
-		/* Search for next non-aliased symbol */
-		for (i = low + 1; i < kallsyms_num_syms; i++) {
-			if (kallsyms_addresses[i] > kallsyms_addresses[low]) {
-				symbol_end = kallsyms_addresses[i];
-				break;
-			}
-		}
-
-		/* if we found no next symbol, we use the end of the section */
-		if (!symbol_end) {
-			if (is_kernel_inittext(addr))
-				symbol_end = (unsigned long)_einittext;
-			else
-				symbol_end = all_var ? (unsigned long)_end : (unsigned long)_etext;
-		}
-
-		*symbolsize = symbol_end - kallsyms_addresses[low];
+		kallsyms_expand_symbol(get_symbol_offset(pos), namebuf);
 		*modname = NULL;
-		*offset = addr - kallsyms_addresses[low];
 		return namebuf;
 	}
 
diff --git a/kernel/module.c b/kernel/module.c
index 7c77a0a9275c..7f60e782de1e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2040,7 +2040,8 @@ const char *module_address_lookup(unsigned long addr,
 	list_for_each_entry(mod, &modules, list) {
 		if (within(addr, mod->module_init, mod->init_size)
 		    || within(addr, mod->module_core, mod->core_size)) {
-			*modname = mod->name;
+			if (modname)
+				*modname = mod->name;
 			return get_ksymbol(mod, addr, size, offset);
 		}
 	}
-- 
cgit v1.2.3


From eed34d0fc5e4b89269053ed855ef714edbcf4518 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Tue, 3 Oct 2006 01:13:50 -0700
Subject: [PATCH] kernel-doc for kernel/dma.c

Add kernel-doc function headers in kernel/dma.c and use it in DocBook.

Clean up kernel-doc in mca_dma.h (the colon (':') represents a
section header).

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/dma.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/dma.c b/kernel/dma.c
index aef0a45b7893..2020644c938a 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -62,6 +62,11 @@ static struct dma_chan dma_chan_busy[MAX_DMA_CHANNELS] = {
 };
 
 
+/**
+ * request_dma - request and reserve a system DMA channel
+ * @dmanr: DMA channel number
+ * @device_id: reserving device ID string, used in /proc/dma
+ */
 int request_dma(unsigned int dmanr, const char * device_id)
 {
 	if (dmanr >= MAX_DMA_CHANNELS)
@@ -76,7 +81,10 @@ int request_dma(unsigned int dmanr, const char * device_id)
 	return 0;
 } /* request_dma */
 
-
+/**
+ * free_dma - free a reserved system DMA channel
+ * @dmanr: DMA channel number
+ */
 void free_dma(unsigned int dmanr)
 {
 	if (dmanr >= MAX_DMA_CHANNELS) {
-- 
cgit v1.2.3


From e1ca66d1b990b23e7753c729332c0ada61f4f38d Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Tue, 3 Oct 2006 01:13:51 -0700
Subject: [PATCH] kernel-doc for kernel/resource.c

Add kernel-doc function headers in kernel/resource.c and use them in DocBook.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/resource.c | 83 +++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 72 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index 9db38a1a7520..6de60c12143e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -193,6 +193,13 @@ static int __release_resource(struct resource *old)
 	return -EINVAL;
 }
 
+/**
+ * request_resource - request and reserve an I/O or memory resource
+ * @root: root resource descriptor
+ * @new: resource descriptor desired by caller
+ *
+ * Returns 0 for success, negative error code on error.
+ */
 int request_resource(struct resource *root, struct resource *new)
 {
 	struct resource *conflict;
@@ -205,6 +212,15 @@ int request_resource(struct resource *root, struct resource *new)
 
 EXPORT_SYMBOL(request_resource);
 
+/**
+ * ____request_resource - reserve a resource, with resource conflict returned
+ * @root: root resource descriptor
+ * @new: resource descriptor desired by caller
+ *
+ * Returns:
+ * On success, NULL is returned.
+ * On error, a pointer to the conflicting resource is returned.
+ */
 struct resource *____request_resource(struct resource *root, struct resource *new)
 {
 	struct resource *conflict;
@@ -217,6 +233,10 @@ struct resource *____request_resource(struct resource *root, struct resource *ne
 
 EXPORT_SYMBOL(____request_resource);
 
+/**
+ * release_resource - release a previously reserved resource
+ * @old: resource pointer
+ */
 int release_resource(struct resource *old)
 {
 	int retval;
@@ -315,8 +335,16 @@ static int find_resource(struct resource *root, struct resource *new,
 	return -EBUSY;
 }
 
-/*
- * Allocate empty slot in the resource tree given range and alignment.
+/**
+ * allocate_resource - allocate empty slot in the resource tree given range & alignment
+ * @root: root resource descriptor
+ * @new: resource descriptor desired by caller
+ * @size: requested resource region size
+ * @min: minimum size to allocate
+ * @max: maximum size to allocate
+ * @align: alignment requested, in bytes
+ * @alignf: alignment function, optional, called if not NULL
+ * @alignf_data: arbitrary data to pass to the @alignf function
  */
 int allocate_resource(struct resource *root, struct resource *new,
 		      resource_size_t size, resource_size_t min,
@@ -407,10 +435,15 @@ int insert_resource(struct resource *parent, struct resource *new)
 	return result;
 }
 
-/*
+/**
+ * adjust_resource - modify a resource's start and size
+ * @res: resource to modify
+ * @start: new start value
+ * @size: new size
+ *
  * Given an existing resource, change its start and size to match the
- * arguments.  Returns -EBUSY if it can't fit.  Existing children of
- * the resource are assumed to be immutable.
+ * arguments.  Returns 0 on success, -EBUSY if it can't fit.
+ * Existing children of the resource are assumed to be immutable.
  */
 int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size)
 {
@@ -456,11 +489,19 @@ EXPORT_SYMBOL(adjust_resource);
  * Note how this, unlike the above, knows about
  * the IO flag meanings (busy etc).
  *
- * Request-region creates a new busy region.
+ * request_region creates a new busy region.
  *
- * Check-region returns non-zero if the area is already busy
+ * check_region returns non-zero if the area is already busy.
  *
- * Release-region releases a matching busy region.
+ * release_region releases a matching busy region.
+ */
+
+/**
+ * __request_region - create a new busy resource region
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @n: resource region size
+ * @name: reserving caller's ID string
  */
 struct resource * __request_region(struct resource *parent,
 				   resource_size_t start, resource_size_t n,
@@ -497,9 +538,23 @@ struct resource * __request_region(struct resource *parent,
 	}
 	return res;
 }
-
 EXPORT_SYMBOL(__request_region);
 
+/**
+ * __check_region - check if a resource region is busy or free
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @n: resource region size
+ *
+ * Returns 0 if the region is free at the moment it is checked,
+ * returns %-EBUSY if the region is busy.
+ *
+ * NOTE:
+ * This function is deprecated because its use is racy.
+ * Even if it returns 0, a subsequent call to request_region()
+ * may fail because another driver etc. just allocated the region.
+ * Do NOT use it.  It will be removed from the kernel.
+ */
 int __check_region(struct resource *parent, resource_size_t start,
 			resource_size_t n)
 {
@@ -513,9 +568,16 @@ int __check_region(struct resource *parent, resource_size_t start,
 	kfree(res);
 	return 0;
 }
-
 EXPORT_SYMBOL(__check_region);
 
+/**
+ * __release_region - release a previously reserved resource region
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @n: resource region size
+ *
+ * The described resource region must match a currently busy region.
+ */
 void __release_region(struct resource *parent, resource_size_t start,
 			resource_size_t n)
 {
@@ -553,7 +615,6 @@ void __release_region(struct resource *parent, resource_size_t start,
 		"<%016llx-%016llx>\n", (unsigned long long)start,
 		(unsigned long long)end);
 }
-
 EXPORT_SYMBOL(__release_region);
 
 /*
-- 
cgit v1.2.3


From 5c1e176781f43bc902a51e5832f789756bff911b Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 3 Oct 2006 01:14:04 -0700
Subject: [PATCH] sched: force /sbin/init off isolated cpus

Force /sbin/init off isolated cpus (unless every CPU is specified as an
isolcpu).

Users seem to think that the isolated CPUs shouldn't have much running on
them to begin with.  That's fair enough: intuitive, I guess.  It also means
that the cpu affinity masks of tasks will not include isolcpus by default,
which is also more intuitive, perhaps.

/sbin/init is spawned from the boot CPU's idle thread, and /sbin/init
starts the rest of userspace. So if the boot CPU is specified to be an
isolcpu, then prior to this patch, all of userspace will be run there.

(throw in a couple of plausible devinit -> cpuinit conversions I spotted
while we're here).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Acked-by: Paul Jackson <pj@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e4e54e86f4a2..ddf418810c39 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4817,7 +4817,7 @@ void show_state(void)
  * NOTE: this function does not set the idle thread's NEED_RESCHED
  * flag, to make booting more robust.
  */
-void __devinit init_idle(struct task_struct *idle, int cpu)
+void __cpuinit init_idle(struct task_struct *idle, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
@@ -5461,7 +5461,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 }
 
 /* cpus with isolated domains */
-static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
+static cpumask_t __cpuinitdata cpu_isolated_map = CPU_MASK_NONE;
 
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
@@ -6747,11 +6747,20 @@ static int update_sched_domains(struct notifier_block *nfb,
 
 void __init sched_init_smp(void)
 {
+	cpumask_t non_isolated_cpus;
+
 	lock_cpu_hotplug();
 	arch_init_sched_domains(&cpu_online_map);
+	cpus_andnot(non_isolated_cpus, cpu_online_map, cpu_isolated_map);
+	if (cpus_empty(non_isolated_cpus))
+		cpu_set(smp_processor_id(), non_isolated_cpus);
 	unlock_cpu_hotplug();
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);
+
+	/* Move init over to a non-isolated CPU */
+	if (set_cpus_allowed(current, non_isolated_cpus) < 0)
+		BUG();
 }
 #else
 void __init sched_init_smp(void)
-- 
cgit v1.2.3


From a616058b7815aafb2163fc795e02a055b0dbc5e2 Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Tue, 3 Oct 2006 01:14:06 -0700
Subject: [PATCH] sched: remove unnecessary sched group allocations

Remove dynamic sched group allocations for MC and SMP domains.  These
allocations can easily fail on big systems(1024 or so CPUs) and we can live
with out these dynamic allocations.

[akpm@osdl.org: build fix]
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 104 +++++++++++++++++++++------------------------------------
 1 file changed, 38 insertions(+), 66 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ddf418810c39..6b956bd9b49a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5489,15 +5489,17 @@ __setup ("isolcpus=", isolated_cpu_setup);
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
-static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
-				    int (*group_fn)(int cpu))
+static void
+init_sched_build_groups(struct sched_group groups[], cpumask_t span,
+			const cpumask_t *cpu_map,
+			int (*group_fn)(int cpu, const cpumask_t *cpu_map))
 {
 	struct sched_group *first = NULL, *last = NULL;
 	cpumask_t covered = CPU_MASK_NONE;
 	int i;
 
 	for_each_cpu_mask(i, span) {
-		int group = group_fn(i);
+		int group = group_fn(i, cpu_map);
 		struct sched_group *sg = &groups[group];
 		int j;
 
@@ -5508,7 +5510,7 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
 		sg->cpu_power = 0;
 
 		for_each_cpu_mask(j, span) {
-			if (group_fn(j) != group)
+			if (group_fn(j, cpu_map) != group)
 				continue;
 
 			cpu_set(j, covered);
@@ -6084,7 +6086,7 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static struct sched_group sched_group_cpus[NR_CPUS];
 
-static int cpu_to_cpu_group(int cpu)
+static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map)
 {
 	return cpu;
 }
@@ -6095,31 +6097,36 @@ static int cpu_to_cpu_group(int cpu)
  */
 #ifdef CONFIG_SCHED_MC
 static DEFINE_PER_CPU(struct sched_domain, core_domains);
-static struct sched_group *sched_group_core_bycpu[NR_CPUS];
+static struct sched_group sched_group_core[NR_CPUS];
 #endif
 
 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
-static int cpu_to_core_group(int cpu)
+static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map)
 {
-	return first_cpu(cpu_sibling_map[cpu]);
+	cpumask_t mask = cpu_sibling_map[cpu];
+	cpus_and(mask, mask, *cpu_map);
+	return first_cpu(mask);
 }
 #elif defined(CONFIG_SCHED_MC)
-static int cpu_to_core_group(int cpu)
+static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map)
 {
 	return cpu;
 }
 #endif
 
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static struct sched_group *sched_group_phys_bycpu[NR_CPUS];
+static struct sched_group sched_group_phys[NR_CPUS];
 
-static int cpu_to_phys_group(int cpu)
+static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map)
 {
 #ifdef CONFIG_SCHED_MC
 	cpumask_t mask = cpu_coregroup_map(cpu);
+	cpus_and(mask, mask, *cpu_map);
 	return first_cpu(mask);
 #elif defined(CONFIG_SCHED_SMT)
-	return first_cpu(cpu_sibling_map[cpu]);
+	cpumask_t mask = cpu_sibling_map[cpu];
+	cpus_and(mask, mask, *cpu_map);
+	return first_cpu(mask);
 #else
 	return cpu;
 #endif
@@ -6137,7 +6144,7 @@ static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
 static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
 
-static int cpu_to_allnodes_group(int cpu)
+static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map)
 {
 	return cpu_to_node(cpu);
 }
@@ -6169,12 +6176,11 @@ next_sg:
 }
 #endif
 
+#ifdef CONFIG_NUMA
 /* Free memory allocated for various sched_group structures */
 static void free_sched_groups(const cpumask_t *cpu_map)
 {
-	int cpu;
-#ifdef CONFIG_NUMA
-	int i;
+	int cpu, i;
 
 	for_each_cpu_mask(cpu, *cpu_map) {
 		struct sched_group *sched_group_allnodes
@@ -6211,20 +6217,12 @@ next_sg:
 		kfree(sched_group_nodes);
 		sched_group_nodes_bycpu[cpu] = NULL;
 	}
-#endif
-	for_each_cpu_mask(cpu, *cpu_map) {
-		if (sched_group_phys_bycpu[cpu]) {
-			kfree(sched_group_phys_bycpu[cpu]);
-			sched_group_phys_bycpu[cpu] = NULL;
-		}
-#ifdef CONFIG_SCHED_MC
-		if (sched_group_core_bycpu[cpu]) {
-			kfree(sched_group_core_bycpu[cpu]);
-			sched_group_core_bycpu[cpu] = NULL;
-		}
-#endif
-	}
 }
+#else
+static void free_sched_groups(const cpumask_t *cpu_map)
+{
+}
+#endif
 
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
@@ -6233,10 +6231,6 @@ next_sg:
 static int build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
-	struct sched_group *sched_group_phys = NULL;
-#ifdef CONFIG_SCHED_MC
-	struct sched_group *sched_group_core = NULL;
-#endif
 #ifdef CONFIG_NUMA
 	struct sched_group **sched_group_nodes = NULL;
 	struct sched_group *sched_group_allnodes = NULL;
@@ -6282,7 +6276,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 			sd = &per_cpu(allnodes_domains, i);
 			*sd = SD_ALLNODES_INIT;
 			sd->span = *cpu_map;
-			group = cpu_to_allnodes_group(i);
+			group = cpu_to_allnodes_group(i, cpu_map);
 			sd->groups = &sched_group_allnodes[group];
 			p = sd;
 		} else
@@ -6295,42 +6289,18 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 		cpus_and(sd->span, sd->span, *cpu_map);
 #endif
 
-		if (!sched_group_phys) {
-			sched_group_phys
-				= kmalloc(sizeof(struct sched_group) * NR_CPUS,
-					  GFP_KERNEL);
-			if (!sched_group_phys) {
-				printk (KERN_WARNING "Can not alloc phys sched"
-						     "group\n");
-				goto error;
-			}
-			sched_group_phys_bycpu[i] = sched_group_phys;
-		}
-
 		p = sd;
 		sd = &per_cpu(phys_domains, i);
-		group = cpu_to_phys_group(i);
+		group = cpu_to_phys_group(i, cpu_map);
 		*sd = SD_CPU_INIT;
 		sd->span = nodemask;
 		sd->parent = p;
 		sd->groups = &sched_group_phys[group];
 
 #ifdef CONFIG_SCHED_MC
-		if (!sched_group_core) {
-			sched_group_core
-				= kmalloc(sizeof(struct sched_group) * NR_CPUS,
-					  GFP_KERNEL);
-			if (!sched_group_core) {
-				printk (KERN_WARNING "Can not alloc core sched"
-						     "group\n");
-				goto error;
-			}
-			sched_group_core_bycpu[i] = sched_group_core;
-		}
-
 		p = sd;
 		sd = &per_cpu(core_domains, i);
-		group = cpu_to_core_group(i);
+		group = cpu_to_core_group(i, cpu_map);
 		*sd = SD_MC_INIT;
 		sd->span = cpu_coregroup_map(i);
 		cpus_and(sd->span, sd->span, *cpu_map);
@@ -6341,7 +6311,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 #ifdef CONFIG_SCHED_SMT
 		p = sd;
 		sd = &per_cpu(cpu_domains, i);
-		group = cpu_to_cpu_group(i);
+		group = cpu_to_cpu_group(i, cpu_map);
 		*sd = SD_SIBLING_INIT;
 		sd->span = cpu_sibling_map[i];
 		cpus_and(sd->span, sd->span, *cpu_map);
@@ -6359,7 +6329,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 			continue;
 
 		init_sched_build_groups(sched_group_cpus, this_sibling_map,
-						&cpu_to_cpu_group);
+					cpu_map, &cpu_to_cpu_group);
 	}
 #endif
 
@@ -6371,7 +6341,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 		if (i != first_cpu(this_core_map))
 			continue;
 		init_sched_build_groups(sched_group_core, this_core_map,
-					&cpu_to_core_group);
+					cpu_map, &cpu_to_core_group);
 	}
 #endif
 
@@ -6385,14 +6355,14 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 			continue;
 
 		init_sched_build_groups(sched_group_phys, nodemask,
-						&cpu_to_phys_group);
+					cpu_map, &cpu_to_phys_group);
 	}
 
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
 	if (sched_group_allnodes)
 		init_sched_build_groups(sched_group_allnodes, *cpu_map,
-					&cpu_to_allnodes_group);
+					cpu_map, &cpu_to_allnodes_group);
 
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		/* Set up node groups */
@@ -6537,7 +6507,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 		init_numa_sched_groups_power(sched_group_nodes[i]);
 
 	if (sched_group_allnodes) {
-		int group = cpu_to_allnodes_group(first_cpu(*cpu_map));
+		int group = cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map);
 		struct sched_group *sg = &sched_group_allnodes[group];
 
 		init_numa_sched_groups_power(sg);
@@ -6563,9 +6533,11 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 
 	return 0;
 
+#ifdef CONFIG_NUMA
 error:
 	free_sched_groups(cpu_map);
 	return -ENOMEM;
+#endif
 }
 /*
  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
-- 
cgit v1.2.3


From 74732646431a1bb7e23e6b564127a8881cfef900 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Tue, 3 Oct 2006 01:14:07 -0700
Subject: [PATCH] sched: don't print migration cost when only 1 CPU

If only a single CPU is present, printing this doesn't make much sense.

Signed-off-by: Dave Jones <davej@redhat.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6b956bd9b49a..6d7bf55ec33d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5977,13 +5977,15 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map)
 #endif
 		);
 	if (system_state == SYSTEM_BOOTING) {
-		printk("migration_cost=");
-		for (distance = 0; distance <= max_distance; distance++) {
-			if (distance)
-				printk(",");
-			printk("%ld", (long)migration_cost[distance] / 1000);
+		if (num_online_cpus() > 1) {
+			printk("migration_cost=");
+			for (distance = 0; distance <= max_distance; distance++) {
+				if (distance)
+					printk(",");
+				printk("%ld", (long)migration_cost[distance] / 1000);
+			}
+			printk("\n");
 		}
-		printk("\n");
 	}
 	j1 = jiffies;
 	if (migration_debug)
-- 
cgit v1.2.3


From 1a84887080dc15f048db7c3a643e98f1435790d6 Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Tue, 3 Oct 2006 01:14:08 -0700
Subject: [PATCH] sched: introduce child field in sched_domain

Introduce the child field in sched_domain struct and use it in
sched_balance_self().

We will also use this field in cleaning up the sched group cpu_power
setup(done in a different patch) code.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 40 ++++++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6d7bf55ec33d..0feeacb91497 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1286,21 +1286,29 @@ static int sched_balance_self(int cpu, int flag)
 	while (sd) {
 		cpumask_t span;
 		struct sched_group *group;
-		int new_cpu;
-		int weight;
+		int new_cpu, weight;
+
+		if (!(sd->flags & flag)) {
+			sd = sd->child;
+			continue;
+		}
 
 		span = sd->span;
 		group = find_idlest_group(sd, t, cpu);
-		if (!group)
-			goto nextlevel;
+		if (!group) {
+			sd = sd->child;
+			continue;
+		}
 
 		new_cpu = find_idlest_cpu(group, t, cpu);
-		if (new_cpu == -1 || new_cpu == cpu)
-			goto nextlevel;
+		if (new_cpu == -1 || new_cpu == cpu) {
+			/* Now try balancing at a lower domain level of cpu */
+			sd = sd->child;
+			continue;
+		}
 
-		/* Now try balancing at a lower domain level */
+		/* Now try balancing at a lower domain level of new_cpu */
 		cpu = new_cpu;
-nextlevel:
 		sd = NULL;
 		weight = cpus_weight(span);
 		for_each_domain(cpu, tmp) {
@@ -5448,12 +5456,18 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 		struct sched_domain *parent = tmp->parent;
 		if (!parent)
 			break;
-		if (sd_parent_degenerate(tmp, parent))
+		if (sd_parent_degenerate(tmp, parent)) {
 			tmp->parent = parent->parent;
+			if (parent->parent)
+				parent->parent->child = tmp;
+		}
 	}
 
-	if (sd && sd_degenerate(sd))
+	if (sd && sd_degenerate(sd)) {
 		sd = sd->parent;
+		if (sd)
+			sd->child = NULL;
+	}
 
 	sched_domain_debug(sd, cpu);
 
@@ -6288,6 +6302,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 		*sd = SD_NODE_INIT;
 		sd->span = sched_domain_node_span(cpu_to_node(i));
 		sd->parent = p;
+		if (p)
+			p->child = sd;
 		cpus_and(sd->span, sd->span, *cpu_map);
 #endif
 
@@ -6297,6 +6313,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 		*sd = SD_CPU_INIT;
 		sd->span = nodemask;
 		sd->parent = p;
+		if (p)
+			p->child = sd;
 		sd->groups = &sched_group_phys[group];
 
 #ifdef CONFIG_SCHED_MC
@@ -6307,6 +6325,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 		sd->span = cpu_coregroup_map(i);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
+		p->child = sd;
 		sd->groups = &sched_group_core[group];
 #endif
 
@@ -6318,6 +6337,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 		sd->span = cpu_sibling_map[i];
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
+		p->child = sd;
 		sd->groups = &sched_group_cpus[group];
 #endif
 	}
-- 
cgit v1.2.3


From 89c4710ee9bbbefe6a4d469d9f36266a92c275c5 Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Tue, 3 Oct 2006 01:14:09 -0700
Subject: [PATCH] sched: cleanup sched_group cpu_power setup

Up to now sched group's cpu_power for each sched domain is initialized
independently.  This made the setup code ugly as the new sched domains are
getting added.

Make the sched group cpu_power setup code generic, by using domain child
field and new domain flag in sched_domain.  For most of the sched
domains(except NUMA), sched group's cpu_power is now computed generically
using the domain properties of itself and of the child domain.

sched groups in NUMA domains are setup little differently and hence they
don't use this generic mechanism.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 145 ++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 82 insertions(+), 63 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0feeacb91497..0a5e814cc618 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2541,8 +2541,14 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	struct rq *busiest;
 	cpumask_t cpus = CPU_MASK_ALL;
 
+	/*
+	 * When power savings policy is enabled for the parent domain, idle
+	 * sibling can pick up load irrespective of busy siblings. In this case,
+	 * let the state of idle sibling percolate up as IDLE, instead of
+	 * portraying it as NOT_IDLE.
+	 */
 	if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
-	    !sched_smt_power_savings)
+	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		sd_idle = 1;
 
 	schedstat_inc(sd, lb_cnt[idle]);
@@ -2638,7 +2644,7 @@ redo:
 	}
 
 	if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-	    !sched_smt_power_savings)
+	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		return -1;
 	return nr_moved;
 
@@ -2654,7 +2660,7 @@ out_one_pinned:
 		sd->balance_interval *= 2;
 
 	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-			!sched_smt_power_savings)
+	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		return -1;
 	return 0;
 }
@@ -2676,7 +2682,14 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 	int sd_idle = 0;
 	cpumask_t cpus = CPU_MASK_ALL;
 
-	if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
+	/*
+	 * When power savings policy is enabled for the parent domain, idle
+	 * sibling can pick up load irrespective of busy siblings. In this case,
+	 * let the state of idle sibling percolate up as IDLE, instead of
+	 * portraying it as NOT_IDLE.
+	 */
+	if (sd->flags & SD_SHARE_CPUPOWER &&
+	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		sd_idle = 1;
 
 	schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
@@ -2717,7 +2730,8 @@ redo:
 
 	if (!nr_moved) {
 		schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
-		if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+		if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+		    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 			return -1;
 	} else
 		sd->nr_balance_failed = 0;
@@ -2727,7 +2741,7 @@ redo:
 out_balanced:
 	schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
 	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-					!sched_smt_power_savings)
+	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		return -1;
 	sd->nr_balance_failed = 0;
 
@@ -5400,7 +5414,9 @@ static int sd_degenerate(struct sched_domain *sd)
 	if (sd->flags & (SD_LOAD_BALANCE |
 			 SD_BALANCE_NEWIDLE |
 			 SD_BALANCE_FORK |
-			 SD_BALANCE_EXEC)) {
+			 SD_BALANCE_EXEC |
+			 SD_SHARE_CPUPOWER |
+			 SD_SHARE_PKG_RESOURCES)) {
 		if (sd->groups != sd->groups->next)
 			return 0;
 	}
@@ -5434,7 +5450,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 		pflags &= ~(SD_LOAD_BALANCE |
 				SD_BALANCE_NEWIDLE |
 				SD_BALANCE_FORK |
-				SD_BALANCE_EXEC);
+				SD_BALANCE_EXEC |
+				SD_SHARE_CPUPOWER |
+				SD_SHARE_PKG_RESOURCES);
 	}
 	if (~cflags & pflags)
 		return 0;
@@ -6240,6 +6258,58 @@ static void free_sched_groups(const cpumask_t *cpu_map)
 }
 #endif
 
+/*
+ * Initialize sched groups cpu_power.
+ *
+ * cpu_power indicates the capacity of sched group, which is used while
+ * distributing the load between different sched groups in a sched domain.
+ * Typically cpu_power for all the groups in a sched domain will be same unless
+ * there are asymmetries in the topology. If there are asymmetries, group
+ * having more cpu_power will pickup more load compared to the group having
+ * less cpu_power.
+ *
+ * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
+ * the maximum number of tasks a group can handle in the presence of other idle
+ * or lightly loaded groups in the same sched domain.
+ */
+static void init_sched_groups_power(int cpu, struct sched_domain *sd)
+{
+	struct sched_domain *child;
+	struct sched_group *group;
+
+	WARN_ON(!sd || !sd->groups);
+
+	if (cpu != first_cpu(sd->groups->cpumask))
+		return;
+
+	child = sd->child;
+
+	/*
+	 * For perf policy, if the groups in child domain share resources
+	 * (for example cores sharing some portions of the cache hierarchy
+	 * or SMT), then set this domain groups cpu_power such that each group
+	 * can handle only one task, when there are other idle groups in the
+	 * same sched domain.
+	 */
+	if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
+		       (child->flags &
+			(SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
+		sd->groups->cpu_power = SCHED_LOAD_SCALE;
+		return;
+	}
+
+	sd->groups->cpu_power = 0;
+
+	/*
+	 * add cpu_power of each child group to this groups cpu_power
+	 */
+	group = child->groups;
+	do {
+		sd->groups->cpu_power += group->cpu_power;
+		group = group->next;
+	} while (group != child->groups);
+}
+
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
@@ -6247,6 +6317,7 @@ static void free_sched_groups(const cpumask_t *cpu_map)
 static int build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
+	struct sched_domain *sd;
 #ifdef CONFIG_NUMA
 	struct sched_group **sched_group_nodes = NULL;
 	struct sched_group *sched_group_allnodes = NULL;
@@ -6456,72 +6527,20 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 	/* Calculate CPU power for physical packages and nodes */
 #ifdef CONFIG_SCHED_SMT
 	for_each_cpu_mask(i, *cpu_map) {
-		struct sched_domain *sd;
 		sd = &per_cpu(cpu_domains, i);
-		sd->groups->cpu_power = SCHED_LOAD_SCALE;
+		init_sched_groups_power(i, sd);
 	}
 #endif
 #ifdef CONFIG_SCHED_MC
 	for_each_cpu_mask(i, *cpu_map) {
-		int power;
-		struct sched_domain *sd;
 		sd = &per_cpu(core_domains, i);
-		if (sched_smt_power_savings)
-			power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
-		else
-			power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
-					    * SCHED_LOAD_SCALE / 10;
-		sd->groups->cpu_power = power;
+		init_sched_groups_power(i, sd);
 	}
 #endif
 
 	for_each_cpu_mask(i, *cpu_map) {
-		struct sched_domain *sd;
-#ifdef CONFIG_SCHED_MC
-		sd = &per_cpu(phys_domains, i);
-		if (i != first_cpu(sd->groups->cpumask))
-			continue;
-
-		sd->groups->cpu_power = 0;
-		if (sched_mc_power_savings || sched_smt_power_savings) {
-			int j;
-
- 			for_each_cpu_mask(j, sd->groups->cpumask) {
-				struct sched_domain *sd1;
- 				sd1 = &per_cpu(core_domains, j);
- 				/*
- 			 	 * for each core we will add once
- 				 * to the group in physical domain
- 			 	 */
-  	 			if (j != first_cpu(sd1->groups->cpumask))
- 					continue;
-
- 				if (sched_smt_power_savings)
-   					sd->groups->cpu_power += sd1->groups->cpu_power;
- 				else
-   					sd->groups->cpu_power += SCHED_LOAD_SCALE;
-   			}
- 		} else
- 			/*
- 			 * This has to be < 2 * SCHED_LOAD_SCALE
- 			 * Lets keep it SCHED_LOAD_SCALE, so that
- 			 * while calculating NUMA group's cpu_power
- 			 * we can simply do
- 			 *  numa_group->cpu_power += phys_group->cpu_power;
- 			 *
- 			 * See "only add power once for each physical pkg"
- 			 * comment below
- 			 */
- 			sd->groups->cpu_power = SCHED_LOAD_SCALE;
-#else
-		int power;
 		sd = &per_cpu(phys_domains, i);
-		if (sched_smt_power_savings)
-			power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
-		else
-			power = SCHED_LOAD_SCALE;
-		sd->groups->cpu_power = power;
-#endif
+		init_sched_groups_power(i, sd);
 	}
 
 #ifdef CONFIG_NUMA
-- 
cgit v1.2.3


From 0feaece97795c4c775a3c732c045706eda28d0e5 Mon Sep 17 00:00:00 2001
From: Satoru Takeuchi <takeuchi_satoru@jp.fujitsu.com>
Date: Tue, 3 Oct 2006 01:14:10 -0700
Subject: [PATCH] sched: fixing wrong comment for find_idlest_cpu()

Fixing wrong comment for find_idlest_cpu().

Signed-off-by: Satoru Takeuchi <takeuchi_satoru@jp.fujitsu.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0a5e814cc618..fec97e4e196d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1232,7 +1232,7 @@ nextgroup:
 }
 
 /*
- * find_idlest_queue - find the idlest runqueue among the cpus in group.
+ * find_idlest_cpu - find the idlest cpu among the cpus in group.
  */
 static int
 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
-- 
cgit v1.2.3


From ce164428c4cabfd284ca81913415cacd889aac33 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Tue, 3 Oct 2006 01:14:11 -0700
Subject: [PATCH] scheduler: NUMA aware placement of sched_group_allnodes

When the per cpu sched domains are build then they also need to be placed
on the node where the cpu resides otherwise we will have frequent off node
accesses which will slow down the system.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index fec97e4e196d..53608a59d6e3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6349,9 +6349,10 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
 			if (!sched_group_allnodes) {
 				sched_group_allnodes
-					= kmalloc(sizeof(struct sched_group)
-							* MAX_NUMNODES,
-						  GFP_KERNEL);
+					= kmalloc_node(sizeof(struct sched_group)
+						  	* MAX_NUMNODES,
+						  GFP_KERNEL,
+						  cpu_to_node(i));
 				if (!sched_group_allnodes) {
 					printk(KERN_WARNING
 					"Can not alloc allnodes sched group\n");
-- 
cgit v1.2.3


From f30c2269544bffc7bf1b0d7c0abe5be1be83b8cb Mon Sep 17 00:00:00 2001
From: Uwe Zeisberger <Uwe_Zeisberger@digi.com>
Date: Tue, 3 Oct 2006 23:01:26 +0200
Subject: fix file specification in comments

Many files include the filename at the beginning, serveral used a wrong one.

Signed-off-by: Uwe Zeisberger <Uwe_Zeisberger@digi.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 kernel/posix-timers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index e5ebcc1ec3a0..9cbb5d1be06f 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -1,5 +1,5 @@
 /*
- * linux/kernel/posix_timers.c
+ * linux/kernel/posix-timers.c
  *
  *
  * 2002-10-15  Posix Clocks & timers
-- 
cgit v1.2.3


From 9f5d785e93f7c68c37582ddea848de23689fdd76 Mon Sep 17 00:00:00 2001
From: Rolf Eike Beer <eike-kernel@sf-tec.de>
Date: Tue, 3 Oct 2006 23:07:31 +0200
Subject: remove duplicate "until" from kernel/workqueue.c

s/until until/until/

Signed-off-by: Rolf Eike Beer <eike-kernel@sf-tec.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 kernel/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 835fe28b87a8..cfc737bffe6d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -34,7 +34,7 @@
  * possible cpu).
  *
  * The sequence counters are for flush_scheduled_work().  It wants to wait
- * until until all currently-scheduled works are completed, but it doesn't
+ * until all currently-scheduled works are completed, but it doesn't
  * want to be livelocked by new, incoming ones.  So it waits until
  * remove_sequence is >= the insert_sequence which pertained when
  * flush_scheduled_work() was called.
-- 
cgit v1.2.3


From 4802211cfd68e44c8401a8fe3657e9c2522ec517 Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@freedesktop.org>
Date: Tue, 3 Oct 2006 23:26:16 +0200
Subject: rcutorture: Fix incorrect description of default for nreaders
 parameter

The comment for the nreaders parameter of rcutorture gives the default as
4*ncpus, but the value actually defaults to 2*ncpus; fix the comment.

Signed-off-by: Josh Triplett <josh@freedesktop.org>
Acked-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 kernel/rcutorture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 4f2c4272d59c..23446e91cded 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -47,7 +47,7 @@
 
 MODULE_LICENSE("GPL");
 
-static int nreaders = -1;	/* # reader threads, defaults to 4*ncpus */
+static int nreaders = -1;	/* # reader threads, defaults to 2*ncpus */
 static int stat_interval;	/* Interval between stats, in seconds. */
 				/*  Defaults to "only at end of test". */
 static int verbose;		/* Print more debug info. */
-- 
cgit v1.2.3


From 038b0a6d8d32db934bba6a24e74e76e4e327a94f Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Wed, 4 Oct 2006 03:38:54 -0400
Subject: Remove all inclusions of <linux/config.h> kbuild explicitly includes
 this at build time.

Signed-off-by: Dave Jones <davej@redhat.com>
---
 kernel/rtmutex-debug.c  | 1 -
 kernel/rtmutex-tester.c | 1 -
 2 files changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 0c1faa950af7..da8d6bf46457 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -16,7 +16,6 @@
  *
  * See rt.c in preempt-rt for proper credits and further information
  */
-#include <linux/config.h>
 #include <linux/sched.h>
 #include <linux/delay.h>
 #include <linux/module.h>
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 948bd8f643e2..6dcea9dd8c94 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -6,7 +6,6 @@
  *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
  *
  */
-#include <linux/config.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/sched.h>
-- 
cgit v1.2.3


From 4b8a311bb161a3bd2ab44311f42c526b6dc76270 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 28 Sep 2006 17:46:21 -0400
Subject: [PATCH] arch filter lists with < or > should not be accepted

Currently the kernel audit system represents arch's as numbers and will
gladly accept comparisons between archs using >, <, >=, <= when the only
thing that makes sense is = or !=.  I'm told that the next revision of
auditctl will do this checking but this will provide enforcement in the
kernel even for old userspace.  A simple command to show the issue would
be to run

auditctl -d entry,always -F arch>i686 -S chmod

with this patch the kernel will reject this with -EINVAL

Please comment/ack/nak as soon as possible.

-Eric

 kernel/auditfilter.c |    9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditfilter.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 1a58a81fb09d..4f40d923af8e 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -411,7 +411,6 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 		case AUDIT_FSGID:
 		case AUDIT_LOGINUID:
 		case AUDIT_PERS:
-		case AUDIT_ARCH:
 		case AUDIT_MSGTYPE:
 		case AUDIT_PPID:
 		case AUDIT_DEVMAJOR:
@@ -423,6 +422,14 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 		case AUDIT_ARG2:
 		case AUDIT_ARG3:
 			break;
+		/* arch is only allowed to be = or != */
+		case AUDIT_ARCH:
+			if ((f->op != AUDIT_NOT_EQUAL) && (f->op != AUDIT_EQUAL)
+					&& (f->op != AUDIT_NEGATE) && (f->op)) {
+				err = -EINVAL;
+				goto exit_free;
+			}
+			break;
 		case AUDIT_PERM:
 			if (f->val & ~15)
 				goto exit_free;
-- 
cgit v1.2.3


From 419c58f11fb732cc8bd1335fa43e0decb34e0be3 Mon Sep 17 00:00:00 2001
From: Alexander Viro <aviro@redhat.com>
Date: Fri, 29 Sep 2006 00:08:50 -0400
Subject: [PATCH] PPID filtering fix

On Thu, Sep 28, 2006 at 04:03:06PM -0400, Eric Paris wrote:
> After some looking I did not see a way to get into audit_log_exit
> without having set the ppid.  So I am dropping the set from there and
> only doing it at the beginning.
>
> Please comment/ack/nak as soon as possible.

Ehh...  That's one hell of an overhead to be had ;-/  Let's be lazy.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 105147631753..b61c0191f3da 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -278,8 +278,11 @@ static int audit_filter_rules(struct task_struct *tsk,
 			result = audit_comparator(tsk->pid, f->op, f->val);
 			break;
 		case AUDIT_PPID:
-			if (ctx)
+			if (ctx) {
+				if (!ctx->ppid)
+					ctx->ppid = sys_getppid();
 				result = audit_comparator(ctx->ppid, f->op, f->val);
+			}
 			break;
 		case AUDIT_UID:
 			result = audit_comparator(tsk->uid, f->op, f->val);
@@ -795,7 +798,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 
 	/* tsk == current */
 	context->pid = tsk->pid;
-	context->ppid = sys_getppid();	/* sic.  tsk == current in all cases */
+	if (!context->ppid)
+		context->ppid = sys_getppid();
 	context->uid = tsk->uid;
 	context->gid = tsk->gid;
 	context->euid = tsk->euid;
@@ -1137,6 +1141,7 @@ void audit_syscall_entry(int arch, int major,
 	context->ctime      = CURRENT_TIME;
 	context->in_syscall = 1;
 	context->auditable  = !!(state == AUDIT_RECORD_CONTEXT);
+	context->ppid       = 0;
 }
 
 /**
-- 
cgit v1.2.3


From ac9910ce017ff5f86f3a25e969b2c4f5d6ac438f Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Thu, 28 Sep 2006 14:31:32 -0400
Subject: [PATCH] name_count array overrun

Hi,

This patch removes the rdev logging from the previous patch

The below patch closes an unbounded use of name_count. This can lead to oopses
in some new file systems.

Signed-off-by: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b61c0191f3da..42f2f1179711 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1357,7 +1357,13 @@ void __audit_inode_child(const char *dname, const struct inode *inode,
 		}
 
 update_context:
-	idx = context->name_count++;
+	idx = context->name_count;
+	if (context->name_count == AUDIT_NAMES) {
+		printk(KERN_DEBUG "name_count maxed and losing %s\n",
+			found_name ?: "(null)");
+		return;
+	}
+	context->name_count++;
 #if AUDIT_DEBUG
 	context->ino_count++;
 #endif
@@ -1375,7 +1381,16 @@ update_context:
 	/* A parent was not found in audit_names, so copy the inode data for the
 	 * provided parent. */
 	if (!found_name) {
-		idx = context->name_count++;
+		idx = context->name_count;
+		if (context->name_count == AUDIT_NAMES) {
+			printk(KERN_DEBUG
+				"name_count maxed and losing parent inode data: dev=%02x:%02x, inode=%lu",
+				MAJOR(parent->i_sb->s_dev),
+				MINOR(parent->i_sb->s_dev),
+				parent->i_ino);
+			return;
+		}
+		context->name_count++;
 #if AUDIT_DEBUG
 		context->ino_count++;
 #endif
-- 
cgit v1.2.3


From a24ceab4f44f21749aa0b6bd38bee37c775e036f Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 4 Oct 2006 02:16:27 -0700
Subject: [PATCH] genirq: irq: convert the move_irq flag from a 32bit word to a
 single bit

The primary aim of this patchset is to remove maintenances problems caused by
the irq infrastructure.  The two big issues I address are an artificially
small cap on the number of irqs, and that MSI assumes vector == irq.  My
primary focus is on x86_64 but I have touched other architectures where
necessary to keep them from breaking.

- To increase the number of irqs I modify the code to look at the (cpu,
  vector) pair instead of just looking at the vector.

  With a large number of irqs available systems with a large irq count no
  longer need to compress their irq numbers to fit.  Removing a lot of brittle
  special cases.

  For acpi guys the result is that irq == gsi.

- Addressing the fact that MSI assumes irq == vector takes a few more
  patches.  But suffice it to say when I am done none of the generic irq code
  even knows what a vector is.

In quick testing on a large Unisys x86_64 machine we stumbled over at least
one driver that assumed that NR_IRQS could always fit into an 8 bit number.
This driver is clearly buggy today.  But this has become a class of bugs that
it is now much easier to hit.

This patch:

This is a minor space optimization.  In practice I don't think this has any
affect because of our alignment constraints and the other fields but there is
not point in chewing up an uncessary word and since we already read the flag
field this should improve the cache hit ratio of the irq handler.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Rajesh Shah <rajesh.shah@intel.com>
Cc: Andi Kleen <ak@muc.de>
Cc: "Protasevich, Natalie" <Natalie.Protasevich@UNISYS.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/migration.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index a57ebe9fa6f6..9b234df810d0 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -7,7 +7,7 @@ void set_pending_irq(unsigned int irq, cpumask_t mask)
 	unsigned long flags;
 
 	spin_lock_irqsave(&desc->lock, flags);
-	desc->move_irq = 1;
+	desc->status |= IRQ_MOVE_PENDING;
 	irq_desc[irq].pending_mask = mask;
 	spin_unlock_irqrestore(&desc->lock, flags);
 }
@@ -17,7 +17,7 @@ void move_native_irq(int irq)
 	struct irq_desc *desc = irq_desc + irq;
 	cpumask_t tmp;
 
-	if (likely(!desc->move_irq))
+	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
 		return;
 
 	/*
@@ -28,7 +28,7 @@ void move_native_irq(int irq)
 		return;
 	}
 
-	desc->move_irq = 0;
+	desc->status &= ~IRQ_MOVE_PENDING;
 
 	if (unlikely(cpus_empty(irq_desc[irq].pending_mask)))
 		return;
-- 
cgit v1.2.3


From e7b946e98a456077dd6897f726f3d6197bd7e3b9 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 4 Oct 2006 02:16:29 -0700
Subject: [PATCH] genirq: irq: add moved_masked_irq

Currently move_native_irq disables and renables the irq we are migrating to
ensure we don't take that irq when we are actually doing the migration
operation.  Disabling the irq needs to happen but sometimes doing the work is
move_native_irq is too late.

On x86 with ioapics the irq move sequences needs to be:
edge_triggered:
  mask irq.
  move irq.
  unmask irq.
  ack irq.
level_triggered:
  mask irq.
  ack irq.
  move irq.
  unmask irq.

We can easily perform the edge triggered sequence, with the current defintion
of move_native_irq.  However the level triggered case does not map well.  For
that I have added move_masked_irq, to allow me to disable the irqs around both
the ack and the move.

Q: Why have we not seen this problem earlier?

A: The only symptom I have been able to reproduce is that if we change
   the vector before acknowleding an irq the wrong irq is acknowledged.
   Since we currently are not reprogramming the irq vector during
   migration no problems show up.

   We have to mask the irq before we acknowledge the irq or else we could
   hit a window where an irq is asserted just before we acknowledge it.

   Edge triggered irqs do not have this problem because acknowledgements
   do not propogate in the same way.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Rajesh Shah <rajesh.shah@intel.com>
Cc: Andi Kleen <ak@muc.de>
Cc: "Protasevich, Natalie" <Natalie.Protasevich@UNISYS.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/migration.c | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 9b234df810d0..4baa3bbcd25a 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -12,7 +12,7 @@ void set_pending_irq(unsigned int irq, cpumask_t mask)
 	spin_unlock_irqrestore(&desc->lock, flags);
 }
 
-void move_native_irq(int irq)
+void move_masked_irq(int irq)
 {
 	struct irq_desc *desc = irq_desc + irq;
 	cpumask_t tmp;
@@ -48,15 +48,29 @@ void move_native_irq(int irq)
 	 * when an active trigger is comming in. This could
 	 * cause some ioapics to mal-function.
 	 * Being paranoid i guess!
+	 *
+	 * For correct operation this depends on the caller
+	 * masking the irqs.
 	 */
 	if (likely(!cpus_empty(tmp))) {
-		if (likely(!(desc->status & IRQ_DISABLED)))
-			desc->chip->disable(irq);
-
 		desc->chip->set_affinity(irq,tmp);
-
-		if (likely(!(desc->status & IRQ_DISABLED)))
-			desc->chip->enable(irq);
 	}
 	cpus_clear(irq_desc[irq].pending_mask);
 }
+
+void move_native_irq(int irq)
+{
+	struct irq_desc *desc = irq_desc + irq;
+
+	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
+		return;
+
+	if (likely(!(desc->status & IRQ_DISABLED)))
+		desc->chip->disable(irq);
+
+	move_masked_irq(irq);
+
+	if (likely(!(desc->status & IRQ_DISABLED)))
+		desc->chip->enable(irq);
+}
+
-- 
cgit v1.2.3


From 3a16d713626735f3016da0521b7bf251cd78e836 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 4 Oct 2006 02:16:37 -0700
Subject: [PATCH] genirq: irq: add a dynamic irq creation API

With the msi support comes a new concept in irq handling, irqs that are
created dynamically at run time.

Currently the msi code allocates irqs backwards.  First it allocates a
platform dependent routing value for an interrupt the ``vector'' and then it
figures out from the vector which irq you are on.

This msi backwards allocator suffers from two basic problems.  The allocator
suffers because it is trying to do something that is architecture specific in
a generic way making it brittle, inflexible, and tied to tightly to the
architecture implementation.  The alloctor also suffers from it's very
backwards nature as it has tied things together that should have no
dependencies.

To solve the basic dynamic irq allocation problem two new architecture
specific functions are added: create_irq and destroy_irq.

create_irq takes no input and returns an unused irq number, that won't be
reused until it is returned to the free poll with destroy_irq.  The irq then
can be used for any purpose although the only initial consumer is the msi
code.

destroy_irq takes an irq number allocated with create_irq and returns it to
the free pool.

Making this functionality per architecture increases the simplicity of the irq
allocation code and increases it's flexibility.

dynamic_irq_init() and dynamic_irq_cleanup() are added to automate the
irq_desc initializtion that should happen for dynamic irqs.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Rajesh Shah <rajesh.shah@intel.com>
Cc: Andi Kleen <ak@muc.de>
Cc: "Protasevich, Natalie" <Natalie.Protasevich@UNISYS.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/chip.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 736cb0bd498f..0dc24386dd99 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -17,6 +17,62 @@
 
 #include "internals.h"
 
+/**
+ *	dynamic_irq_init - initialize a dynamically allocated irq
+ *	@irq:	irq number to initialize
+ */
+void dynamic_irq_init(unsigned int irq)
+{
+	struct irq_desc *desc;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
+		WARN_ON(1);
+		return;
+	}
+
+	/* Ensure we don't have left over values from a previous use of this irq */
+	desc = irq_desc + irq;
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->status = IRQ_DISABLED;
+	desc->chip = &no_irq_chip;
+	desc->handle_irq = handle_bad_irq;
+	desc->depth = 1;
+	desc->handler_data = NULL;
+	desc->chip_data = NULL;
+	desc->action = NULL;
+	desc->irq_count = 0;
+	desc->irqs_unhandled = 0;
+#ifdef CONFIG_SMP
+	desc->affinity = CPU_MASK_ALL;
+#endif
+	spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+/**
+ *	dynamic_irq_cleanup - cleanup a dynamically allocated irq
+ *	@irq:	irq number to initialize
+ */
+void dynamic_irq_cleanup(unsigned int irq)
+{
+	struct irq_desc *desc;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
+		WARN_ON(1);
+		return;
+	}
+
+	desc = irq_desc + irq;
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->handle_irq = handle_bad_irq;
+	desc->chip = &no_irq_chip;
+	spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+
 /**
  *	set_irq_chip - set the irq chip for an irq
  *	@irq:	irq number
-- 
cgit v1.2.3


From 1f80025e624bb14fefadfef7e80fbfb9740d4714 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 4 Oct 2006 02:16:56 -0700
Subject: [PATCH] msi: simplify msi sanity checks by adding with generic irq
 code

Currently msi.c is doing sanity checks that make certain before an irq is
destroyed it has no more users.

By adding irq_has_action I can perform the test is a generic way, instead of
relying on a msi specific data structure.

By performing the core check in dynamic_irq_cleanup I ensure every user of
dynamic irqs has a test present and we don't free resources that are in use.

In msi.c this allows me to kill the attrib.state member of msi_desc and all of
the assciated code to maintain it.

To keep from freeing data structures when irq cleanup code is called to soon
changing dyanamic_irq_cleanup is insufficient because there are msi specific
data structures that are also not safe to free.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Greg KH <greg@kroah.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/chip.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 0dc24386dd99..4cf65f5c6a74 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -67,6 +67,13 @@ void dynamic_irq_cleanup(unsigned int irq)
 
 	desc = irq_desc + irq;
 	spin_lock_irqsave(&desc->lock, flags);
+	if (desc->action) {
+		spin_unlock_irqrestore(&desc->lock, flags);
+		printk(KERN_ERR "Destroying IRQ%d without calling free_irq\n",
+			irq);
+		WARN_ON(1);
+		return;
+	}
 	desc->handle_irq = handle_bad_irq;
 	desc->chip = &no_irq_chip;
 	spin_unlock_irqrestore(&desc->lock, flags);
-- 
cgit v1.2.3


From 621934ee7ed5b073c7fd638b347e632c53572761 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Wed, 4 Oct 2006 02:17:02 -0700
Subject: [PATCH] srcu-3: RCU variant permitting read-side blocking

Updated patch adding a variant of RCU that permits sleeping in read-side
critical sections.  SRCU is as follows:

o	Each use of SRCU creates its own srcu_struct, and each
	srcu_struct has its own set of grace periods.  This is
	critical, as it prevents one subsystem with a blocking
	reader from holding up SRCU grace periods for other
	subsystems.

o	The SRCU primitives (srcu_read_lock(), srcu_read_unlock(),
	and synchronize_srcu()) all take a pointer to a srcu_struct.

o	The SRCU primitives must be called from process context.

o	srcu_read_lock() returns an int that must be passed to
	the matching srcu_read_unlock().  Realtime RCU avoids the
	need for this by storing the state in the task struct,
	but SRCU needs to allow a given code path to pass through
	multiple SRCU domains -- storing state in the task struct
	would therefore require either arbitrary space in the
	task struct or arbitrary limits on SRCU nesting.  So I
	kicked the state-storage problem up to the caller.

	Of course, it is not permitted to call synchronize_srcu()
	while in an SRCU read-side critical section.

o	There is no call_srcu().  It would not be hard to implement
	one, but it seems like too easy a way to OOM the system.
	(Hey, we have enough trouble with call_rcu(), which does
	-not- permit readers to sleep!!!)  So, if you want it,
	please tell me why...

[josht@us.ibm.com: sparse notation]
Signed-off-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Josh Triplett <josh@freedesktop.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/Makefile |   2 +-
 kernel/srcu.c   | 257 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 258 insertions(+), 1 deletion(-)
 create mode 100644 kernel/srcu.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index d948ca12acf0..5e3f3b75563a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
-	    hrtimer.o rwsem.o latency.o nsproxy.o
+	    hrtimer.o rwsem.o latency.o nsproxy.o srcu.o
 
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
diff --git a/kernel/srcu.c b/kernel/srcu.c
new file mode 100644
index 000000000000..7e1979f624ba
--- /dev/null
+++ b/kernel/srcu.c
@@ -0,0 +1,257 @@
+/*
+ * Sleepable Read-Copy Update mechanism for mutual exclusion.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2006
+ *
+ * Author: Paul McKenney <paulmck@us.ibm.com>
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU/ *.txt
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/srcu.h>
+
+/**
+ * init_srcu_struct - initialize a sleep-RCU structure
+ * @sp: structure to initialize.
+ *
+ * Must invoke this on a given srcu_struct before passing that srcu_struct
+ * to any other function.  Each srcu_struct represents a separate domain
+ * of SRCU protection.
+ */
+void init_srcu_struct(struct srcu_struct *sp)
+{
+	sp->completed = 0;
+	sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
+	mutex_init(&sp->mutex);
+}
+
+/*
+ * srcu_readers_active_idx -- returns approximate number of readers
+ *	active on the specified rank of per-CPU counters.
+ */
+
+static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
+{
+	int cpu;
+	int sum;
+
+	sum = 0;
+	for_each_possible_cpu(cpu)
+		sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx];
+	return sum;
+}
+
+/**
+ * srcu_readers_active - returns approximate number of readers.
+ * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
+ *
+ * Note that this is not an atomic primitive, and can therefore suffer
+ * severe errors when invoked on an active srcu_struct.  That said, it
+ * can be useful as an error check at cleanup time.
+ */
+int srcu_readers_active(struct srcu_struct *sp)
+{
+	return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1);
+}
+
+/**
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @sp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+void cleanup_srcu_struct(struct srcu_struct *sp)
+{
+	int sum;
+
+	sum = srcu_readers_active(sp);
+	WARN_ON(sum);  /* Leakage unless caller handles error. */
+	if (sum != 0)
+		return;
+	free_percpu(sp->per_cpu_ref);
+	sp->per_cpu_ref = NULL;
+}
+
+/**
+ * srcu_read_lock - register a new reader for an SRCU-protected structure.
+ * @sp: srcu_struct in which to register the new reader.
+ *
+ * Counts the new reader in the appropriate per-CPU element of the
+ * srcu_struct.  Must be called from process context.
+ * Returns an index that must be passed to the matching srcu_read_unlock().
+ */
+int srcu_read_lock(struct srcu_struct *sp)
+{
+	int idx;
+
+	preempt_disable();
+	idx = sp->completed & 0x1;
+	barrier();  /* ensure compiler looks -once- at sp->completed. */
+	per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++;
+	srcu_barrier();  /* ensure compiler won't misorder critical section. */
+	preempt_enable();
+	return idx;
+}
+
+/**
+ * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
+ * @sp: srcu_struct in which to unregister the old reader.
+ * @idx: return value from corresponding srcu_read_lock().
+ *
+ * Removes the count for the old reader from the appropriate per-CPU
+ * element of the srcu_struct.  Note that this may well be a different
+ * CPU than that which was incremented by the corresponding srcu_read_lock().
+ * Must be called from process context.
+ */
+void srcu_read_unlock(struct srcu_struct *sp, int idx)
+{
+	preempt_disable();
+	srcu_barrier();  /* ensure compiler won't misorder critical section. */
+	per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
+	preempt_enable();
+}
+
+/**
+ * synchronize_srcu - wait for prior SRCU read-side critical-section completion
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Flip the completed counter, and wait for the old count to drain to zero.
+ * As with classic RCU, the updater must use some separate means of
+ * synchronizing concurrent updates.  Can block; must be called from
+ * process context.
+ *
+ * Note that it is illegal to call synchornize_srcu() from the corresponding
+ * SRCU read-side critical section; doing so will result in deadlock.
+ * However, it is perfectly legal to call synchronize_srcu() on one
+ * srcu_struct from some other srcu_struct's read-side critical section.
+ */
+void synchronize_srcu(struct srcu_struct *sp)
+{
+	int idx;
+
+	idx = sp->completed;
+	mutex_lock(&sp->mutex);
+
+	/*
+	 * Check to see if someone else did the work for us while we were
+	 * waiting to acquire the lock.  We need -two- advances of
+	 * the counter, not just one.  If there was but one, we might have
+	 * shown up -after- our helper's first synchronize_sched(), thus
+	 * having failed to prevent CPU-reordering races with concurrent
+	 * srcu_read_unlock()s on other CPUs (see comment below).  So we
+	 * either (1) wait for two or (2) supply the second ourselves.
+	 */
+
+	if ((sp->completed - idx) >= 2) {
+		mutex_unlock(&sp->mutex);
+		return;
+	}
+
+	synchronize_sched();  /* Force memory barrier on all CPUs. */
+
+	/*
+	 * The preceding synchronize_sched() ensures that any CPU that
+	 * sees the new value of sp->completed will also see any preceding
+	 * changes to data structures made by this CPU.  This prevents
+	 * some other CPU from reordering the accesses in its SRCU
+	 * read-side critical section to precede the corresponding
+	 * srcu_read_lock() -- ensuring that such references will in
+	 * fact be protected.
+	 *
+	 * So it is now safe to do the flip.
+	 */
+
+	idx = sp->completed & 0x1;
+	sp->completed++;
+
+	synchronize_sched();  /* Force memory barrier on all CPUs. */
+
+	/*
+	 * At this point, because of the preceding synchronize_sched(),
+	 * all srcu_read_lock() calls using the old counters have completed.
+	 * Their corresponding critical sections might well be still
+	 * executing, but the srcu_read_lock() primitives themselves
+	 * will have finished executing.
+	 */
+
+	while (srcu_readers_active_idx(sp, idx))
+		schedule_timeout_interruptible(1);
+
+	synchronize_sched();  /* Force memory barrier on all CPUs. */
+
+	/*
+	 * The preceding synchronize_sched() forces all srcu_read_unlock()
+	 * primitives that were executing concurrently with the preceding
+	 * for_each_possible_cpu() loop to have completed by this point.
+	 * More importantly, it also forces the corresponding SRCU read-side
+	 * critical sections to have also completed, and the corresponding
+	 * references to SRCU-protected data items to be dropped.
+	 *
+	 * Note:
+	 *
+	 *	Despite what you might think at first glance, the
+	 *	preceding synchronize_sched() -must- be within the
+	 *	critical section ended by the following mutex_unlock().
+	 *	Otherwise, a task taking the early exit can race
+	 *	with a srcu_read_unlock(), which might have executed
+	 *	just before the preceding srcu_readers_active() check,
+	 *	and whose CPU might have reordered the srcu_read_unlock()
+	 *	with the preceding critical section.  In this case, there
+	 *	is nothing preventing the synchronize_sched() task that is
+	 *	taking the early exit from freeing a data structure that
+	 *	is still being referenced (out of order) by the task
+	 *	doing the srcu_read_unlock().
+	 *
+	 *	Alternatively, the comparison with "2" on the early exit
+	 *	could be changed to "3", but this increases synchronize_srcu()
+	 *	latency for bulk loads.  So the current code is preferred.
+	 */
+
+	mutex_unlock(&sp->mutex);
+}
+
+/**
+ * srcu_batches_completed - return batches completed.
+ * @sp: srcu_struct on which to report batch completion.
+ *
+ * Report the number of batches, correlated with, but not necessarily
+ * precisely the same as, the number of grace periods that have elapsed.
+ */
+
+long srcu_batches_completed(struct srcu_struct *sp)
+{
+	return sp->completed;
+}
+
+EXPORT_SYMBOL_GPL(init_srcu_struct);
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(srcu_read_lock);
+EXPORT_SYMBOL_GPL(srcu_read_unlock);
+EXPORT_SYMBOL_GPL(synchronize_srcu);
+EXPORT_SYMBOL_GPL(srcu_batches_completed);
+EXPORT_SYMBOL_GPL(srcu_readers_active);
-- 
cgit v1.2.3


From b2896d2e75c87ea6a842c088db730b03c91db737 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Wed, 4 Oct 2006 02:17:03 -0700
Subject: [PATCH] srcu-3: add SRCU operations to rcutorture

Adds SRCU operations to rcutorture and updates rcutorture documentation.
Also increases the stress imposed by the rcutorture test.

[bunk@stusta.de: make needlessly global code static]
Signed-off-by: Paul E. McKenney <paulmck@us.ibm.com>
Cc: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcutorture.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 123 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 23446e91cded..e34d22bf2293 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -44,6 +44,7 @@
 #include <linux/delay.h>
 #include <linux/byteorder/swabb.h>
 #include <linux/stat.h>
+#include <linux/srcu.h>
 
 MODULE_LICENSE("GPL");
 
@@ -53,7 +54,7 @@ static int stat_interval;	/* Interval between stats, in seconds. */
 static int verbose;		/* Print more debug info. */
 static int test_no_idle_hz;	/* Test RCU's support for tickless idle CPUs. */
 static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
-static char *torture_type = "rcu"; /* What to torture. */
+static char *torture_type = "rcu"; /* What to torture: rcu, srcu. */
 
 module_param(nreaders, int, 0);
 MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
@@ -66,7 +67,7 @@ MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
 module_param(shuffle_interval, int, 0);
 MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
 module_param(torture_type, charp, 0);
-MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh)");
+MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
 
 #define TORTURE_FLAG "-torture:"
 #define PRINTK_STRING(s) \
@@ -104,11 +105,11 @@ static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
 static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) =
 	{ 0 };
 static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
-atomic_t n_rcu_torture_alloc;
-atomic_t n_rcu_torture_alloc_fail;
-atomic_t n_rcu_torture_free;
-atomic_t n_rcu_torture_mberror;
-atomic_t n_rcu_torture_error;
+static atomic_t n_rcu_torture_alloc;
+static atomic_t n_rcu_torture_alloc_fail;
+static atomic_t n_rcu_torture_free;
+static atomic_t n_rcu_torture_mberror;
+static atomic_t n_rcu_torture_error;
 
 /*
  * Allocate an element from the rcu_tortures pool.
@@ -180,6 +181,7 @@ struct rcu_torture_ops {
 	void (*init)(void);
 	void (*cleanup)(void);
 	int (*readlock)(void);
+	void (*readdelay)(struct rcu_random_state *rrsp);
 	void (*readunlock)(int idx);
 	int (*completed)(void);
 	void (*deferredfree)(struct rcu_torture *p);
@@ -198,6 +200,18 @@ static int rcu_torture_read_lock(void) __acquires(RCU)
 	return 0;
 }
 
+static void rcu_read_delay(struct rcu_random_state *rrsp)
+{
+	long delay;
+	const long longdelay = 200;
+
+	/* We want there to be long-running readers, but not all the time. */
+
+	delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay);
+	if (!delay)
+		udelay(longdelay);
+}
+
 static void rcu_torture_read_unlock(int idx) __releases(RCU)
 {
 	rcu_read_unlock();
@@ -239,6 +253,7 @@ static struct rcu_torture_ops rcu_ops = {
 	.init = NULL,
 	.cleanup = NULL,
 	.readlock = rcu_torture_read_lock,
+	.readdelay = rcu_read_delay,
 	.readunlock = rcu_torture_read_unlock,
 	.completed = rcu_torture_completed,
 	.deferredfree = rcu_torture_deferred_free,
@@ -275,6 +290,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
 	.init = NULL,
 	.cleanup = NULL,
 	.readlock = rcu_bh_torture_read_lock,
+	.readdelay = rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock = rcu_bh_torture_read_unlock,
 	.completed = rcu_bh_torture_completed,
 	.deferredfree = rcu_bh_torture_deferred_free,
@@ -282,8 +298,105 @@ static struct rcu_torture_ops rcu_bh_ops = {
 	.name = "rcu_bh"
 };
 
+/*
+ * Definitions for srcu torture testing.
+ */
+
+static struct srcu_struct srcu_ctl;
+static struct list_head srcu_removed;
+
+static void srcu_torture_init(void)
+{
+	init_srcu_struct(&srcu_ctl);
+	INIT_LIST_HEAD(&srcu_removed);
+}
+
+static void srcu_torture_cleanup(void)
+{
+	synchronize_srcu(&srcu_ctl);
+	cleanup_srcu_struct(&srcu_ctl);
+}
+
+static int srcu_torture_read_lock(void)
+{
+	return srcu_read_lock(&srcu_ctl);
+}
+
+static void srcu_read_delay(struct rcu_random_state *rrsp)
+{
+	long delay;
+	const long uspertick = 1000000 / HZ;
+	const long longdelay = 10;
+
+	/* We want there to be long-running readers, but not all the time. */
+
+	delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick);
+	if (!delay)
+		schedule_timeout_interruptible(longdelay);
+}
+
+static void srcu_torture_read_unlock(int idx)
+{
+	srcu_read_unlock(&srcu_ctl, idx);
+}
+
+static int srcu_torture_completed(void)
+{
+	return srcu_batches_completed(&srcu_ctl);
+}
+
+static void srcu_torture_deferred_free(struct rcu_torture *p)
+{
+	int i;
+	struct rcu_torture *rp;
+	struct rcu_torture *rp1;
+
+	synchronize_srcu(&srcu_ctl);
+	list_add(&p->rtort_free, &srcu_removed);
+	list_for_each_entry_safe(rp, rp1, &srcu_removed, rtort_free) {
+		i = rp->rtort_pipe_count;
+		if (i > RCU_TORTURE_PIPE_LEN)
+			i = RCU_TORTURE_PIPE_LEN;
+		atomic_inc(&rcu_torture_wcount[i]);
+		if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
+			rp->rtort_mbtest = 0;
+			list_del(&rp->rtort_free);
+			rcu_torture_free(rp);
+		}
+	}
+}
+
+static int srcu_torture_stats(char *page)
+{
+	int cnt = 0;
+	int cpu;
+	int idx = srcu_ctl.completed & 0x1;
+
+	cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):",
+		       torture_type, TORTURE_FLAG, idx);
+	for_each_possible_cpu(cpu) {
+		cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu,
+			       per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
+			       per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
+	}
+	cnt += sprintf(&page[cnt], "\n");
+	return cnt;
+}
+
+static struct rcu_torture_ops srcu_ops = {
+	.init = srcu_torture_init,
+	.cleanup = srcu_torture_cleanup,
+	.readlock = srcu_torture_read_lock,
+	.readdelay = srcu_read_delay,
+	.readunlock = srcu_torture_read_unlock,
+	.completed = srcu_torture_completed,
+	.deferredfree = srcu_torture_deferred_free,
+	.stats = srcu_torture_stats,
+	.name = "srcu"
+};
+
 static struct rcu_torture_ops *torture_ops[] =
-	{ &rcu_ops, &rcu_bh_ops, NULL };
+	{ &rcu_ops, &rcu_bh_ops, &srcu_ops, NULL };
 
 /*
  * RCU torture writer kthread.  Repeatedly substitutes a new structure
@@ -359,7 +472,7 @@ rcu_torture_reader(void *arg)
 		}
 		if (p->rtort_mbtest == 0)
 			atomic_inc(&n_rcu_torture_mberror);
-		udelay(rcu_random(&rand) & 0x7f);
+		cur_ops->readdelay(&rand);
 		preempt_disable();
 		pipe_count = p->rtort_pipe_count;
 		if (pipe_count > RCU_TORTURE_PIPE_LEN) {
@@ -483,7 +596,7 @@ static int rcu_idle_cpu;	/* Force all torture tasks off this CPU */
 /* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case
  * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs.
  */
-void rcu_torture_shuffle_tasks(void)
+static void rcu_torture_shuffle_tasks(void)
 {
 	cpumask_t tmp_mask = CPU_MASK_ALL;
 	int i;
-- 
cgit v1.2.3


From eabc069401bcf45bcc3f19e643017bf761780aa8 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Wed, 4 Oct 2006 02:17:04 -0700
Subject: [PATCH] Add SRCU-based notifier chains

This patch (as751) adds a new type of notifier chain, based on the SRCU
(Sleepable Read-Copy Update) primitives recently added to the kernel.  An
SRCU notifier chain is much like a blocking notifier chain, in that it must
be called in process context and its callout routines are allowed to sleep.
 The difference is that the chain's links are protected by the SRCU
mechanism rather than by an rw-semaphore, so calling the chain has
extremely low overhead: no memory barriers and no cache-line bouncing.  On
the other hand, unregistering from the chain is expensive and the chain
head requires special runtime initialization (plus cleanup if it is to be
deallocated).

SRCU notifiers are appropriate for notifiers that will be called very
frequently and for which unregistration occurs very seldom.  The proposed
"task notifier" scheme qualifies, as may some of the network notifiers.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Acked-by: Paul E. McKenney <paulmck@us.ibm.com>
Acked-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 123 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 2314867ae34f..fd5c71006775 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -153,7 +153,7 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
 
 /*
  *	Atomic notifier chain routines.  Registration and unregistration
- *	use a mutex, and call_chain is synchronized by RCU (no locks).
+ *	use a spinlock, and call_chain is synchronized by RCU (no locks).
  */
 
 /**
@@ -401,6 +401,128 @@ int raw_notifier_call_chain(struct raw_notifier_head *nh,
 
 EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
 
+/*
+ *	SRCU notifier chain routines.    Registration and unregistration
+ *	use a mutex, and call_chain is synchronized by SRCU (no locks).
+ */
+
+/**
+ *	srcu_notifier_chain_register - Add notifier to an SRCU notifier chain
+ *	@nh: Pointer to head of the SRCU notifier chain
+ *	@n: New entry in notifier chain
+ *
+ *	Adds a notifier to an SRCU notifier chain.
+ *	Must be called in process context.
+ *
+ *	Currently always returns zero.
+ */
+
+int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
+		struct notifier_block *n)
+{
+	int ret;
+
+	/*
+	 * This code gets used during boot-up, when task switching is
+	 * not yet working and interrupts must remain disabled.  At
+	 * such times we must not call mutex_lock().
+	 */
+	if (unlikely(system_state == SYSTEM_BOOTING))
+		return notifier_chain_register(&nh->head, n);
+
+	mutex_lock(&nh->mutex);
+	ret = notifier_chain_register(&nh->head, n);
+	mutex_unlock(&nh->mutex);
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(srcu_notifier_chain_register);
+
+/**
+ *	srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain
+ *	@nh: Pointer to head of the SRCU notifier chain
+ *	@n: Entry to remove from notifier chain
+ *
+ *	Removes a notifier from an SRCU notifier chain.
+ *	Must be called from process context.
+ *
+ *	Returns zero on success or %-ENOENT on failure.
+ */
+int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
+		struct notifier_block *n)
+{
+	int ret;
+
+	/*
+	 * This code gets used during boot-up, when task switching is
+	 * not yet working and interrupts must remain disabled.  At
+	 * such times we must not call mutex_lock().
+	 */
+	if (unlikely(system_state == SYSTEM_BOOTING))
+		return notifier_chain_unregister(&nh->head, n);
+
+	mutex_lock(&nh->mutex);
+	ret = notifier_chain_unregister(&nh->head, n);
+	mutex_unlock(&nh->mutex);
+	synchronize_srcu(&nh->srcu);
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
+
+/**
+ *	srcu_notifier_call_chain - Call functions in an SRCU notifier chain
+ *	@nh: Pointer to head of the SRCU notifier chain
+ *	@val: Value passed unmodified to notifier function
+ *	@v: Pointer passed unmodified to notifier function
+ *
+ *	Calls each function in a notifier chain in turn.  The functions
+ *	run in a process context, so they are allowed to block.
+ *
+ *	If the return value of the notifier can be and'ed
+ *	with %NOTIFY_STOP_MASK then srcu_notifier_call_chain
+ *	will return immediately, with the return value of
+ *	the notifier function which halted execution.
+ *	Otherwise the return value is the return value
+ *	of the last notifier function called.
+ */
+
+int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
+		unsigned long val, void *v)
+{
+	int ret;
+	int idx;
+
+	idx = srcu_read_lock(&nh->srcu);
+	ret = notifier_call_chain(&nh->head, val, v);
+	srcu_read_unlock(&nh->srcu, idx);
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
+
+/**
+ *	srcu_init_notifier_head - Initialize an SRCU notifier head
+ *	@nh: Pointer to head of the srcu notifier chain
+ *
+ *	Unlike other sorts of notifier heads, SRCU notifier heads require
+ *	dynamic initialization.  Be sure to call this routine before
+ *	calling any of the other SRCU notifier routines for this head.
+ *
+ *	If an SRCU notifier head is deallocated, it must first be cleaned
+ *	up by calling srcu_cleanup_notifier_head().  Otherwise the head's
+ *	per-cpu data (used by the SRCU mechanism) will leak.
+ */
+
+void srcu_init_notifier_head(struct srcu_notifier_head *nh)
+{
+	mutex_init(&nh->mutex);
+	init_srcu_struct(&nh->srcu);
+	nh->head = NULL;
+}
+
+EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
+
 /**
  *	register_reboot_notifier - Register function to be called at reboot time
  *	@nb: Info about notifier function to be called
-- 
cgit v1.2.3


From e6a92013ba458804161c0c5b6d134d82204dc233 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Wed, 4 Oct 2006 02:17:05 -0700
Subject: [PATCH] SRCU: report out-of-memory errors

Currently the init_srcu_struct() routine has no way to report out-of-memory
errors.  This patch (as761) makes it return -ENOMEM when the per-cpu data
allocation fails.

The patch also makes srcu_init_notifier_head() report a BUG if a notifier
head can't be initialized.  Perhaps it should return -ENOMEM instead, but
in the most likely cases where this might occur I don't think any recovery
is possible.  Notifier chains generally are not created dynamically.

[akpm@osdl.org: avoid statement-with-side-effect in macro]
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Acked-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/srcu.c | 5 +++--
 kernel/sys.c  | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/srcu.c b/kernel/srcu.c
index 7e1979f624ba..3507cabe963b 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -42,11 +42,12 @@
  * to any other function.  Each srcu_struct represents a separate domain
  * of SRCU protection.
  */
-void init_srcu_struct(struct srcu_struct *sp)
+int init_srcu_struct(struct srcu_struct *sp)
 {
 	sp->completed = 0;
-	sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
 	mutex_init(&sp->mutex);
+	sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
+	return (sp->per_cpu_ref ? 0 : -ENOMEM);
 }
 
 /*
diff --git a/kernel/sys.c b/kernel/sys.c
index fd5c71006775..98489d82801b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -517,7 +517,8 @@ EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
 void srcu_init_notifier_head(struct srcu_notifier_head *nh)
 {
 	mutex_init(&nh->mutex);
-	init_srcu_struct(&nh->srcu);
+	if (init_srcu_struct(&nh->srcu) < 0)
+		BUG();
 	nh->head = NULL;
 }
 
-- 
cgit v1.2.3


From ff2c93a5373f12f86f3281705d11278a9f2334e2 Mon Sep 17 00:00:00 2001
From: Josh Triplett <josht@us.ibm.com>
Date: Wed, 4 Oct 2006 02:17:09 -0700
Subject: [PATCH] rcu: Add MODULE_AUTHOR to rcutorture module

Signed-off-by: Josh Triplett <josh@freedesktop.org>
Acked-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcutorture.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index e34d22bf2293..7b965341b1dc 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -47,6 +47,7 @@
 #include <linux/srcu.h>
 
 MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
 
 static int nreaders = -1;	/* # reader threads, defaults to 2*ncpus */
 static int stat_interval;	/* Interval between stats, in seconds. */
-- 
cgit v1.2.3


From 3c29e03d9121e07714fb9e5303d9b026800ffd5a Mon Sep 17 00:00:00 2001
From: Josh Triplett <josht@us.ibm.com>
Date: Wed, 4 Oct 2006 02:17:10 -0700
Subject: [PATCH] rcu: Mention rcu_bh in description of rcutorture's
 torture_type parameter

The comment for rcutorture's torture_type parameter only lists the RCU
variants rcu and srcu, but not rcu_bh; add rcu_bh to the list.

Signed-off-by: Josh Triplett <josh@freedesktop.org>
Acked-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcutorture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 7b965341b1dc..021d3108bb6e 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -55,7 +55,7 @@ static int stat_interval;	/* Interval between stats, in seconds. */
 static int verbose;		/* Print more debug info. */
 static int test_no_idle_hz;	/* Test RCU's support for tickless idle CPUs. */
 static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
-static char *torture_type = "rcu"; /* What to torture: rcu, srcu. */
+static char *torture_type = "rcu"; /* What to torture: rcu, rcu_bh, srcu. */
 
 module_param(nreaders, int, 0);
 MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
-- 
cgit v1.2.3


From 2860aaba4dc87fa43c08724434b87a8650f3bff5 Mon Sep 17 00:00:00 2001
From: Josh Triplett <josht@us.ibm.com>
Date: Wed, 4 Oct 2006 02:17:11 -0700
Subject: [PATCH] rcu: Avoid kthread_stop on invalid pointer if rcutorture
 reader startup fails

rcu_torture_init kmallocs the array of reader threads, then creates each
one with kthread_run, cleaning up with rcu_torture_cleanup if this fails.
rcu_torture_cleanup calls kthread_stop on any non-NULL pointer in the
array; however, any readers after the one that failed to start up will have
invalid pointers, not null pointers.  Avoid this by using kzalloc instead.

Signed-off-by: Josh Triplett <josh@freedesktop.org>
Acked-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcutorture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 021d3108bb6e..42e7f01e8003 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -780,7 +780,7 @@ rcu_torture_init(void)
 		writer_task = NULL;
 		goto unwind;
 	}
-	reader_tasks = kmalloc(nrealreaders * sizeof(reader_tasks[0]),
+	reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]),
 			       GFP_KERNEL);
 	if (reader_tasks == NULL) {
 		VERBOSE_PRINTK_ERRSTRING("out of memory");
-- 
cgit v1.2.3


From 75cfef32f26d03f5d0a0833572d52f94ad858a36 Mon Sep 17 00:00:00 2001
From: Josh Triplett <josht@us.ibm.com>
Date: Wed, 4 Oct 2006 02:17:12 -0700
Subject: [PATCH] rcu: Fix sign bug making rcu_random always return the same
 sequence

rcu_random uses a counter rrs_count to occasionally mix data from
get_random_bytes into the state of its pseudorandom generator.  However,
the rrs_counter gets declared as an unsigned long, and rcu_random checks
for --rrs_count < 0, so this code will never mix any real random data into
the state, and will thus always return the same sequence of random numbers.

Also, change the return value of rcu_random from long to unsigned long, to
avoid potential issues caused by the use of the % operator, which can
return negative values for negative left operands.

Signed-off-by: Josh Triplett <josh@freedesktop.org>
Acked-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcutorture.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 42e7f01e8003..43d6d4f9ef09 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -147,7 +147,7 @@ rcu_torture_free(struct rcu_torture *p)
 
 struct rcu_random_state {
 	unsigned long rrs_state;
-	unsigned long rrs_count;
+	long rrs_count;
 };
 
 #define RCU_RANDOM_MULT 39916801  /* prime */
@@ -160,7 +160,7 @@ struct rcu_random_state {
  * Crude but fast random-number generator.  Uses a linear congruential
  * generator, with occasional help from get_random_bytes().
  */
-static long
+static unsigned long
 rcu_random(struct rcu_random_state *rrsp)
 {
 	long refresh;
-- 
cgit v1.2.3


From b772e1dd4b1e60a7a160f7bd4ea08e28394ceb54 Mon Sep 17 00:00:00 2001
From: Josh Triplett <josht@us.ibm.com>
Date: Wed, 4 Oct 2006 02:17:13 -0700
Subject: [PATCH] RCU: add fake writers to rcutorture

rcutorture currently has one writer and an arbitrary number of readers.  To
better exercise some of the code paths in RCU implementations, add fake
writer threads which call the synchronize function for the RCU variant in a
loop, with a delay between calls to arrange for different numbers of
writers running in parallel.

[bunk@stusta.de: cleanup]
Acked-by: Paul McKenney <paulmck@us.ibm.com>
Cc: Dipkanar Sarma <dipankar@in.ibm.com>
Signed-off-by: Josh Triplett <josh@freedesktop.org>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcutorture.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 104 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 43d6d4f9ef09..e0450210ce4d 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -15,9 +15,10 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  *
- * Copyright (C) IBM Corporation, 2005
+ * Copyright (C) IBM Corporation, 2005, 2006
  *
  * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ *          Josh Triplett <josh@freedesktop.org>
  *
  * See also:  Documentation/RCU/torture.txt
  */
@@ -47,9 +48,11 @@
 #include <linux/srcu.h>
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
+              "Josh Triplett <josh@freedesktop.org>");
 
 static int nreaders = -1;	/* # reader threads, defaults to 2*ncpus */
+static int nfakewriters = 4;	/* # fake writer threads */
 static int stat_interval;	/* Interval between stats, in seconds. */
 				/*  Defaults to "only at end of test". */
 static int verbose;		/* Print more debug info. */
@@ -59,6 +62,8 @@ static char *torture_type = "rcu"; /* What to torture: rcu, rcu_bh, srcu. */
 
 module_param(nreaders, int, 0);
 MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
+module_param(nfakewriters, int, 0);
+MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
 module_param(stat_interval, int, 0);
 MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
 module_param(verbose, bool, 0);
@@ -82,6 +87,7 @@ static char printk_buf[4096];
 
 static int nrealreaders;
 static struct task_struct *writer_task;
+static struct task_struct **fakewriter_tasks;
 static struct task_struct **reader_tasks;
 static struct task_struct *stats_task;
 static struct task_struct *shuffler_task;
@@ -186,6 +192,7 @@ struct rcu_torture_ops {
 	void (*readunlock)(int idx);
 	int (*completed)(void);
 	void (*deferredfree)(struct rcu_torture *p);
+	void (*sync)(void);
 	int (*stats)(char *page);
 	char *name;
 };
@@ -258,6 +265,7 @@ static struct rcu_torture_ops rcu_ops = {
 	.readunlock = rcu_torture_read_unlock,
 	.completed = rcu_torture_completed,
 	.deferredfree = rcu_torture_deferred_free,
+	.sync = synchronize_rcu,
 	.stats = NULL,
 	.name = "rcu"
 };
@@ -287,6 +295,28 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
 	call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
 }
 
+struct rcu_bh_torture_synchronize {
+	struct rcu_head head;
+	struct completion completion;
+};
+
+static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head)
+{
+	struct rcu_bh_torture_synchronize *rcu;
+
+	rcu = container_of(head, struct rcu_bh_torture_synchronize, head);
+	complete(&rcu->completion);
+}
+
+static void rcu_bh_torture_synchronize(void)
+{
+	struct rcu_bh_torture_synchronize rcu;
+
+	init_completion(&rcu.completion);
+	call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
+	wait_for_completion(&rcu.completion);
+}
+
 static struct rcu_torture_ops rcu_bh_ops = {
 	.init = NULL,
 	.cleanup = NULL,
@@ -295,6 +325,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
 	.readunlock = rcu_bh_torture_read_unlock,
 	.completed = rcu_bh_torture_completed,
 	.deferredfree = rcu_bh_torture_deferred_free,
+	.sync = rcu_bh_torture_synchronize,
 	.stats = NULL,
 	.name = "rcu_bh"
 };
@@ -367,6 +398,11 @@ static void srcu_torture_deferred_free(struct rcu_torture *p)
 	}
 }
 
+static void srcu_torture_synchronize(void)
+{
+	synchronize_srcu(&srcu_ctl);
+}
+
 static int srcu_torture_stats(char *page)
 {
 	int cnt = 0;
@@ -392,6 +428,7 @@ static struct rcu_torture_ops srcu_ops = {
 	.readunlock = srcu_torture_read_unlock,
 	.completed = srcu_torture_completed,
 	.deferredfree = srcu_torture_deferred_free,
+	.sync = srcu_torture_synchronize,
 	.stats = srcu_torture_stats,
 	.name = "srcu"
 };
@@ -443,6 +480,30 @@ rcu_torture_writer(void *arg)
 	return 0;
 }
 
+/*
+ * RCU torture fake writer kthread.  Repeatedly calls sync, with a random
+ * delay between calls.
+ */
+static int
+rcu_torture_fakewriter(void *arg)
+{
+	DEFINE_RCU_RANDOM(rand);
+
+	VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started");
+	set_user_nice(current, 19);
+
+	do {
+		schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
+		udelay(rcu_random(&rand) & 0x3ff);
+		cur_ops->sync();
+	} while (!kthread_should_stop() && !fullstop);
+
+	VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
+	while (!kthread_should_stop())
+		schedule_timeout_uninterruptible(1);
+	return 0;
+}
+
 /*
  * RCU torture reader kthread.  Repeatedly dereferences rcu_torture_current,
  * incrementing the corresponding element of the pipeline array.  The
@@ -621,6 +682,12 @@ static void rcu_torture_shuffle_tasks(void)
 				set_cpus_allowed(reader_tasks[i], tmp_mask);
 	}
 
+	if (fakewriter_tasks != NULL) {
+		for (i = 0; i < nfakewriters; i++)
+			if (fakewriter_tasks[i])
+				set_cpus_allowed(fakewriter_tasks[i], tmp_mask);
+	}
+
 	if (writer_task)
 		set_cpus_allowed(writer_task, tmp_mask);
 
@@ -654,11 +721,12 @@ rcu_torture_shuffle(void *arg)
 static inline void
 rcu_torture_print_module_parms(char *tag)
 {
-	printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d "
+	printk(KERN_ALERT "%s" TORTURE_FLAG
+		"--- %s: nreaders=%d nfakewriters=%d "
 		"stat_interval=%d verbose=%d test_no_idle_hz=%d "
 		"shuffle_interval = %d\n",
-		torture_type, tag, nrealreaders, stat_interval, verbose,
-		test_no_idle_hz, shuffle_interval);
+		torture_type, tag, nrealreaders, nfakewriters,
+		stat_interval, verbose, test_no_idle_hz, shuffle_interval);
 }
 
 static void
@@ -693,6 +761,19 @@ rcu_torture_cleanup(void)
 	}
 	rcu_torture_current = NULL;
 
+	if (fakewriter_tasks != NULL) {
+		for (i = 0; i < nfakewriters; i++) {
+			if (fakewriter_tasks[i] != NULL) {
+				VERBOSE_PRINTK_STRING(
+					"Stopping rcu_torture_fakewriter task");
+				kthread_stop(fakewriter_tasks[i]);
+			}
+			fakewriter_tasks[i] = NULL;
+		}
+		kfree(fakewriter_tasks);
+		fakewriter_tasks = NULL;
+	}
+
 	if (stats_task != NULL) {
 		VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
 		kthread_stop(stats_task);
@@ -780,6 +861,24 @@ rcu_torture_init(void)
 		writer_task = NULL;
 		goto unwind;
 	}
+	fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
+	                           GFP_KERNEL);
+	if (fakewriter_tasks == NULL) {
+		VERBOSE_PRINTK_ERRSTRING("out of memory");
+		firsterr = -ENOMEM;
+		goto unwind;
+	}
+	for (i = 0; i < nfakewriters; i++) {
+		VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task");
+		fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL,
+		                                  "rcu_torture_fakewriter");
+		if (IS_ERR(fakewriter_tasks[i])) {
+			firsterr = PTR_ERR(fakewriter_tasks[i]);
+			VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter");
+			fakewriter_tasks[i] = NULL;
+			goto unwind;
+		}
+	}
 	reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]),
 			       GFP_KERNEL);
 	if (reader_tasks == NULL) {
-- 
cgit v1.2.3


From e3033736581f125ba5fd6c0760e0d430d54fb5c0 Mon Sep 17 00:00:00 2001
From: Josh Triplett <josht@us.ibm.com>
Date: Wed, 4 Oct 2006 02:17:14 -0700
Subject: [PATCH] rcu: refactor srcu_torture_deferred_free to work for any
 implementation

Make srcu_torture_deferred_free use cur_ops->sync() so it will work for any
implementation.  Move and rename it in preparation for use in the ops of other
implementations.

Signed-off-by: Josh Triplett <josh@freedesktop.org>
Acked-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcutorture.c | 53 +++++++++++++++++++++++++++++------------------------
 1 file changed, 29 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index e0450210ce4d..6e2f0a8344c2 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -117,6 +117,7 @@ static atomic_t n_rcu_torture_alloc_fail;
 static atomic_t n_rcu_torture_free;
 static atomic_t n_rcu_torture_mberror;
 static atomic_t n_rcu_torture_error;
+static struct list_head rcu_torture_removed;
 
 /*
  * Allocate an element from the rcu_tortures pool.
@@ -270,6 +271,32 @@ static struct rcu_torture_ops rcu_ops = {
 	.name = "rcu"
 };
 
+static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
+{
+	int i;
+	struct rcu_torture *rp;
+	struct rcu_torture *rp1;
+
+	cur_ops->sync();
+	list_add(&p->rtort_free, &rcu_torture_removed);
+	list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) {
+		i = rp->rtort_pipe_count;
+		if (i > RCU_TORTURE_PIPE_LEN)
+			i = RCU_TORTURE_PIPE_LEN;
+		atomic_inc(&rcu_torture_wcount[i]);
+		if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
+			rp->rtort_mbtest = 0;
+			list_del(&rp->rtort_free);
+			rcu_torture_free(rp);
+		}
+	}
+}
+
+static void rcu_sync_torture_init(void)
+{
+	INIT_LIST_HEAD(&rcu_torture_removed);
+}
+
 /*
  * Definitions for rcu_bh torture testing.
  */
@@ -335,12 +362,11 @@ static struct rcu_torture_ops rcu_bh_ops = {
  */
 
 static struct srcu_struct srcu_ctl;
-static struct list_head srcu_removed;
 
 static void srcu_torture_init(void)
 {
 	init_srcu_struct(&srcu_ctl);
-	INIT_LIST_HEAD(&srcu_removed);
+	rcu_sync_torture_init();
 }
 
 static void srcu_torture_cleanup(void)
@@ -377,27 +403,6 @@ static int srcu_torture_completed(void)
 	return srcu_batches_completed(&srcu_ctl);
 }
 
-static void srcu_torture_deferred_free(struct rcu_torture *p)
-{
-	int i;
-	struct rcu_torture *rp;
-	struct rcu_torture *rp1;
-
-	synchronize_srcu(&srcu_ctl);
-	list_add(&p->rtort_free, &srcu_removed);
-	list_for_each_entry_safe(rp, rp1, &srcu_removed, rtort_free) {
-		i = rp->rtort_pipe_count;
-		if (i > RCU_TORTURE_PIPE_LEN)
-			i = RCU_TORTURE_PIPE_LEN;
-		atomic_inc(&rcu_torture_wcount[i]);
-		if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
-			rp->rtort_mbtest = 0;
-			list_del(&rp->rtort_free);
-			rcu_torture_free(rp);
-		}
-	}
-}
-
 static void srcu_torture_synchronize(void)
 {
 	synchronize_srcu(&srcu_ctl);
@@ -427,7 +432,7 @@ static struct rcu_torture_ops srcu_ops = {
 	.readdelay = srcu_read_delay,
 	.readunlock = srcu_torture_read_unlock,
 	.completed = srcu_torture_completed,
-	.deferredfree = srcu_torture_deferred_free,
+	.deferredfree = rcu_sync_torture_deferred_free,
 	.sync = srcu_torture_synchronize,
 	.stats = srcu_torture_stats,
 	.name = "srcu"
-- 
cgit v1.2.3


From 20d2e4283a97665a3db78c60dfa342a0c7c1b180 Mon Sep 17 00:00:00 2001
From: Josh Triplett <josht@us.ibm.com>
Date: Wed, 4 Oct 2006 02:17:15 -0700
Subject: [PATCH] rcu: add rcu_sync torture type to rcutorture

Use the newly-generic synchronous deferred free function to implement torture
testing for RCU using synchronize_rcu rather than the asynchronous call_rcu.

Signed-off-by: Josh Triplett <josh@freedesktop.org>
Acked-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcutorture.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 6e2f0a8344c2..1c329df60bcc 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -58,7 +58,7 @@ static int stat_interval;	/* Interval between stats, in seconds. */
 static int verbose;		/* Print more debug info. */
 static int test_no_idle_hz;	/* Test RCU's support for tickless idle CPUs. */
 static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
-static char *torture_type = "rcu"; /* What to torture: rcu, rcu_bh, srcu. */
+static char *torture_type = "rcu"; /* What RCU implementation to torture. */
 
 module_param(nreaders, int, 0);
 MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
@@ -297,6 +297,19 @@ static void rcu_sync_torture_init(void)
 	INIT_LIST_HEAD(&rcu_torture_removed);
 }
 
+static struct rcu_torture_ops rcu_sync_ops = {
+	.init = rcu_sync_torture_init,
+	.cleanup = NULL,
+	.readlock = rcu_torture_read_lock,
+	.readdelay = rcu_read_delay,
+	.readunlock = rcu_torture_read_unlock,
+	.completed = rcu_torture_completed,
+	.deferredfree = rcu_sync_torture_deferred_free,
+	.sync = synchronize_rcu,
+	.stats = NULL,
+	.name = "rcu_sync"
+};
+
 /*
  * Definitions for rcu_bh torture testing.
  */
@@ -439,7 +452,7 @@ static struct rcu_torture_ops srcu_ops = {
 };
 
 static struct rcu_torture_ops *torture_ops[] =
-	{ &rcu_ops, &rcu_bh_ops, &srcu_ops, NULL };
+	{ &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &srcu_ops, NULL };
 
 /*
  * RCU torture writer kthread.  Repeatedly substitutes a new structure
-- 
cgit v1.2.3


From 11a147013e39ff4cb031395cb78a9d307c4799cd Mon Sep 17 00:00:00 2001
From: Josh Triplett <josht@us.ibm.com>
Date: Wed, 4 Oct 2006 02:17:16 -0700
Subject: [PATCH] rcu: add rcu_bh_sync torture type to rcutorture

Use the newly-generic synchronous deferred free function to implement torture
testing for rcu_bh using synchronize_rcu_bh rather than the asynchronous
call_rcu_bh.

Signed-off-by: Josh Triplett <josh@freedesktop.org>
Acked-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcutorture.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 1c329df60bcc..0f0ff1556b5d 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -370,6 +370,19 @@ static struct rcu_torture_ops rcu_bh_ops = {
 	.name = "rcu_bh"
 };
 
+static struct rcu_torture_ops rcu_bh_sync_ops = {
+	.init = rcu_sync_torture_init,
+	.cleanup = NULL,
+	.readlock = rcu_bh_torture_read_lock,
+	.readdelay = rcu_read_delay,  /* just reuse rcu's version. */
+	.readunlock = rcu_bh_torture_read_unlock,
+	.completed = rcu_bh_torture_completed,
+	.deferredfree = rcu_sync_torture_deferred_free,
+	.sync = rcu_bh_torture_synchronize,
+	.stats = NULL,
+	.name = "rcu_bh_sync"
+};
+
 /*
  * Definitions for srcu torture testing.
  */
@@ -452,7 +465,8 @@ static struct rcu_torture_ops srcu_ops = {
 };
 
 static struct rcu_torture_ops *torture_ops[] =
-	{ &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &srcu_ops, NULL };
+	{ &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &srcu_ops,
+	  NULL };
 
 /*
  * RCU torture writer kthread.  Repeatedly substitutes a new structure
-- 
cgit v1.2.3


From 4b6c2cca6eef9cc4a15350bf1c61839e12e08b84 Mon Sep 17 00:00:00 2001
From: Josh Triplett <josht@us.ibm.com>
Date: Wed, 4 Oct 2006 02:17:16 -0700
Subject: [PATCH] rcu: add sched torture type to rcutorture

Implement torture testing for the "sched" variant of RCU, which uses
preempt_disable, preempt_enable, and synchronize_sched.

Signed-off-by: Josh Triplett <josh@freedesktop.org>
Acked-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcutorture.c | 40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 0f0ff1556b5d..e2bda18f6f42 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -464,9 +464,47 @@ static struct rcu_torture_ops srcu_ops = {
 	.name = "srcu"
 };
 
+/*
+ * Definitions for sched torture testing.
+ */
+
+static int sched_torture_read_lock(void)
+{
+	preempt_disable();
+	return 0;
+}
+
+static void sched_torture_read_unlock(int idx)
+{
+	preempt_enable();
+}
+
+static int sched_torture_completed(void)
+{
+	return 0;
+}
+
+static void sched_torture_synchronize(void)
+{
+	synchronize_sched();
+}
+
+static struct rcu_torture_ops sched_ops = {
+	.init = rcu_sync_torture_init,
+	.cleanup = NULL,
+	.readlock = sched_torture_read_lock,
+	.readdelay = rcu_read_delay,  /* just reuse rcu's version. */
+	.readunlock = sched_torture_read_unlock,
+	.completed = sched_torture_completed,
+	.deferredfree = rcu_sync_torture_deferred_free,
+	.sync = sched_torture_synchronize,
+	.stats = NULL,
+	.name = "sched"
+};
+
 static struct rcu_torture_ops *torture_ops[] =
 	{ &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &srcu_ops,
-	  NULL };
+	  &sched_ops, NULL };
 
 /*
  * RCU torture writer kthread.  Repeatedly substitutes a new structure
-- 
cgit v1.2.3


From 20e9751bd9dd6b832fd84ada27840360f7e877f1 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 4 Oct 2006 02:17:17 -0700
Subject: [PATCH] rcu: simplify/improve batch tuning

Kill a hard-to-calculate 'rsinterval' boot parameter and per-cpu
rcu_data.last_rs_qlen.  Instead, it adds adds a flag rcu_ctrlblk.signaled,
which records the fact that one of CPUs has sent a resched IPI since the
last rcu_start_batch().

Roughly speaking, we need two rcu_start_batch()s in order to move callbacks
from ->nxtlist to ->donelist.  This means that when ->qlen exceeds qhimark
and continues to grow, we should send a resched IPI, and then do it again
after we gone through a quiescent state.

On the other hand, if it was already sent, we don't need to do it again
when another CPU detects overflow of the queue.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcupdate.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 523e46483b99..26bb5ffe1ef1 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -71,9 +71,6 @@ static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
 static int blimit = 10;
 static int qhimark = 10000;
 static int qlowmark = 100;
-#ifdef CONFIG_SMP
-static int rsinterval = 1000;
-#endif
 
 static atomic_t rcu_barrier_cpu_count;
 static DEFINE_MUTEX(rcu_barrier_mutex);
@@ -86,8 +83,8 @@ static void force_quiescent_state(struct rcu_data *rdp,
 	int cpu;
 	cpumask_t cpumask;
 	set_need_resched();
-	if (unlikely(rdp->qlen - rdp->last_rs_qlen > rsinterval)) {
-		rdp->last_rs_qlen = rdp->qlen;
+	if (unlikely(!rcp->signaled)) {
+		rcp->signaled = 1;
 		/*
 		 * Don't send IPI to itself. With irqs disabled,
 		 * rdp->cpu is the current cpu.
@@ -301,6 +298,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
 		smp_mb();
 		cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
 
+		rcp->signaled = 0;
 	}
 }
 
@@ -628,9 +626,6 @@ void synchronize_rcu(void)
 module_param(blimit, int, 0);
 module_param(qhimark, int, 0);
 module_param(qlowmark, int, 0);
-#ifdef CONFIG_SMP
-module_param(rsinterval, int, 0);
-#endif
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
 EXPORT_SYMBOL_GPL(call_rcu);
-- 
cgit v1.2.3


From 57a58a9435aef3e0342ba4b2c97e0ddfea6f2c7f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 5 Oct 2006 13:06:34 +0100
Subject: IRQ: Typedef the IRQ flow handler function type

Typedef the IRQ flow handler function type.

Signed-Off-By: David Howells <dhowells@redhat.com>
(cherry picked from 8e973fbdf5716b93a0a8c0365be33a31ca0fa351 commit)
---
 kernel/irq/chip.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 4cf65f5c6a74..53e9dce6c657 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -505,10 +505,7 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
 #endif /* CONFIG_SMP */
 
 void
-__set_irq_handler(unsigned int irq,
-		  void fastcall (*handle)(unsigned int, irq_desc_t *,
-					  struct pt_regs *),
-		  int is_chained)
+__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained)
 {
 	struct irq_desc *desc;
 	unsigned long flags;
@@ -561,9 +558,7 @@ __set_irq_handler(unsigned int irq,
 
 void
 set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
-			 void fastcall (*handle)(unsigned int,
-						 struct irq_desc *,
-						 struct pt_regs *))
+			 irq_flow_handler_t handle)
 {
 	set_irq_chip(irq, chip);
 	__set_irq_handler(irq, handle, 0);
@@ -574,8 +569,7 @@ set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
  * /proc/interrupts output:
  */
 const char *
-handle_irq_name(void fastcall (*handle)(unsigned int, struct irq_desc *,
-					struct pt_regs *))
+handle_irq_name(irq_flow_handler_t handle)
 {
 	if (handle == handle_level_irq)
 		return "level  ";
-- 
cgit v1.2.3


From da482792a6d1a3fbaaa25fae867b343fb4db3246 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 5 Oct 2006 13:06:34 +0100
Subject: IRQ: Typedef the IRQ handler function type

Typedef the IRQ handler function type.

Signed-Off-By: David Howells <dhowells@redhat.com>
(cherry picked from 1356d1e5fd256997e3d3dce0777ab787d0515c7a commit)
---
 kernel/irq/manage.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 92be519eff26..6879202afe9a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -427,8 +427,7 @@ EXPORT_SYMBOL(free_irq);
  *	IRQF_SAMPLE_RANDOM	The interrupt can be used for entropy
  *
  */
-int request_irq(unsigned int irq,
-		irqreturn_t (*handler)(int, void *, struct pt_regs *),
+int request_irq(unsigned int irq, irq_handler_t handler,
 		unsigned long irqflags, const char *devname, void *dev_id)
 {
 	struct irqaction *action;
@@ -475,4 +474,3 @@ int request_irq(unsigned int irq,
 	return retval;
 }
 EXPORT_SYMBOL(request_irq);
-
-- 
cgit v1.2.3


From 7d12e780e003f93433d49ce78cfedf4b4c52adc5 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 5 Oct 2006 14:55:46 +0100
Subject: IRQ: Maintain regs pointer globally rather than passing to IRQ
 handlers

Maintain a per-CPU global "struct pt_regs *" variable which can be used instead
of passing regs around manually through all ~1800 interrupt handlers in the
Linux kernel.

The regs pointer is used in few places, but it potentially costs both stack
space and code to pass it around.  On the FRV arch, removing the regs parameter
from all the genirq function results in a 20% speed up of the IRQ exit path
(ie: from leaving timer_interrupt() to leaving do_IRQ()).

Where appropriate, an arch may override the generic storage facility and do
something different with the variable.  On FRV, for instance, the address is
maintained in GR28 at all times inside the kernel as part of general exception
handling.

Having looked over the code, it appears that the parameter may be handed down
through up to twenty or so layers of functions.  Consider a USB character
device attached to a USB hub, attached to a USB controller that posts its
interrupts through a cascaded auxiliary interrupt controller.  A character
device driver may want to pass regs to the sysrq handler through the input
layer which adds another few layers of parameter passing.

I've build this code with allyesconfig for x86_64 and i386.  I've runtested the
main part of the code on FRV and i386, though I can't test most of the drivers.
I've also done partial conversion for powerpc and MIPS - these at least compile
with minimal configurations.

This will affect all archs.  Mostly the changes should be relatively easy.
Take do_IRQ(), store the regs pointer at the beginning, saving the old one:

	struct pt_regs *old_regs = set_irq_regs(regs);

And put the old one back at the end:

	set_irq_regs(old_regs);

Don't pass regs through to generic_handle_irq() or __do_IRQ().

In timer_interrupt(), this sort of change will be necessary:

	-	update_process_times(user_mode(regs));
	-	profile_tick(CPU_PROFILING, regs);
	+	update_process_times(user_mode(get_irq_regs()));
	+	profile_tick(CPU_PROFILING);

I'd like to move update_process_times()'s use of get_irq_regs() into itself,
except that i386, alone of the archs, uses something other than user_mode().

Some notes on the interrupt handling in the drivers:

 (*) input_dev() is now gone entirely.  The regs pointer is no longer stored in
     the input_dev struct.

 (*) finish_unlinks() in drivers/usb/host/ohci-q.c needs checking.  It does
     something different depending on whether it's been supplied with a regs
     pointer or not.

 (*) Various IRQ handler function pointers have been moved to type
     irq_handler_t.

Signed-Off-By: David Howells <dhowells@redhat.com>
(cherry picked from 1b16e7ac850969f38b375e511e3fa2f474a33867 commit)
---
 kernel/irq/chip.c       | 36 +++++++++++++++---------------------
 kernel/irq/handle.c     | 19 ++++++++-----------
 kernel/irq/spurious.c   | 10 +++++-----
 kernel/power/poweroff.c |  3 +--
 kernel/profile.c        |  5 ++++-
 5 files changed, 33 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 53e9dce6c657..11c99697acfe 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -249,7 +249,6 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
  *	handle_simple_irq - Simple and software-decoded IRQs.
  *	@irq:	the interrupt number
  *	@desc:	the interrupt description structure for this irq
- *	@regs:	pointer to a register structure
  *
  *	Simple interrupts are either sent from a demultiplexing interrupt
  *	handler or come from hardware, where no interrupt hardware control
@@ -259,7 +258,7 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
  *	unmask issues if necessary.
  */
 void fastcall
-handle_simple_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
+handle_simple_irq(unsigned int irq, struct irq_desc *desc)
 {
 	struct irqaction *action;
 	irqreturn_t action_ret;
@@ -279,9 +278,9 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
 	desc->status |= IRQ_INPROGRESS;
 	spin_unlock(&desc->lock);
 
-	action_ret = handle_IRQ_event(irq, regs, action);
+	action_ret = handle_IRQ_event(irq, action);
 	if (!noirqdebug)
-		note_interrupt(irq, desc, action_ret, regs);
+		note_interrupt(irq, desc, action_ret);
 
 	spin_lock(&desc->lock);
 	desc->status &= ~IRQ_INPROGRESS;
@@ -293,7 +292,6 @@ out_unlock:
  *	handle_level_irq - Level type irq handler
  *	@irq:	the interrupt number
  *	@desc:	the interrupt description structure for this irq
- *	@regs:	pointer to a register structure
  *
  *	Level type interrupts are active as long as the hardware line has
  *	the active level. This may require to mask the interrupt and unmask
@@ -301,7 +299,7 @@ out_unlock:
  *	interrupt line is back to inactive.
  */
 void fastcall
-handle_level_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
+handle_level_irq(unsigned int irq, struct irq_desc *desc)
 {
 	unsigned int cpu = smp_processor_id();
 	struct irqaction *action;
@@ -329,9 +327,9 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
 	desc->status &= ~IRQ_PENDING;
 	spin_unlock(&desc->lock);
 
-	action_ret = handle_IRQ_event(irq, regs, action);
+	action_ret = handle_IRQ_event(irq, action);
 	if (!noirqdebug)
-		note_interrupt(irq, desc, action_ret, regs);
+		note_interrupt(irq, desc, action_ret);
 
 	spin_lock(&desc->lock);
 	desc->status &= ~IRQ_INPROGRESS;
@@ -345,7 +343,6 @@ out_unlock:
  *	handle_fasteoi_irq - irq handler for transparent controllers
  *	@irq:	the interrupt number
  *	@desc:	the interrupt description structure for this irq
- *	@regs:	pointer to a register structure
  *
  *	Only a single callback will be issued to the chip: an ->eoi()
  *	call when the interrupt has been serviced. This enables support
@@ -353,8 +350,7 @@ out_unlock:
  *	details in hardware, transparently.
  */
 void fastcall
-handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc,
-		   struct pt_regs *regs)
+handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 {
 	unsigned int cpu = smp_processor_id();
 	struct irqaction *action;
@@ -382,9 +378,9 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc,
 	desc->status &= ~IRQ_PENDING;
 	spin_unlock(&desc->lock);
 
-	action_ret = handle_IRQ_event(irq, regs, action);
+	action_ret = handle_IRQ_event(irq, action);
 	if (!noirqdebug)
-		note_interrupt(irq, desc, action_ret, regs);
+		note_interrupt(irq, desc, action_ret);
 
 	spin_lock(&desc->lock);
 	desc->status &= ~IRQ_INPROGRESS;
@@ -398,7 +394,6 @@ out:
  *	handle_edge_irq - edge type IRQ handler
  *	@irq:	the interrupt number
  *	@desc:	the interrupt description structure for this irq
- *	@regs:	pointer to a register structure
  *
  *	Interrupt occures on the falling and/or rising edge of a hardware
  *	signal. The occurence is latched into the irq controller hardware
@@ -412,7 +407,7 @@ out:
  *	loop is left.
  */
 void fastcall
-handle_edge_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
+handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 {
 	const unsigned int cpu = smp_processor_id();
 
@@ -463,9 +458,9 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
 
 		desc->status &= ~IRQ_PENDING;
 		spin_unlock(&desc->lock);
-		action_ret = handle_IRQ_event(irq, regs, action);
+		action_ret = handle_IRQ_event(irq, action);
 		if (!noirqdebug)
-			note_interrupt(irq, desc, action_ret, regs);
+			note_interrupt(irq, desc, action_ret);
 		spin_lock(&desc->lock);
 
 	} while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
@@ -480,12 +475,11 @@ out_unlock:
  *	handle_percpu_IRQ - Per CPU local irq handler
  *	@irq:	the interrupt number
  *	@desc:	the interrupt description structure for this irq
- *	@regs:	pointer to a register structure
  *
  *	Per CPU interrupts on SMP machines without locking requirements
  */
 void fastcall
-handle_percpu_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
+handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
 {
 	irqreturn_t action_ret;
 
@@ -494,9 +488,9 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
 	if (desc->chip->ack)
 		desc->chip->ack(irq);
 
-	action_ret = handle_IRQ_event(irq, regs, desc->action);
+	action_ret = handle_IRQ_event(irq, desc->action);
 	if (!noirqdebug)
-		note_interrupt(irq, desc, action_ret, regs);
+		note_interrupt(irq, desc, action_ret);
 
 	if (desc->chip->eoi)
 		desc->chip->eoi(irq);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 4c6cdbaed661..42aa6f1a3f0f 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -27,7 +27,7 @@
  * Handles spurious and unhandled IRQ's. It also prints a debugmessage.
  */
 void fastcall
-handle_bad_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
+handle_bad_irq(unsigned int irq, struct irq_desc *desc)
 {
 	print_irq_desc(irq, desc);
 	kstat_this_cpu.irqs[irq]++;
@@ -115,7 +115,7 @@ struct irq_chip dummy_irq_chip = {
 /*
  * Special, empty irq handler:
  */
-irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs)
+irqreturn_t no_action(int cpl, void *dev_id)
 {
 	return IRQ_NONE;
 }
@@ -123,13 +123,11 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs)
 /**
  * handle_IRQ_event - irq action chain handler
  * @irq:	the interrupt number
- * @regs:	pointer to a register structure
  * @action:	the interrupt action chain for this irq
  *
  * Handles the action chain of an irq event
  */
-irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
-			     struct irqaction *action)
+irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 {
 	irqreturn_t ret, retval = IRQ_NONE;
 	unsigned int status = 0;
@@ -140,7 +138,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
 		local_irq_enable_in_hardirq();
 
 	do {
-		ret = action->handler(irq, action->dev_id, regs);
+		ret = action->handler(irq, action->dev_id);
 		if (ret == IRQ_HANDLED)
 			status |= action->flags;
 		retval |= ret;
@@ -158,7 +156,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
 /**
  * __do_IRQ - original all in one highlevel IRQ handler
  * @irq:	the interrupt number
- * @regs:	pointer to a register structure
  *
  * __do_IRQ handles all normal device IRQ's (the special
  * SMP cross-CPU interrupts have their own specific
@@ -167,7 +164,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
  * This is the original x86 implementation which is used for every
  * interrupt type.
  */
-fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
+fastcall unsigned int __do_IRQ(unsigned int irq)
 {
 	struct irq_desc *desc = irq_desc + irq;
 	struct irqaction *action;
@@ -182,7 +179,7 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
 		 */
 		if (desc->chip->ack)
 			desc->chip->ack(irq);
-		action_ret = handle_IRQ_event(irq, regs, desc->action);
+		action_ret = handle_IRQ_event(irq, desc->action);
 		desc->chip->end(irq);
 		return 1;
 	}
@@ -233,11 +230,11 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
 
 		spin_unlock(&desc->lock);
 
-		action_ret = handle_IRQ_event(irq, regs, action);
+		action_ret = handle_IRQ_event(irq, action);
 
 		spin_lock(&desc->lock);
 		if (!noirqdebug)
-			note_interrupt(irq, desc, action_ret, regs);
+			note_interrupt(irq, desc, action_ret);
 		if (likely(!(desc->status & IRQ_PENDING)))
 			break;
 		desc->status &= ~IRQ_PENDING;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 417e98092cf2..543ea2e5ad93 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -16,7 +16,7 @@ static int irqfixup __read_mostly;
 /*
  * Recovery handler for misrouted interrupts.
  */
-static int misrouted_irq(int irq, struct pt_regs *regs)
+static int misrouted_irq(int irq)
 {
 	int i;
 	int ok = 0;
@@ -49,7 +49,7 @@ static int misrouted_irq(int irq, struct pt_regs *regs)
 		while (action) {
 			/* Only shared IRQ handlers are safe to call */
 			if (action->flags & IRQF_SHARED) {
-				if (action->handler(i, action->dev_id, regs) ==
+				if (action->handler(i, action->dev_id) ==
 						IRQ_HANDLED)
 					ok = 1;
 			}
@@ -70,7 +70,7 @@ static int misrouted_irq(int irq, struct pt_regs *regs)
 			 */
 			work = 1;
 			spin_unlock(&desc->lock);
-			handle_IRQ_event(i, regs, action);
+			handle_IRQ_event(i, action);
 			spin_lock(&desc->lock);
 			desc->status &= ~IRQ_PENDING;
 		}
@@ -136,7 +136,7 @@ report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret)
 }
 
 void note_interrupt(unsigned int irq, struct irq_desc *desc,
-		    irqreturn_t action_ret, struct pt_regs *regs)
+		    irqreturn_t action_ret)
 {
 	if (unlikely(action_ret != IRQ_HANDLED)) {
 		desc->irqs_unhandled++;
@@ -147,7 +147,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
 	if (unlikely(irqfixup)) {
 		/* Don't punish working computers */
 		if ((irqfixup == 2 && irq == 0) || action_ret == IRQ_NONE) {
-			int ok = misrouted_irq(irq, regs);
+			int ok = misrouted_irq(irq);
 			if (action_ret == IRQ_NONE)
 				desc->irqs_unhandled -= ok;
 		}
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 7a4144ba3afd..f1f900ac3164 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -23,8 +23,7 @@ static void do_poweroff(void *dummy)
 
 static DECLARE_WORK(poweroff_work, do_poweroff, NULL);
 
-static void handle_poweroff(int key, struct pt_regs *pt_regs,
-				struct tty_struct *tty)
+static void handle_poweroff(int key, struct tty_struct *tty)
 {
 	schedule_work(&poweroff_work);
 }
diff --git a/kernel/profile.c b/kernel/profile.c
index fb660c7d35ba..857300a2afec 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -25,6 +25,7 @@
 #include <linux/mutex.h>
 #include <asm/sections.h>
 #include <asm/semaphore.h>
+#include <asm/irq_regs.h>
 
 struct profile_hit {
 	u32 pc, hits;
@@ -366,8 +367,10 @@ void profile_hit(int type, void *__pc)
 }
 #endif /* !CONFIG_SMP */
 
-void profile_tick(int type, struct pt_regs *regs)
+void profile_tick(int type)
 {
+	struct pt_regs *regs = get_irq_regs();
+
 	if (type == CPU_PROFILING && timer_hook)
 		timer_hook(regs);
 	if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask))
-- 
cgit v1.2.3


From 4899b8b16b302299cc91289f7b5bac295e9ab387 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 6 Oct 2006 00:43:48 -0700
Subject: [PATCH] kauditd_thread warning fix

Squash this warning:

  kernel/audit.c: In function 'kauditd_thread':
  kernel/audit.c:367: warning: no return statement in function returning non-void

We might as test kthread_should_stop(), although it's not very pointful at
present.

The code which starts this thread looks racy - the kernel could start multiple
threads.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jeff Garzik <jeff@garzik.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/audit.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index f9889ee77825..98106f6078b0 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -340,7 +340,7 @@ static int kauditd_thread(void *dummy)
 {
 	struct sk_buff *skb;
 
-	while (1) {
+	while (!kthread_should_stop()) {
 		skb = skb_dequeue(&audit_skb_queue);
 		wake_up(&audit_backlog_wait);
 		if (skb) {
@@ -369,6 +369,7 @@ static int kauditd_thread(void *dummy)
 			remove_wait_queue(&kauditd_wait, &wait);
 		}
 	}
+	return 0;
 }
 
 int audit_send_list(void *_dest)
-- 
cgit v1.2.3


From e317c8ccaaf900abf39cc3240e4dc5ba82a3cc67 Mon Sep 17 00:00:00 2001
From: Frederik Deweerdt <deweerdt@free.fr>
Date: Fri, 6 Oct 2006 18:58:24 +0000
Subject: [PATCH] ixp4xxdefconfig arm fixes

With the following patch, the ixp4xxdefconfig builds correctly.  I'll
test some more configs if I get some time.

Signed-off-by: Frederik Deweerdt <frederik.deweerdt@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/resend.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 35f10f7ff94a..5bfeaed7e487 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -38,7 +38,7 @@ static void resend_irqs(unsigned long arg)
 		clear_bit(irq, irqs_resend);
 		desc = irq_desc + irq;
 		local_irq_disable();
-		desc->handle_irq(irq, desc, NULL);
+		desc->handle_irq(irq, desc);
 		local_irq_enable();
 	}
 }
-- 
cgit v1.2.3


From 5c339d4541995df2fd3ca31a84c042e7afe9b3c1 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Oct 2006 22:19:44 -0700
Subject: [PATCH] swsusp: Make userland suspend work on SMP again

Unfortunately one of the recent changes in swsusp has broken the userland
suspend on SMP.  Fix it.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/user.c b/kernel/power/user.c
index 72825c853cd7..93b5dd283dea 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -145,10 +145,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 			error = freeze_processes();
 			if (error) {
 				thaw_processes();
+				enable_nonboot_cpus();
 				error = -EBUSY;
 			}
 		}
-		enable_nonboot_cpus();
 		up(&pm_sem);
 		if (!error)
 			data->frozen = 1;
-- 
cgit v1.2.3


From ba46df984b8e8114c3cf19c51670fab084bd4196 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Tue, 10 Oct 2006 22:46:07 +0100
Subject: [PATCH] __user annotations: futex

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex.c        | 15 ++++++++-------
 kernel/futex_compat.c | 12 ++++++------
 2 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 4aaf91951a43..b364e0026191 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1612,10 +1612,10 @@ sys_set_robust_list(struct robust_list_head __user *head,
  * @len_ptr: pointer to a length field, the kernel fills in the header size
  */
 asmlinkage long
-sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
+sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
 		    size_t __user *len_ptr)
 {
-	struct robust_list_head *head;
+	struct robust_list_head __user *head;
 	unsigned long ret;
 
 	if (!pid)
@@ -1694,14 +1694,15 @@ retry:
  * Fetch a robust-list pointer. Bit 0 signals PI futexes:
  */
 static inline int fetch_robust_entry(struct robust_list __user **entry,
-				     struct robust_list __user **head, int *pi)
+				     struct robust_list __user * __user *head,
+				     int *pi)
 {
 	unsigned long uentry;
 
-	if (get_user(uentry, (unsigned long *)head))
+	if (get_user(uentry, (unsigned long __user *)head))
 		return -EFAULT;
 
-	*entry = (void *)(uentry & ~1UL);
+	*entry = (void __user *)(uentry & ~1UL);
 	*pi = uentry & 1;
 
 	return 0;
@@ -1739,7 +1740,7 @@ void exit_robust_list(struct task_struct *curr)
 		return;
 
 	if (pending)
-		handle_futex_death((void *)pending + futex_offset, curr, pip);
+		handle_futex_death((void __user *)pending + futex_offset, curr, pip);
 
 	while (entry != &head->list) {
 		/*
@@ -1747,7 +1748,7 @@ void exit_robust_list(struct task_struct *curr)
 		 * don't process it twice:
 		 */
 		if (entry != pending)
-			if (handle_futex_death((void *)entry + futex_offset,
+			if (handle_futex_death((void __user *)entry + futex_offset,
 						curr, pi))
 				return;
 		/*
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index c5cca3f65cb7..50f24eea6cd0 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -18,7 +18,7 @@
  */
 static inline int
 fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
-		   compat_uptr_t *head, int *pi)
+		   compat_uptr_t __user *head, int *pi)
 {
 	if (get_user(*uentry, head))
 		return -EFAULT;
@@ -62,7 +62,7 @@ void compat_exit_robust_list(struct task_struct *curr)
 			       &head->list_op_pending, &pip))
 		return;
 	if (upending)
-		handle_futex_death((void *)pending + futex_offset, curr, pip);
+		handle_futex_death((void __user *)pending + futex_offset, curr, pip);
 
 	while (compat_ptr(uentry) != &head->list) {
 		/*
@@ -70,7 +70,7 @@ void compat_exit_robust_list(struct task_struct *curr)
 		 * dont process it twice:
 		 */
 		if (entry != pending)
-			if (handle_futex_death((void *)entry + futex_offset,
+			if (handle_futex_death((void __user *)entry + futex_offset,
 						curr, pi))
 				return;
 
@@ -78,7 +78,7 @@ void compat_exit_robust_list(struct task_struct *curr)
 		 * Fetch the next entry in the list:
 		 */
 		if (fetch_robust_entry(&uentry, &entry,
-				       (compat_uptr_t *)&entry->next, &pi))
+				       (compat_uptr_t __user *)&entry->next, &pi))
 			return;
 		/*
 		 * Avoid excessively long or circular lists:
@@ -103,10 +103,10 @@ compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
 }
 
 asmlinkage long
-compat_sys_get_robust_list(int pid, compat_uptr_t *head_ptr,
+compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 			   compat_size_t __user *len_ptr)
 {
-	struct compat_robust_list_head *head;
+	struct compat_robust_list_head __user *head;
 	unsigned long ret;
 
 	if (!pid)
-- 
cgit v1.2.3


From ba2397efe10ba728c7ff22b179e218c292227c13 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Tue, 10 Oct 2006 22:46:57 +0100
Subject: [PATCH] make kernel/relay.c __user-clean

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/relay.c | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 1d63ecddfa70..f04bbdb56ac2 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -887,7 +887,7 @@ static int subbuf_read_actor(size_t read_start,
 
 	from = buf->start + read_start;
 	ret = avail;
-	if (copy_to_user(desc->arg.data, from, avail)) {
+	if (copy_to_user(desc->arg.buf, from, avail)) {
 		desc->error = -EFAULT;
 		ret = 0;
 	}
@@ -946,24 +946,17 @@ typedef int (*subbuf_actor_t) (size_t read_start,
  */
 static inline ssize_t relay_file_read_subbufs(struct file *filp,
 					      loff_t *ppos,
-					      size_t count,
 					      subbuf_actor_t subbuf_actor,
 					      read_actor_t actor,
-					      void *target)
+					      read_descriptor_t *desc)
 {
 	struct rchan_buf *buf = filp->private_data;
 	size_t read_start, avail;
-	read_descriptor_t desc;
 	int ret;
 
-	if (!count)
+	if (!desc->count)
 		return 0;
 
-	desc.written = 0;
-	desc.count = count;
-	desc.arg.data = target;
-	desc.error = 0;
-
 	mutex_lock(&filp->f_dentry->d_inode->i_mutex);
 	do {
 		if (!relay_file_read_avail(buf, *ppos))
@@ -974,19 +967,19 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp,
 		if (!avail)
 			break;
 
-		avail = min(desc.count, avail);
-		ret = subbuf_actor(read_start, buf, avail, &desc, actor);
-		if (desc.error < 0)
+		avail = min(desc->count, avail);
+		ret = subbuf_actor(read_start, buf, avail, desc, actor);
+		if (desc->error < 0)
 			break;
 
 		if (ret) {
 			relay_file_read_consume(buf, read_start, ret);
 			*ppos = relay_file_read_end_pos(buf, read_start, ret);
 		}
-	} while (desc.count && ret);
+	} while (desc->count && ret);
 	mutex_unlock(&filp->f_dentry->d_inode->i_mutex);
 
-	return desc.written;
+	return desc->written;
 }
 
 static ssize_t relay_file_read(struct file *filp,
@@ -994,8 +987,13 @@ static ssize_t relay_file_read(struct file *filp,
 			       size_t count,
 			       loff_t *ppos)
 {
-	return relay_file_read_subbufs(filp, ppos, count, subbuf_read_actor,
-				       NULL, buffer);
+	read_descriptor_t desc;
+	desc.written = 0;
+	desc.count = count;
+	desc.arg.buf = buffer;
+	desc.error = 0;
+	return relay_file_read_subbufs(filp, ppos, subbuf_read_actor,
+				       NULL, &desc);
 }
 
 static ssize_t relay_file_sendfile(struct file *filp,
@@ -1004,8 +1002,13 @@ static ssize_t relay_file_sendfile(struct file *filp,
 				   read_actor_t actor,
 				   void *target)
 {
-	return relay_file_read_subbufs(filp, ppos, count, subbuf_send_actor,
-				       actor, target);
+	read_descriptor_t desc;
+	desc.written = 0;
+	desc.count = count;
+	desc.arg.data = target;
+	desc.error = 0;
+	return relay_file_read_subbufs(filp, ppos, subbuf_send_actor,
+				       actor, &desc);
 }
 
 struct file_operations relay_file_operations = {
-- 
cgit v1.2.3


From 1af9892811ad3bf2abb31566bfb2ec0b5070a66a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Tue, 10 Oct 2006 22:48:57 +0100
Subject: [PATCH] cpuset ANSI prototype

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9d850ae13b1b..6313c38c930e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2137,7 +2137,7 @@ static int cpuset_handle_cpuhp(struct notifier_block *nb,
  * See also the previous routine cpuset_handle_cpuhp().
  */
 
-void cpuset_track_online_nodes()
+void cpuset_track_online_nodes(void)
 {
 	common_cpu_mem_hotplug_unplug();
 }
-- 
cgit v1.2.3


From 4dfbb9d8c6cbfc32faa5c71145bd2a43e1f8237c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 11 Oct 2006 01:45:14 -0400
Subject: Lockdep: add lockdep_set_class_and_subclass() and
 lockdep_set_subclass()

This annotation makes it possible to assign a subclass on lock init. This
annotation is meant to reduce the _nested() annotations by assigning a
default subclass.

One could do without this annotation and rely on lockdep_set_class()
exclusively, but that would require a manual stack of struct lock_class_key
objects.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 kernel/lockdep.c     | 10 ++++++----
 kernel/mutex-debug.c |  2 +-
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 4c0553461000..ba7156ac70c1 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -1177,7 +1177,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
  * itself, so actual lookup of the hash should be once per lock object.
  */
 static inline struct lock_class *
-register_lock_class(struct lockdep_map *lock, unsigned int subclass)
+register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 {
 	struct lockdep_subclass_key *key;
 	struct list_head *hash_head;
@@ -1249,7 +1249,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass)
 out_unlock_set:
 	__raw_spin_unlock(&hash_lock);
 
-	if (!subclass)
+	if (!subclass || force)
 		lock->class_cache = class;
 
 	DEBUG_LOCKS_WARN_ON(class->subclass != subclass);
@@ -1937,7 +1937,7 @@ void trace_softirqs_off(unsigned long ip)
  * Initialize a lock instance's lock-class mapping info:
  */
 void lockdep_init_map(struct lockdep_map *lock, const char *name,
-		      struct lock_class_key *key)
+		      struct lock_class_key *key, int subclass)
 {
 	if (unlikely(!debug_locks))
 		return;
@@ -1957,6 +1957,8 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
 	lock->name = name;
 	lock->key = key;
 	lock->class_cache = NULL;
+	if (subclass)
+		register_lock_class(lock, subclass, 1);
 }
 
 EXPORT_SYMBOL_GPL(lockdep_init_map);
@@ -1995,7 +1997,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	 * Not cached yet or subclass?
 	 */
 	if (unlikely(!class)) {
-		class = register_lock_class(lock, subclass);
+		class = register_lock_class(lock, subclass, 0);
 		if (!class)
 			return 0;
 	}
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index e3203c654dda..18651641a7b5 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -91,7 +91,7 @@ void debug_mutex_init(struct mutex *lock, const char *name,
 	 * Make sure we are not reinitializing a held lock:
 	 */
 	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
-	lockdep_init_map(&lock->dep_map, name, key);
+	lockdep_init_map(&lock->dep_map, name, key, 0);
 #endif
 	lock->owner = NULL;
 	lock->magic = lock;
-- 
cgit v1.2.3


From 97c7801cd5b0bb6a38c16108a496235474dc6310 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 11 Oct 2006 01:20:45 -0700
Subject: [PATCH] swsusp: Use suspend_console

Add suspend_console() and resume_console() to the suspend-to-disk code paths
so that the users of netconsole can use swsusp with it.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c | 8 ++++++++
 kernel/power/user.c | 8 +++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index d72234942798..d3a158a60312 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -18,6 +18,7 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/pm.h>
+#include <linux/console.h>
 #include <linux/cpu.h>
 
 #include "power.h"
@@ -119,8 +120,10 @@ int pm_suspend_disk(void)
 	if (error)
 		return error;
 
+	suspend_console();
 	error = device_suspend(PMSG_FREEZE);
 	if (error) {
+		resume_console();
 		printk("Some devices failed to suspend\n");
 		unprepare_processes();
 		return error;
@@ -133,6 +136,7 @@ int pm_suspend_disk(void)
 
 	if (in_suspend) {
 		device_resume();
+		resume_console();
 		pr_debug("PM: writing image.\n");
 		error = swsusp_write();
 		if (!error)
@@ -148,6 +152,7 @@ int pm_suspend_disk(void)
 	swsusp_free();
  Done:
 	device_resume();
+	resume_console();
 	unprepare_processes();
 	return error;
 }
@@ -212,7 +217,9 @@ static int software_resume(void)
 
 	pr_debug("PM: Preparing devices for restore.\n");
 
+	suspend_console();
 	if ((error = device_suspend(PMSG_PRETHAW))) {
+		resume_console();
 		printk("Some devices failed to suspend\n");
 		swsusp_free();
 		goto Thaw;
@@ -224,6 +231,7 @@ static int software_resume(void)
 	swsusp_resume();
 	pr_debug("PM: Restore failed, recovering.n");
 	device_resume();
+	resume_console();
  Thaw:
 	unprepare_processes();
  Done:
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 93b5dd283dea..d991d3b0e5a4 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -19,6 +19,7 @@
 #include <linux/swapops.h>
 #include <linux/pm.h>
 #include <linux/fs.h>
+#include <linux/console.h>
 #include <linux/cpu.h>
 
 #include <asm/uaccess.h>
@@ -173,12 +174,14 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		/* Free memory before shutting down devices. */
 		error = swsusp_shrink_memory();
 		if (!error) {
+			suspend_console();
 			error = device_suspend(PMSG_FREEZE);
 			if (!error) {
 				in_suspend = 1;
 				error = swsusp_suspend();
 				device_resume();
 			}
+			resume_console();
 		}
 		up(&pm_sem);
 		if (!error)
@@ -196,11 +199,13 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		snapshot_free_unused_memory(&data->handle);
 		down(&pm_sem);
 		pm_prepare_console();
+		suspend_console();
 		error = device_suspend(PMSG_PRETHAW);
 		if (!error) {
 			error = swsusp_resume();
 			device_resume();
 		}
+		resume_console();
 		pm_restore_console();
 		up(&pm_sem);
 		break;
@@ -289,6 +294,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		}
 
 		/* Put devices to sleep */
+		suspend_console();
 		error = device_suspend(PMSG_SUSPEND);
 		if (error) {
 			printk(KERN_ERR "Failed to suspend some devices.\n");
@@ -299,7 +305,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 			/* Wake up devices */
 			device_resume();
 		}
-
+		resume_console();
 		if (pm_ops->finish)
 			pm_ops->finish(PM_SUSPEND_MEM);
 
-- 
cgit v1.2.3


From 469340236a7c9673df3e6a2425f559517f01464e Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 11 Oct 2006 01:21:26 -0700
Subject: [PATCH] mm: kevent threads: use MPOL_DEFAULT

Switch the memory policy of the kevent threads to MPOL_DEFAULT while
leaving the kzalloc of the workqueue structure on interleave.  This means
that all code executed in the context of the kevent thread is allocating
node local.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Alok Kataria <alok.kataria@calsoftinc.com>
Cc: Andi Kleen <ak@suse.de>
Cc: <pj@sgi.com>
Cc: <shai@scalex86.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/workqueue.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index cfc737bffe6d..3df9bfc7ff78 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -28,6 +28,7 @@
 #include <linux/notifier.h>
 #include <linux/kthread.h>
 #include <linux/hardirq.h>
+#include <linux/mempolicy.h>
 
 /*
  * The per-CPU workqueue (if single thread, we always use the first
@@ -245,6 +246,12 @@ static int worker_thread(void *__cwq)
 	sigprocmask(SIG_BLOCK, &blocked, NULL);
 	flush_signals(current);
 
+	/*
+	 * We inherited MPOL_INTERLEAVE from the booting kernel.
+	 * Set MPOL_DEFAULT to insure node local allocations.
+	 */
+	numa_default_policy();
+
 	/* SIG_IGN makes children autoreap: see do_notify_parent(). */
 	sa.sa.sa_handler = SIG_IGN;
 	sa.sa.sa_flags = 0;
-- 
cgit v1.2.3


From fa3ba2e81ea23416272a22009bba95954c81969c Mon Sep 17 00:00:00 2001
From: Florin Malita <fmalita@gmail.com>
Date: Wed, 11 Oct 2006 01:21:48 -0700
Subject: [PATCH] fix Module taint flags listing in Oops/panic

Module taint flags listing in Oops/panic has a couple of issues:

* taint_flags() doesn't null-terminate the buffer after printing the flags

* per-module taints are only set if the kernel is not already tainted
  (with that particular flag) => only the first offending module gets its
  taint info correctly updated

Some additional changes:

* 'license_gplok' is no longer needed - equivalent to !(taints &
  TAINT_PROPRIETARY_MODULE) - so we can drop it from struct module *
  exporting module taint info via /proc/module:

pwc 88576 0 - Live 0xf8c32000
evilmod 6784 1 pwc, Live 0xf8bbf000 (PF)

Signed-off-by: Florin Malita <fmalita@gmail.com>
Cc: "Randy.Dunlap" <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 94 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 49 insertions(+), 45 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 7f60e782de1e..67009bd56c52 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -87,6 +87,12 @@ static inline int strong_try_module_get(struct module *mod)
 	return try_module_get(mod);
 }
 
+static inline void add_taint_module(struct module *mod, unsigned flag)
+{
+	add_taint(flag);
+	mod->taints |= flag;
+}
+
 /* A thread that wants to hold a reference to a module only while it
  * is running can call ths to safely exit.
  * nfsd and lockd use this.
@@ -847,12 +853,10 @@ static int check_version(Elf_Shdr *sechdrs,
 		return 0;
 	}
 	/* Not in module's version table.  OK, but that taints the kernel. */
-	if (!(tainted & TAINT_FORCED_MODULE)) {
+	if (!(tainted & TAINT_FORCED_MODULE))
 		printk("%s: no version for \"%s\" found: kernel tainted.\n",
 		       mod->name, symname);
-		add_taint(TAINT_FORCED_MODULE);
-		mod->taints |= TAINT_FORCED_MODULE;
-	}
+	add_taint_module(mod, TAINT_FORCED_MODULE);
 	return 1;
 }
 
@@ -910,7 +914,8 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
 	unsigned long ret;
 	const unsigned long *crc;
 
-	ret = __find_symbol(name, &owner, &crc, mod->license_gplok);
+	ret = __find_symbol(name, &owner, &crc,
+			!(mod->taints & TAINT_PROPRIETARY_MODULE));
 	if (ret) {
 		/* use_module can fail due to OOM, or module unloading */
 		if (!check_version(sechdrs, versindex, name, mod, crc) ||
@@ -1335,12 +1340,11 @@ static void set_license(struct module *mod, const char *license)
 	if (!license)
 		license = "unspecified";
 
-	mod->license_gplok = license_is_gpl_compatible(license);
-	if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) {
-		printk(KERN_WARNING "%s: module license '%s' taints kernel.\n",
-		       mod->name, license);
-		add_taint(TAINT_PROPRIETARY_MODULE);
-		mod->taints |= TAINT_PROPRIETARY_MODULE;
+	if (!license_is_gpl_compatible(license)) {
+		if (!(tainted & TAINT_PROPRIETARY_MODULE))
+			printk(KERN_WARNING "%s: module license '%s' taints"
+				"kernel.\n", mod->name, license);
+		add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
 	}
 }
 
@@ -1619,8 +1623,7 @@ static struct module *load_module(void __user *umod,
 	modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
 	/* This is allowed: modprobe --force will invalidate it. */
 	if (!modmagic) {
-		add_taint(TAINT_FORCED_MODULE);
-		mod->taints |= TAINT_FORCED_MODULE;
+		add_taint_module(mod, TAINT_FORCED_MODULE);
 		printk(KERN_WARNING "%s: no version magic, tainting kernel.\n",
 		       mod->name);
 	} else if (!same_magic(modmagic, vermagic)) {
@@ -1714,14 +1717,10 @@ static struct module *load_module(void __user *umod,
 	/* Set up license info based on the info section */
 	set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
 
-	if (strcmp(mod->name, "ndiswrapper") == 0) {
-		add_taint(TAINT_PROPRIETARY_MODULE);
-		mod->taints |= TAINT_PROPRIETARY_MODULE;
-	}
-	if (strcmp(mod->name, "driverloader") == 0) {
-		add_taint(TAINT_PROPRIETARY_MODULE);
-		mod->taints |= TAINT_PROPRIETARY_MODULE;
-	}
+	if (strcmp(mod->name, "ndiswrapper") == 0)
+		add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+	if (strcmp(mod->name, "driverloader") == 0)
+		add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
 
 	/* Set up MODINFO_ATTR fields */
 	setup_modinfo(mod, sechdrs, infoindex);
@@ -1766,8 +1765,7 @@ static struct module *load_module(void __user *umod,
 	    (mod->num_unused_gpl_syms && !unusedgplcrcindex)) {
 		printk(KERN_WARNING "%s: No versions for exported symbols."
 		       " Tainting kernel.\n", mod->name);
-		add_taint(TAINT_FORCED_MODULE);
-		mod->taints |= TAINT_FORCED_MODULE;
+		add_taint_module(mod, TAINT_FORCED_MODULE);
 	}
 #endif
 
@@ -2132,9 +2130,33 @@ static void m_stop(struct seq_file *m, void *p)
 	mutex_unlock(&module_mutex);
 }
 
+static char *taint_flags(unsigned int taints, char *buf)
+{
+	int bx = 0;
+
+	if (taints) {
+		buf[bx++] = '(';
+		if (taints & TAINT_PROPRIETARY_MODULE)
+			buf[bx++] = 'P';
+		if (taints & TAINT_FORCED_MODULE)
+			buf[bx++] = 'F';
+		/*
+		 * TAINT_FORCED_RMMOD: could be added.
+		 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
+		 * apply to modules.
+		 */
+		buf[bx++] = ')';
+	}
+	buf[bx] = '\0';
+
+	return buf;
+}
+
 static int m_show(struct seq_file *m, void *p)
 {
 	struct module *mod = list_entry(p, struct module, list);
+	char buf[8];
+
 	seq_printf(m, "%s %lu",
 		   mod->name, mod->init_size + mod->core_size);
 	print_unload_info(m, mod);
@@ -2147,6 +2169,10 @@ static int m_show(struct seq_file *m, void *p)
 	/* Used by oprofile and other similar tools. */
 	seq_printf(m, " 0x%p", mod->module_core);
 
+	/* Taints info */
+	if (mod->taints)
+		seq_printf(m, " %s", taint_flags(mod->taints, buf));
+
 	seq_printf(m, "\n");
 	return 0;
 }
@@ -2235,28 +2261,6 @@ struct module *module_text_address(unsigned long addr)
 	return mod;
 }
 
-static char *taint_flags(unsigned int taints, char *buf)
-{
-	*buf = '\0';
-	if (taints) {
-		int bx;
-
-		buf[0] = '(';
-		bx = 1;
-		if (taints & TAINT_PROPRIETARY_MODULE)
-			buf[bx++] = 'P';
-		if (taints & TAINT_FORCED_MODULE)
-			buf[bx++] = 'F';
-		/*
-		 * TAINT_FORCED_RMMOD: could be added.
-		 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
-		 * apply to modules.
-		 */
-		buf[bx] = ')';
-	}
-	return buf;
-}
-
 /* Don't grab lock, we're oopsing. */
 void print_modules(void)
 {
-- 
cgit v1.2.3


From beed33a816204cb402c69266475b6a60a2433ceb Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 11 Oct 2006 01:21:52 -0700
Subject: [PATCH] sched: likely profiling

This likely profiling is pretty fun. I found a few possible problems
in sched.c.

This patch may be not measurable, but when I did measure long ago,
nooping (un)likely cost a couple of % on scheduler heavy benchmarks, so
it all adds up.

Tweak some branch hints:

- the 2nd 64 bits in the bitmask is likely to be populated, because it
  contains the first 28 bits (nearly 3/4) of the normal priorities.
  (ratio of 669669:691 ~= 1000:1).

- it isn't unlikely that context switching switches to another process. it
  might be very rapidly switching to and from the idle process (ratio of
  475815:419004 and 471330:423544). Let the branch predictor decide.

- preempt_enable seems to be very often called in a nested preempt_disable
  or with interrupts disabled (ratio of 3567760:87965 ~= 40:1)

Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Daniel Walker <dwalker@mvista.com>
Cc: Hua Zhong <hzhong@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 53608a59d6e3..094b5687eef6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1822,14 +1822,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	struct mm_struct *mm = next->mm;
 	struct mm_struct *oldmm = prev->active_mm;
 
-	if (unlikely(!mm)) {
+	if (!mm) {
 		next->active_mm = oldmm;
 		atomic_inc(&oldmm->mm_count);
 		enter_lazy_tlb(oldmm, next);
 	} else
 		switch_mm(oldmm, mm, next);
 
-	if (unlikely(!prev->mm)) {
+	if (!prev->mm) {
 		prev->active_mm = NULL;
 		WARN_ON(rq->prev_mm);
 		rq->prev_mm = oldmm;
@@ -3491,7 +3491,7 @@ asmlinkage void __sched preempt_schedule(void)
 	 * If there is a non-zero preempt_count or interrupts are disabled,
 	 * we do not want to preempt the current task.  Just return..
 	 */
-	if (unlikely(ti->preempt_count || irqs_disabled()))
+	if (likely(ti->preempt_count || irqs_disabled()))
 		return;
 
 need_resched:
-- 
cgit v1.2.3


From 01a3ee2b203e511e20f98b85a9172fd32c53e87c Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@linux.intel.com>
Date: Wed, 11 Oct 2006 01:21:55 -0700
Subject: [PATCH] bitmap: parse input from kernel and user buffers

lib/bitmap.c:bitmap_parse() is a library function that received as input a
user buffer.  This seemed to have originated from the way the write_proc
function of the /proc filesystem operates.

This has been reworked to not use kmalloc and eliminates a lot of
get_user() overhead by performing one access_ok before using __get_user().

We need to test if we are in kernel or user space (is_user) and access the
buffer differently.  We cannot use __get_user() to access kernel addresses
in all cases, for example in architectures with separate address space for
kernel and user.

This function will be useful for other uses as well; for example, taking
input for /sysfs instead of /proc, so it was changed to accept kernel
buffers.  We have this use for the Linux UWB project, as part as the
upcoming bandwidth allocator code.

Only a few routines used this function and they were changed too.

Signed-off-by: Reinette Chatre <reinette.chatre@linux.intel.com>
Signed-off-by: Inaky Perez-Gonzalez <inaky@linux.intel.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Joe Korty <joe.korty@ccur.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/proc.c | 2 +-
 kernel/profile.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 607c7809ad01..9a352667007c 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -57,7 +57,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
 	if (!irq_desc[irq].chip->set_affinity || no_irq_affinity)
 		return -EIO;
 
-	err = cpumask_parse(buffer, count, new_value);
+	err = cpumask_parse_user(buffer, count, new_value);
 	if (err)
 		return err;
 
diff --git a/kernel/profile.c b/kernel/profile.c
index 857300a2afec..f940b462eec9 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -399,7 +399,7 @@ static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffe
 	unsigned long full_count = count, err;
 	cpumask_t new_value;
 
-	err = cpumask_parse(buffer, count, new_value);
+	err = cpumask_parse_user(buffer, count, new_value);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From 3dc3099a9b2c346b16383597fadaa79a05a52388 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 11 Oct 2006 01:22:06 -0700
Subject: [PATCH] lockdep: use BUILD_BUG_ON

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/lockdep.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 4c0553461000..805a322a5655 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -1114,8 +1114,6 @@ static int count_matching_names(struct lock_class *new_class)
 	return count + 1;
 }
 
-extern void __error_too_big_MAX_LOCKDEP_SUBCLASSES(void);
-
 /*
  * Register a lock's class in the hash-table, if the class is not present
  * yet. Otherwise we look it up. We cache the result in the lock object
@@ -1153,8 +1151,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 	 * (or spin_lock_init()) call - which acts as the key. For static
 	 * locks we use the lock object itself as the key.
 	 */
-	if (sizeof(struct lock_class_key) > sizeof(struct lock_class))
-		__error_too_big_MAX_LOCKDEP_SUBCLASSES();
+	BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(struct lock_class));
 
 	key = lock->key->subkeys + subclass;
 
-- 
cgit v1.2.3


From 256a6b41365e17cebe5c2fc91ddff716c9aa055a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 11 Oct 2006 01:22:08 -0700
Subject: [PATCH] lockdep: fix printk recursion logic

Bug reported and fixed by Tilman Schmidt <tilman@imap.cc>: if lockdep is
enabled then log messages make it to /var/log/messages belatedly.  The
reason is a missed wakeup of klogd.

Initially there was only a lockdep_internal() protection against lockdep
recursion within vprintk() - it grew the 'outer' lockdep_off()/on()
protection only later on.  But that lockdep_off() made the
release_console_sem() within vprintk() always happen under the
lockdep_internal() condition, causing the bug.

The right solution to remove the inner protection against recursion here -
the outer one is enough.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Tilman Schmidt <tilman@imap.cc>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/printk.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 771f5e861bcd..f7d427ef5038 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -820,15 +820,8 @@ void release_console_sem(void)
 	console_locked = 0;
 	up(&console_sem);
 	spin_unlock_irqrestore(&logbuf_lock, flags);
-	if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) {
-		/*
-		 * If we printk from within the lock dependency code,
-		 * from within the scheduler code, then do not lock
-		 * up due to self-recursion:
-		 */
-		if (!lockdep_internal())
-			wake_up_interruptible(&log_wait);
-	}
+	if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait))
+		wake_up_interruptible(&log_wait);
 }
 EXPORT_SYMBOL(release_console_sem);
 
-- 
cgit v1.2.3


From 39af114377bf80d2a88e47be33d578d1fa9b0775 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 16 Oct 2006 09:01:46 -0700
Subject: [PATCH] fix epoll_pwait when EPOLL=n

Fixes http://bugzilla.kernel.org/show_bug.cgi?id=7371

sys_epoll_pwait needs to be listed as a conditional (weak)
entry point for CONFIG_EPOLL=n.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys_ni.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7a3b2e75f040..0e53314b14de 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -49,6 +49,7 @@ cond_syscall(compat_sys_get_robust_list);
 cond_syscall(sys_epoll_create);
 cond_syscall(sys_epoll_ctl);
 cond_syscall(sys_epoll_wait);
+cond_syscall(sys_epoll_pwait);
 cond_syscall(sys_semget);
 cond_syscall(sys_semop);
 cond_syscall(sys_semtimedop);
-- 
cgit v1.2.3


From ca268c691de95612981b93e58899c1d73fdb6b47 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 17 Oct 2006 00:09:28 -0700
Subject: [PATCH] lockdep: increase max allowed recursion depth

In general, lockdep warnings are intended to be non-fatal, so I have put in
various practical limits on internal data structure failure modes.  We haven't
had a /single/ lockdep-internal crash ever since lockdep went upstream [the
unwinder crashes are outside of lockdep], and that's largely due to the good
internal checks it does.

Recursion within the dependency graph is currently limited to 20, that's
probably not enough on some many-CPU boxes - this patch doubles it to 40.  I
have written the lockdep functions to have as small stackframes as possible,
so 40 should be OK too.  (The practical recursion limit should be somewhere
between 100 and 200 entries.  If we hit that then I'll change the algorithm to
be iteration-based.  Graph walking logic is so easy to program via recursion,
so i'd like to keep recursion as long as possible.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/lockdep.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 805a322a5655..d1a3b2cfe4a9 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -575,6 +575,8 @@ static noinline int print_circular_bug_tail(void)
 	return 0;
 }
 
+#define RECURSION_LIMIT 40
+
 static int noinline print_infinite_recursion_bug(void)
 {
 	__raw_spin_unlock(&hash_lock);
@@ -595,7 +597,7 @@ check_noncircular(struct lock_class *source, unsigned int depth)
 	debug_atomic_inc(&nr_cyclic_check_recursions);
 	if (depth > max_recursion_depth)
 		max_recursion_depth = depth;
-	if (depth >= 20)
+	if (depth >= RECURSION_LIMIT)
 		return print_infinite_recursion_bug();
 	/*
 	 * Check this lock's dependency list:
@@ -645,7 +647,7 @@ find_usage_forwards(struct lock_class *source, unsigned int depth)
 
 	if (depth > max_recursion_depth)
 		max_recursion_depth = depth;
-	if (depth >= 20)
+	if (depth >= RECURSION_LIMIT)
 		return print_infinite_recursion_bug();
 
 	debug_atomic_inc(&nr_find_usage_forwards_checks);
@@ -684,7 +686,7 @@ find_usage_backwards(struct lock_class *source, unsigned int depth)
 
 	if (depth > max_recursion_depth)
 		max_recursion_depth = depth;
-	if (depth >= 20)
+	if (depth >= RECURSION_LIMIT)
 		return print_infinite_recursion_bug();
 
 	debug_atomic_inc(&nr_find_usage_backwards_checks);
-- 
cgit v1.2.3


From 3f4a0b917ce72ef47e438d354c433eb645218e87 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Tue, 17 Oct 2006 00:09:32 -0700
Subject: [PATCH] i386 Time: Avoid PIT SMP lockups

Avoid possible PIT livelock issues seen on SMP systems (and reported by
Andi), by not allowing it as a clocksource on SMP boxes.

However, since the PIT may no longer be present, we have to properly handle
the cases where SMP systems have TSC skew and fall back from the TSC.
Since the PIT isn't there, it would "fall back" to the TSC again.  So this
changes the jiffies rating to 1, and the TSC-bad rating value to 0.

Thus you will get the following behavior priority on i386 systems:

tsc		[if present & stable]
hpet		[if present]
cyclone		[if present]
acpi_pm		[if present]
pit		[if UP]
jiffies

Rather then the current more complicated:
tsc		[if present & stable]
hpet		[if present]
cyclone		[if present]
acpi_pm		[if present]
pit		[if cpus < 4]
tsc		[if present & unstable]
jiffies

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time/jiffies.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 126bb30c4afe..a99b2a6e6a07 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -57,7 +57,7 @@ static cycle_t jiffies_read(void)
 
 struct clocksource clocksource_jiffies = {
 	.name		= "jiffies",
-	.rating		= 0, /* lowest rating*/
+	.rating		= 1, /* lowest valid rating*/
 	.read		= jiffies_read,
 	.mask		= 0xffffffff, /*32bits*/
 	.mult		= NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
-- 
cgit v1.2.3


From ac08c26492a0ad4d94a25bd47d5630cd38337069 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 17 Oct 2006 00:09:39 -0700
Subject: [PATCH] posix-cpu-timers: prevent signal delivery starvation

The integer divisions in the timer accounting code can round the result
down to 0.  Adding 0 is without effect and the signal delivery stops.

Clamp the division result to minimum 1 to avoid this.

Problem was reported by Seongbae Park <spark@google.com>, who provided
also an inital patch.

Roland sayeth:

  I have had some more time to think about the problem, and to reproduce it
  using Toyo's test case.  For the record, if my understanding of the problem
  is correct, this happens only in one very particular case.  First, the
  expiry time has to be so soon that in cputime_t units (usually 1s/HZ ticks)
  it's < nthreads so the division yields zero.  Second, it only affects each
  thread that is so new that its CPU time accumulation is zero so now+0 is
  still zero and ->it_*_expires winds up staying zero.  For the VIRT and PROF
  clocks when cputime_t is tick granularity (or the SCHED clock on
  configurations where sched_clock's value only advances on clock ticks), this
  is not hard to arrange with new threads starting up and blocking before they
  accumulate a whole tick of CPU time.  That's what happens in Toyo's test
  case.

  Note that in general it is fine for that division to round down to zero,
  and set each thread's expiry time to its "now" time.  The problem only
  arises with thread's whose "now" value is still zero, so that now+0 winds up
  0 and is interpreted as "not set" instead of ">= now".  So it would be a
  sufficient and more precise fix to just use max(ticks, 1) inside the loop
  when setting each it_*_expires value.

  But, it does no harm to round the division up to one and always advance
  every thread's expiry time.  If the thread didn't already fire timers for
  the expiry time of "now", there is no expectation that it will do so before
  the next tick anyway.  So I followed Thomas's patch in lifting the max out
  of the loops.

  This patch also covers the reload cases, which are harder to write a test
  for (and I didn't try).  I've tested it with Toyo's case and it fixes that.

[toyoa@mvista.com: fix: min_t -> max_t]
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Daniel Walker <dwalker@mvista.com>
Cc: Toyo Abe <toyoa@mvista.com>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: Seongbae Park <spark@google.com>
Cc: Peter Mattis <pmattis@google.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Martin Bligh <mbligh@google.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 479b16b44f79..7c3e1e6dfb5b 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -87,6 +87,19 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
 	return a;
 }
 
+/*
+ * Divide and limit the result to res >= 1
+ *
+ * This is necessary to prevent signal delivery starvation, when the result of
+ * the division would be rounded down to 0.
+ */
+static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div)
+{
+	cputime_t res = cputime_div(time, div);
+
+	return max_t(cputime_t, res, 1);
+}
+
 /*
  * Update expiry time from increment, and increase overrun count,
  * given the current clock sample.
@@ -483,8 +496,8 @@ static void process_timer_rebalance(struct task_struct *p,
 		BUG();
 		break;
 	case CPUCLOCK_PROF:
-		left = cputime_div(cputime_sub(expires.cpu, val.cpu),
-				   nthreads);
+		left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
+				       nthreads);
 		do {
 			if (likely(!(t->flags & PF_EXITING))) {
 				ticks = cputime_add(prof_ticks(t), left);
@@ -498,8 +511,8 @@ static void process_timer_rebalance(struct task_struct *p,
 		} while (t != p);
 		break;
 	case CPUCLOCK_VIRT:
-		left = cputime_div(cputime_sub(expires.cpu, val.cpu),
-				   nthreads);
+		left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
+				       nthreads);
 		do {
 			if (likely(!(t->flags & PF_EXITING))) {
 				ticks = cputime_add(virt_ticks(t), left);
@@ -515,6 +528,7 @@ static void process_timer_rebalance(struct task_struct *p,
 	case CPUCLOCK_SCHED:
 		nsleft = expires.sched - val.sched;
 		do_div(nsleft, nthreads);
+		nsleft = max_t(unsigned long long, nsleft, 1);
 		do {
 			if (likely(!(t->flags & PF_EXITING))) {
 				ns = t->sched_time + nsleft;
@@ -1159,12 +1173,13 @@ static void check_process_timers(struct task_struct *tsk,
 
 		prof_left = cputime_sub(prof_expires, utime);
 		prof_left = cputime_sub(prof_left, stime);
-		prof_left = cputime_div(prof_left, nthreads);
+		prof_left = cputime_div_non_zero(prof_left, nthreads);
 		virt_left = cputime_sub(virt_expires, utime);
-		virt_left = cputime_div(virt_left, nthreads);
+		virt_left = cputime_div_non_zero(virt_left, nthreads);
 		if (sched_expires) {
 			sched_left = sched_expires - sched_time;
 			do_div(sched_left, nthreads);
+			sched_left = max_t(unsigned long long, sched_left, 1);
 		} else {
 			sched_left = 0;
 		}
-- 
cgit v1.2.3


From c60099bfe3a5e6fa22a930627689b3769c52153f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 17 Oct 2006 00:09:59 -0700
Subject: [PATCH] swsusp: fix memory leaks

My fancy new swsusp IO code had a big memory leak.  It's somewhat invisible
because the whole mem_map[] gets overwritten after resume, but it can cause us
to get low on memory during the actual suspend process.

Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 9b2ee5344dee..1a3b0dd2c3fc 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -425,7 +425,8 @@ static int submit(int rw, pgoff_t page_off, struct page *page,
 			bio_set_pages_dirty(bio);
 		bio_put(bio);
 	} else {
-		get_page(page);
+		if (rw == READ)
+			get_page(page);	/* These pages are freed later */
 		bio->bi_private = *bio_chain;
 		*bio_chain = bio;
 		submit_bio(rw | (1 << BIO_RW_SYNC), bio);
-- 
cgit v1.2.3


From a460e745e8f9c75a0525ff94154a0629f9d3e05d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 17 Oct 2006 00:10:03 -0700
Subject: [PATCH] genirq: clean up irq-flow-type naming

Introduce desc->name and eliminate the handle_irq_name() hack.  Add
set_irq_chip_and_handler_name() to set the flow type and name at once.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Matthew Wilcox <willy@debian.org>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/chip.c | 33 +++++++++------------------------
 1 file changed, 9 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 11c99697acfe..2d0dc3efe813 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -499,7 +499,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
 #endif /* CONFIG_SMP */
 
 void
-__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained)
+__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
+		  const char *name)
 {
 	struct irq_desc *desc;
 	unsigned long flags;
@@ -540,6 +541,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained)
 		desc->depth = 1;
 	}
 	desc->handle_irq = handle;
+	desc->name = name;
 
 	if (handle != handle_bad_irq && is_chained) {
 		desc->status &= ~IRQ_DISABLED;
@@ -555,30 +557,13 @@ set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
 			 irq_flow_handler_t handle)
 {
 	set_irq_chip(irq, chip);
-	__set_irq_handler(irq, handle, 0);
+	__set_irq_handler(irq, handle, 0, NULL);
 }
 
-/*
- * Get a descriptive string for the highlevel handler, for
- * /proc/interrupts output:
- */
-const char *
-handle_irq_name(irq_flow_handler_t handle)
+void
+set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
+			      irq_flow_handler_t handle, const char *name)
 {
-	if (handle == handle_level_irq)
-		return "level  ";
-	if (handle == handle_fasteoi_irq)
-		return "fasteoi";
-	if (handle == handle_edge_irq)
-		return "edge   ";
-	if (handle == handle_simple_irq)
-		return "simple ";
-#ifdef CONFIG_SMP
-	if (handle == handle_percpu_irq)
-		return "percpu ";
-#endif
-	if (handle == handle_bad_irq)
-		return "bad    ";
-
-	return NULL;
+	set_irq_chip(irq, chip);
+	__set_irq_handler(irq, handle, 0, name);
 }
-- 
cgit v1.2.3


From bea493a031fe3337f4fe5479e8e865513828ea76 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 17 Oct 2006 00:10:33 -0700
Subject: [PATCH] rt-mutex: fixup rt-mutex debug code

BUG: warning at kernel/rtmutex-debug.c:125/rt_mutex_debug_task_free() (Not tainted)
 [<c04051e3>] show_trace_log_lvl+0x58/0x16a
 [<c04057f0>] show_trace+0xd/0x10
 [<c0405900>] dump_stack+0x19/0x1b
 [<c043f03d>] rt_mutex_debug_task_free+0x35/0x6a
 [<c04224c0>] free_task+0x15/0x24
 [<c042378c>] copy_process+0x12bd/0x1324
 [<c0423835>] do_fork+0x42/0x113
 [<c04021dd>] sys_fork+0x19/0x1b
 [<c0403fb7>] syscall_call+0x7/0xb

In copy_process(), dup_task_struct() also duplicates the ->pi_lock,
->pi_waiters and ->pi_blocked_on members.  rt_mutex_debug_task_free()
called from free_task() validates these members.  However free_task() can
be invoked before these members are reset for the new task.

Move the initialization code before the first bail that can hit free_task().

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 7dc6140baac6..29ebb30850ed 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -984,6 +984,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (!p)
 		goto fork_out;
 
+	rt_mutex_init_task(p);
+
 #ifdef CONFIG_TRACE_IRQFLAGS
 	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
@@ -1088,8 +1090,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->lockdep_recursion = 0;
 #endif
 
-	rt_mutex_init_task(p);
-
 #ifdef CONFIG_DEBUG_MUTEXES
 	p->blocked_on = NULL; /* not blocked yet */
 #endif
-- 
cgit v1.2.3


From bd5349cfd2b9bbb10a3dbcd3fe5cbaabe0b2ab9e Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@suse.de>
Date: Tue, 17 Oct 2006 00:10:35 -0700
Subject: [PATCH] Convert cpu hotplug notifiers to use raw_notifier instead of
 blocking_notifier

The use of blocking notifier by _cpu_up and _cpu_down in cpu.c has two
problem.

1/ An interaction with the workqueue notifier causes lockdep to spit a
   warning.

2/ A notifier could conceivable be added or removed while _cpu_up or
   _cpu_down are in process.  As each notifier is called twice (prepare
   then commit/abort) this could be unhealthy.

To fix to we simply take cpu_add_remove_lock while adding or removing
notifiers to/from the list.

This makes the 'blocking' usage unnecessary as all accesses to cpu_chain
are now protected by cpu_add_remove_lock.  So change "blocking" to "raw" in
all relevant places.  This fixes 1.

Credit: Andrew Morton
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Michal Piotrowski <michal.k.k.piotrowski@gmail.com> (reporter)
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpu.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 32c96628463e..27dd3ee47099 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -19,7 +19,7 @@
 static DEFINE_MUTEX(cpu_add_remove_lock);
 static DEFINE_MUTEX(cpu_bitmask_lock);
 
-static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain);
+static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
 
 /* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
  * Should always be manipulated under cpu_add_remove_lock
@@ -68,7 +68,11 @@ EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
 /* Need to know about CPUs going up/down? */
 int __cpuinit register_cpu_notifier(struct notifier_block *nb)
 {
-	return blocking_notifier_chain_register(&cpu_chain, nb);
+	int ret;
+	mutex_lock(&cpu_add_remove_lock);
+	ret = raw_notifier_chain_register(&cpu_chain, nb);
+	mutex_unlock(&cpu_add_remove_lock);
+	return ret;
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -77,7 +81,9 @@ EXPORT_SYMBOL(register_cpu_notifier);
 
 void unregister_cpu_notifier(struct notifier_block *nb)
 {
-	blocking_notifier_chain_unregister(&cpu_chain, nb);
+	mutex_lock(&cpu_add_remove_lock);
+	raw_notifier_chain_unregister(&cpu_chain, nb);
+	mutex_unlock(&cpu_add_remove_lock);
 }
 EXPORT_SYMBOL(unregister_cpu_notifier);
 
@@ -126,7 +132,7 @@ static int _cpu_down(unsigned int cpu)
 	if (!cpu_online(cpu))
 		return -EINVAL;
 
-	err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
+	err = raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
 						(void *)(long)cpu);
 	if (err == NOTIFY_BAD) {
 		printk("%s: attempt to take down CPU %u failed\n",
@@ -146,7 +152,7 @@ static int _cpu_down(unsigned int cpu)
 
 	if (IS_ERR(p)) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
-		if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
+		if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
 				(void *)(long)cpu) == NOTIFY_BAD)
 			BUG();
 
@@ -169,7 +175,7 @@ static int _cpu_down(unsigned int cpu)
 	put_cpu();
 
 	/* CPU is completely dead: tell everyone.  Too late to complain. */
-	if (blocking_notifier_call_chain(&cpu_chain, CPU_DEAD,
+	if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD,
 			(void *)(long)cpu) == NOTIFY_BAD)
 		BUG();
 
@@ -206,7 +212,7 @@ static int __devinit _cpu_up(unsigned int cpu)
 	if (cpu_online(cpu) || !cpu_present(cpu))
 		return -EINVAL;
 
-	ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
+	ret = raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
 	if (ret == NOTIFY_BAD) {
 		printk("%s: attempt to bring up CPU %u failed\n",
 				__FUNCTION__, cpu);
@@ -223,11 +229,11 @@ static int __devinit _cpu_up(unsigned int cpu)
 	BUG_ON(!cpu_online(cpu));
 
 	/* Now call notifier in preparation. */
-	blocking_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
+	raw_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
 
 out_notify:
 	if (ret != 0)
-		blocking_notifier_call_chain(&cpu_chain,
+		raw_notifier_call_chain(&cpu_chain,
 				CPU_UP_CANCELED, hcpu);
 
 	return ret;
-- 
cgit v1.2.3


From 91fcdd4e0314145d7d4fa52dba2f9c2da25346fd Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bbpetkov@yahoo.de>
Date: Thu, 19 Oct 2006 23:28:29 -0700
Subject: [PATCH] readjust comments of task_timeslice for kernel doc

Signed-off-by: Borislav Petkov <petkov@math.uni-muenster.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 094b5687eef6..3399701c680e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -160,15 +160,6 @@
 #define TASK_PREEMPTS_CURR(p, rq) \
 	((p)->prio < (rq)->curr->prio)
 
-/*
- * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
- * to time slice values: [800ms ... 100ms ... 5ms]
- *
- * The higher a thread's priority, the bigger timeslices
- * it gets during one round of execution. But even the lowest
- * priority thread gets MIN_TIMESLICE worth of execution time.
- */
-
 #define SCALE_PRIO(x, prio) \
 	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
 
@@ -180,6 +171,15 @@ static unsigned int static_prio_timeslice(int static_prio)
 		return SCALE_PRIO(DEF_TIMESLICE, static_prio);
 }
 
+/*
+ * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
+ * to time slice values: [800ms ... 100ms ... 5ms]
+ *
+ * The higher a thread's priority, the bigger timeslices
+ * it gets during one round of execution. But even the lowest
+ * priority thread gets MIN_TIMESLICE worth of execution time.
+ */
+
 static inline unsigned int task_timeslice(struct task_struct *p)
 {
 	return static_prio_timeslice(p->static_prio);
-- 
cgit v1.2.3


From d6f8ff7381501887233666b508b9eac70143303d Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 19 Oct 2006 23:28:34 -0700
Subject: [PATCH] cad_pid sysctl with PROC_FS=n

If CONFIG_PROC_FS=n:

kernel/sysctl.c:148: warning: 'proc_do_cad_pid' used but never defined
kernel/built-in.o:(.data+0x1228): undefined reference to `proc_do_cad_pid'
make: *** [.tmp_vmlinux1] Error 1

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8020fb273c4f..8bff2c18fb5a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -136,8 +136,10 @@ static int parse_table(int __user *, int, void __user *, size_t __user *,
 static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
 
+#ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
+#endif
 
 static ctl_table root_table[];
 static struct ctl_table_header root_table_header =
@@ -542,6 +544,7 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
+#ifdef CONFIG_PROC_SYSCTL
 	{
 		.ctl_name	= KERN_CADPID,
 		.procname	= "cad_pid",
@@ -550,6 +553,7 @@ static ctl_table kern_table[] = {
 		.mode		= 0600,
 		.proc_handler	= &proc_do_cad_pid,
 	},
+#endif
 	{
 		.ctl_name	= KERN_MAX_THREADS,
 		.procname	= "threads-max",
-- 
cgit v1.2.3


From e05d722e4555cd54677b4c8431d9e81fd047ef7a Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 19 Oct 2006 23:29:12 -0700
Subject: [PATCH] kernel/nsproxy.c: use kmemdup()

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/nsproxy.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 6ebdb82a0ce4..674aceb7335a 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -44,11 +44,9 @@ static inline struct nsproxy *clone_namespaces(struct nsproxy *orig)
 {
 	struct nsproxy *ns;
 
-	ns = kmalloc(sizeof(struct nsproxy), GFP_KERNEL);
-	if (ns) {
-		memcpy(ns, orig, sizeof(struct nsproxy));
+	ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL);
+	if (ns)
 		atomic_set(&ns->count, 1);
-	}
 	return ns;
 }
 
-- 
cgit v1.2.3


From 690a973f48b6ba2954465992c08e65059c8374fe Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Sat, 21 Oct 2006 18:37:01 +0200
Subject: [PATCH] x86-64: Speed up dwarf2 unwinder

This changes the dwarf2 unwinder to do a binary search for CIEs
instead of a linear work. The linker is unfortunately not
able to build a proper lookup table at link time, instead it creates
one at runtime as soon as the bootmem allocator is usable (so you'll continue
using the linear lookup for the first [hopefully] few calls).
The code should be ready to utilize a build-time created table once
a fixed linker becomes available.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 kernel/unwind.c | 318 +++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 279 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/kernel/unwind.c b/kernel/unwind.c
index 2e2368607aab..f7e50d16dbf6 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -11,13 +11,15 @@
 
 #include <linux/unwind.h>
 #include <linux/module.h>
-#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/sort.h>
 #include <linux/stop_machine.h>
 #include <asm/sections.h>
 #include <asm/uaccess.h>
 #include <asm/unaligned.h>
 
 extern char __start_unwind[], __end_unwind[];
+extern const u8 __start_unwind_hdr[], __end_unwind_hdr[];
 
 #define MAX_STACK_DEPTH 8
 
@@ -100,6 +102,8 @@ static struct unwind_table {
 	} core, init;
 	const void *address;
 	unsigned long size;
+	const unsigned char *header;
+	unsigned long hdrsz;
 	struct unwind_table *link;
 	const char *name;
 } root_table;
@@ -145,6 +149,10 @@ static struct unwind_table *find_table(unsigned long pc)
 	return table;
 }
 
+static unsigned long read_pointer(const u8 **pLoc,
+                                  const void *end,
+                                  signed ptrType);
+
 static void init_unwind_table(struct unwind_table *table,
                               const char *name,
                               const void *core_start,
@@ -152,14 +160,30 @@ static void init_unwind_table(struct unwind_table *table,
                               const void *init_start,
                               unsigned long init_size,
                               const void *table_start,
-                              unsigned long table_size)
+                              unsigned long table_size,
+                              const u8 *header_start,
+                              unsigned long header_size)
 {
+	const u8 *ptr = header_start + 4;
+	const u8 *end = header_start + header_size;
+
 	table->core.pc = (unsigned long)core_start;
 	table->core.range = core_size;
 	table->init.pc = (unsigned long)init_start;
 	table->init.range = init_size;
 	table->address = table_start;
 	table->size = table_size;
+	/* See if the linker provided table looks valid. */
+	if (header_size <= 4
+	    || header_start[0] != 1
+	    || (void *)read_pointer(&ptr, end, header_start[1]) != table_start
+	    || header_start[2] == DW_EH_PE_omit
+	    || read_pointer(&ptr, end, header_start[2]) <= 0
+	    || header_start[3] == DW_EH_PE_omit)
+		header_start = NULL;
+	table->hdrsz = header_size;
+	smp_wmb();
+	table->header = header_start;
 	table->link = NULL;
 	table->name = name;
 }
@@ -169,7 +193,143 @@ void __init unwind_init(void)
 	init_unwind_table(&root_table, "kernel",
 	                  _text, _end - _text,
 	                  NULL, 0,
-	                  __start_unwind, __end_unwind - __start_unwind);
+	                  __start_unwind, __end_unwind - __start_unwind,
+	                  __start_unwind_hdr, __end_unwind_hdr - __start_unwind_hdr);
+}
+
+static const u32 bad_cie, not_fde;
+static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *);
+static signed fde_pointer_type(const u32 *cie);
+
+struct eh_frame_hdr_table_entry {
+	unsigned long start, fde;
+};
+
+static int cmp_eh_frame_hdr_table_entries(const void *p1, const void *p2)
+{
+	const struct eh_frame_hdr_table_entry *e1 = p1;
+	const struct eh_frame_hdr_table_entry *e2 = p2;
+
+	return (e1->start > e2->start) - (e1->start < e2->start);
+}
+
+static void swap_eh_frame_hdr_table_entries(void *p1, void *p2, int size)
+{
+	struct eh_frame_hdr_table_entry *e1 = p1;
+	struct eh_frame_hdr_table_entry *e2 = p2;
+	unsigned long v;
+
+	v = e1->start;
+	e1->start = e2->start;
+	e2->start = v;
+	v = e1->fde;
+	e1->fde = e2->fde;
+	e2->fde = v;
+}
+
+static void __init setup_unwind_table(struct unwind_table *table,
+					void *(*alloc)(unsigned long))
+{
+	const u8 *ptr;
+	unsigned long tableSize = table->size, hdrSize;
+	unsigned n;
+	const u32 *fde;
+	struct {
+		u8 version;
+		u8 eh_frame_ptr_enc;
+		u8 fde_count_enc;
+		u8 table_enc;
+		unsigned long eh_frame_ptr;
+		unsigned int fde_count;
+		struct eh_frame_hdr_table_entry table[];
+	} __attribute__((__packed__)) *header;
+
+	if (table->header)
+		return;
+
+	if (table->hdrsz)
+		printk(KERN_WARNING ".eh_frame_hdr for '%s' present but unusable\n",
+		       table->name);
+
+	if (tableSize & (sizeof(*fde) - 1))
+		return;
+
+	for (fde = table->address, n = 0;
+	     tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde;
+	     tableSize -= sizeof(*fde) + *fde, fde += 1 + *fde / sizeof(*fde)) {
+		const u32 *cie = cie_for_fde(fde, table);
+		signed ptrType;
+
+		if (cie == &not_fde)
+			continue;
+		if (cie == NULL
+		    || cie == &bad_cie
+		    || (ptrType = fde_pointer_type(cie)) < 0)
+			return;
+		ptr = (const u8 *)(fde + 2);
+		if (!read_pointer(&ptr,
+		                  (const u8 *)(fde + 1) + *fde,
+		                  ptrType))
+			return;
+		++n;
+	}
+
+	if (tableSize || !n)
+		return;
+
+	hdrSize = 4 + sizeof(unsigned long) + sizeof(unsigned int)
+	        + 2 * n * sizeof(unsigned long);
+	header = alloc(hdrSize);
+	if (!header)
+		return;
+	header->version          = 1;
+	header->eh_frame_ptr_enc = DW_EH_PE_abs|DW_EH_PE_native;
+	header->fde_count_enc    = DW_EH_PE_abs|DW_EH_PE_data4;
+	header->table_enc        = DW_EH_PE_abs|DW_EH_PE_native;
+	put_unaligned((unsigned long)table->address, &header->eh_frame_ptr);
+	BUILD_BUG_ON(offsetof(typeof(*header), fde_count)
+	             % __alignof(typeof(header->fde_count)));
+	header->fde_count        = n;
+
+	BUILD_BUG_ON(offsetof(typeof(*header), table)
+	             % __alignof(typeof(*header->table)));
+	for (fde = table->address, tableSize = table->size, n = 0;
+	     tableSize;
+	     tableSize -= sizeof(*fde) + *fde, fde += 1 + *fde / sizeof(*fde)) {
+		const u32 *cie = fde + 1 - fde[1] / sizeof(*fde);
+
+		if (!fde[1])
+			continue; /* this is a CIE */
+		ptr = (const u8 *)(fde + 2);
+		header->table[n].start = read_pointer(&ptr,
+		                                      (const u8 *)(fde + 1) + *fde,
+		                                      fde_pointer_type(cie));
+		header->table[n].fde = (unsigned long)fde;
+		++n;
+	}
+	WARN_ON(n != header->fde_count);
+
+	sort(header->table,
+	     n,
+	     sizeof(*header->table),
+	     cmp_eh_frame_hdr_table_entries,
+	     swap_eh_frame_hdr_table_entries);
+
+	table->hdrsz = hdrSize;
+	smp_wmb();
+	table->header = (const void *)header;
+}
+
+static void *__init balloc(unsigned long sz)
+{
+	return __alloc_bootmem_nopanic(sz,
+	                               sizeof(unsigned int),
+	                               __pa(MAX_DMA_ADDRESS));
+}
+
+void __init unwind_setup(void)
+{
+	setup_unwind_table(&root_table, balloc);
 }
 
 #ifdef CONFIG_MODULES
@@ -193,7 +353,8 @@ void *unwind_add_table(struct module *module,
 	init_unwind_table(table, module->name,
 	                  module->module_core, module->core_size,
 	                  module->module_init, module->init_size,
-	                  table_start, table_size);
+	                  table_start, table_size,
+	                  NULL, 0);
 
 	if (last_table)
 		last_table->link = table;
@@ -303,6 +464,26 @@ static sleb128_t get_sleb128(const u8 **pcur, const u8 *end)
 	return value;
 }
 
+static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *table)
+{
+	const u32 *cie;
+
+	if (!*fde || (*fde & (sizeof(*fde) - 1)))
+		return &bad_cie;
+	if (!fde[1])
+		return &not_fde; /* this is a CIE */
+	if ((fde[1] & (sizeof(*fde) - 1))
+	    || fde[1] > (unsigned long)(fde + 1) - (unsigned long)table->address)
+		return NULL; /* this is not a valid FDE */
+	cie = fde + 1 - fde[1] / sizeof(*fde);
+	if (*cie <= sizeof(*cie) + 4
+	    || *cie >= fde[1] - sizeof(*fde)
+	    || (*cie & (sizeof(*cie) - 1))
+	    || cie[1])
+		return NULL; /* this is not a (valid) CIE */
+	return cie;
+}
+
 static unsigned long read_pointer(const u8 **pLoc,
                                   const void *end,
                                   signed ptrType)
@@ -610,49 +791,108 @@ int unwind(struct unwind_frame_info *frame)
 	unsigned i;
 	signed ptrType = -1;
 	uleb128_t retAddrReg = 0;
-	struct unwind_table *table;
+	const struct unwind_table *table;
 	struct unwind_state state;
 
 	if (UNW_PC(frame) == 0)
 		return -EINVAL;
 	if ((table = find_table(pc)) != NULL
 	    && !(table->size & (sizeof(*fde) - 1))) {
-		unsigned long tableSize = table->size;
-
-		for (fde = table->address;
-		     tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde;
-		     tableSize -= sizeof(*fde) + *fde,
-		     fde += 1 + *fde / sizeof(*fde)) {
-			if (!*fde || (*fde & (sizeof(*fde) - 1)))
-				break;
-			if (!fde[1])
-				continue; /* this is a CIE */
-			if ((fde[1] & (sizeof(*fde) - 1))
-			    || fde[1] > (unsigned long)(fde + 1)
-			                - (unsigned long)table->address)
-				continue; /* this is not a valid FDE */
-			cie = fde + 1 - fde[1] / sizeof(*fde);
-			if (*cie <= sizeof(*cie) + 4
-			    || *cie >= fde[1] - sizeof(*fde)
-			    || (*cie & (sizeof(*cie) - 1))
-			    || cie[1]
-			    || (ptrType = fde_pointer_type(cie)) < 0) {
-				cie = NULL; /* this is not a (valid) CIE */
-				continue;
+		const u8 *hdr = table->header;
+		unsigned long tableSize;
+
+		smp_rmb();
+		if (hdr && hdr[0] == 1) {
+			switch(hdr[3] & DW_EH_PE_FORM) {
+			case DW_EH_PE_native: tableSize = sizeof(unsigned long); break;
+			case DW_EH_PE_data2: tableSize = 2; break;
+			case DW_EH_PE_data4: tableSize = 4; break;
+			case DW_EH_PE_data8: tableSize = 8; break;
+			default: tableSize = 0; break;
+			}
+			ptr = hdr + 4;
+			end = hdr + table->hdrsz;
+			if (tableSize
+			    && read_pointer(&ptr, end, hdr[1])
+			       == (unsigned long)table->address
+			    && (i = read_pointer(&ptr, end, hdr[2])) > 0
+			    && i == (end - ptr) / (2 * tableSize)
+			    && !((end - ptr) % (2 * tableSize))) {
+				do {
+					const u8 *cur = ptr + (i / 2) * (2 * tableSize);
+
+					startLoc = read_pointer(&cur,
+					                        cur + tableSize,
+					                        hdr[3]);
+					if (pc < startLoc)
+						i /= 2;
+					else {
+						ptr = cur - tableSize;
+						i = (i + 1) / 2;
+					}
+				} while (startLoc && i > 1);
+				if (i == 1
+				    && (startLoc = read_pointer(&ptr,
+				                                ptr + tableSize,
+				                                hdr[3])) != 0
+				    && pc >= startLoc)
+					fde = (void *)read_pointer(&ptr,
+					                           ptr + tableSize,
+					                           hdr[3]);
 			}
+		}
+
+		if (fde != NULL) {
+			cie = cie_for_fde(fde, table);
 			ptr = (const u8 *)(fde + 2);
-			startLoc = read_pointer(&ptr,
-			                        (const u8 *)(fde + 1) + *fde,
-			                        ptrType);
-			endLoc = startLoc
-			         + read_pointer(&ptr,
-			                        (const u8 *)(fde + 1) + *fde,
-			                        ptrType & DW_EH_PE_indirect
-			                        ? ptrType
-			                        : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed));
-			if (pc >= startLoc && pc < endLoc)
-				break;
-			cie = NULL;
+			if(cie != NULL
+			   && cie != &bad_cie
+			   && cie != &not_fde
+			   && (ptrType = fde_pointer_type(cie)) >= 0
+			   && read_pointer(&ptr,
+			                   (const u8 *)(fde + 1) + *fde,
+			                   ptrType) == startLoc) {
+				if (!(ptrType & DW_EH_PE_indirect))
+					ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed;
+				endLoc = startLoc
+				         + read_pointer(&ptr,
+				                        (const u8 *)(fde + 1) + *fde,
+				                        ptrType);
+				if(pc >= endLoc)
+					fde = NULL;
+			} else
+				fde = NULL;
+		}
+		if (fde == NULL) {
+			for (fde = table->address, tableSize = table->size;
+			     cie = NULL, tableSize > sizeof(*fde)
+			     && tableSize - sizeof(*fde) >= *fde;
+			     tableSize -= sizeof(*fde) + *fde,
+			     fde += 1 + *fde / sizeof(*fde)) {
+				cie = cie_for_fde(fde, table);
+				if (cie == &bad_cie) {
+					cie = NULL;
+					break;
+				}
+				if (cie == NULL
+				    || cie == &not_fde
+				    || (ptrType = fde_pointer_type(cie)) < 0)
+					continue;
+				ptr = (const u8 *)(fde + 2);
+				startLoc = read_pointer(&ptr,
+				                        (const u8 *)(fde + 1) + *fde,
+				                        ptrType);
+				if (!startLoc)
+					continue;
+				if (!(ptrType & DW_EH_PE_indirect))
+					ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed;
+				endLoc = startLoc
+				         + read_pointer(&ptr,
+				                        (const u8 *)(fde + 1) + *fde,
+				                        ptrType);
+				if (pc >= startLoc && pc < endLoc)
+					break;
+			}
 		}
 	}
 	if (cie != NULL) {
-- 
cgit v1.2.3


From 1d4d262769cd1894a0306b9c57e72f005cd9e75a Mon Sep 17 00:00:00 2001
From: Jan Dittmer <jdi@l4x.org>
Date: Sat, 28 Oct 2006 10:38:38 -0700
Subject: [PATCH] Add missing space in module.c for taintskernel

Obvious fix.

Signed-off-by: Jan Dittmer <jdi@l4x.org>
Acked-by: Florin Malita <fmalita@gmail.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 67009bd56c52..5072a943fe35 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1342,7 +1342,7 @@ static void set_license(struct module *mod, const char *license)
 
 	if (!license_is_gpl_compatible(license)) {
 		if (!(tainted & TAINT_PROPRIETARY_MODULE))
-			printk(KERN_WARNING "%s: module license '%s' taints"
+			printk(KERN_WARNING "%s: module license '%s' taints "
 				"kernel.\n", mod->name, license);
 		add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
 	}
-- 
cgit v1.2.3


From 5fa3839a64203b2ab727dcb37da9b2d7079fca28 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Sat, 28 Oct 2006 10:38:46 -0700
Subject: [PATCH] Constify compat_get_bitmap argument

This means we can call it when the bitmap we want to fetch is declared
const.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/compat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index 75573e5d27b0..d4898aad6cfa 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -678,7 +678,7 @@ int get_compat_sigevent(struct sigevent *event,
 		? -EFAULT : 0;
 }
 
-long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask,
+long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
 		       unsigned long bitmap_size)
 {
 	int i, j;
-- 
cgit v1.2.3


From fca178c0c6e8d52a1875be36b070f30884ebfae9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sat, 28 Oct 2006 10:38:49 -0700
Subject: [PATCH] fill_tgid: fix task_struct leak and possible oops

1. fill_tgid() forgets to do put_task_struct(first).

2. release_task(first) can happen after fill_tgid() drops tasklist_lock,
   it is unsafe to dereference first->signal.

This is a temporary fix, imho the locking should be reworked.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@sgi.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 5d6a8c54ee85..9aeee511a463 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -237,14 +237,17 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
 	} else
 		get_task_struct(first);
 
-	/* Start with stats from dead tasks */
-	spin_lock_irqsave(&first->signal->stats_lock, flags);
-	if (first->signal->stats)
-		memcpy(stats, first->signal->stats, sizeof(*stats));
-	spin_unlock_irqrestore(&first->signal->stats_lock, flags);
 
 	tsk = first;
 	read_lock(&tasklist_lock);
+	/* Start with stats from dead tasks */
+	if (first->signal) {
+		spin_lock_irqsave(&first->signal->stats_lock, flags);
+		if (first->signal->stats)
+			memcpy(stats, first->signal->stats, sizeof(*stats));
+		spin_unlock_irqrestore(&first->signal->stats_lock, flags);
+	}
+
 	do {
 		if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk))
 			continue;
@@ -264,7 +267,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
 	 * Accounting subsytems can also add calls here to modify
 	 * fields of taskstats.
 	 */
-
+	put_task_struct(first);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 05d5bcd60e8202e5c7b28cf61186043a4d612623 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sat, 28 Oct 2006 10:38:50 -0700
Subject: [PATCH] bacct_add_tsk: fix unsafe and wrong parent/group_leader
 dereference

1. ts = timespec_sub(uptime, current->group_leader->start_time);

   It is possible that current != tsk. Probably it was supposed
   to be 'tsk->group_leader->start_time. But why we are reading
   group_leader's start_time ? This accounting is per thread,
   not per procees, I changed this to 'tsk->start_time.
   Please corect me.

2. stats->ac_ppid = (tsk->parent) ? tsk->parent->pid : 0;

   tsk->parent never == NULL, and it is unsafe to dereference it.
   Both the task and it's parent may exit after the caller unlocks
   tasklist_lock, the memory could be unmapped (DEBUG_SLAB).
   (And we should use ->real_parent->tgid in fact).

Q: I don't understand the 'if (thread_group_leader(tsk))' check.
Why it is needed ?

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Acked-by: Jay Lan <jlan@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/tsacct.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index db443221ba5b..65a5036a3d95 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -36,7 +36,7 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 
 	/* calculate task elapsed time in timespec */
 	do_posix_clock_monotonic_gettime(&uptime);
-	ts = timespec_sub(uptime, current->group_leader->start_time);
+	ts = timespec_sub(uptime, tsk->start_time);
 	/* rebase elapsed time to usec */
 	ac_etime = timespec_to_ns(&ts);
 	do_div(ac_etime, NSEC_PER_USEC);
@@ -58,7 +58,10 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 	stats->ac_uid	 = tsk->uid;
 	stats->ac_gid	 = tsk->gid;
 	stats->ac_pid	 = tsk->pid;
-	stats->ac_ppid	 = (tsk->parent) ? tsk->parent->pid : 0;
+	rcu_read_lock();
+	stats->ac_ppid	 = pid_alive(tsk) ?
+				rcu_dereference(tsk->real_parent)->tgid : 0;
+	rcu_read_unlock();
 	stats->ac_utime	 = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC;
 	stats->ac_stime	 = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC;
 	stats->ac_minflt = tsk->min_flt;
-- 
cgit v1.2.3


From 093a8e8aecd77b2799934996a55a6838e1e2b8f3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sat, 28 Oct 2006 10:38:51 -0700
Subject: [PATCH] taskstats_tgid_free: fix usage

taskstats_tgid_free() is called on copy_process's error path. This is wrong.

	IF (clone_flags & CLONE_THREAD)
		We should not clear ->signal->taskstats, current uses it,
		it probably has a valid accumulated info.
	ELSE
		taskstats_tgid_init() set ->signal->taskstats = NULL,
		there is nothing to free.

Move the callsite to __exit_signal(). We don't need any locking, entire
thread group is exiting, nobody should have a reference to soon to be
released ->signal.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 1 +
 kernel/fork.c | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index f250a5e3e281..06de6c4e8ca3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -128,6 +128,7 @@ static void __exit_signal(struct task_struct *tsk)
 	flush_sigqueue(&tsk->pending);
 	if (sig) {
 		flush_sigqueue(&sig->shared_pending);
+		taskstats_tgid_free(sig);
 		__cleanup_signal(sig);
 	}
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 29ebb30850ed..213326609bac 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -897,7 +897,6 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 void __cleanup_signal(struct signal_struct *sig)
 {
 	exit_thread_group_keys(sig);
-	taskstats_tgid_free(sig);
 	kmem_cache_free(signal_cachep, sig);
 }
 
-- 
cgit v1.2.3


From b8534d7bd89df0cd41cd47bcd6733a05ea9a691a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sat, 28 Oct 2006 10:38:53 -0700
Subject: [PATCH] taskstats: kill ->taskstats_lock in favor of ->siglock

signal_struct is (mostly) protected by ->sighand->siglock, I think we don't
need ->taskstats_lock to protect ->stats.  This also allows us to simplify the
locking in fill_tgid().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c      |  2 +-
 kernel/taskstats.c | 16 ++++++----------
 2 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 213326609bac..3da978eec791 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -830,7 +830,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	if (clone_flags & CLONE_THREAD) {
 		atomic_inc(&current->signal->count);
 		atomic_inc(&current->signal->live);
-		taskstats_tgid_alloc(current->signal);
+		taskstats_tgid_alloc(current);
 		return 0;
 	}
 	sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 9aeee511a463..b2efda94615a 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -241,11 +241,11 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
 	tsk = first;
 	read_lock(&tasklist_lock);
 	/* Start with stats from dead tasks */
-	if (first->signal) {
-		spin_lock_irqsave(&first->signal->stats_lock, flags);
+	if (first->sighand) {
+		spin_lock_irqsave(&first->sighand->siglock, flags);
 		if (first->signal->stats)
 			memcpy(stats, first->signal->stats, sizeof(*stats));
-		spin_unlock_irqrestore(&first->signal->stats_lock, flags);
+		spin_unlock_irqrestore(&first->sighand->siglock, flags);
 	}
 
 	do {
@@ -276,7 +276,7 @@ static void fill_tgid_exit(struct task_struct *tsk)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&tsk->signal->stats_lock, flags);
+	spin_lock_irqsave(&tsk->sighand->siglock, flags);
 	if (!tsk->signal->stats)
 		goto ret;
 
@@ -288,7 +288,7 @@ static void fill_tgid_exit(struct task_struct *tsk)
 	 */
 	delayacct_add_tsk(tsk->signal->stats, tsk);
 ret:
-	spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
+	spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
 	return;
 }
 
@@ -464,15 +464,10 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 	size_t size;
 	int is_thread_group;
 	struct nlattr *na;
-	unsigned long flags;
 
 	if (!family_registered || !tidstats)
 		return;
 
-	spin_lock_irqsave(&tsk->signal->stats_lock, flags);
-	is_thread_group = tsk->signal->stats ? 1 : 0;
-	spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
-
 	rc = 0;
 	/*
 	 * Size includes space for nested attributes
@@ -480,6 +475,7 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 	size = nla_total_size(sizeof(u32)) +
 		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
 
+	is_thread_group = (tsk->signal->stats != NULL);
 	if (is_thread_group)
 		size = 2 * size;	/* PID + STATS + TGID + STATS */
 
-- 
cgit v1.2.3


From a98b6094261c0112e9c455c96995972181bff049 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sat, 28 Oct 2006 10:38:54 -0700
Subject: [PATCH] taskstats: don't use tasklist_lock

Remove tasklist_lock from taskstats.c. find_task_by_pid() is rcu-safe.
->siglock allows us to traverse subthread without tasklist.

Q: delay accounting looks wrong to me.  If sub-thread has already called
taskstats_exit_send() but didn't call release_task(self) yet it will be
accounted twice.  The window is big.  No?

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 59 ++++++++++++++++++++++--------------------------------
 1 file changed, 24 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index b2efda94615a..b724aeea5443 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -174,21 +174,19 @@ static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 	up_write(&listeners->sem);
 }
 
-static int fill_pid(pid_t pid, struct task_struct *pidtsk,
+static int fill_pid(pid_t pid, struct task_struct *tsk,
 		struct taskstats *stats)
 {
 	int rc = 0;
-	struct task_struct *tsk = pidtsk;
 
-	if (!pidtsk) {
-		read_lock(&tasklist_lock);
+	if (!tsk) {
+		rcu_read_lock();
 		tsk = find_task_by_pid(pid);
-		if (!tsk) {
-			read_unlock(&tasklist_lock);
+		if (tsk)
+			get_task_struct(tsk);
+		rcu_read_unlock();
+		if (!tsk)
 			return -ESRCH;
-		}
-		get_task_struct(tsk);
-		read_unlock(&tasklist_lock);
 	} else
 		get_task_struct(tsk);
 
@@ -214,40 +212,28 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 
 }
 
-static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
+static int fill_tgid(pid_t tgid, struct task_struct *first,
 		struct taskstats *stats)
 {
-	struct task_struct *tsk, *first;
+	struct task_struct *tsk;
 	unsigned long flags;
+	int rc = -ESRCH;
 
 	/*
 	 * Add additional stats from live tasks except zombie thread group
 	 * leaders who are already counted with the dead tasks
 	 */
-	first = tgidtsk;
-	if (!first) {
-		read_lock(&tasklist_lock);
+	rcu_read_lock();
+	if (!first)
 		first = find_task_by_pid(tgid);
-		if (!first) {
-			read_unlock(&tasklist_lock);
-			return -ESRCH;
-		}
-		get_task_struct(first);
-		read_unlock(&tasklist_lock);
-	} else
-		get_task_struct(first);
 
+	if (!first || !lock_task_sighand(first, &flags))
+		goto out;
 
-	tsk = first;
-	read_lock(&tasklist_lock);
-	/* Start with stats from dead tasks */
-	if (first->sighand) {
-		spin_lock_irqsave(&first->sighand->siglock, flags);
-		if (first->signal->stats)
-			memcpy(stats, first->signal->stats, sizeof(*stats));
-		spin_unlock_irqrestore(&first->sighand->siglock, flags);
-	}
+	if (first->signal->stats)
+		memcpy(stats, first->signal->stats, sizeof(*stats));
 
+	tsk = first;
 	do {
 		if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk))
 			continue;
@@ -260,15 +246,18 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
 		delayacct_add_tsk(stats, tsk);
 
 	} while_each_thread(first, tsk);
-	read_unlock(&tasklist_lock);
-	stats->version = TASKSTATS_VERSION;
 
+	unlock_task_sighand(first, &flags);
+	rc = 0;
+out:
+	rcu_read_unlock();
+
+	stats->version = TASKSTATS_VERSION;
 	/*
 	 * Accounting subsytems can also add calls here to modify
 	 * fields of taskstats.
 	 */
-	put_task_struct(first);
-	return 0;
+	return rc;
 }
 
 
-- 
cgit v1.2.3


From d7c3f5f231c60d7e6ada5770b536df2b3ec1bd08 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sat, 28 Oct 2006 10:38:54 -0700
Subject: [PATCH] fill_tgid: cleanup delays accounting

fill_tgid() should skip not only an already exited group leader.  If the
task has ->exit_state != 0 it already did exit_notify(), so it also did
fill_tgid_exit()->delayacct_add_tsk(->signal->stats) and we should skip it
to avoid a double accounting.

This patch doesn't close the race completely, but it cleanups the code.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index b724aeea5443..8adfb8069c6d 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -235,7 +235,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
 
 	tsk = first;
 	do {
-		if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk))
+		if (tsk->exit_state)
 			continue;
 		/*
 		 * Accounting subsystem can call its functions here to
-- 
cgit v1.2.3


From bb1d860551c4307b1a7ee9a21b120319075e987e Mon Sep 17 00:00:00 2001
From: Jim Houston <jim.houston@comcast.net>
Date: Sat, 28 Oct 2006 10:38:56 -0700
Subject: [PATCH] time_adjust cleared before use

I notice that the code which implements adjtime clears the time_adjust
value before using it.  The attached patch makes the obvious fix.

Acked-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Jim Houston <jim.houston@ccur.com>
Cc: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time/ntp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 47195fa0ec4f..3afeaa3a73f9 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -161,9 +161,9 @@ void second_overflow(void)
 			time_adjust += MAX_TICKADJ;
 			tick_length -= MAX_TICKADJ_SCALED;
 		} else {
-			time_adjust = 0;
 			tick_length += (s64)(time_adjust * NSEC_PER_USEC /
 					     HZ) << TICK_LENGTH_SHIFT;
+			time_adjust = 0;
 		}
 	}
 }
-- 
cgit v1.2.3


From 8fa1d7d3b2c51594c0f3aa151983dd51f605e07d Mon Sep 17 00:00:00 2001
From: Satoru Takeuchi <takeuchi_satoru@jp.fujitsu.com>
Date: Sat, 28 Oct 2006 10:38:57 -0700
Subject: [PATCH] cpu-hotplug: release `workqueue_mutex' properly on CPU
 hot-remove

_cpu_down() acquires `workqueue_mutex' on its process, but doen't release it
if __cpu_disable() fails.

Signed-off-by: Satoru Takeuchi <takeuchi_satoru@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpu.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 27dd3ee47099..663c920b2234 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -150,18 +150,18 @@ static int _cpu_down(unsigned int cpu)
 	p = __stop_machine_run(take_cpu_down, NULL, cpu);
 	mutex_unlock(&cpu_bitmask_lock);
 
-	if (IS_ERR(p)) {
+	if (IS_ERR(p) || cpu_online(cpu)) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
 				(void *)(long)cpu) == NOTIFY_BAD)
 			BUG();
 
-		err = PTR_ERR(p);
-		goto out_allowed;
-	}
-
-	if (cpu_online(cpu))
+		if (IS_ERR(p)) {
+			err = PTR_ERR(p);
+			goto out_allowed;
+		}
 		goto out_thread;
+	}
 
 	/* Wait for it to sleep (leaving idle task). */
 	while (!idle_cpu(cpu))
-- 
cgit v1.2.3


From 057647fc47b3a5fbcfa997041db3f483d506603c Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Sat, 28 Oct 2006 10:38:58 -0700
Subject: [PATCH] workqueue: update kerneldoc

This patch (as812) changes the kerneldoc comments explaining the return
values from queue_work(), queue_delayed_work(), and
queue_delayed_work_on().  The updated comments explain more accurately the
meaning of the return code and avoid suggesting that a 0 value means the
routine was unsuccessful.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/workqueue.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3df9bfc7ff78..17c2f03d2c27 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -99,7 +99,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
  * @wq: workqueue to use
  * @work: work to queue
  *
- * Returns non-zero if it was successfully added.
+ * Returns 0 if @work was already on a queue, non-zero otherwise.
  *
  * We queue the work to the CPU it was submitted, but there is no
  * guarantee that it will be processed by that CPU.
@@ -138,7 +138,7 @@ static void delayed_work_timer_fn(unsigned long __data)
  * @work: work to queue
  * @delay: number of jiffies to wait before queueing
  *
- * Returns non-zero if it was successfully added.
+ * Returns 0 if @work was already on a queue, non-zero otherwise.
  */
 int fastcall queue_delayed_work(struct workqueue_struct *wq,
 			struct work_struct *work, unsigned long delay)
@@ -169,7 +169,7 @@ EXPORT_SYMBOL_GPL(queue_delayed_work);
  * @work: work to queue
  * @delay: number of jiffies to wait before queueing
  *
- * Returns non-zero if it was successfully added.
+ * Returns 0 if @work was already on a queue, non-zero otherwise.
  */
 int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 			struct work_struct *work, unsigned long delay)
-- 
cgit v1.2.3


From d46a3d0d07ba539aea5b0e1ad30e568f0cb03576 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 29 Oct 2006 16:45:58 +0300
Subject: [PATCH] taskstats: fix sk_buff leak

'return genlmsg_cancel()' in taskstats_user_cmd/taskstats_exit_send
potentially leaks a skb.  Unless we pass 'rep_skb' to the netlink layer
we own sk_buff.  This means we should always do kfree_skb() on failure.

[ Thomas acked and pointed out missing return value in original version ]

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Thomas Graf <tgraf@suug.ch>
Cc: Andrew Morton <akpm@osdl.org>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@sgi.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 8adfb8069c6d..f3c3e9d43d2c 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -411,7 +411,7 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 	return send_reply(rep_skb, info->snd_pid);
 
 nla_put_failure:
-	return genlmsg_cancel(rep_skb, reply);
+	rc = genlmsg_cancel(rep_skb, reply);
 err:
 	nlmsg_free(rep_skb);
 	return rc;
@@ -507,7 +507,6 @@ send:
 
 nla_put_failure:
 	genlmsg_cancel(rep_skb, reply);
-	goto ret;
 err_skb:
 	nlmsg_free(rep_skb);
 ret:
-- 
cgit v1.2.3


From 3d8334def5cf831d2ed438aae021696a2faa4ddd Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 29 Oct 2006 18:57:16 +0300
Subject: [PATCH] taskstats: fix sk_buff size calculation

prepare_reply() adds GENL_HDRLEN to the payload (genlmsg_total_size()),
but then it does genlmsg_put()->nlmsg_put().  This means we forget to
reserve a room for 'struct nlmsghdr'.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Andrew Morton <akpm@osdl.org>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@sgi.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index f3c3e9d43d2c..2039585ec5e1 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -77,7 +77,8 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
 	/*
 	 * If new attributes are added, please revisit this allocation
 	 */
-	skb = nlmsg_new(genlmsg_total_size(size), GFP_KERNEL);
+	size = nlmsg_total_size(genlmsg_total_size(size));
+	skb = nlmsg_new(size, GFP_KERNEL);
 	if (!skb)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From 0e2d57fc6e7dabdbfdd4f26c861e7e6c75d5bdcf Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Sun, 29 Oct 2006 22:46:34 -0800
Subject: [PATCH] ndiswrapper: don't set the module->taints flags

For ndiswrapper, don't set the module->taints flags, just set the kernel
global tainted flag.  This should allow ndiswrapper to continue to use GPL
symbols.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Florin Malita <fmalita@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 5072a943fe35..f0166563c602 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1718,7 +1718,7 @@ static struct module *load_module(void __user *umod,
 	set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
 
 	if (strcmp(mod->name, "ndiswrapper") == 0)
-		add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+		add_taint(TAINT_PROPRIETARY_MODULE);
 	if (strcmp(mod->name, "driverloader") == 0)
 		add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
 
-- 
cgit v1.2.3


From f0ec1aaf54caddd21c259aea8b2ecfbde4ee4fb9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 29 Oct 2006 22:46:43 -0800
Subject: [PATCH] xacct_add_tsk: fix pure theoretical ->mm use-after-free

Paranoid fix. The task can free its ->mm after the 'if (p->mm)' check.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/tsacct.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 65a5036a3d95..96f77013d3f0 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -80,13 +80,17 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
  */
 void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 {
+	struct mm_struct *mm;
+
 	/* convert pages-jiffies to Mbyte-usec */
 	stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB;
 	stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB;
-	if (p->mm) {
+	mm = get_task_mm(p);
+	if (mm) {
 		/* adjust to KB unit */
-		stats->hiwater_rss   = p->mm->hiwater_rss * PAGE_SIZE / KB;
-		stats->hiwater_vm    = p->mm->hiwater_vm * PAGE_SIZE / KB;
+		stats->hiwater_rss   = mm->hiwater_rss * PAGE_SIZE / KB;
+		stats->hiwater_vm    = mm->hiwater_vm * PAGE_SIZE / KB;
+		mmput(mm);
 	}
 	stats->read_char	= p->rchar;
 	stats->write_char	= p->wchar;
-- 
cgit v1.2.3


From 4a279ff1ea1cf325775ada983035123fcdc8e986 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 30 Oct 2006 22:07:15 -0800
Subject: [PATCH] taskstats: fix sub-threads accounting

If there are no listeners, taskstats_exit_send() just returns because
taskstats_exit_alloc() didn't allocate *tidstats.  This is wrong, each
sub-thread should do fill_tgid_exit() on exit, otherwise its ->delays is
not recorded in ->signal->stats and lost.

Q: We don't send TASKSTATS_TYPE_AGGR_TGID when single-threaded process
exits.  Is it good?  How can the listener figure out that it was actually a
process exit, not sub-thread?

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Balbir Singh <balbir@in.ibm.com>
Acked-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 2039585ec5e1..f45c5e70773c 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -455,10 +455,9 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 	int is_thread_group;
 	struct nlattr *na;
 
-	if (!family_registered || !tidstats)
+	if (!family_registered)
 		return;
 
-	rc = 0;
 	/*
 	 * Size includes space for nested attributes
 	 */
@@ -466,8 +465,15 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
 
 	is_thread_group = (tsk->signal->stats != NULL);
-	if (is_thread_group)
-		size = 2 * size;	/* PID + STATS + TGID + STATS */
+	if (is_thread_group) {
+		/* PID + STATS + TGID + STATS */
+		size = 2 * size;
+		/* fill the tsk->signal->stats structure */
+		fill_tgid_exit(tsk);
+	}
+
+	if (!tidstats)
+		return;
 
 	rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
 	if (rc < 0)
@@ -487,11 +493,8 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 		goto send;
 
 	/*
-	 * tsk has/had a thread group so fill the tsk->signal->stats structure
 	 * Doesn't matter if tsk is the leader or the last group member leaving
 	 */
-
-	fill_tgid_exit(tsk);
 	if (!group_dead)
 		goto send;
 
-- 
cgit v1.2.3


From f46c483357c2d87606bbefb511321e3efd4baae0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 2 Nov 2006 22:07:16 -0800
Subject: [PATCH] Add printk_timed_ratelimit()

printk_ratelimit() has global state which makes it not useful for callers
which wish to perform ratelimiting at a particular frequency.

Add a printk_timed_ratelimit() which utilises caller-provided state storage to
permit more flexibility.

This function can in fact be used for things other than printk ratelimiting
and is perhaps poorly named.

Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/printk.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index f7d427ef5038..66426552fbfe 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -31,6 +31,7 @@
 #include <linux/security.h>
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
+#include <linux/jiffies.h>
 
 #include <asm/uaccess.h>
 
@@ -1101,3 +1102,23 @@ int printk_ratelimit(void)
 				printk_ratelimit_burst);
 }
 EXPORT_SYMBOL(printk_ratelimit);
+
+/**
+ * printk_timed_ratelimit - caller-controlled printk ratelimiting
+ * @caller_jiffies: pointer to caller's state
+ * @interval_msecs: minimum interval between prints
+ *
+ * printk_timed_ratelimit() returns true if more than @interval_msecs
+ * milliseconds have elapsed since the last time printk_timed_ratelimit()
+ * returned true.
+ */
+bool printk_timed_ratelimit(unsigned long *caller_jiffies,
+			unsigned int interval_msecs)
+{
+	if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) {
+		*caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs);
+		return true;
+	}
+	return false;
+}
+EXPORT_SYMBOL(printk_timed_ratelimit);
-- 
cgit v1.2.3


From 19c6b6ed3f597a583f58e3fc99256cc01ae8c394 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 2 Nov 2006 22:07:17 -0800
Subject: [PATCH] schedule removal of FUTEX_FD

Apparently FUTEX_FD is unfixably racy and nothing uses it (or if it does, it
shouldn't).

Add a warning printk, give any remaining users six months to migrate off it.

Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index b364e0026191..93ef30ba209f 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1507,6 +1507,13 @@ static int futex_fd(u32 __user *uaddr, int signal)
 	struct futex_q *q;
 	struct file *filp;
 	int ret, err;
+	static unsigned long printk_interval;
+
+	if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
+		printk(KERN_WARNING "Process `%s' used FUTEX_FD, which "
+		    	"will be removed from the kernel in June 2007\n",
+			current->comm);
+	}
 
 	ret = -EINVAL;
 	if (!valid_signal(signal))
-- 
cgit v1.2.3


From b918f6e62cd46774f9fc0a3fbba6bd10ad85ee14 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 2 Nov 2006 22:07:19 -0800
Subject: [PATCH] swsusp: debugging

Add a swsusp debugging mode.  This does everything that's needed for a suspend
except for actually suspending.  So we can look in the log messages and work
out a) what code is being slow and b) which drivers are misbehaving.

(1)
# echo testproc > /sys/power/disk
# echo disk > /sys/power/state

This should turn off the non-boot CPU, freeze all processes, wait for 5
seconds and then thaw the processes and the CPU.

(2)
# echo test > /sys/power/disk
# echo disk > /sys/power/state

This should turn off the non-boot CPU, freeze all processes, shrink
memory, suspend all devices, wait for 5 seconds, resume the devices etc.

Cc: Pavel Machek <pavel@ucw.cz>
Cc: Stefan Seyfried <seife@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index d3a158a60312..b1fb7866b0b3 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -71,7 +71,7 @@ static inline void platform_finish(void)
 
 static int prepare_processes(void)
 {
-	int error;
+	int error = 0;
 
 	pm_prepare_console();
 
@@ -84,6 +84,12 @@ static int prepare_processes(void)
 		goto thaw;
 	}
 
+	if (pm_disk_mode == PM_DISK_TESTPROC) {
+		printk("swsusp debug: Waiting for 5 seconds.\n");
+		mdelay(5000);
+		goto thaw;
+	}
+
 	/* Free memory before shutting down devices. */
 	if (!(error = swsusp_shrink_memory()))
 		return 0;
@@ -120,13 +126,21 @@ int pm_suspend_disk(void)
 	if (error)
 		return error;
 
+	if (pm_disk_mode == PM_DISK_TESTPROC)
+		goto Thaw;
+
 	suspend_console();
 	error = device_suspend(PMSG_FREEZE);
 	if (error) {
 		resume_console();
 		printk("Some devices failed to suspend\n");
-		unprepare_processes();
-		return error;
+		goto Thaw;
+	}
+
+	if (pm_disk_mode == PM_DISK_TEST) {
+		printk("swsusp debug: Waiting for 5 seconds.\n");
+		mdelay(5000);
+		goto Done;
 	}
 
 	pr_debug("PM: snapshotting memory.\n");
@@ -143,16 +157,17 @@ int pm_suspend_disk(void)
 			power_down(pm_disk_mode);
 		else {
 			swsusp_free();
-			unprepare_processes();
-			return error;
+			goto Thaw;
 		}
-	} else
+	} else {
 		pr_debug("PM: Image restored successfully.\n");
+	}
 
 	swsusp_free();
  Done:
 	device_resume();
 	resume_console();
+ Thaw:
 	unprepare_processes();
 	return error;
 }
@@ -249,6 +264,8 @@ static const char * const pm_disk_modes[] = {
 	[PM_DISK_PLATFORM]	= "platform",
 	[PM_DISK_SHUTDOWN]	= "shutdown",
 	[PM_DISK_REBOOT]	= "reboot",
+	[PM_DISK_TEST]		= "test",
+	[PM_DISK_TESTPROC]	= "testproc",
 };
 
 /**
@@ -303,17 +320,19 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
 		}
 	}
 	if (mode) {
-		if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT)
+		if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT ||
+		     mode == PM_DISK_TEST || mode == PM_DISK_TESTPROC) {
 			pm_disk_mode = mode;
-		else {
+		} else {
 			if (pm_ops && pm_ops->enter &&
 			    (mode == pm_ops->pm_disk_mode))
 				pm_disk_mode = mode;
 			else
 				error = -EINVAL;
 		}
-	} else
+	} else {
 		error = -EINVAL;
+	}
 
 	pr_debug("PM: suspend-to-disk mode set to '%s'\n",
 		 pm_disk_modes[mode]);
-- 
cgit v1.2.3


From 3fd593979802f81ff6452596ac61e3840f917589 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Thu, 2 Nov 2006 22:07:24 -0800
Subject: [PATCH] Create compat_sys_migrate_pages

This is needed on bigendian 64bit architectures.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/compat.c | 33 +++++++++++++++++++++++++++++++++
 kernel/sys_ni.c |  1 +
 2 files changed, 34 insertions(+)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index d4898aad6cfa..6952dd057300 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -982,4 +982,37 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
 	}
 	return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
 }
+
+asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
+			compat_ulong_t maxnode,
+			const compat_ulong_t __user *old_nodes,
+			const compat_ulong_t __user *new_nodes)
+{
+	unsigned long __user *old = NULL;
+	unsigned long __user *new = NULL;
+	nodemask_t tmp_mask;
+	unsigned long nr_bits;
+	unsigned long size;
+
+	nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
+	size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
+	if (old_nodes) {
+		if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
+			return -EFAULT;
+		old = compat_alloc_user_space(new_nodes ? size * 2 : size);
+		if (new_nodes)
+			new = old + size / sizeof(unsigned long);
+		if (copy_to_user(old, nodes_addr(tmp_mask), size))
+			return -EFAULT;
+	}
+	if (new_nodes) {
+		if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
+			return -EFAULT;
+		if (new == NULL)
+			new = compat_alloc_user_space(size);
+		if (copy_to_user(new, nodes_addr(tmp_mask), size))
+			return -EFAULT;
+	}
+	return sys_migrate_pages(pid, nr_bits + 1, old, new);
+}
 #endif
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 0e53314b14de..d7306d0f3dfc 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -135,6 +135,7 @@ cond_syscall(sys_madvise);
 cond_syscall(sys_mremap);
 cond_syscall(sys_remap_file_pages);
 cond_syscall(compat_sys_move_pages);
+cond_syscall(compat_sys_migrate_pages);
 
 /* block-layer dependent */
 cond_syscall(sys_bdflush);
-- 
cgit v1.2.3


From 45c18b0bb579b5c1b89f8c99f1b6ffa4c586ba08 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Sat, 4 Nov 2006 10:06:02 -0800
Subject: Fix unlikely (but possible) race condition on task->user access

There's a possible race condition when doing a "switch_uid()" from one
user to another, which could race with another thread doing a signal
allocation and looking at the old thread ->user pointer as it is freed.

This explains an oops reported by Lukasz Trabinski:
	http://permalink.gmane.org/gmane.linux.kernel/462241

We fix this by delaying the (reference-counted) freeing of the user
structure until the thread signal handler lock has been released, so
that we know that the signal allocation has either seen the new value or
has properly incremented the reference count of the old one.

Race identified by Oleg Nesterov.

Cc: Lukasz Trabinski <lukasz@wsisiz.edu.pl>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Andrew Morton <akpm@osdl.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/user.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index 6408c0424291..220e586127a0 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -187,6 +187,17 @@ void switch_uid(struct user_struct *new_user)
 	atomic_dec(&old_user->processes);
 	switch_uid_keyring(new_user);
 	current->user = new_user;
+
+	/*
+	 * We need to synchronize with __sigqueue_alloc()
+	 * doing a get_uid(p->user).. If that saw the old
+	 * user value, we need to wait until it has exited
+	 * its critical region before we can free the old
+	 * structure.
+	 */
+	smp_mb();
+	spin_unlock_wait(&current->sighand->siglock);
+
 	free_uid(old_user);
 	suid_keys(current);
 }
-- 
cgit v1.2.3


From 10b1fbdb0a0ca91847a534ad26d0bc250c25b74f Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Sat, 4 Nov 2006 13:03:00 -0800
Subject: Make sure "user->sigpending" count is in sync

The previous commit (45c18b0bb579b5c1b89f8c99f1b6ffa4c586ba08, aka "Fix
unlikely (but possible) race condition on task->user access") fixed a
potential oops due to __sigqueue_alloc() getting its "user" pointer out
of sync with switch_user(), and accessing a user pointer that had been
de-allocated on another CPU.

It still left another (much less serious) problem, where a concurrent
__sigqueue_alloc and swich_user could cause sigqueue_alloc to do signal
pending reference counting for a _different_ user than the one it then
actually ended up using.  No oops, but we'd end up with the wrong signal
accounting.

Another case of Oleg's eagle-eyes picking up the problem.

This is trivially fixed by just making sure we load whichever "user"
structure we decide to use (it doesn't matter _which_ one we pick, we
just need to pick one) just once.

Acked-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Andrew Morton <akpm@osdl.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 7ed8d5304bec..df18c167a2a7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -267,18 +267,25 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 					 int override_rlimit)
 {
 	struct sigqueue *q = NULL;
+	struct user_struct *user;
 
-	atomic_inc(&t->user->sigpending);
+	/*
+	 * In order to avoid problems with "switch_user()", we want to make
+	 * sure that the compiler doesn't re-load "t->user"
+	 */
+	user = t->user;
+	barrier();
+	atomic_inc(&user->sigpending);
 	if (override_rlimit ||
-	    atomic_read(&t->user->sigpending) <=
+	    atomic_read(&user->sigpending) <=
 			t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
 		q = kmem_cache_alloc(sigqueue_cachep, flags);
 	if (unlikely(q == NULL)) {
-		atomic_dec(&t->user->sigpending);
+		atomic_dec(&user->sigpending);
 	} else {
 		INIT_LIST_HEAD(&q->list);
 		q->flags = 0;
-		q->user = get_uid(t->user);
+		q->user = get_uid(user);
 	}
 	return(q);
 }
-- 
cgit v1.2.3


From 4b96b1a10cb00c867103b21f0f2a6c91b705db11 Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Sun, 5 Nov 2006 23:52:04 -0800
Subject: [PATCH] Fix the spurious unlock_cpu_hotplug false warnings

Cpu-hotplug locking has a minor race case caused because of setting the
variable "recursive" to NULL *after* releasing the cpu_bitmask_lock in the
function unlock_cpu_hotplug,instead of doing so before releasing the
cpu_bitmask_lock.

This was the cause of most of the recent false spurious lock_cpu_unlock
warnings.

This should fix the problem reported by Martin Lorenz reported in
http://lkml.org/lkml/2006/10/29/127.

Thanks to Srinivasa DS for pointing it out.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 663c920b2234..272254f20d97 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -58,8 +58,8 @@ void unlock_cpu_hotplug(void)
 		recursive_depth--;
 		return;
 	}
-	mutex_unlock(&cpu_bitmask_lock);
 	recursive = NULL;
+	mutex_unlock(&cpu_bitmask_lock);
 }
 EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
 
-- 
cgit v1.2.3


From 64efade11cddc4237c1b95ea4ca18af122a7e19e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sun, 5 Nov 2006 23:52:10 -0800
Subject: [PATCH] lockdep: fix delayacct locking bug

Make the delayacct lock irqsave; this avoids the possible deadlock where
an interrupt is taken while holding the delayacct lock which needs to
take the delayacct lock.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/delayacct.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 36752f124c6a..66a0ea48751d 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -66,6 +66,7 @@ static void delayacct_end(struct timespec *start, struct timespec *end,
 {
 	struct timespec ts;
 	s64 ns;
+	unsigned long flags;
 
 	do_posix_clock_monotonic_gettime(end);
 	ts = timespec_sub(*end, *start);
@@ -73,10 +74,10 @@ static void delayacct_end(struct timespec *start, struct timespec *end,
 	if (ns < 0)
 		return;
 
-	spin_lock(&current->delays->lock);
+	spin_lock_irqsave(&current->delays->lock, flags);
 	*total += ns;
 	(*count)++;
-	spin_unlock(&current->delays->lock);
+	spin_unlock_irqrestore(&current->delays->lock, flags);
 }
 
 void __delayacct_blkio_start(void)
@@ -104,6 +105,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	s64 tmp;
 	struct timespec ts;
 	unsigned long t1,t2,t3;
+	unsigned long flags;
 
 	/* Though tsk->delays accessed later, early exit avoids
 	 * unnecessary returning of other data
@@ -136,14 +138,14 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 
 	/* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
 
-	spin_lock(&tsk->delays->lock);
+	spin_lock_irqsave(&tsk->delays->lock, flags);
 	tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
 	d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
 	tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
 	d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
 	d->blkio_count += tsk->delays->blkio_count;
 	d->swapin_count += tsk->delays->swapin_count;
-	spin_unlock(&tsk->delays->lock);
+	spin_unlock_irqrestore(&tsk->delays->lock, flags);
 
 done:
 	return 0;
@@ -152,11 +154,12 @@ done:
 __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
 {
 	__u64 ret;
+	unsigned long flags;
 
-	spin_lock(&tsk->delays->lock);
+	spin_lock_irqsave(&tsk->delays->lock, flags);
 	ret = nsec_to_clock_t(tsk->delays->blkio_delay +
 				tsk->delays->swapin_delay);
-	spin_unlock(&tsk->delays->lock);
+	spin_unlock_irqrestore(&tsk->delays->lock, flags);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 0e009be8a0c2309f3696df70f72ef0075aa34c9c Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 5 Nov 2006 23:52:11 -0800
Subject: [PATCH] Improve the removed sysctl warnings

Don't warn about libpthread's access to kernel.version.  When it receives
-ENOSYS it will read /proc/sys/kernel/version.

If anything else shows up print the sysctl number string.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Cal Peake <cp@absolutedigital.net>
Cc: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8bff2c18fb5a..0c8e805bbd6f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2680,13 +2680,33 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
 asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
 {
 	static int msg_count;
+	struct __sysctl_args tmp;
+	int name[CTL_MAXNAME];
+	int i;
+
+	/* Read in the sysctl name for better debug message logging */
+	if (copy_from_user(&tmp, args, sizeof(tmp)))
+		return -EFAULT;
+	if (tmp.nlen <= 0 || tmp.nlen >= CTL_MAXNAME)
+		return -ENOTDIR;
+	for (i = 0; i < tmp.nlen; i++)
+		if (get_user(name[i], tmp.name + i))
+			return -EFAULT;
+
+	/* Ignore accesses to kernel.version */
+	if ((tmp.nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION))
+		goto out;
 
 	if (msg_count < 5) {
 		msg_count++;
 		printk(KERN_INFO
 			"warning: process `%s' used the removed sysctl "
-			"system call\n", current->comm);
+			"system call with ", current->comm);
+		for (i = 0; i < tmp.nlen; i++)
+			printk("%d.", name[i]);
+		printk("\n");
 	}
+out:
 	return -ENOSYS;
 }
 
-- 
cgit v1.2.3


From d99f160ac53e51090f015a8f0617cea25f81a191 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 5 Nov 2006 23:52:12 -0800
Subject: [PATCH] sysctl: allow a zero ctl_name in the middle of a sysctl table

Since it is becoming clear that there are just enough users of the binary
sysctl interface that completely removing the binary interface from the kernel
will not be an option for foreseeable future, we need to find a way to address
the sysctl maintenance issues.

The basic problem is that sysctl requires one central authority to allocate
sysctl numbers, or else conflicts and ABI breakage occur.  The proc interface
to sysctl does not have that problem, as names are not densely allocated.

By not terminating a sysctl table until I have neither a ctl_name nor a
procname, it becomes simple to add sysctl entries that don't show up in the
binary sysctl interface.  Which allows people to avoid allocating a binary
sysctl value when not needed.

I have audited the kernel code and in my reading I have not found a single
sysctl table that wasn't terminated by a completely zero filled entry.  So
this change in behavior should not affect anything.

I think this mechanism eases the pain enough that combined with a little
disciple we can solve the reoccurring sysctl ABI breakage.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0c8e805bbd6f..09e569f4792b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1315,7 +1315,9 @@ repeat:
 		return -ENOTDIR;
 	if (get_user(n, name))
 		return -EFAULT;
-	for ( ; table->ctl_name; table++) {
+	for ( ; table->ctl_name || table->procname; table++) {
+		if (!table->ctl_name)
+			continue;
 		if (n == table->ctl_name || table->ctl_name == CTL_ANY) {
 			int error;
 			if (table->child) {
@@ -1532,7 +1534,7 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root,
 	int len;
 	mode_t mode;
 	
-	for (; table->ctl_name; table++) {
+	for (; table->ctl_name || table->procname; table++) {
 		/* Can't do anything without a proc name. */
 		if (!table->procname)
 			continue;
@@ -1579,7 +1581,7 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root,
 static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root)
 {
 	struct proc_dir_entry *de;
-	for (; table->ctl_name; table++) {
+	for (; table->ctl_name || table->procname; table++) {
 		if (!(de = table->de))
 			continue;
 		if (de->mode & S_IFDIR) {
-- 
cgit v1.2.3


From 0130b0b32ee53dc7add773fcea984f6a26ef1da3 Mon Sep 17 00:00:00 2001
From: Sharyathi Nagesh <sharyath@in.ibm.com>
Date: Fri, 10 Nov 2006 12:27:54 -0800
Subject: [PATCH] fix Data Acess error in dup_fd

On running the Stress Test on machine for more than 72 hours following
error message was observed.

0:mon> e
cpu 0x0: Vector: 300 (Data Access) at [c00000007ce2f7f0]
    pc: c000000000060d90: .dup_fd+0x240/0x39c
    lr: c000000000060d6c: .dup_fd+0x21c/0x39c
    sp: c00000007ce2fa70
   msr: 800000000000b032
   dar: ffffffff00000028
 dsisr: 40000000
  current = 0xc000000074950980
  paca    = 0xc000000000454500
    pid   = 27330, comm = bash

0:mon> t
[c00000007ce2fa70] c000000000060d28 .dup_fd+0x1d8/0x39c (unreliable)
[c00000007ce2fb30] c000000000060f48 .copy_files+0x5c/0x88
[c00000007ce2fbd0] c000000000061f5c .copy_process+0x574/0x1520
[c00000007ce2fcd0] c000000000062f88 .do_fork+0x80/0x1c4
[c00000007ce2fdc0] c000000000011790 .sys_clone+0x5c/0x74
[c00000007ce2fe30] c000000000008950 .ppc_clone+0x8/0xc

The problem is because of race window.  When if(expand) block is executed in
dup_fd unlocking of oldf->file_lock give a window for fdtable in oldf to be
modified.  So actual open_files in oldf may not match with open_files
variable.

Cc: Vadim Lobanov <vlobanov@speakeasy.net>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 3da978eec791..4b4eab2a3161 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -687,6 +687,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 		 * the latest pointer.
 		 */
 		spin_lock(&oldf->file_lock);
+		open_files = count_open_files(old_fdt);
 		old_fdt = files_fdtable(oldf);
 	}
 
-- 
cgit v1.2.3


From f72fa707604c015a6625e80f269506032d5430dc Mon Sep 17 00:00:00 2001
From: Pavel Emelianov <xemul@openvz.org>
Date: Fri, 10 Nov 2006 12:27:56 -0800
Subject: [PATCH] Fix misrouted interrupts deadlocks

While testing kernel on machine with "irqpoll" option I've caught such a
lockup:

	__do_IRQ()
	   spin_lock(&desc->lock);
           desc->chip->ack(); /* IRQ is ACKed */
	note_interrupt()
	misrouted_irq()
	handle_IRQ_event()
           if (...)
	      local_irq_enable_in_hardirq();
	/* interrupts are enabled from now */
	...
	__do_IRQ() /* same IRQ we've started from */
	   spin_lock(&desc->lock); /* LOCKUP */

Looking at misrouted_irq() code I've found that a potential deadlock like
this can also take place:

1CPU:
__do_IRQ()
   spin_lock(&desc->lock); /* irq = A */
misrouted_irq()
   for (i = 1; i < NR_IRQS; i++) {
      spin_lock(&desc->lock); /* irq = B */
      if (desc->status & IRQ_INPROGRESS) {

2CPU:
__do_IRQ()
   spin_lock(&desc->lock); /* irq = B */
misrouted_irq()
   for (i = 1; i < NR_IRQS; i++) {
      spin_lock(&desc->lock); /* irq = A */
      if (desc->status & IRQ_INPROGRESS) {

As the second lock on both CPUs is taken before checking that this irq is
being handled in another processor this may cause a deadlock.  This issue
is only theoretical.

I propose the attached patch to fix booth problems: when trying to handle
misrouted IRQ active desc->lock may be unlocked.

Acked-by: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/spurious.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 543ea2e5ad93..9c7e2e4c1fe7 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -147,7 +147,11 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
 	if (unlikely(irqfixup)) {
 		/* Don't punish working computers */
 		if ((irqfixup == 2 && irq == 0) || action_ret == IRQ_NONE) {
-			int ok = misrouted_irq(irq);
+			int ok;
+
+			spin_unlock(&desc->lock);
+			ok = misrouted_irq(irq);
+			spin_lock(&desc->lock);
 			if (action_ret == IRQ_NONE)
 				desc->irqs_unhandled -= ok;
 		}
-- 
cgit v1.2.3


From 8b126b77536186eef69d408eb7959ce7f558f251 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 14 Nov 2006 02:03:23 -0800
Subject: [PATCH] setup_irq(): better mismatch debugging

When we get a mismatch between handlers on the same IRQ, all we get is "IRQ
handler type mismatch for IRQ n".  Let's print the name of the
presently-registered handler with which we got the mismatch.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/manage.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6879202afe9a..b385878c6e80 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -216,6 +216,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 {
 	struct irq_desc *desc = irq_desc + irq;
 	struct irqaction *old, **p;
+	const char *old_name = NULL;
 	unsigned long flags;
 	int shared = 0;
 
@@ -255,8 +256,10 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 		 * set the trigger type must match.
 		 */
 		if (!((old->flags & new->flags) & IRQF_SHARED) ||
-		    ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK))
+		    ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) {
+			old_name = old->name;
 			goto mismatch;
+		}
 
 #if defined(CONFIG_IRQ_PER_CPU)
 		/* All handlers must agree on per-cpuness */
@@ -322,11 +325,13 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 	return 0;
 
 mismatch:
-	spin_unlock_irqrestore(&desc->lock, flags);
 	if (!(new->flags & IRQF_PROBE_SHARED)) {
 		printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq);
+		if (old_name)
+			printk(KERN_ERR "current handler: %s\n", old_name);
 		dump_stack();
 	}
+	spin_unlock_irqrestore(&desc->lock, flags);
 	return -EBUSY;
 }
 
-- 
cgit v1.2.3


From 9a3a04ac386f44175b6a4142eaeab3d4170a57f3 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@woody.osdl.org>
Date: Tue, 14 Nov 2006 15:20:51 -0800
Subject: Revert "[PATCH] fix Data Acess error in dup_fd"

This reverts commit 0130b0b32ee53dc7add773fcea984f6a26ef1da3.

Sergey Vlasov points out (and Vadim Lobanov concurs) that the bug it was
supposed to fix must be some unrelated memory corruption, and the "fix"
actually causes more problems:

  "However, the new code does not look safe in all cases.  If some other
   task has opened more files while dup_fd() released oldf->file_lock, the
   new code will update open_files to the new larger value.  But newf was
   allocated with the old smaller value of open_files, therefore subsequent
   accesses to newf may try to write into unallocated memory."

so revert it.

Cc: Sharyathi Nagesh <sharyath@in.ibm.com>
Cc: Sergey Vlasov <vsu@altlinux.ru>
Cc: Vadim Lobanov <vlobanov@speakeasy.net>
Cc: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 4b4eab2a3161..3da978eec791 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -687,7 +687,6 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 		 * the latest pointer.
 		 */
 		spin_lock(&oldf->file_lock);
-		open_files = count_open_files(old_fdt);
 		old_fdt = files_fdtable(oldf);
 	}
 
-- 
cgit v1.2.3


From b86432b42eba5671969a9e6483ee219674b7ee25 Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Date: Thu, 16 Nov 2006 01:19:10 -0800
Subject: [PATCH] some irq_chip variables point to NULL

I got an oops when booting 2.6.19-rc5-mm1 on my ia64 machine.

Below is the log.

Oops 11012296146944 [1]
Modules linked in: binfmt_misc dm_mirror dm_multipath dm_mod thermal processor f
an container button sg eepro100 e100 mii

Pid: 0, CPU 0, comm:              swapper
psr : 0000121008022038 ifs : 800000000000040b ip  : [<a0000001000e1411>]    Not
tainted
ip is at __do_IRQ+0x371/0x3e0
unat: 0000000000000000 pfs : 000000000000040b rsc : 0000000000000003
rnat: 656960155aa56aa5 bsps: a00000010058b890 pr  : 656960155aa55a65
ldrs: 0000000000000000 ccv : 0000000000000000 fpsr: 0009804c0270033f
csd : 0000000000000000 ssd : 0000000000000000
b0  : a0000001000e1390 b6  : a0000001005beac0 b7  : e00000007f01aa00
f6  : 000000000000000000000 f7  : 0ffe69090000000000000
f8  : 1000a9090000000000000 f9  : 0ffff8000000000000000
f10 : 1000a908ffffff6f70000 f11 : 1003e0000000000000909
r1  : a000000100fbbff0 r2  : 0000000000010002 r3  : 0000000000010001
r8  : fffffffffffbffff r9  : a000000100bd8060 r10 : a000000100dd83b8
r11 : fffffffffffeffff r12 : a000000100bcbbb0 r13 : a000000100bc4000
r14 : 0000000000010000 r15 : 0000000000010000 r16 : a000000100c01aa8
r17 : a000000100d2c350 r18 : 0000000000000000 r19 : a000000100d2c300
r20 : a000000100c01a88 r21 : 0000000080010100 r22 : a000000100c01ac0
r23 : a0000001000108e0 r24 : e000000477980004 r25 : 0000000000000000
r26 : 0000000000000000 r27 : e00000000913400c r28 : e0000004799ee51c
r29 : e0000004778b87f0 r30 : a000000100d2c300 r31 : a00000010005c7e0

Call Trace:
 [<a000000100014600>] show_stack+0x40/0xa0
                                sp=a000000100bcb760 bsp=a000000100bc4f40
 [<a000000100014f00>] show_regs+0x840/0x880
                                sp=a000000100bcb930 bsp=a000000100bc4ee8
 [<a000000100037fb0>] die+0x250/0x320
                                sp=a000000100bcb930 bsp=a000000100bc4ea0
 [<a00000010005e5f0>] ia64_do_page_fault+0x8d0/0xa20
                                sp=a000000100bcb950 bsp=a000000100bc4e50
 [<a00000010000caa0>] ia64_leave_kernel+0x0/0x290
                                sp=a000000100bcb9e0 bsp=a000000100bc4e50
 [<a0000001000e1410>] __do_IRQ+0x370/0x3e0
                                sp=a000000100bcbbb0 bsp=a000000100bc4df0
 [<a000000100011f50>] ia64_handle_irq+0x170/0x220
                                sp=a000000100bcbbb0 bsp=a000000100bc4dc0
 [<a00000010000caa0>] ia64_leave_kernel+0x0/0x290
                                sp=a000000100bcbbb0 bsp=a000000100bc4dc0
 [<a000000100012390>] ia64_pal_call_static+0x90/0xc0
                                sp=a000000100bcbd80 bsp=a000000100bc4d78
 [<a000000100015630>] default_idle+0x90/0x160
                                sp=a000000100bcbd80 bsp=a000000100bc4d58
 [<a000000100014290>] cpu_idle+0x1f0/0x440
                                sp=a000000100bcbe20 bsp=a000000100bc4d18
 [<a000000100009980>] rest_init+0xc0/0xe0
                                sp=a000000100bcbe20 bsp=a000000100bc4d00
 [<a0000001009f8ea0>] start_kernel+0x6a0/0x6c0
                                sp=a000000100bcbe20 bsp=a000000100bc4ca0
 [<a0000001000089f0>] __end_ivt_text+0x6d0/0x6f0
                                sp=a000000100bcbe30 bsp=a000000100bc4c00
 <0>Kernel panic - not syncing: Aiee, killing interrupt handler!

The root cause is that some irq_chip variables, especially ia64_msi_chip,
initiate their memeber end to point to NULL. __do_IRQ doesn't check
if irq_chip->end is null and just calls it after processing the interrupt.

As irq_chip->end is called at many places, so I fix it by reinitiating
irq_chip->end to dummy_irq_chip.end, e.g., a noop function.

Signed-off-by: Zhang Yanmin <yanmin.zhang@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/chip.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 2d0dc3efe813..ebfd24a41858 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -233,6 +233,8 @@ void irq_chip_set_defaults(struct irq_chip *chip)
 		chip->shutdown = chip->disable;
 	if (!chip->name)
 		chip->name = chip->typename;
+	if (!chip->end)
+		chip->end = dummy_irq_chip.end;
 }
 
 static inline void mask_ack_irq(struct irq_desc *desc, int irq)
-- 
cgit v1.2.3


From 1ff5683043196b9ad628a5de6bf8eeca52ee8bfd Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 17 Nov 2006 19:57:22 +0100
Subject: [PATCH] lockdep: fix static keys in module-allocated percpu areas

lockdep got confused by certain locks in modules:

 INFO: trying to register non-static key.
 the code is fine but needs lockdep annotation.
 turning off the locking correctness validator.

 Call Trace:
  [<ffffffff8026f40d>] dump_trace+0xaa/0x3f2
  [<ffffffff8026f78f>] show_trace+0x3a/0x60
  [<ffffffff8026f9d1>] dump_stack+0x15/0x17
  [<ffffffff802abfe8>] __lock_acquire+0x724/0x9bb
  [<ffffffff802ac52b>] lock_acquire+0x4d/0x67
  [<ffffffff80267139>] rt_spin_lock+0x3d/0x41
  [<ffffffff8839ed3f>] :ip_conntrack:__ip_ct_refresh_acct+0x131/0x174
  [<ffffffff883a1334>] :ip_conntrack:udp_packet+0xbf/0xcf
  [<ffffffff8839f9af>] :ip_conntrack:ip_conntrack_in+0x394/0x4a7
  [<ffffffff8023551f>] nf_iterate+0x41/0x7f
  [<ffffffff8025946a>] nf_hook_slow+0x64/0xd5
  [<ffffffff802369a2>] ip_rcv+0x24e/0x506
  [...]

Steven Rostedt found the bug: static_obj() check did not take
PERCPU_ENOUGH_ROOM into account, so in-module DEFINE_PER_CPU-area locks
were triggering this message.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/lockdep.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index b739be2a6dc9..c9fefdb1a7db 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -1081,7 +1081,8 @@ static int static_obj(void *obj)
 	 */
 	for_each_possible_cpu(i) {
 		start = (unsigned long) &__per_cpu_start + per_cpu_offset(i);
-		end   = (unsigned long) &__per_cpu_end   + per_cpu_offset(i);
+		end   = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM
+					+ per_cpu_offset(i);
 
 		if ((addr >= start) && (addr < end))
 			return 1;
-- 
cgit v1.2.3


From 52bad64d95bd89e08c49ec5a071fa6dcbe5a1a9c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 22 Nov 2006 14:54:01 +0000
Subject: WorkStruct: Separate delayable and non-delayable events.

Separate delayable work items from non-delayable work items be splitting them
into a separate structure (delayed_work), which incorporates a work_struct and
the timer_list removed from work_struct.

The work_struct struct is huge, and this limits it's usefulness.  On a 64-bit
architecture it's nearly 100 bytes in size.  This reduces that by half for the
non-delayable type of event.

Signed-Off-By: David Howells <dhowells@redhat.com>
---
 kernel/workqueue.c | 51 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 28 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 17c2f03d2c27..44fc54b7decf 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -122,29 +122,33 @@ EXPORT_SYMBOL_GPL(queue_work);
 
 static void delayed_work_timer_fn(unsigned long __data)
 {
-	struct work_struct *work = (struct work_struct *)__data;
-	struct workqueue_struct *wq = work->wq_data;
+	struct delayed_work *dwork = (struct delayed_work *)__data;
+	struct workqueue_struct *wq = dwork->work.wq_data;
 	int cpu = smp_processor_id();
 
 	if (unlikely(is_single_threaded(wq)))
 		cpu = singlethread_cpu;
 
-	__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
+	__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), &dwork->work);
 }
 
 /**
  * queue_delayed_work - queue work on a workqueue after delay
  * @wq: workqueue to use
- * @work: work to queue
+ * @work: delayable work to queue
  * @delay: number of jiffies to wait before queueing
  *
  * Returns 0 if @work was already on a queue, non-zero otherwise.
  */
 int fastcall queue_delayed_work(struct workqueue_struct *wq,
-			struct work_struct *work, unsigned long delay)
+			struct delayed_work *dwork, unsigned long delay)
 {
 	int ret = 0;
-	struct timer_list *timer = &work->timer;
+	struct timer_list *timer = &dwork->timer;
+	struct work_struct *work = &dwork->work;
+
+	if (delay == 0)
+		return queue_work(wq, work);
 
 	if (!test_and_set_bit(0, &work->pending)) {
 		BUG_ON(timer_pending(timer));
@@ -153,7 +157,7 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq,
 		/* This stores wq for the moment, for the timer_fn */
 		work->wq_data = wq;
 		timer->expires = jiffies + delay;
-		timer->data = (unsigned long)work;
+		timer->data = (unsigned long)dwork;
 		timer->function = delayed_work_timer_fn;
 		add_timer(timer);
 		ret = 1;
@@ -172,10 +176,11 @@ EXPORT_SYMBOL_GPL(queue_delayed_work);
  * Returns 0 if @work was already on a queue, non-zero otherwise.
  */
 int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
-			struct work_struct *work, unsigned long delay)
+			struct delayed_work *dwork, unsigned long delay)
 {
 	int ret = 0;
-	struct timer_list *timer = &work->timer;
+	struct timer_list *timer = &dwork->timer;
+	struct work_struct *work = &dwork->work;
 
 	if (!test_and_set_bit(0, &work->pending)) {
 		BUG_ON(timer_pending(timer));
@@ -184,7 +189,7 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 		/* This stores wq for the moment, for the timer_fn */
 		work->wq_data = wq;
 		timer->expires = jiffies + delay;
-		timer->data = (unsigned long)work;
+		timer->data = (unsigned long)dwork;
 		timer->function = delayed_work_timer_fn;
 		add_timer_on(timer, cpu);
 		ret = 1;
@@ -468,31 +473,31 @@ EXPORT_SYMBOL(schedule_work);
 
 /**
  * schedule_delayed_work - put work task in global workqueue after delay
- * @work: job to be done
- * @delay: number of jiffies to wait
+ * @dwork: job to be done
+ * @delay: number of jiffies to wait or 0 for immediate execution
  *
  * After waiting for a given time this puts a job in the kernel-global
  * workqueue.
  */
-int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay)
+int fastcall schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
 {
-	return queue_delayed_work(keventd_wq, work, delay);
+	return queue_delayed_work(keventd_wq, dwork, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work);
 
 /**
  * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
  * @cpu: cpu to use
- * @work: job to be done
+ * @dwork: job to be done
  * @delay: number of jiffies to wait
  *
  * After waiting for a given time this puts a job in the kernel-global
  * workqueue on the specified CPU.
  */
 int schedule_delayed_work_on(int cpu,
-			struct work_struct *work, unsigned long delay)
+			struct delayed_work *dwork, unsigned long delay)
 {
-	return queue_delayed_work_on(cpu, keventd_wq, work, delay);
+	return queue_delayed_work_on(cpu, keventd_wq, dwork, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work_on);
 
@@ -539,12 +544,12 @@ EXPORT_SYMBOL(flush_scheduled_work);
  * cancel_rearming_delayed_workqueue - reliably kill off a delayed
  *			work whose handler rearms the delayed work.
  * @wq:   the controlling workqueue structure
- * @work: the delayed work struct
+ * @dwork: the delayed work struct
  */
 void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
-				       struct work_struct *work)
+				       struct delayed_work *dwork)
 {
-	while (!cancel_delayed_work(work))
+	while (!cancel_delayed_work(dwork))
 		flush_workqueue(wq);
 }
 EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
@@ -552,11 +557,11 @@ EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
 /**
  * cancel_rearming_delayed_work - reliably kill off a delayed keventd
  *			work whose handler rearms the delayed work.
- * @work: the delayed work struct
+ * @dwork: the delayed work struct
  */
-void cancel_rearming_delayed_work(struct work_struct *work)
+void cancel_rearming_delayed_work(struct delayed_work *dwork)
 {
-	cancel_rearming_delayed_workqueue(keventd_wq, work);
+	cancel_rearming_delayed_workqueue(keventd_wq, dwork);
 }
 EXPORT_SYMBOL(cancel_rearming_delayed_work);
 
-- 
cgit v1.2.3


From 6bb49e5965c1fc399b4d3cd2b5cf2da535b330c0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 22 Nov 2006 14:54:45 +0000
Subject: WorkStruct: Typedef the work function prototype

Define a type for the work function prototype.  It's not only kept in the
work_struct struct, it's also passed as an argument to several functions.

This makes it easier to change it.

Signed-Off-By: David Howells <dhowells@redhat.com>
---
 kernel/workqueue.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 44fc54b7decf..1e9d61ecf762 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -217,7 +217,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
 	while (!list_empty(&cwq->worklist)) {
 		struct work_struct *work = list_entry(cwq->worklist.next,
 						struct work_struct, entry);
-		void (*f) (void *) = work->func;
+		work_func_t f = work->func;
 		void *data = work->data;
 
 		list_del_init(cwq->worklist.next);
@@ -513,7 +513,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
  *
  * schedule_on_each_cpu() is very slow.
  */
-int schedule_on_each_cpu(void (*func)(void *info), void *info)
+int schedule_on_each_cpu(work_func_t func, void *info)
 {
 	int cpu;
 	struct work_struct *works;
@@ -578,7 +578,7 @@ EXPORT_SYMBOL(cancel_rearming_delayed_work);
  * Returns:	0 - function was executed
  *		1 - function was scheduled for execution
  */
-int execute_in_process_context(void (*fn)(void *data), void *data,
+int execute_in_process_context(work_func_t fn, void *data,
 			       struct execute_work *ew)
 {
 	if (!in_interrupt()) {
-- 
cgit v1.2.3


From 365970a1ea76d81cb1ad2f652acb605f06dae256 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 22 Nov 2006 14:54:49 +0000
Subject: WorkStruct: Merge the pending bit into the wq_data pointer

Reclaim a word from the size of the work_struct by folding the pending bit and
the wq_data pointer together.  This shouldn't cause misalignment problems as
all pointers should be at least 4-byte aligned.

Signed-Off-By: David Howells <dhowells@redhat.com>
---
 kernel/workqueue.c | 41 ++++++++++++++++++++++++++++++++---------
 1 file changed, 32 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1e9d61ecf762..967479756511 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -80,6 +80,29 @@ static inline int is_single_threaded(struct workqueue_struct *wq)
 	return list_empty(&wq->list);
 }
 
+static inline void set_wq_data(struct work_struct *work, void *wq)
+{
+	unsigned long new, old, res;
+
+	/* assume the pending flag is already set and that the task has already
+	 * been queued on this workqueue */
+	new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING);
+	res = work->management;
+	if (res != new) {
+		do {
+			old = res;
+			new = (unsigned long) wq;
+			new |= (old & WORK_STRUCT_FLAG_MASK);
+			res = cmpxchg(&work->management, old, new);
+		} while (res != old);
+	}
+}
+
+static inline void *get_wq_data(struct work_struct *work)
+{
+	return (void *) (work->management & WORK_STRUCT_WQ_DATA_MASK);
+}
+
 /* Preempt must be disabled. */
 static void __queue_work(struct cpu_workqueue_struct *cwq,
 			 struct work_struct *work)
@@ -87,7 +110,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
 	unsigned long flags;
 
 	spin_lock_irqsave(&cwq->lock, flags);
-	work->wq_data = cwq;
+	set_wq_data(work, cwq);
 	list_add_tail(&work->entry, &cwq->worklist);
 	cwq->insert_sequence++;
 	wake_up(&cwq->more_work);
@@ -108,7 +131,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
 {
 	int ret = 0, cpu = get_cpu();
 
-	if (!test_and_set_bit(0, &work->pending)) {
+	if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) {
 		if (unlikely(is_single_threaded(wq)))
 			cpu = singlethread_cpu;
 		BUG_ON(!list_empty(&work->entry));
@@ -123,7 +146,7 @@ EXPORT_SYMBOL_GPL(queue_work);
 static void delayed_work_timer_fn(unsigned long __data)
 {
 	struct delayed_work *dwork = (struct delayed_work *)__data;
-	struct workqueue_struct *wq = dwork->work.wq_data;
+	struct workqueue_struct *wq = get_wq_data(&dwork->work);
 	int cpu = smp_processor_id();
 
 	if (unlikely(is_single_threaded(wq)))
@@ -150,12 +173,12 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq,
 	if (delay == 0)
 		return queue_work(wq, work);
 
-	if (!test_and_set_bit(0, &work->pending)) {
+	if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) {
 		BUG_ON(timer_pending(timer));
 		BUG_ON(!list_empty(&work->entry));
 
 		/* This stores wq for the moment, for the timer_fn */
-		work->wq_data = wq;
+		set_wq_data(work, wq);
 		timer->expires = jiffies + delay;
 		timer->data = (unsigned long)dwork;
 		timer->function = delayed_work_timer_fn;
@@ -182,12 +205,12 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 	struct timer_list *timer = &dwork->timer;
 	struct work_struct *work = &dwork->work;
 
-	if (!test_and_set_bit(0, &work->pending)) {
+	if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) {
 		BUG_ON(timer_pending(timer));
 		BUG_ON(!list_empty(&work->entry));
 
 		/* This stores wq for the moment, for the timer_fn */
-		work->wq_data = wq;
+		set_wq_data(work, wq);
 		timer->expires = jiffies + delay;
 		timer->data = (unsigned long)dwork;
 		timer->function = delayed_work_timer_fn;
@@ -223,8 +246,8 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
 		list_del_init(cwq->worklist.next);
 		spin_unlock_irqrestore(&cwq->lock, flags);
 
-		BUG_ON(work->wq_data != cwq);
-		clear_bit(0, &work->pending);
+		BUG_ON(get_wq_data(work) != cwq);
+		clear_bit(WORK_STRUCT_PENDING, &work->management);
 		f(data);
 
 		spin_lock_irqsave(&cwq->lock, flags);
-- 
cgit v1.2.3


From 65f27f38446e1976cc98fd3004b110fedcddd189 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 22 Nov 2006 14:55:48 +0000
Subject: WorkStruct: Pass the work_struct pointer instead of context data

Pass the work_struct pointer to the work function rather than context data.
The work function can use container_of() to work out the data.

For the cases where the container of the work_struct may go away the moment the
pending bit is cleared, it is made possible to defer the release of the
structure by deferring the clearing of the pending bit.

To make this work, an extra flag is introduced into the management side of the
work_struct.  This governs auto-release of the structure upon execution.

Ordinarily, the work queue executor would release the work_struct for further
scheduling or deallocation by clearing the pending bit prior to jumping to the
work function.  This means that, unless the driver makes some guarantee itself
that the work_struct won't go away, the work function may not access anything
else in the work_struct or its container lest they be deallocated..  This is a
problem if the auxiliary data is taken away (as done by the last patch).

However, if the pending bit is *not* cleared before jumping to the work
function, then the work function *may* access the work_struct and its container
with no problems.  But then the work function must itself release the
work_struct by calling work_release().

In most cases, automatic release is fine, so this is the default.  Special
initiators exist for the non-auto-release case (ending in _NAR).


Signed-Off-By: David Howells <dhowells@redhat.com>
---
 kernel/kmod.c           | 16 ++++++++++------
 kernel/kthread.c        | 13 ++++++++-----
 kernel/power/poweroff.c |  4 ++--
 kernel/sys.c            |  4 ++--
 kernel/workqueue.c      | 19 ++++++++-----------
 5 files changed, 30 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index bb4e29d924e4..7dc7a9dad6ac 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -114,6 +114,7 @@ EXPORT_SYMBOL(request_module);
 #endif /* CONFIG_KMOD */
 
 struct subprocess_info {
+	struct work_struct work;
 	struct completion *complete;
 	char *path;
 	char **argv;
@@ -221,9 +222,10 @@ static int wait_for_helper(void *data)
 }
 
 /* This is run by khelper thread  */
-static void __call_usermodehelper(void *data)
+static void __call_usermodehelper(struct work_struct *work)
 {
-	struct subprocess_info *sub_info = data;
+	struct subprocess_info *sub_info =
+		container_of(work, struct subprocess_info, work);
 	pid_t pid;
 	int wait = sub_info->wait;
 
@@ -264,6 +266,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	struct subprocess_info sub_info = {
+		.work		= __WORK_INITIALIZER(sub_info.work,
+						     __call_usermodehelper),
 		.complete	= &done,
 		.path		= path,
 		.argv		= argv,
@@ -272,7 +276,6 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
 		.wait		= wait,
 		.retval		= 0,
 	};
-	DECLARE_WORK(work, __call_usermodehelper, &sub_info);
 
 	if (!khelper_wq)
 		return -EBUSY;
@@ -280,7 +283,7 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
 	if (path[0] == '\0')
 		return 0;
 
-	queue_work(khelper_wq, &work);
+	queue_work(khelper_wq, &sub_info.work);
 	wait_for_completion(&done);
 	return sub_info.retval;
 }
@@ -291,6 +294,8 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
 {
 	DECLARE_COMPLETION(done);
 	struct subprocess_info sub_info = {
+		.work		= __WORK_INITIALIZER(sub_info.work,
+						     __call_usermodehelper),
 		.complete	= &done,
 		.path		= path,
 		.argv		= argv,
@@ -298,7 +303,6 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
 		.retval		= 0,
 	};
 	struct file *f;
-	DECLARE_WORK(work, __call_usermodehelper, &sub_info);
 
 	if (!khelper_wq)
 		return -EBUSY;
@@ -318,7 +322,7 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
 	}
 	sub_info.stdin = f;
 
-	queue_work(khelper_wq, &work);
+	queue_work(khelper_wq, &sub_info.work);
 	wait_for_completion(&done);
 	return sub_info.retval;
 }
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4f9c60ef95e8..1db8c72d0d38 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -31,6 +31,8 @@ struct kthread_create_info
 	/* Result passed back to kthread_create() from keventd. */
 	struct task_struct *result;
 	struct completion done;
+
+	struct work_struct work;
 };
 
 struct kthread_stop_info
@@ -111,9 +113,10 @@ static int kthread(void *_create)
 }
 
 /* We are keventd: create a thread. */
-static void keventd_create_kthread(void *_create)
+static void keventd_create_kthread(struct work_struct *work)
 {
-	struct kthread_create_info *create = _create;
+	struct kthread_create_info *create =
+		container_of(work, struct kthread_create_info, work);
 	int pid;
 
 	/* We want our own signal handler (we take no signals by default). */
@@ -154,20 +157,20 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 				   ...)
 {
 	struct kthread_create_info create;
-	DECLARE_WORK(work, keventd_create_kthread, &create);
 
 	create.threadfn = threadfn;
 	create.data = data;
 	init_completion(&create.started);
 	init_completion(&create.done);
+	INIT_WORK(&create.work, keventd_create_kthread);
 
 	/*
 	 * The workqueue needs to start up first:
 	 */
 	if (!helper_wq)
-		work.func(work.data);
+		create.work.func(&create.work);
 	else {
-		queue_work(helper_wq, &work);
+		queue_work(helper_wq, &create.work);
 		wait_for_completion(&create.done);
 	}
 	if (!IS_ERR(create.result)) {
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index f1f900ac3164..678ec736076b 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -16,12 +16,12 @@
  * callback we use.
  */
 
-static void do_poweroff(void *dummy)
+static void do_poweroff(struct work_struct *dummy)
 {
 	kernel_power_off();
 }
 
-static DECLARE_WORK(poweroff_work, do_poweroff, NULL);
+static DECLARE_WORK(poweroff_work, do_poweroff);
 
 static void handle_poweroff(int key, struct tty_struct *tty)
 {
diff --git a/kernel/sys.c b/kernel/sys.c
index 98489d82801b..c87b461de38d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -880,7 +880,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 	return 0;
 }
 
-static void deferred_cad(void *dummy)
+static void deferred_cad(struct work_struct *dummy)
 {
 	kernel_restart(NULL);
 }
@@ -892,7 +892,7 @@ static void deferred_cad(void *dummy)
  */
 void ctrl_alt_del(void)
 {
-	static DECLARE_WORK(cad_work, deferred_cad, NULL);
+	static DECLARE_WORK(cad_work, deferred_cad);
 
 	if (C_A_D)
 		schedule_work(&cad_work);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 967479756511..8d1e7cb8a51a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -241,14 +241,14 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
 		struct work_struct *work = list_entry(cwq->worklist.next,
 						struct work_struct, entry);
 		work_func_t f = work->func;
-		void *data = work->data;
 
 		list_del_init(cwq->worklist.next);
 		spin_unlock_irqrestore(&cwq->lock, flags);
 
 		BUG_ON(get_wq_data(work) != cwq);
-		clear_bit(WORK_STRUCT_PENDING, &work->management);
-		f(data);
+		if (!test_bit(WORK_STRUCT_NOAUTOREL, &work->management))
+			work_release(work);
+		f(work);
 
 		spin_lock_irqsave(&cwq->lock, flags);
 		cwq->remove_sequence++;
@@ -527,7 +527,6 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
 /**
  * schedule_on_each_cpu - call a function on each online CPU from keventd
  * @func: the function to call
- * @info: a pointer to pass to func()
  *
  * Returns zero on success.
  * Returns -ve errno on failure.
@@ -536,7 +535,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
  *
  * schedule_on_each_cpu() is very slow.
  */
-int schedule_on_each_cpu(work_func_t func, void *info)
+int schedule_on_each_cpu(work_func_t func)
 {
 	int cpu;
 	struct work_struct *works;
@@ -547,7 +546,7 @@ int schedule_on_each_cpu(work_func_t func, void *info)
 
 	mutex_lock(&workqueue_mutex);
 	for_each_online_cpu(cpu) {
-		INIT_WORK(per_cpu_ptr(works, cpu), func, info);
+		INIT_WORK(per_cpu_ptr(works, cpu), func);
 		__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
 				per_cpu_ptr(works, cpu));
 	}
@@ -591,7 +590,6 @@ EXPORT_SYMBOL(cancel_rearming_delayed_work);
 /**
  * execute_in_process_context - reliably execute the routine with user context
  * @fn:		the function to execute
- * @data:	data to pass to the function
  * @ew:		guaranteed storage for the execute work structure (must
  *		be available when the work executes)
  *
@@ -601,15 +599,14 @@ EXPORT_SYMBOL(cancel_rearming_delayed_work);
  * Returns:	0 - function was executed
  *		1 - function was scheduled for execution
  */
-int execute_in_process_context(work_func_t fn, void *data,
-			       struct execute_work *ew)
+int execute_in_process_context(work_func_t fn, struct execute_work *ew)
 {
 	if (!in_interrupt()) {
-		fn(data);
+		fn(&ew->work);
 		return 0;
 	}
 
-	INIT_WORK(&ew->work, fn, data);
+	INIT_WORK(&ew->work, fn);
 	schedule_work(&ew->work);
 
 	return 1;
-- 
cgit v1.2.3


From c4028958b6ecad064b1a6303a6a5906d4fe48d73 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 22 Nov 2006 14:57:56 +0000
Subject: WorkStruct: make allyesconfig

Fix up for make allyesconfig.

Signed-Off-By: David Howells <dhowells@redhat.com>
---
 kernel/relay.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index f04bbdb56ac2..2b92e8ece85b 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -308,9 +308,10 @@ static struct rchan_callbacks default_channel_callbacks = {
  *	reason waking is deferred is that calling directly from write
  *	causes problems if you're writing from say the scheduler.
  */
-static void wakeup_readers(void *private)
+static void wakeup_readers(struct work_struct *work)
 {
-	struct rchan_buf *buf = private;
+	struct rchan_buf *buf =
+		container_of(work, struct rchan_buf, wake_readers.work);
 	wake_up_interruptible(&buf->read_wait);
 }
 
@@ -328,7 +329,7 @@ static inline void __relay_reset(struct rchan_buf *buf, unsigned int init)
 	if (init) {
 		init_waitqueue_head(&buf->read_wait);
 		kref_init(&buf->kref);
-		INIT_WORK(&buf->wake_readers, NULL, NULL);
+		INIT_DELAYED_WORK(&buf->wake_readers, NULL);
 	} else {
 		cancel_delayed_work(&buf->wake_readers);
 		flush_scheduled_work();
@@ -549,7 +550,8 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
 			buf->padding[old_subbuf];
 		smp_mb();
 		if (waitqueue_active(&buf->read_wait)) {
-			PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf);
+			PREPARE_DELAYED_WORK(&buf->wake_readers,
+					     wakeup_readers);
 			schedule_delayed_work(&buf->wake_readers, 1);
 		}
 	}
-- 
cgit v1.2.3


From b42172fc7b569a0ef2b0fa38d71382969074c0e2 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@woody.osdl.org>
Date: Wed, 22 Nov 2006 09:32:06 -0800
Subject: Don't call "note_interrupt()" with irq descriptor lock held

This reverts commit f72fa707604c015a6625e80f269506032d5430dc, and solves
the problem that it tried to fix by simply making "__do_IRQ()" call the
note_interrupt() function without the lock held, the way everybody else
does.

It should be noted that all interrupt handling code must never allow the
descriptor actors to be entered "recursively" (that's why we do all the
magic IRQ_PENDING stuff in the first place), so there actually is
exclusion at that much higher level, even in the absense of locking.

Acked-by: Vivek Goyal <vgoyal@in.ibm.com>
Acked-by:Pavel Emelianov <xemul@openvz.org>
Cc: Andrew Morton <akpm@osdl.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/handle.c   | 4 ++--
 kernel/irq/spurious.c | 6 +-----
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 42aa6f1a3f0f..a681912bc89a 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -231,10 +231,10 @@ fastcall unsigned int __do_IRQ(unsigned int irq)
 		spin_unlock(&desc->lock);
 
 		action_ret = handle_IRQ_event(irq, action);
-
-		spin_lock(&desc->lock);
 		if (!noirqdebug)
 			note_interrupt(irq, desc, action_ret);
+
+		spin_lock(&desc->lock);
 		if (likely(!(desc->status & IRQ_PENDING)))
 			break;
 		desc->status &= ~IRQ_PENDING;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 9c7e2e4c1fe7..543ea2e5ad93 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -147,11 +147,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
 	if (unlikely(irqfixup)) {
 		/* Don't punish working computers */
 		if ((irqfixup == 2 && irq == 0) || action_ret == IRQ_NONE) {
-			int ok;
-
-			spin_unlock(&desc->lock);
-			ok = misrouted_irq(irq);
-			spin_lock(&desc->lock);
+			int ok = misrouted_irq(irq);
 			if (action_ret == IRQ_NONE)
 				desc->irqs_unhandled -= ok;
 		}
-- 
cgit v1.2.3


From 753ca4f312a4b26940e4731b4fa5dbbbbcc77e97 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Sat, 25 Nov 2006 11:09:34 -0800
Subject: [PATCH] fix copy_process() error check

The return value of copy_process() should be checked by IS_ERR().

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 3da978eec791..8cdd3e72ba55 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1315,9 +1315,8 @@ struct task_struct * __devinit fork_idle(int cpu)
 	struct pt_regs regs;
 
 	task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0);
-	if (!task)
-		return ERR_PTR(-ENOMEM);
-	init_idle(task, cpu);
+	if (!IS_ERR(task))
+		init_idle(task, cpu);
 
 	return task;
 }
-- 
cgit v1.2.3


From cfd3ef2346f924d6c0e82236c20fdb3a8840136a Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 25 Nov 2006 11:09:37 -0800
Subject: [PATCH] lockdep: spin_lock_irqsave_nested()

Introduce spin_lock_irqsave_nested(); implementation from:
 http://lkml.org/lkml/2006/6/1/122
Patch from:
 http://lkml.org/lkml/2006/9/13/258

[akpm@osdl.org: two compile fixes]
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Jiri Kosina <jikos@jikos.cz>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/spinlock.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'kernel')

diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 476c3741511b..2c6c2bf85514 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -293,6 +293,27 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
 }
 
 EXPORT_SYMBOL(_spin_lock_nested);
+unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	preempt_disable();
+	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+	/*
+	 * On lockdep we dont want the hand-coded irq-enable of
+	 * _raw_spin_lock_flags() code, because lockdep assumes
+	 * that interrupts are not re-enabled during lock-acquire:
+	 */
+#ifdef CONFIG_PROVE_SPIN_LOCKING
+	_raw_spin_lock(lock);
+#else
+	_raw_spin_lock_flags(lock, &flags);
+#endif
+	return flags;
+}
+
+EXPORT_SYMBOL(_spin_lock_irqsave_nested);
 
 #endif
 
-- 
cgit v1.2.3


From ff0a538d8b08700df2b46f9aafc9fb2765071f0a Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Tue, 28 Nov 2006 20:12:59 +0100
Subject: [PATCH] x86-64: work around gcc4 issue with -Os in Dwarf2 stack
 unwind

This fixes a problem with gcc4 mis-compiling the stack unwind code under
-Os, which resulted in 'stuck' messages whenever an assembly routine was
encountered.

(The second hunk is trivial cleanup.)

Signed-off-by: Jan Beulich <jbeulich@novell.com>
---
 kernel/unwind.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/unwind.c b/kernel/unwind.c
index f7e50d16dbf6..ed0a21d4a902 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -938,8 +938,11 @@ int unwind(struct unwind_frame_info *frame)
 		else {
 			retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end);
 			/* skip augmentation */
-			if (((const char *)(cie + 2))[1] == 'z')
-				ptr += get_uleb128(&ptr, end);
+			if (((const char *)(cie + 2))[1] == 'z') {
+				uleb128_t augSize = get_uleb128(&ptr, end);
+
+				ptr += augSize;
+			}
 			if (ptr > end
 			   || retAddrReg >= ARRAY_SIZE(reg_info)
 			   || REG_INVALID(retAddrReg)
@@ -963,9 +966,7 @@ int unwind(struct unwind_frame_info *frame)
 	if (cie == NULL || fde == NULL) {
 #ifdef CONFIG_FRAME_POINTER
 		unsigned long top, bottom;
-#endif
 
-#ifdef CONFIG_FRAME_POINTER
 		top = STACK_TOP(frame->task);
 		bottom = STACK_BOTTOM(frame->task);
 # if FRAME_RETADDR_OFFSET < 0
-- 
cgit v1.2.3


From 3cce4856ff3dfa663b1a168dab48120d70820da6 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 28 Nov 2006 12:29:43 -0800
Subject: [PATCH] fix create_write_pipe() error check

The return value of create_write_pipe()/create_read_pipe() should be
checked by IS_ERR().

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kmod.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index bb4e29d924e4..2b76dee28496 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -307,14 +307,14 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
 		return 0;
 
 	f = create_write_pipe();
-	if (!f)
-		return -ENOMEM;
+	if (IS_ERR(f))
+		return PTR_ERR(f);
 	*filp = f;
 
 	f = create_read_pipe(f);
-	if (!f) {
+	if (IS_ERR(f)) {
 		free_write_pipe(*filp);
-		return -ENOMEM;
+		return PTR_ERR(f);
 	}
 	sub_info.stdin = f;
 
-- 
cgit v1.2.3


From e17e0f51aeea4e59c7e450a1c0f26605b91c1260 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Fri, 24 Nov 2006 12:15:25 +0100
Subject: Driver core: show drivers in /sys/module/

Show the drivers, which belong to the module:
  $ ls -l /sys/module/usbcore/drivers/
  hub -> ../../../bus/usb/drivers/hub
  usb -> ../../../bus/usb/drivers/usb
  usbfs -> ../../../bus/usb/drivers/usbfs

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/module.c | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index f0166563c602..45e01cb60101 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1086,22 +1086,35 @@ static int mod_sysfs_setup(struct module *mod,
 		goto out;
 	kobj_set_kset_s(&mod->mkobj, module_subsys);
 	mod->mkobj.mod = mod;
-	err = kobject_register(&mod->mkobj.kobj);
+
+	/* delay uevent until full sysfs population */
+	kobject_init(&mod->mkobj.kobj);
+	err = kobject_add(&mod->mkobj.kobj);
 	if (err)
 		goto out;
 
+	mod->drivers_dir = kobject_add_dir(&mod->mkobj.kobj, "drivers");
+	if (!mod->drivers_dir)
+		goto out_unreg;
+
 	err = module_param_sysfs_setup(mod, kparam, num_params);
 	if (err)
-		goto out_unreg;
+		goto out_unreg_drivers;
 
 	err = module_add_modinfo_attrs(mod);
 	if (err)
-		goto out_unreg;
+		goto out_unreg_param;
 
+	kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
 	return 0;
 
+out_unreg_drivers:
+	kobject_unregister(mod->drivers_dir);
+out_unreg_param:
+	module_param_sysfs_remove(mod);
 out_unreg:
-	kobject_unregister(&mod->mkobj.kobj);
+	kobject_del(&mod->mkobj.kobj);
+	kobject_put(&mod->mkobj.kobj);
 out:
 	return err;
 }
@@ -1110,6 +1123,7 @@ static void mod_kobject_remove(struct module *mod)
 {
 	module_remove_modinfo_attrs(mod);
 	module_param_sysfs_remove(mod);
+	kobject_unregister(mod->drivers_dir);
 
 	kobject_unregister(&mod->mkobj.kobj);
 }
@@ -2275,11 +2289,14 @@ void print_modules(void)
 
 void module_add_driver(struct module *mod, struct device_driver *drv)
 {
+	int no_warn;
+
 	if (!mod || !drv)
 		return;
 
-	/* Don't check return code; this call is idempotent */
-	sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module");
+	/* Don't check return codes; these calls are idempotent */
+	no_warn = sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module");
+	no_warn = sysfs_create_link(mod->drivers_dir, &drv->kobj, drv->name);
 }
 EXPORT_SYMBOL(module_add_driver);
 
@@ -2288,6 +2305,8 @@ void module_remove_driver(struct device_driver *drv)
 	if (!drv)
 		return;
 	sysfs_remove_link(&drv->kobj, "module");
+	if (drv->owner && drv->owner->drivers_dir)
+		sysfs_remove_link(drv->owner->drivers_dir, drv->name);
 }
 EXPORT_SYMBOL(module_remove_driver);
 
-- 
cgit v1.2.3


From 339bf98ffc6a8d8eb16fc532ac57ffbced2f8a68 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 10 Nov 2006 14:10:15 -0800
Subject: [NETLINK]: Do precise netlink message allocations where possible

Account for the netlink message header size directly in nlmsg_new()
instead of relying on the caller calculate it correctly.

Replaces error handling of message construction functions when
constructing notifications with bug traps since a failure implies
a bug in calculating the size of the skb.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Paul Moore <paul.moore@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/taskstats.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index f45c5e70773c..4f3f0e48c845 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -77,8 +77,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
 	/*
 	 * If new attributes are added, please revisit this allocation
 	 */
-	size = nlmsg_total_size(genlmsg_total_size(size));
-	skb = nlmsg_new(size, GFP_KERNEL);
+	skb = nlmsg_new(genlmsg_total_size(size), GFP_KERNEL);
 	if (!skb)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From 3dabc7157859e706770c825aa229f8943db4e0e1 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 14 Nov 2006 19:44:52 -0800
Subject: [GENL]: Add genlmsg_new() to allocate generic netlink messages

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Paul Moore <paul.moore@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/taskstats.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 4f3f0e48c845..faa5239813ce 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -77,7 +77,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
 	/*
 	 * If new attributes are added, please revisit this allocation
 	 */
-	skb = nlmsg_new(genlmsg_total_size(size), GFP_KERNEL);
+	skb = genlmsg_new(size, GFP_KERNEL);
 	if (!skb)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From 17c157c889f4b07258af6bfec9e4e9dcf3c00178 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 14 Nov 2006 19:46:02 -0800
Subject: [GENL]: Add genlmsg_put_reply() to simplify building reply headers

By modyfing genlmsg_put() to take a genl_family and by adding
genlmsg_put_reply() the process of constructing the netlink
and generic netlink headers is simplified.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Paul Moore <paul.moore@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/taskstats.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index faa5239813ce..d3d28919d4b4 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -85,13 +85,9 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
 		int seq = get_cpu_var(taskstats_seqnum)++;
 		put_cpu_var(taskstats_seqnum);
 
-		reply = genlmsg_put(skb, 0, seq,
-				family.id, 0, 0,
-				cmd, family.version);
+		reply = genlmsg_put(skb, 0, seq, &family, 0, cmd);
 	} else
-		reply = genlmsg_put(skb, info->snd_pid, info->snd_seq,
-				family.id, 0, 0,
-				cmd, family.version);
+		reply = genlmsg_put_reply(skb, info, &family, 0, cmd);
 	if (reply == NULL) {
 		nlmsg_free(skb);
 		return -EINVAL;
-- 
cgit v1.2.3


From f6a570333e554b48ad589e7137c77c57809eee81 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 18 Oct 2006 01:47:25 -0400
Subject: [PATCH] severing module.h->sched.h

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/latency.c |  1 +
 kernel/module.c  | 15 ++++++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/latency.c b/kernel/latency.c
index 258f2555abbc..e63fcacb61a7 100644
--- a/kernel/latency.c
+++ b/kernel/latency.c
@@ -36,6 +36,7 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/notifier.h>
+#include <linux/jiffies.h>
 #include <asm/atomic.h>
 
 struct latency_info {
diff --git a/kernel/module.c b/kernel/module.c
index 45e01cb60101..e2d09d604ca0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -34,10 +34,10 @@
 #include <linux/err.h>
 #include <linux/vermagic.h>
 #include <linux/notifier.h>
+#include <linux/sched.h>
 #include <linux/stop_machine.h>
 #include <linux/device.h>
 #include <linux/string.h>
-#include <linux/sched.h>
 #include <linux/mutex.h>
 #include <linux/unwind.h>
 #include <asm/uaccess.h>
@@ -790,6 +790,19 @@ static struct module_attribute refcnt = {
 	.show = show_refcnt,
 };
 
+void module_put(struct module *module)
+{
+	if (module) {
+		unsigned int cpu = get_cpu();
+		local_dec(&module->ref[cpu].count);
+		/* Maybe they're waiting for us to drop reference? */
+		if (unlikely(!module_is_live(module)))
+			wake_up_process(module->waiter);
+		put_cpu();
+	}
+}
+EXPORT_SYMBOL(module_put);
+
 #else /* !CONFIG_MODULE_UNLOAD */
 static void print_unload_info(struct seq_file *m, struct module *mod)
 {
-- 
cgit v1.2.3


From a1f8e7f7fb9d7e2cbcb53170edca7c0ac4680697 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 19 Oct 2006 16:08:53 -0400
Subject: [PATCH] severing skbuff.h -> highmem.h

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 42f2f1179711..ab97e5101232 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -64,6 +64,7 @@
 #include <linux/tty.h>
 #include <linux/selinux.h>
 #include <linux/binfmts.h>
+#include <linux/highmem.h>
 #include <linux/syscalls.h>
 
 #include "audit.h"
-- 
cgit v1.2.3


From f95d47caae5302a63d92be9a0292abc90e2a14e1 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Thu, 7 Dec 2006 02:14:02 +0100
Subject: [PATCH] i386: Use %gs as the PDA base-segment in the kernel

This patch is the meat of the PDA change.  This patch makes several related
changes:

1: Most significantly, %gs is now used in the kernel.  This means that on
   entry, the old value of %gs is saved away, and it is reloaded with
   __KERNEL_PDA.

2: entry.S constructs the stack in the shape of struct pt_regs, and this
   is passed around the kernel so that the process's saved register
   state can be accessed.

   Unfortunately struct pt_regs doesn't currently have space for %gs
   (or %fs). This patch extends pt_regs to add space for gs (no space
   is allocated for %fs, since it won't be used, and it would just
   complicate the code in entry.S to work around the space).

3: Because %gs is now saved on the stack like %ds, %es and the integer
   registers, there are a number of places where it no longer needs to
   be handled specially; namely context switch, and saving/restoring the
   register state in a signal context.

4: And since kernel threads run in kernel space and call normal kernel
   code, they need to be created with their %gs == __KERNEL_PDA.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Chuck Ebbert <76306.1226@compuserve.com>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 8cdd3e72ba55..fd22245e3881 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1303,7 +1303,7 @@ fork_out:
 	return ERR_PTR(retval);
 }
 
-struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
+noinline struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
 {
 	memset(regs, 0, sizeof(struct pt_regs));
 	return regs;
-- 
cgit v1.2.3


From 0741f4d207a644482d7a040f05cd264c98cf7ee8 Mon Sep 17 00:00:00 2001
From: Chuck Ebbert <76306.1226@compuserve.com>
Date: Thu, 7 Dec 2006 02:14:11 +0100
Subject: [PATCH] x86: add sysctl for kstack_depth_to_print

Add sysctl for kstack_depth_to_print. This lets users change
the amount of raw stack data printed in dump_stack() without
having to reboot.

Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 kernel/sysctl.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 09e569f4792b..6fc5e17086f4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -54,6 +54,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
 
 #ifdef CONFIG_X86
 #include <asm/nmi.h>
+#include <asm/stacktrace.h>
 #endif
 
 #if defined(CONFIG_SYSCTL)
@@ -707,6 +708,14 @@ static ctl_table kern_table[] = {
 		.mode		= 0444,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "kstack_depth_to_print",
+		.data		= &kstack_depth_to_print,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 #if defined(CONFIG_MMU)
 	{
-- 
cgit v1.2.3


From e2124bb8d369a4bc1afde1959040e33d71c41d5e Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:12 +0100
Subject: [PATCH] unwinder: Use probe_kernel_address instead of __get_user in
 kernel/unwind.c

This avoids trouble with the page fault handler if the fault
happens inside an interrupt context.

Suggested by Linus

Cc: jbeulich@novell.com
Signed-off-by: Andi Kleen <ak@suse.de>
---
 kernel/unwind.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/unwind.c b/kernel/unwind.c
index ed0a21d4a902..af48168a3afb 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -14,6 +14,7 @@
 #include <linux/bootmem.h>
 #include <linux/sort.h>
 #include <linux/stop_machine.h>
+#include <linux/uaccess.h>
 #include <asm/sections.h>
 #include <asm/uaccess.h>
 #include <asm/unaligned.h>
@@ -550,7 +551,7 @@ static unsigned long read_pointer(const u8 **pLoc,
 		return 0;
 	}
 	if ((ptrType & DW_EH_PE_indirect)
-	    && __get_user(value, (unsigned long *)value))
+	    && probe_kernel_address((unsigned long *)value, value))
 		return 0;
 	*pLoc = ptr.p8;
 
@@ -982,18 +983,19 @@ int unwind(struct unwind_frame_info *frame)
 		        & (sizeof(unsigned long) - 1))) {
 			unsigned long link;
 
-			if (!__get_user(link,
+			if (!probe_kernel_address(
 			                (unsigned long *)(UNW_FP(frame)
-			                                  + FRAME_LINK_OFFSET))
+			                                  + FRAME_LINK_OFFSET),
+						  link)
 # if FRAME_RETADDR_OFFSET < 0
 			   && link > bottom && link < UNW_FP(frame)
 # else
 			   && link > UNW_FP(frame) && link < bottom
 # endif
 			   && !(link & (sizeof(link) - 1))
-			   && !__get_user(UNW_PC(frame),
+			   && !probe_kernel_address(
 			                  (unsigned long *)(UNW_FP(frame)
-			                                    + FRAME_RETADDR_OFFSET))) {
+			                                    + FRAME_RETADDR_OFFSET), UNW_PC(frame))) {
 				UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET
 # if FRAME_RETADDR_OFFSET < 0
 					-
@@ -1104,7 +1106,7 @@ int unwind(struct unwind_frame_info *frame)
 					return -EIO;
 				switch(reg_info[i].width) {
 #define CASE(n)     case sizeof(u##n): \
-					__get_user(FRAME_REG(i, u##n), (u##n *)addr); \
+					probe_kernel_address((u##n *)addr, FRAME_REG(i, u##n)); \
 					break
 				CASES;
 #undef CASE
-- 
cgit v1.2.3


From eef5e0d185fc049bda11fa14ba286fbd357da896 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:12 +0100
Subject: [PATCH] unwinder: Remove lockdep disabling of nested locks for
 unwinder

Shouldn't be needed anymore since __kernel_text_address
is used unconditionally on x86-64

Signed-off-by: Andi Kleen <ak@suse.de>
---
 kernel/lockdep.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index c9fefdb1a7db..9bb8d784eb02 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -228,11 +228,7 @@ static int save_trace(struct stack_trace *trace)
 	trace->skip = 3;
 	trace->all_contexts = 0;
 
-	/* Make sure to not recurse in case the the unwinder needs to tak
-e	   locks. */
-	lockdep_off();
 	save_stack_trace(trace, NULL);
-	lockdep_on();
 
 	trace->max_entries = trace->nr_entries;
 
-- 
cgit v1.2.3


From 359ad0d4015a9ab39243f2ebc4eb07915bd618b2 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 7 Dec 2006 02:14:13 +0100
Subject: [PATCH] unwinder: more sanity checks in Dwarf2 unwinder

Tighten the requirements on both input to and output from the Dwarf2
unwinder.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 kernel/unwind.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/unwind.c b/kernel/unwind.c
index af48168a3afb..7e721f104105 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -95,6 +95,7 @@ static const struct {
 
 typedef unsigned long uleb128_t;
 typedef   signed long sleb128_t;
+#define sleb128abs __builtin_labs
 
 static struct unwind_table {
 	struct {
@@ -787,7 +788,7 @@ int unwind(struct unwind_frame_info *frame)
 #define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs])
 	const u32 *fde = NULL, *cie = NULL;
 	const u8 *ptr = NULL, *end = NULL;
-	unsigned long pc = UNW_PC(frame) - frame->call_frame;
+	unsigned long pc = UNW_PC(frame) - frame->call_frame, sp;
 	unsigned long startLoc = 0, endLoc = 0, cfa;
 	unsigned i;
 	signed ptrType = -1;
@@ -936,6 +937,9 @@ int unwind(struct unwind_frame_info *frame)
 		state.dataAlign = get_sleb128(&ptr, end);
 		if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end)
 			cie = NULL;
+		else if (UNW_PC(frame) % state.codeAlign
+		         || UNW_SP(frame) % sleb128abs(state.dataAlign))
+			return -EPERM;
 		else {
 			retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end);
 			/* skip augmentation */
@@ -968,6 +972,8 @@ int unwind(struct unwind_frame_info *frame)
 #ifdef CONFIG_FRAME_POINTER
 		unsigned long top, bottom;
 
+		if ((UNW_SP(frame) | UNW_FP(frame)) % sizeof(unsigned long))
+			return -EPERM;
 		top = STACK_TOP(frame->task);
 		bottom = STACK_BOTTOM(frame->task);
 # if FRAME_RETADDR_OFFSET < 0
@@ -1018,6 +1024,7 @@ int unwind(struct unwind_frame_info *frame)
 	   || state.regs[retAddrReg].where == Nowhere
 	   || state.cfa.reg >= ARRAY_SIZE(reg_info)
 	   || reg_info[state.cfa.reg].width != sizeof(unsigned long)
+	   || FRAME_REG(state.cfa.reg, unsigned long) % sizeof(unsigned long)
 	   || state.cfa.offs % sizeof(unsigned long))
 		return -EIO;
 	/* update frame */
@@ -1038,6 +1045,8 @@ int unwind(struct unwind_frame_info *frame)
 #else
 # define CASES CASE(8); CASE(16); CASE(32); CASE(64)
 #endif
+	pc = UNW_PC(frame);
+	sp = UNW_SP(frame);
 	for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
 		if (REG_INVALID(i)) {
 			if (state.regs[i].where == Nowhere)
@@ -1118,6 +1127,11 @@ int unwind(struct unwind_frame_info *frame)
 		}
 	}
 
+	if (UNW_PC(frame) % state.codeAlign
+	    || UNW_SP(frame) % sleb128abs(state.dataAlign)
+	    || (pc == UNW_PC(frame) && sp == UNW_SP(frame)))
+		return -EIO;
+
 	return 0;
 #undef CASES
 #undef FRAME_REG
-- 
cgit v1.2.3


From 6d0185ea611276fdf81991d7774d396bdc1ae392 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 7 Dec 2006 02:14:13 +0100
Subject: [PATCH] unwinder: Add debugging output to the Dwarf2 unwinder

Add debugging printks to the unwinder to allow easier debugging
when something goes wrong with it.

This can be controlled with the new unwinder_debug=N option
Most output is given by N=1

AK: Added documentation of unwinder_debug=

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 kernel/unwind.c | 113 +++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 96 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/unwind.c b/kernel/unwind.c
index 7e721f104105..209e248517db 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -137,6 +137,17 @@ struct unwind_state {
 
 static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 };
 
+static unsigned unwind_debug;
+static int __init unwind_debug_setup(char *s)
+{
+	unwind_debug = simple_strtoul(s, NULL, 0);
+	return 1;
+}
+__setup("unwind_debug=", unwind_debug_setup);
+#define dprintk(lvl, fmt, args...) \
+	((void)(lvl > unwind_debug \
+	 || printk(KERN_DEBUG "unwind: " fmt "\n", ##args)))
+
 static struct unwind_table *find_table(unsigned long pc)
 {
 	struct unwind_table *table;
@@ -281,6 +292,7 @@ static void __init setup_unwind_table(struct unwind_table *table,
 
 	hdrSize = 4 + sizeof(unsigned long) + sizeof(unsigned int)
 	        + 2 * n * sizeof(unsigned long);
+	dprintk(2, "Binary lookup table size for %s: %lu bytes", table->name, hdrSize);
 	header = alloc(hdrSize);
 	if (!header)
 		return;
@@ -500,13 +512,17 @@ static unsigned long read_pointer(const u8 **pLoc,
 		const unsigned long *pul;
 	} ptr;
 
-	if (ptrType < 0 || ptrType == DW_EH_PE_omit)
+	if (ptrType < 0 || ptrType == DW_EH_PE_omit) {
+		dprintk(1, "Invalid pointer encoding %02X (%p,%p).", ptrType, *pLoc, end);
 		return 0;
+	}
 	ptr.p8 = *pLoc;
 	switch(ptrType & DW_EH_PE_FORM) {
 	case DW_EH_PE_data2:
-		if (end < (const void *)(ptr.p16u + 1))
+		if (end < (const void *)(ptr.p16u + 1)) {
+			dprintk(1, "Data16 overrun (%p,%p).", ptr.p8, end);
 			return 0;
+		}
 		if(ptrType & DW_EH_PE_signed)
 			value = get_unaligned(ptr.p16s++);
 		else
@@ -514,8 +530,10 @@ static unsigned long read_pointer(const u8 **pLoc,
 		break;
 	case DW_EH_PE_data4:
 #ifdef CONFIG_64BIT
-		if (end < (const void *)(ptr.p32u + 1))
+		if (end < (const void *)(ptr.p32u + 1)) {
+			dprintk(1, "Data32 overrun (%p,%p).", ptr.p8, end);
 			return 0;
+		}
 		if(ptrType & DW_EH_PE_signed)
 			value = get_unaligned(ptr.p32s++);
 		else
@@ -527,8 +545,10 @@ static unsigned long read_pointer(const u8 **pLoc,
 		BUILD_BUG_ON(sizeof(u32) != sizeof(value));
 #endif
 	case DW_EH_PE_native:
-		if (end < (const void *)(ptr.pul + 1))
+		if (end < (const void *)(ptr.pul + 1)) {
+			dprintk(1, "DataUL overrun (%p,%p).", ptr.p8, end);
 			return 0;
+		}
 		value = get_unaligned(ptr.pul++);
 		break;
 	case DW_EH_PE_leb128:
@@ -536,10 +556,14 @@ static unsigned long read_pointer(const u8 **pLoc,
 		value = ptrType & DW_EH_PE_signed
 		        ? get_sleb128(&ptr.p8, end)
 		        : get_uleb128(&ptr.p8, end);
-		if ((const void *)ptr.p8 > end)
+		if ((const void *)ptr.p8 > end) {
+			dprintk(1, "DataLEB overrun (%p,%p).", ptr.p8, end);
 			return 0;
+		}
 		break;
 	default:
+		dprintk(2, "Cannot decode pointer type %02X (%p,%p).",
+		        ptrType, ptr.p8, end);
 		return 0;
 	}
 	switch(ptrType & DW_EH_PE_ADJUST) {
@@ -549,11 +573,16 @@ static unsigned long read_pointer(const u8 **pLoc,
 		value += (unsigned long)*pLoc;
 		break;
 	default:
+		dprintk(2, "Cannot adjust pointer type %02X (%p,%p).",
+		        ptrType, *pLoc, end);
 		return 0;
 	}
 	if ((ptrType & DW_EH_PE_indirect)
-	    && probe_kernel_address((unsigned long *)value, value))
+	    && probe_kernel_address((unsigned long *)value, value)) {
+		dprintk(1, "Cannot read indirect value %lx (%p,%p).",
+		        value, *pLoc, end);
 		return 0;
+	}
 	*pLoc = ptr.p8;
 
 	return value;
@@ -702,8 +731,10 @@ static int processCFI(const u8 *start,
 					state->label = NULL;
 					return 1;
 				}
-				if (state->stackDepth >= MAX_STACK_DEPTH)
+				if (state->stackDepth >= MAX_STACK_DEPTH) {
+					dprintk(1, "State stack overflow (%p,%p).", ptr.p8, end);
 					return 0;
+				}
 				state->stack[state->stackDepth++] = ptr.p8;
 				break;
 			case DW_CFA_restore_state:
@@ -718,8 +749,10 @@ static int processCFI(const u8 *start,
 					result = processCFI(start, end, 0, ptrType, state);
 					state->loc = loc;
 					state->label = label;
-				} else
+				} else {
+					dprintk(1, "State stack underflow (%p,%p).", ptr.p8, end);
 					return 0;
+				}
 				break;
 			case DW_CFA_def_cfa:
 				state->cfa.reg = get_uleb128(&ptr.p8, end);
@@ -751,6 +784,7 @@ static int processCFI(const u8 *start,
 				break;
 			case DW_CFA_GNU_window_save:
 			default:
+				dprintk(1, "Unrecognized CFI op %02X (%p,%p).", ptr.p8[-1], ptr.p8 - 1, end);
 				result = 0;
 				break;
 			}
@@ -766,12 +800,17 @@ static int processCFI(const u8 *start,
 			set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state);
 			break;
 		}
-		if (ptr.p8 > end)
+		if (ptr.p8 > end) {
+			dprintk(1, "Data overrun (%p,%p).", ptr.p8, end);
 			result = 0;
+		}
 		if (result && targetLoc != 0 && targetLoc < state->loc)
 			return 1;
 	}
 
+	if (result && ptr.p8 < end)
+		dprintk(1, "Data underrun (%p,%p).", ptr.p8, end);
+
 	return result
 	   && ptr.p8 == end
 	   && (targetLoc == 0
@@ -843,6 +882,8 @@ int unwind(struct unwind_frame_info *frame)
 					                           hdr[3]);
 			}
 		}
+		if(hdr && !fde)
+			dprintk(3, "Binary lookup for %lx failed.", pc);
 
 		if (fde != NULL) {
 			cie = cie_for_fde(fde, table);
@@ -864,6 +905,8 @@ int unwind(struct unwind_frame_info *frame)
 					fde = NULL;
 			} else
 				fde = NULL;
+			if(!fde)
+				dprintk(1, "Binary lookup result for %lx discarded.", pc);
 		}
 		if (fde == NULL) {
 			for (fde = table->address, tableSize = table->size;
@@ -895,6 +938,8 @@ int unwind(struct unwind_frame_info *frame)
 				if (pc >= startLoc && pc < endLoc)
 					break;
 			}
+			if(!fde)
+				dprintk(3, "Linear lookup for %lx failed.", pc);
 		}
 	}
 	if (cie != NULL) {
@@ -928,6 +973,8 @@ int unwind(struct unwind_frame_info *frame)
 			if (ptr >= end || *ptr)
 				cie = NULL;
 		}
+		if(!cie)
+			dprintk(1, "CIE unusable (%p,%p).", ptr, end);
 		++ptr;
 	}
 	if (cie != NULL) {
@@ -938,9 +985,11 @@ int unwind(struct unwind_frame_info *frame)
 		if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end)
 			cie = NULL;
 		else if (UNW_PC(frame) % state.codeAlign
-		         || UNW_SP(frame) % sleb128abs(state.dataAlign))
+		         || UNW_SP(frame) % sleb128abs(state.dataAlign)) {
+			dprintk(1, "Input pointer(s) misaligned (%lx,%lx).",
+			        UNW_PC(frame), UNW_SP(frame));
 			return -EPERM;
-		else {
+		} else {
 			retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end);
 			/* skip augmentation */
 			if (((const char *)(cie + 2))[1] == 'z') {
@@ -954,6 +1003,8 @@ int unwind(struct unwind_frame_info *frame)
 			   || reg_info[retAddrReg].width != sizeof(unsigned long))
 				cie = NULL;
 		}
+		if(!cie)
+			dprintk(1, "CIE validation failed (%p,%p).", ptr, end);
 	}
 	if (cie != NULL) {
 		state.cieStart = ptr;
@@ -967,6 +1018,8 @@ int unwind(struct unwind_frame_info *frame)
 			if ((ptr += augSize) > end)
 				fde = NULL;
 		}
+		if(!fde)
+			dprintk(1, "FDE validation failed (%p,%p).", ptr, end);
 	}
 	if (cie == NULL || fde == NULL) {
 #ifdef CONFIG_FRAME_POINTER
@@ -1025,8 +1078,10 @@ int unwind(struct unwind_frame_info *frame)
 	   || state.cfa.reg >= ARRAY_SIZE(reg_info)
 	   || reg_info[state.cfa.reg].width != sizeof(unsigned long)
 	   || FRAME_REG(state.cfa.reg, unsigned long) % sizeof(unsigned long)
-	   || state.cfa.offs % sizeof(unsigned long))
+	   || state.cfa.offs % sizeof(unsigned long)) {
+		dprintk(1, "Unusable unwind info (%p,%p).", ptr, end);
 		return -EIO;
+	}
 	/* update frame */
 #ifndef CONFIG_AS_CFI_SIGNAL_FRAME
 	if(frame->call_frame
@@ -1051,6 +1106,8 @@ int unwind(struct unwind_frame_info *frame)
 		if (REG_INVALID(i)) {
 			if (state.regs[i].where == Nowhere)
 				continue;
+			dprintk(1, "Cannot restore register %u (%d).",
+			        i, state.regs[i].where);
 			return -EIO;
 		}
 		switch(state.regs[i].where) {
@@ -1059,8 +1116,11 @@ int unwind(struct unwind_frame_info *frame)
 		case Register:
 			if (state.regs[i].value >= ARRAY_SIZE(reg_info)
 			   || REG_INVALID(state.regs[i].value)
-			   || reg_info[i].width > reg_info[state.regs[i].value].width)
+			   || reg_info[i].width > reg_info[state.regs[i].value].width) {
+				dprintk(1, "Cannot restore register %u from register %lu.",
+				        i, state.regs[i].value);
 				return -EIO;
+			}
 			switch(reg_info[state.regs[i].value].width) {
 #define CASE(n) \
 			case sizeof(u##n): \
@@ -1070,6 +1130,9 @@ int unwind(struct unwind_frame_info *frame)
 			CASES;
 #undef CASE
 			default:
+				dprintk(1, "Unsupported register size %u (%lu).",
+				        reg_info[state.regs[i].value].width,
+				        state.regs[i].value);
 				return -EIO;
 			}
 			break;
@@ -1094,12 +1157,17 @@ int unwind(struct unwind_frame_info *frame)
 			CASES;
 #undef CASE
 			default:
+				dprintk(1, "Unsupported register size %u (%u).",
+				        reg_info[i].width, i);
 				return -EIO;
 			}
 			break;
 		case Value:
-			if (reg_info[i].width != sizeof(unsigned long))
+			if (reg_info[i].width != sizeof(unsigned long)) {
+				dprintk(1, "Unsupported value size %u (%u).",
+				        reg_info[i].width, i);
 				return -EIO;
+			}
 			FRAME_REG(i, unsigned long) = cfa + state.regs[i].value
 			                                    * state.dataAlign;
 			break;
@@ -1111,8 +1179,11 @@ int unwind(struct unwind_frame_info *frame)
 				    % sizeof(unsigned long)
 				    || addr < startLoc
 				    || addr + sizeof(unsigned long) < addr
-				    || addr + sizeof(unsigned long) > endLoc)
+				    || addr + sizeof(unsigned long) > endLoc) {
+					dprintk(1, "Bad memory location %lx (%lx).",
+					        addr, state.regs[i].value);
 					return -EIO;
+				}
 				switch(reg_info[i].width) {
 #define CASE(n)     case sizeof(u##n): \
 					probe_kernel_address((u##n *)addr, FRAME_REG(i, u##n)); \
@@ -1120,6 +1191,8 @@ int unwind(struct unwind_frame_info *frame)
 				CASES;
 #undef CASE
 				default:
+					dprintk(1, "Unsupported memory size %u (%u).",
+					        reg_info[i].width, i);
 					return -EIO;
 				}
 			}
@@ -1128,9 +1201,15 @@ int unwind(struct unwind_frame_info *frame)
 	}
 
 	if (UNW_PC(frame) % state.codeAlign
-	    || UNW_SP(frame) % sleb128abs(state.dataAlign)
-	    || (pc == UNW_PC(frame) && sp == UNW_SP(frame)))
+	    || UNW_SP(frame) % sleb128abs(state.dataAlign)) {
+		dprintk(1, "Output pointer(s) misaligned (%lx,%lx).",
+		        UNW_PC(frame), UNW_SP(frame));
 		return -EIO;
+	}
+	if (pc == UNW_PC(frame) && sp == UNW_SP(frame)) {
+		dprintk(1, "No progress (%lx,%lx).", pc, sp);
+		return -EIO;
+	}
 
 	return 0;
 #undef CASES
-- 
cgit v1.2.3


From c65f38d911aa301cea109d38d40925750dd6c2da Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 7 Dec 2006 02:14:19 +0100
Subject: [PATCH] unwinder: fully support linker generated .eh_frame_hdr
 section

Now that binutils' ld is able to properly populate .eh_frame_hdr in the
Linux kernel case, here's a patch to add some functionality to the Dwarf2
unwinder to actually be able to make use of this (applies on firstfloor
tree with the previously sent patch to add debug output, but not on plain
2.6.19).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 kernel/unwind.c | 66 ++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 47 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/unwind.c b/kernel/unwind.c
index 209e248517db..08645aa7c2d6 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -164,7 +164,9 @@ static struct unwind_table *find_table(unsigned long pc)
 
 static unsigned long read_pointer(const u8 **pLoc,
                                   const void *end,
-                                  signed ptrType);
+                                  signed ptrType,
+                                  unsigned long text_base,
+                                  unsigned long data_base);
 
 static void init_unwind_table(struct unwind_table *table,
                               const char *name,
@@ -189,10 +191,13 @@ static void init_unwind_table(struct unwind_table *table,
 	/* See if the linker provided table looks valid. */
 	if (header_size <= 4
 	    || header_start[0] != 1
-	    || (void *)read_pointer(&ptr, end, header_start[1]) != table_start
-	    || header_start[2] == DW_EH_PE_omit
-	    || read_pointer(&ptr, end, header_start[2]) <= 0
-	    || header_start[3] == DW_EH_PE_omit)
+	    || (void *)read_pointer(&ptr, end, header_start[1], 0, 0)
+	       != table_start
+	    || !read_pointer(&ptr, end, header_start[2], 0, 0)
+	    || !read_pointer(&ptr, end, header_start[3], 0,
+	                     (unsigned long)header_start)
+	    || !read_pointer(&ptr, end, header_start[3], 0,
+	                     (unsigned long)header_start))
 		header_start = NULL;
 	table->hdrsz = header_size;
 	smp_wmb();
@@ -282,7 +287,7 @@ static void __init setup_unwind_table(struct unwind_table *table,
 		ptr = (const u8 *)(fde + 2);
 		if (!read_pointer(&ptr,
 		                  (const u8 *)(fde + 1) + *fde,
-		                  ptrType))
+		                  ptrType, 0, 0))
 			return;
 		++n;
 	}
@@ -317,7 +322,7 @@ static void __init setup_unwind_table(struct unwind_table *table,
 		ptr = (const u8 *)(fde + 2);
 		header->table[n].start = read_pointer(&ptr,
 		                                      (const u8 *)(fde + 1) + *fde,
-		                                      fde_pointer_type(cie));
+		                                      fde_pointer_type(cie), 0, 0);
 		header->table[n].fde = (unsigned long)fde;
 		++n;
 	}
@@ -500,7 +505,9 @@ static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *table)
 
 static unsigned long read_pointer(const u8 **pLoc,
                                   const void *end,
-                                  signed ptrType)
+                                  signed ptrType,
+                                  unsigned long text_base,
+                                  unsigned long data_base)
 {
 	unsigned long value = 0;
 	union {
@@ -572,6 +579,22 @@ static unsigned long read_pointer(const u8 **pLoc,
 	case DW_EH_PE_pcrel:
 		value += (unsigned long)*pLoc;
 		break;
+	case DW_EH_PE_textrel:
+		if (likely(text_base)) {
+			value += text_base;
+			break;
+		}
+		dprintk(2, "Text-relative encoding %02X (%p,%p), but zero text base.",
+		        ptrType, *pLoc, end);
+		return 0;
+	case DW_EH_PE_datarel:
+		if (likely(data_base)) {
+			value += data_base;
+			break;
+		}
+		dprintk(2, "Data-relative encoding %02X (%p,%p), but zero data base.",
+		        ptrType, *pLoc, end);
+		return 0;
 	default:
 		dprintk(2, "Cannot adjust pointer type %02X (%p,%p).",
 		        ptrType, *pLoc, end);
@@ -625,7 +648,8 @@ static signed fde_pointer_type(const u32 *cie)
 			case 'P': {
 					signed ptrType = *ptr++;
 
-					if (!read_pointer(&ptr, end, ptrType) || ptr > end)
+					if (!read_pointer(&ptr, end, ptrType, 0, 0)
+					    || ptr > end)
 						return -1;
 				}
 				break;
@@ -685,7 +709,8 @@ static int processCFI(const u8 *start,
 			case DW_CFA_nop:
 				break;
 			case DW_CFA_set_loc:
-				if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0)
+				state->loc = read_pointer(&ptr.p8, end, ptrType, 0, 0);
+				if (state->loc == 0)
 					result = 0;
 				break;
 			case DW_CFA_advance_loc1:
@@ -854,9 +879,9 @@ int unwind(struct unwind_frame_info *frame)
 			ptr = hdr + 4;
 			end = hdr + table->hdrsz;
 			if (tableSize
-			    && read_pointer(&ptr, end, hdr[1])
+			    && read_pointer(&ptr, end, hdr[1], 0, 0)
 			       == (unsigned long)table->address
-			    && (i = read_pointer(&ptr, end, hdr[2])) > 0
+			    && (i = read_pointer(&ptr, end, hdr[2], 0, 0)) > 0
 			    && i == (end - ptr) / (2 * tableSize)
 			    && !((end - ptr) % (2 * tableSize))) {
 				do {
@@ -864,7 +889,8 @@ int unwind(struct unwind_frame_info *frame)
 
 					startLoc = read_pointer(&cur,
 					                        cur + tableSize,
-					                        hdr[3]);
+					                        hdr[3], 0,
+					                        (unsigned long)hdr);
 					if (pc < startLoc)
 						i /= 2;
 					else {
@@ -875,11 +901,13 @@ int unwind(struct unwind_frame_info *frame)
 				if (i == 1
 				    && (startLoc = read_pointer(&ptr,
 				                                ptr + tableSize,
-				                                hdr[3])) != 0
+				                                hdr[3], 0,
+				                                (unsigned long)hdr)) != 0
 				    && pc >= startLoc)
 					fde = (void *)read_pointer(&ptr,
 					                           ptr + tableSize,
-					                           hdr[3]);
+					                           hdr[3], 0,
+					                           (unsigned long)hdr);
 			}
 		}
 		if(hdr && !fde)
@@ -894,13 +922,13 @@ int unwind(struct unwind_frame_info *frame)
 			   && (ptrType = fde_pointer_type(cie)) >= 0
 			   && read_pointer(&ptr,
 			                   (const u8 *)(fde + 1) + *fde,
-			                   ptrType) == startLoc) {
+			                   ptrType, 0, 0) == startLoc) {
 				if (!(ptrType & DW_EH_PE_indirect))
 					ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed;
 				endLoc = startLoc
 				         + read_pointer(&ptr,
 				                        (const u8 *)(fde + 1) + *fde,
-				                        ptrType);
+				                        ptrType, 0, 0);
 				if(pc >= endLoc)
 					fde = NULL;
 			} else
@@ -926,7 +954,7 @@ int unwind(struct unwind_frame_info *frame)
 				ptr = (const u8 *)(fde + 2);
 				startLoc = read_pointer(&ptr,
 				                        (const u8 *)(fde + 1) + *fde,
-				                        ptrType);
+				                        ptrType, 0, 0);
 				if (!startLoc)
 					continue;
 				if (!(ptrType & DW_EH_PE_indirect))
@@ -934,7 +962,7 @@ int unwind(struct unwind_frame_info *frame)
 				endLoc = startLoc
 				         + read_pointer(&ptr,
 				                        (const u8 *)(fde + 1) + *fde,
-				                        ptrType);
+				                        ptrType, 0, 0);
 				if (pc >= startLoc && pc < endLoc)
 					break;
 			}
-- 
cgit v1.2.3


From b65780e123ba9b762276482bbfb52836e4d41fd9 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 7 Dec 2006 02:14:19 +0100
Subject: [PATCH] unwinder: move .eh_frame to RODATA

The .eh_frame section contents is never written to, so it can as well
benefit from CONFIG_DEBUG_RODATA.

Diff-ed against firstfloor tree.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 kernel/unwind.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/unwind.c b/kernel/unwind.c
index 08645aa7c2d6..09c261329249 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -19,7 +19,7 @@
 #include <asm/uaccess.h>
 #include <asm/unaligned.h>
 
-extern char __start_unwind[], __end_unwind[];
+extern const char __start_unwind[], __end_unwind[];
 extern const u8 __start_unwind_hdr[], __end_unwind_hdr[];
 
 #define MAX_STACK_DEPTH 8
-- 
cgit v1.2.3


From 161a09e737f0761ca064ee6a907313402f7a54b6 Mon Sep 17 00:00:00 2001
From: Joy Latten <latten@austin.ibm.com>
Date: Mon, 27 Nov 2006 13:11:54 -0600
Subject: audit: Add auditing to ipsec

An audit message occurs when an ipsec SA
or ipsec policy is created/deleted.

Signed-off-by: Joy Latten <latten@austin.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/auditsc.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ab97e5101232..40722e26de98 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -731,7 +731,7 @@ static inline void audit_free_context(struct audit_context *context)
 		printk(KERN_ERR "audit: freed %d contexts\n", count);
 }
 
-static void audit_log_task_context(struct audit_buffer *ab)
+void audit_log_task_context(struct audit_buffer *ab)
 {
 	char *ctx = NULL;
 	ssize_t len = 0;
@@ -760,6 +760,8 @@ error_path:
 	return;
 }
 
+EXPORT_SYMBOL(audit_log_task_context);
+
 static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 {
 	char name[sizeof(tsk->comm)];
@@ -1488,6 +1490,8 @@ uid_t audit_get_loginuid(struct audit_context *ctx)
 	return ctx ? ctx->loginuid : -1;
 }
 
+EXPORT_SYMBOL(audit_get_loginuid);
+
 /**
  * __audit_mq_open - record audit data for a POSIX MQ open
  * @oflag: open flag
-- 
cgit v1.2.3


From 7602bdf2fd14a40dd9b104e516fdc05e1bd17952 Mon Sep 17 00:00:00 2001
From: Ashwin Chaugule <ashwin.chaugule@celunite.com>
Date: Wed, 6 Dec 2006 20:31:57 -0800
Subject: [PATCH] new scheme to preempt swap token

The new swap token patches replace the current token traversal algo.  The old
algo had a crude timeout parameter that was used to handover the token from
one task to another.  This algo, transfers the token to the tasks that are in
need of the token.  The urgency for the token is based on the number of times
a task is required to swap-in pages.  Accordingly, the priority of a task is
incremented if it has been badly affected due to swap-outs.  To ensure that
the token doesnt bounce around rapidly, the token holders are given a priority
boost.  The priority of tasks is also decremented, if their rate of swap-in's
keeps reducing.  This way, the condition to check whether to pre-empt the swap
token, is a matter of comparing two task's priority fields.

[akpm@osdl.org: cleanups]
Signed-off-by: Ashwin Chaugule <ashwin.chaugule@celunite.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c   |  8 ++++++++
 kernel/sysctl.c | 11 -----------
 2 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 8cdd3e72ba55..5678e6c61ef2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -479,6 +479,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
 
 	memcpy(mm, oldmm, sizeof(*mm));
 
+	/* Initializing for Swap token stuff */
+	mm->token_priority = 0;
+	mm->last_interval = 0;
+
 	if (!mm_init(mm))
 		goto fail_nomem;
 
@@ -542,6 +546,10 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 		goto fail_nomem;
 
 good_mm:
+	/* Initializing for Swap token stuff */
+	mm->token_priority = 0;
+	mm->last_interval = 0;
+
 	tsk->mm = mm;
 	tsk->active_mm = mm;
 	return 0;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 09e569f4792b..7abe9704e75a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -977,17 +977,6 @@ static ctl_table vm_table[] = {
 		.extra1		= &zero,
 	},
 #endif
-#ifdef CONFIG_SWAP
-	{
-		.ctl_name	= VM_SWAP_TOKEN_TIMEOUT,
-		.procname	= "swap_token_timeout",
-		.data		= &swap_token_default_timeout,
-		.maxlen		= sizeof(swap_token_default_timeout),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_jiffies,
-		.strategy	= &sysctl_jiffies,
-	},
-#endif
 #ifdef CONFIG_NUMA
 	{
 		.ctl_name	= VM_ZONE_RECLAIM_MODE,
-- 
cgit v1.2.3


From a866374aecc90c7d90619727ccd851ac096b2fc7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 6 Dec 2006 20:32:20 -0800
Subject: [PATCH] mm: pagefault_{disable,enable}()

Introduce pagefault_{disable,enable}() and use these where previously we did
manual preempt increments/decrements to make the pagefault handler do the
atomic thing.

Currently they still rely on the increased preempt count, but do not rely on
the disabled preemption, this might go away in the future.

(NOTE: the extra barrier() in pagefault_disable might fix some holes on
       machines which have too many registers for their own good)

[heiko.carstens@de.ibm.com: s390 fix]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Nick Piggin <npiggin@suse.de>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 93ef30ba209f..af7b81cbde30 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -282,9 +282,9 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
 {
 	int ret;
 
-	inc_preempt_count();
+	pagefault_disable();
 	ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
-	dec_preempt_count();
+	pagefault_enable();
 
 	return ret ? -EFAULT : 0;
 }
@@ -585,9 +585,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 	if (!(uval & FUTEX_OWNER_DIED)) {
 		newval = FUTEX_WAITERS | new_owner->pid;
 
-		inc_preempt_count();
+		pagefault_disable();
 		curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
-		dec_preempt_count();
+		pagefault_enable();
 		if (curval == -EFAULT)
 			return -EFAULT;
 		if (curval != uval)
@@ -618,9 +618,9 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
 	 * There is no waiter, so we unlock the futex. The owner died
 	 * bit has not to be preserved here. We are the owner:
 	 */
-	inc_preempt_count();
+	pagefault_disable();
 	oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
-	dec_preempt_count();
+	pagefault_enable();
 
 	if (oldval == -EFAULT)
 		return oldval;
@@ -1158,9 +1158,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
 	 */
 	newval = current->pid;
 
-	inc_preempt_count();
+	pagefault_disable();
 	curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
-	dec_preempt_count();
+	pagefault_enable();
 
 	if (unlikely(curval == -EFAULT))
 		goto uaddr_faulted;
@@ -1183,9 +1183,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
 	uval = curval;
 	newval = uval | FUTEX_WAITERS;
 
-	inc_preempt_count();
+	pagefault_disable();
 	curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
-	dec_preempt_count();
+	pagefault_enable();
 
 	if (unlikely(curval == -EFAULT))
 		goto uaddr_faulted;
@@ -1215,10 +1215,10 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
 			newval = current->pid |
 				FUTEX_OWNER_DIED | FUTEX_WAITERS;
 
-			inc_preempt_count();
+			pagefault_disable();
 			curval = futex_atomic_cmpxchg_inatomic(uaddr,
 							       uval, newval);
-			dec_preempt_count();
+			pagefault_enable();
 
 			if (unlikely(curval == -EFAULT))
 				goto uaddr_faulted;
@@ -1390,9 +1390,9 @@ retry_locked:
 	 * anyone else up:
 	 */
 	if (!(uval & FUTEX_OWNER_DIED)) {
-		inc_preempt_count();
+		pagefault_disable();
 		uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
-		dec_preempt_count();
+		pagefault_enable();
 	}
 
 	if (unlikely(uval == -EFAULT))
-- 
cgit v1.2.3


From e94b1766097d53e6f3ccfb36c8baa562ffeda3fc Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 6 Dec 2006 20:33:17 -0800
Subject: [PATCH] slab: remove SLAB_KERNEL

SLAB_KERNEL is an alias of GFP_KERNEL.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/delayacct.c | 2 +-
 kernel/fork.c      | 6 +++---
 kernel/taskstats.c | 2 +-
 kernel/user.c      | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 66a0ea48751d..70e9ec603082 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -41,7 +41,7 @@ void delayacct_init(void)
 
 void __delayacct_tsk_init(struct task_struct *tsk)
 {
-	tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL);
+	tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL);
 	if (tsk->delays)
 		spin_lock_init(&tsk->delays->lock);
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 5678e6c61ef2..711aa5f10da7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -237,7 +237,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 				goto fail_nomem;
 			charge = len;
 		}
-		tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+		tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 		if (!tmp)
 			goto fail_nomem;
 		*tmp = *mpnt;
@@ -319,7 +319,7 @@ static inline void mm_free_pgd(struct mm_struct * mm)
 
  __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
 
-#define allocate_mm()	(kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
+#define allocate_mm()	(kmem_cache_alloc(mm_cachep, GFP_KERNEL))
 #define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
 
 #include <linux/init_task.h>
@@ -621,7 +621,7 @@ static struct files_struct *alloc_files(void)
 	struct files_struct *newf;
 	struct fdtable *fdt;
 
-	newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
+	newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
 	if (!newf)
 		goto out;
 
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index d3d28919d4b4..1b2b326cf703 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -425,7 +425,7 @@ void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
 	*mycpu = raw_smp_processor_id();
 
 	*ptidstats = NULL;
-	tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
+	tmp = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
 	if (!tmp)
 		return;
 
diff --git a/kernel/user.c b/kernel/user.c
index 220e586127a0..c1f93c164c93 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -132,7 +132,7 @@ struct user_struct * alloc_uid(uid_t uid)
 	if (!up) {
 		struct user_struct *new;
 
-		new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL);
+		new = kmem_cache_alloc(uid_cachep, GFP_KERNEL);
 		if (!new)
 			return NULL;
 		new->uid = uid;
-- 
cgit v1.2.3


From e18b890bb0881bbab6f4f1a6cd20d9c60d66b003 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 6 Dec 2006 20:33:20 -0800
Subject: [PATCH] slab: remove kmem_cache_t

Replace all uses of kmem_cache_t with struct kmem_cache.

The patch was generated using the following script:

	#!/bin/sh
	#
	# Replace one string by another in all the kernel sources.
	#

	set -e

	for file in `find * -name "*.c" -o -name "*.h"|xargs grep -l $1`; do
		quilt add $file
		sed -e "1,\$s/$1/$2/g" $file >/tmp/$$
		mv /tmp/$$ $file
		quilt refresh
	done

The script was run like this

	sh replace kmem_cache_t "struct kmem_cache"

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/delayacct.c    |  2 +-
 kernel/fork.c         | 16 ++++++++--------
 kernel/pid.c          |  2 +-
 kernel/posix-timers.c |  2 +-
 kernel/signal.c       |  2 +-
 kernel/taskstats.c    |  2 +-
 kernel/user.c         |  2 +-
 7 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 70e9ec603082..766d5912b26a 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -20,7 +20,7 @@
 #include <linux/delayacct.h>
 
 int delayacct_on __read_mostly = 1;	/* Delay accounting turned on/off */
-kmem_cache_t *delayacct_cache;
+struct kmem_cache *delayacct_cache;
 
 static int __init delayacct_setup_disable(char *str)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index 711aa5f10da7..2cf74edd3295 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -82,26 +82,26 @@ int nr_processes(void)
 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
 # define alloc_task_struct()	kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
 # define free_task_struct(tsk)	kmem_cache_free(task_struct_cachep, (tsk))
-static kmem_cache_t *task_struct_cachep;
+static struct kmem_cache *task_struct_cachep;
 #endif
 
 /* SLAB cache for signal_struct structures (tsk->signal) */
-static kmem_cache_t *signal_cachep;
+static struct kmem_cache *signal_cachep;
 
 /* SLAB cache for sighand_struct structures (tsk->sighand) */
-kmem_cache_t *sighand_cachep;
+struct kmem_cache *sighand_cachep;
 
 /* SLAB cache for files_struct structures (tsk->files) */
-kmem_cache_t *files_cachep;
+struct kmem_cache *files_cachep;
 
 /* SLAB cache for fs_struct structures (tsk->fs) */
-kmem_cache_t *fs_cachep;
+struct kmem_cache *fs_cachep;
 
 /* SLAB cache for vm_area_struct structures */
-kmem_cache_t *vm_area_cachep;
+struct kmem_cache *vm_area_cachep;
 
 /* SLAB cache for mm_struct structures (tsk->mm) */
-static kmem_cache_t *mm_cachep;
+static struct kmem_cache *mm_cachep;
 
 void free_task(struct task_struct *tsk)
 {
@@ -1421,7 +1421,7 @@ long do_fork(unsigned long clone_flags,
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif
 
-static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
+static void sighand_ctor(void *data, struct kmem_cache *cachep, unsigned long flags)
 {
 	struct sighand_struct *sighand = data;
 
diff --git a/kernel/pid.c b/kernel/pid.c
index b914392085f9..a48879b0b921 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -31,7 +31,7 @@
 #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
 static struct hlist_head *pid_hash;
 static int pidhash_shift;
-static kmem_cache_t *pid_cachep;
+static struct kmem_cache *pid_cachep;
 
 int pid_max = PID_MAX_DEFAULT;
 
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9cbb5d1be06f..5fe87de10ff0 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -70,7 +70,7 @@
 /*
  * Lets keep our timers in a slab cache :-)
  */
-static kmem_cache_t *posix_timers_cache;
+static struct kmem_cache *posix_timers_cache;
 static struct idr posix_timers_id;
 static DEFINE_SPINLOCK(idr_lock);
 
diff --git a/kernel/signal.c b/kernel/signal.c
index df18c167a2a7..8e19d2785486 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -33,7 +33,7 @@
  * SLAB caches for signal bits.
  */
 
-static kmem_cache_t *sigqueue_cachep;
+static struct kmem_cache *sigqueue_cachep;
 
 /*
  * In POSIX a signal is sent either to a specific thread (Linux task)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 1b2b326cf703..f5f92014ae98 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -34,7 +34,7 @@
 
 static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
 static int family_registered;
-kmem_cache_t *taskstats_cache;
+struct kmem_cache *taskstats_cache;
 
 static struct genl_family family = {
 	.id		= GENL_ID_GENERATE,
diff --git a/kernel/user.c b/kernel/user.c
index c1f93c164c93..4869563080e9 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -26,7 +26,7 @@
 #define __uidhashfn(uid)	(((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
 #define uidhashentry(uid)	(uidhash_table + __uidhashfn((uid)))
 
-static kmem_cache_t *uid_cachep;
+static struct kmem_cache *uid_cachep;
 static struct list_head uidhash_table[UIDHASH_SZ];
 
 /*
-- 
cgit v1.2.3


From 3592695c363c3f3119621bdcf5ed852d6b9d1a5c Mon Sep 17 00:00:00 2001
From: Stefan Seyfried <seife@suse.de>
Date: Wed, 6 Dec 2006 20:34:06 -0800
Subject: [PATCH] uswsusp: add pmops->{prepare,enter,finish} support (aka
 "platform mode")

Add an ioctl to the userspace swsusp code that enables the usage of the
pmops->prepare, pmops->enter and pmops->finish methods (the in-kernel
suspend knows these as "platform method").  These are needed on many
machines to (among others) speed up resuming by letting the BIOS skip some
steps or let my hp nx5000 recognise the correct ac_adapter state after
resume again.

It also ensures on many machines, that changed hardware (unplugged AC
adapters) gets correctly detected and that kacpid does not run wild after
resume.

Signed-off-by: Stefan Seyfried <seife@suse.de>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/power.h |  7 ++++++-
 kernel/power/user.c  | 28 ++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index bfe999f7b272..87ecb1856ee8 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -117,7 +117,12 @@ extern void snapshot_free_unused_memory(struct snapshot_handle *handle);
 #define SNAPSHOT_FREE_SWAP_PAGES	_IO(SNAPSHOT_IOC_MAGIC, 9)
 #define SNAPSHOT_SET_SWAP_FILE		_IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
 #define SNAPSHOT_S2RAM			_IO(SNAPSHOT_IOC_MAGIC, 11)
-#define SNAPSHOT_IOC_MAXNR	11
+#define SNAPSHOT_PMOPS			_IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int)
+#define SNAPSHOT_IOC_MAXNR	12
+
+#define PMOPS_PREPARE	1
+#define PMOPS_ENTER	2
+#define PMOPS_FINISH	3
 
 /**
  *	The bitmap is used for tracing allocated swap pages
diff --git a/kernel/power/user.c b/kernel/power/user.c
index d991d3b0e5a4..4c24ca5d62e3 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -11,6 +11,7 @@
 
 #include <linux/suspend.h>
 #include <linux/syscalls.h>
+#include <linux/reboot.h>
 #include <linux/string.h>
 #include <linux/device.h>
 #include <linux/miscdevice.h>
@@ -313,6 +314,33 @@ OutS3:
 		up(&pm_sem);
 		break;
 
+	case SNAPSHOT_PMOPS:
+		switch (arg) {
+
+		case PMOPS_PREPARE:
+			if (pm_ops->prepare) {
+				error = pm_ops->prepare(PM_SUSPEND_DISK);
+			}
+			break;
+
+		case PMOPS_ENTER:
+			kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
+			error = pm_ops->enter(PM_SUSPEND_DISK);
+			break;
+
+		case PMOPS_FINISH:
+			if (pm_ops && pm_ops->finish) {
+				pm_ops->finish(PM_SUSPEND_DISK);
+			}
+			break;
+
+		default:
+			printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg);
+			error = -EINVAL;
+
+		}
+		break;
+
 	default:
 		error = -ENOTTY;
 
-- 
cgit v1.2.3


From 915bae9ebe41e52d71ad8b06d50e4ab26189f964 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:07 -0800
Subject: [PATCH] swsusp: use partition device and offset to identify swap
 areas

The Linux kernel handles swap files almost in the same way as it handles swap
partitions and there are only two differences between these two types of swap
areas:

(1) swap files need not be contiguous,

(2) the header of a swap file is not in the first block of the partition
    that holds it.  From the swsusp's point of view (1) is not a problem,
    because it is already taken care of by the swap-handling code, but (2) has
    to be taken into consideration.

In principle the location of a swap file's header may be determined with the
help of appropriate filesystem driver.  Unfortunately, however, it requires
the filesystem holding the swap file to be mounted, and if this filesystem is
journaled, it cannot be mounted during a resume from disk.  For this reason we
need some other means by which swap areas can be identified.

For example, to identify a swap area we can use the partition that holds the
area and the offset from the beginning of this partition at which the swap
header is located.

The following patch allows swsusp to identify swap areas this way.  It changes
swap_type_of() so that it takes an additional argument representing an offset
of the swap header within the partition represented by its first argument.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swap.c | 2 +-
 kernel/power/user.c | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 1a3b0dd2c3fc..7b10da16389e 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -74,7 +74,7 @@ static int mark_swapfiles(swp_entry_t start)
 
 static int swsusp_swap_check(void) /* This is called before saving image */
 {
-	int res = swap_type_of(swsusp_resume_device);
+	int res = swap_type_of(swsusp_resume_device, 0);
 
 	if (res >= 0) {
 		root_swap = res;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 4c24ca5d62e3..a327b18a5ffd 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -55,7 +55,8 @@ static int snapshot_open(struct inode *inode, struct file *filp)
 	filp->private_data = data;
 	memset(&data->handle, 0, sizeof(struct snapshot_handle));
 	if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
-		data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1;
+		data->swap = swsusp_resume_device ?
+				swap_type_of(swsusp_resume_device, 0) : -1;
 		data->mode = O_RDONLY;
 	} else {
 		data->swap = -1;
@@ -265,7 +266,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 			 * so we need to recode them
 			 */
 			if (old_decode_dev(arg)) {
-				data->swap = swap_type_of(old_decode_dev(arg));
+				data->swap = swap_type_of(old_decode_dev(arg), 0);
 				if (data->swap < 0)
 					error = -ENODEV;
 			} else {
-- 
cgit v1.2.3


From 3fc6b34f4803b959c1e30c15247e2180cd529115 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:09 -0800
Subject: [PATCH] swsusp: rearrange swap-handling code

Rearrange the code in kernel/power/swap.c so that the next patch is more
readable.

[This patch only moves the existing code.]

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swap.c | 221 ++++++++++++++++++++++++++--------------------------
 1 file changed, 112 insertions(+), 109 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 7b10da16389e..7768c2ba7550 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -41,10 +41,121 @@ static struct swsusp_header {
 } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
 
 /*
- * Saving part...
+ * General things
  */
 
 static unsigned short root_swap = 0xffff;
+static struct block_device *resume_bdev;
+
+/**
+ *	submit - submit BIO request.
+ *	@rw:	READ or WRITE.
+ *	@off	physical offset of page.
+ *	@page:	page we're reading or writing.
+ *	@bio_chain: list of pending biod (for async reading)
+ *
+ *	Straight from the textbook - allocate and initialize the bio.
+ *	If we're reading, make sure the page is marked as dirty.
+ *	Then submit it and, if @bio_chain == NULL, wait.
+ */
+static int submit(int rw, pgoff_t page_off, struct page *page,
+			struct bio **bio_chain)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_ATOMIC, 1);
+	if (!bio)
+		return -ENOMEM;
+	bio->bi_sector = page_off * (PAGE_SIZE >> 9);
+	bio->bi_bdev = resume_bdev;
+	bio->bi_end_io = end_swap_bio_read;
+
+	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+		printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
+		bio_put(bio);
+		return -EFAULT;
+	}
+
+	lock_page(page);
+	bio_get(bio);
+
+	if (bio_chain == NULL) {
+		submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+		wait_on_page_locked(page);
+		if (rw == READ)
+			bio_set_pages_dirty(bio);
+		bio_put(bio);
+	} else {
+		if (rw == READ)
+			get_page(page);	/* These pages are freed later */
+		bio->bi_private = *bio_chain;
+		*bio_chain = bio;
+		submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+	}
+	return 0;
+}
+
+static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
+{
+	return submit(READ, page_off, virt_to_page(addr), bio_chain);
+}
+
+static int bio_write_page(pgoff_t page_off, void *addr)
+{
+	return submit(WRITE, page_off, virt_to_page(addr), NULL);
+}
+
+static int wait_on_bio_chain(struct bio **bio_chain)
+{
+	struct bio *bio;
+	struct bio *next_bio;
+	int ret = 0;
+
+	if (bio_chain == NULL)
+		return 0;
+
+	bio = *bio_chain;
+	if (bio == NULL)
+		return 0;
+	while (bio) {
+		struct page *page;
+
+		next_bio = bio->bi_private;
+		page = bio->bi_io_vec[0].bv_page;
+		wait_on_page_locked(page);
+		if (!PageUptodate(page) || PageError(page))
+			ret = -EIO;
+		put_page(page);
+		bio_put(bio);
+		bio = next_bio;
+	}
+	*bio_chain = NULL;
+	return ret;
+}
+
+static void show_speed(struct timeval *start, struct timeval *stop,
+			unsigned nr_pages, char *msg)
+{
+	s64 elapsed_centisecs64;
+	int centisecs;
+	int k;
+	int kps;
+
+	elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
+	do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
+	centisecs = elapsed_centisecs64;
+	if (centisecs == 0)
+		centisecs = 1;	/* avoid div-by-zero */
+	k = nr_pages * (PAGE_SIZE / 1024);
+	kps = (k * 100) / centisecs;
+	printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
+			centisecs / 100, centisecs % 100,
+			kps / 1000, (kps % 1000) / 10);
+}
+
+/*
+ * Saving part
+ */
 
 static int mark_swapfiles(swp_entry_t start)
 {
@@ -166,26 +277,6 @@ static void release_swap_writer(struct swap_map_handle *handle)
 	handle->bitmap = NULL;
 }
 
-static void show_speed(struct timeval *start, struct timeval *stop,
-			unsigned nr_pages, char *msg)
-{
-	s64 elapsed_centisecs64;
-	int centisecs;
-	int k;
-	int kps;
-
-	elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
-	do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
-	centisecs = elapsed_centisecs64;
-	if (centisecs == 0)
-		centisecs = 1;	/* avoid div-by-zero */
-	k = nr_pages * (PAGE_SIZE / 1024);
-	kps = (k * 100) / centisecs;
-	printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
-			centisecs / 100, centisecs % 100,
-			kps / 1000, (kps % 1000) / 10);
-}
-
 static int get_swap_writer(struct swap_map_handle *handle)
 {
 	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
@@ -205,34 +296,6 @@ static int get_swap_writer(struct swap_map_handle *handle)
 	return 0;
 }
 
-static int wait_on_bio_chain(struct bio **bio_chain)
-{
-	struct bio *bio;
-	struct bio *next_bio;
-	int ret = 0;
-
-	if (bio_chain == NULL)
-		return 0;
-
-	bio = *bio_chain;
-	if (bio == NULL)
-		return 0;
-	while (bio) {
-		struct page *page;
-
-		next_bio = bio->bi_private;
-		page = bio->bi_io_vec[0].bv_page;
-		wait_on_page_locked(page);
-		if (!PageUptodate(page) || PageError(page))
-			ret = -EIO;
-		put_page(page);
-		bio_put(bio);
-		bio = next_bio;
-	}
-	*bio_chain = NULL;
-	return ret;
-}
-
 static int swap_write_page(struct swap_map_handle *handle, void *buf,
 				struct bio **bio_chain)
 {
@@ -384,66 +447,6 @@ int swsusp_write(void)
 	return error;
 }
 
-static struct block_device *resume_bdev;
-
-/**
- *	submit - submit BIO request.
- *	@rw:	READ or WRITE.
- *	@off	physical offset of page.
- *	@page:	page we're reading or writing.
- *	@bio_chain: list of pending biod (for async reading)
- *
- *	Straight from the textbook - allocate and initialize the bio.
- *	If we're reading, make sure the page is marked as dirty.
- *	Then submit it and, if @bio_chain == NULL, wait.
- */
-static int submit(int rw, pgoff_t page_off, struct page *page,
-			struct bio **bio_chain)
-{
-	struct bio *bio;
-
-	bio = bio_alloc(GFP_ATOMIC, 1);
-	if (!bio)
-		return -ENOMEM;
-	bio->bi_sector = page_off * (PAGE_SIZE >> 9);
-	bio->bi_bdev = resume_bdev;
-	bio->bi_end_io = end_swap_bio_read;
-
-	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
-		printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
-		bio_put(bio);
-		return -EFAULT;
-	}
-
-	lock_page(page);
-	bio_get(bio);
-
-	if (bio_chain == NULL) {
-		submit_bio(rw | (1 << BIO_RW_SYNC), bio);
-		wait_on_page_locked(page);
-		if (rw == READ)
-			bio_set_pages_dirty(bio);
-		bio_put(bio);
-	} else {
-		if (rw == READ)
-			get_page(page);	/* These pages are freed later */
-		bio->bi_private = *bio_chain;
-		*bio_chain = bio;
-		submit_bio(rw | (1 << BIO_RW_SYNC), bio);
-	}
-	return 0;
-}
-
-static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
-{
-	return submit(READ, page_off, virt_to_page(addr), bio_chain);
-}
-
-static int bio_write_page(pgoff_t page_off, void *addr)
-{
-	return submit(WRITE, page_off, virt_to_page(addr), NULL);
-}
-
 /**
  *	The following functions allow us to read data using a swap map
  *	in a file-alike way
-- 
cgit v1.2.3


From 3aef83e0ef1ffb8ea3bea97be46821a45c952173 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:10 -0800
Subject: [PATCH] swsusp: use block device offsets to identify swap locations

Make swsusp use block device offsets instead of swap offsets to identify swap
locations and make it use the same code paths for writing as well as for
reading data.

This allows us to use the same code for handling swap files and swap
partitions and to simplify the code, eg.  by dropping rw_swap_page_sync().

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/power.h  |   2 +-
 kernel/power/swap.c   | 133 ++++++++++++++++++++++++++------------------------
 kernel/power/swsusp.c |  10 ++--
 kernel/power/user.c   |   7 +--
 4 files changed, 80 insertions(+), 72 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index 87ecb1856ee8..210ebba26020 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -146,7 +146,7 @@ struct bitmap_page {
 
 extern void free_bitmap(struct bitmap_page *bitmap);
 extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
-extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap);
+extern sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap);
 extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
 
 extern int swsusp_check(void);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 7768c2ba7550..1b08f46bbb7e 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -34,8 +34,8 @@ extern char resume_file[];
 #define SWSUSP_SIG	"S1SUSPEND"
 
 static struct swsusp_header {
-	char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
-	swp_entry_t image;
+	char reserved[PAGE_SIZE - 20 - sizeof(sector_t)];
+	sector_t image;
 	char	orig_sig[10];
 	char	sig[10];
 } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
@@ -100,9 +100,9 @@ static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
 	return submit(READ, page_off, virt_to_page(addr), bio_chain);
 }
 
-static int bio_write_page(pgoff_t page_off, void *addr)
+static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
 {
-	return submit(WRITE, page_off, virt_to_page(addr), NULL);
+	return submit(WRITE, page_off, virt_to_page(addr), bio_chain);
 }
 
 static int wait_on_bio_chain(struct bio **bio_chain)
@@ -157,22 +157,19 @@ static void show_speed(struct timeval *start, struct timeval *stop,
  * Saving part
  */
 
-static int mark_swapfiles(swp_entry_t start)
+static int mark_swapfiles(sector_t start)
 {
 	int error;
 
-	rw_swap_page_sync(READ, swp_entry(root_swap, 0),
-			  virt_to_page((unsigned long)&swsusp_header), NULL);
+	bio_read_page(0, &swsusp_header, NULL);
 	if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
 	    !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
 		memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
 		memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
 		swsusp_header.image = start;
-		error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0),
-				virt_to_page((unsigned long)&swsusp_header),
-				NULL);
+		error = bio_write_page(0, &swsusp_header, NULL);
 	} else {
-		pr_debug("swsusp: Partition is not swap space.\n");
+		printk(KERN_ERR "swsusp: Swap header not found!\n");
 		error = -ENODEV;
 	}
 	return error;
@@ -185,12 +182,21 @@ static int mark_swapfiles(swp_entry_t start)
 
 static int swsusp_swap_check(void) /* This is called before saving image */
 {
-	int res = swap_type_of(swsusp_resume_device, 0);
+	int res;
+
+	res = swap_type_of(swsusp_resume_device, 0);
+	if (res < 0)
+		return res;
+
+	root_swap = res;
+	resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_WRITE);
+	if (IS_ERR(resume_bdev))
+		return PTR_ERR(resume_bdev);
+
+	res = set_blocksize(resume_bdev, PAGE_SIZE);
+	if (res < 0)
+		blkdev_put(resume_bdev);
 
-	if (res >= 0) {
-		root_swap = res;
-		return 0;
-	}
 	return res;
 }
 
@@ -201,36 +207,26 @@ static int swsusp_swap_check(void) /* This is called before saving image */
  *	@bio_chain:	Link the next write BIO here
  */
 
-static int write_page(void *buf, unsigned long offset, struct bio **bio_chain)
+static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
 {
-	swp_entry_t entry;
-	int error = -ENOSPC;
-
-	if (offset) {
-		struct page *page = virt_to_page(buf);
-
-		if (bio_chain) {
-			/*
-			 * Whether or not we successfully allocated a copy page,
-			 * we take a ref on the page here.  It gets undone in
-			 * wait_on_bio_chain().
-			 */
-			struct page *page_copy;
-			page_copy = alloc_page(GFP_ATOMIC);
-			if (page_copy == NULL) {
-				WARN_ON_ONCE(1);
-				bio_chain = NULL;	/* Go synchronous */
-				get_page(page);
-			} else {
-				memcpy(page_address(page_copy),
-					page_address(page), PAGE_SIZE);
-				page = page_copy;
-			}
+	void *src;
+
+	if (!offset)
+		return -ENOSPC;
+
+	if (bio_chain) {
+		src = (void *)__get_free_page(GFP_ATOMIC);
+		if (src) {
+			memcpy(src, buf, PAGE_SIZE);
+		} else {
+			WARN_ON_ONCE(1);
+			bio_chain = NULL;	/* Go synchronous */
+			src = buf;
 		}
-		entry = swp_entry(root_swap, offset);
-		error = rw_swap_page_sync(WRITE, entry, page, bio_chain);
+	} else {
+		src = buf;
 	}
-	return error;
+	return bio_write_page(offset, src, bio_chain);
 }
 
 /*
@@ -248,11 +244,11 @@ static int write_page(void *buf, unsigned long offset, struct bio **bio_chain)
  *	at a time.
  */
 
-#define MAP_PAGE_ENTRIES	(PAGE_SIZE / sizeof(long) - 1)
+#define MAP_PAGE_ENTRIES	(PAGE_SIZE / sizeof(sector_t) - 1)
 
 struct swap_map_page {
-	unsigned long		entries[MAP_PAGE_ENTRIES];
-	unsigned long		next_swap;
+	sector_t entries[MAP_PAGE_ENTRIES];
+	sector_t next_swap;
 };
 
 /**
@@ -262,7 +258,7 @@ struct swap_map_page {
 
 struct swap_map_handle {
 	struct swap_map_page *cur;
-	unsigned long cur_swap;
+	sector_t cur_swap;
 	struct bitmap_page *bitmap;
 	unsigned int k;
 };
@@ -287,7 +283,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
 		release_swap_writer(handle);
 		return -ENOMEM;
 	}
-	handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap);
+	handle->cur_swap = alloc_swapdev_block(root_swap, handle->bitmap);
 	if (!handle->cur_swap) {
 		release_swap_writer(handle);
 		return -ENOSPC;
@@ -300,11 +296,11 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
 				struct bio **bio_chain)
 {
 	int error = 0;
-	unsigned long offset;
+	sector_t offset;
 
 	if (!handle->cur)
 		return -EINVAL;
-	offset = alloc_swap_page(root_swap, handle->bitmap);
+	offset = alloc_swapdev_block(root_swap, handle->bitmap);
 	error = write_page(buf, offset, bio_chain);
 	if (error)
 		return error;
@@ -313,7 +309,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
 		error = wait_on_bio_chain(bio_chain);
 		if (error)
 			goto out;
-		offset = alloc_swap_page(root_swap, handle->bitmap);
+		offset = alloc_swapdev_block(root_swap, handle->bitmap);
 		if (!offset)
 			return -ENOSPC;
 		handle->cur->next_swap = offset;
@@ -413,37 +409,47 @@ int swsusp_write(void)
 	struct swsusp_info *header;
 	int error;
 
-	if ((error = swsusp_swap_check())) {
+	error = swsusp_swap_check();
+	if (error) {
 		printk(KERN_ERR "swsusp: Cannot find swap device, try "
 				"swapon -a.\n");
 		return error;
 	}
 	memset(&snapshot, 0, sizeof(struct snapshot_handle));
 	error = snapshot_read_next(&snapshot, PAGE_SIZE);
-	if (error < PAGE_SIZE)
-		return error < 0 ? error : -EFAULT;
+	if (error < PAGE_SIZE) {
+		if (error >= 0)
+			error = -EFAULT;
+
+		goto out;
+	}
 	header = (struct swsusp_info *)data_of(snapshot);
 	if (!enough_swap(header->pages)) {
 		printk(KERN_ERR "swsusp: Not enough free swap\n");
-		return -ENOSPC;
+		error = -ENOSPC;
+		goto out;
 	}
 	error = get_swap_writer(&handle);
 	if (!error) {
-		unsigned long start = handle.cur_swap;
+		sector_t start = handle.cur_swap;
+
 		error = swap_write_page(&handle, header, NULL);
 		if (!error)
 			error = save_image(&handle, &snapshot,
 					header->pages - 1);
+
 		if (!error) {
 			flush_swap_writer(&handle);
 			printk("S");
-			error = mark_swapfiles(swp_entry(root_swap, start));
+			error = mark_swapfiles(start);
 			printk("|\n");
 		}
 	}
 	if (error)
 		free_all_swap_pages(root_swap, handle.bitmap);
 	release_swap_writer(&handle);
+out:
+	swsusp_close();
 	return error;
 }
 
@@ -459,17 +465,18 @@ static void release_swap_reader(struct swap_map_handle *handle)
 	handle->cur = NULL;
 }
 
-static int get_swap_reader(struct swap_map_handle *handle,
-                                      swp_entry_t start)
+static int get_swap_reader(struct swap_map_handle *handle, sector_t start)
 {
 	int error;
 
-	if (!swp_offset(start))
+	if (!start)
 		return -EINVAL;
+
 	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
 	if (!handle->cur)
 		return -ENOMEM;
-	error = bio_read_page(swp_offset(start), handle->cur, NULL);
+
+	error = bio_read_page(start, handle->cur, NULL);
 	if (error) {
 		release_swap_reader(handle);
 		return error;
@@ -481,7 +488,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
 static int swap_read_page(struct swap_map_handle *handle, void *buf,
 				struct bio **bio_chain)
 {
-	unsigned long offset;
+	sector_t offset;
 	int error;
 
 	if (!handle->cur)
@@ -608,7 +615,7 @@ int swsusp_check(void)
 		if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
 			memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
 			/* Reset swap signature now */
-			error = bio_write_page(0, &swsusp_header);
+			error = bio_write_page(0, &swsusp_header, NULL);
 		} else {
 			return -EINVAL;
 		}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 0b66659dc516..4147a756a8c7 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -134,18 +134,18 @@ static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit)
 	return 0;
 }
 
-unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap)
+sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap)
 {
 	unsigned long offset;
 
 	offset = swp_offset(get_swap_page_of_type(swap));
 	if (offset) {
-		if (bitmap_set(bitmap, offset)) {
+		if (bitmap_set(bitmap, offset))
 			swap_free(swp_entry(swap, offset));
-			offset = 0;
-		}
+		else
+			return swapdev_block(swap, offset);
 	}
-	return offset;
+	return 0;
 }
 
 void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
diff --git a/kernel/power/user.c b/kernel/power/user.c
index a327b18a5ffd..f0b7ef8bdd74 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -126,7 +126,8 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 {
 	int error = 0;
 	struct snapshot_data *data;
-	loff_t offset, avail;
+	loff_t avail;
+	sector_t offset;
 
 	if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
 		return -ENOTTY;
@@ -240,10 +241,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 				break;
 			}
 		}
-		offset = alloc_swap_page(data->swap, data->bitmap);
+		offset = alloc_swapdev_block(data->swap, data->bitmap);
 		if (offset) {
 			offset <<= PAGE_SHIFT;
-			error = put_user(offset, (loff_t __user *)arg);
+			error = put_user(offset, (sector_t __user *)arg);
 		} else {
 			error = -ENOSPC;
 		}
-- 
cgit v1.2.3


From 9a154d9d95b7b9845938242f5c62505b3cab5bcd Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:12 -0800
Subject: [PATCH] swsusp: add resume_offset command line parameter

Add the kernel command line parameter "resume_offset=" allowing us to specify
the offset, in <PAGE_SIZE> units, from the beginning of the partition pointed
to by the "resume=" parameter at which the swap header is located.

This offset can be determined, for example, by an application using the FIBMAP
ioctl to obtain the swap header's block number for given file.

[akpm@osdl.org: we don't know what type sector_t is]
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c  | 15 +++++++++++++++
 kernel/power/power.h |  1 +
 kernel/power/swap.c  | 15 ++++++++++-----
 3 files changed, 26 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index b1fb7866b0b3..d79feeb45459 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -27,6 +27,7 @@
 static int noresume = 0;
 char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
+sector_t swsusp_resume_block;
 
 /**
  *	power_down - Shut machine down for hibernate.
@@ -423,6 +424,19 @@ static int __init resume_setup(char *str)
 	return 1;
 }
 
+static int __init resume_offset_setup(char *str)
+{
+	unsigned long long offset;
+
+	if (noresume)
+		return 1;
+
+	if (sscanf(str, "%llu", &offset) == 1)
+		swsusp_resume_block = offset;
+
+	return 1;
+}
+
 static int __init noresume_setup(char *str)
 {
 	noresume = 1;
@@ -430,4 +444,5 @@ static int __init noresume_setup(char *str)
 }
 
 __setup("noresume", noresume_setup);
+__setup("resume_offset=", resume_offset_setup);
 __setup("resume=", resume_setup);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 210ebba26020..adaf7d4fbdaf 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -42,6 +42,7 @@ extern const void __nosave_begin, __nosave_end;
 extern unsigned long image_size;
 extern int in_suspend;
 extern dev_t swsusp_resume_device;
+extern sector_t swsusp_resume_block;
 
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 1b08f46bbb7e..aa5a9bff01f1 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -161,13 +161,14 @@ static int mark_swapfiles(sector_t start)
 {
 	int error;
 
-	bio_read_page(0, &swsusp_header, NULL);
+	bio_read_page(swsusp_resume_block, &swsusp_header, NULL);
 	if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
 	    !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
 		memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
 		memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
 		swsusp_header.image = start;
-		error = bio_write_page(0, &swsusp_header, NULL);
+		error = bio_write_page(swsusp_resume_block,
+					&swsusp_header, NULL);
 	} else {
 		printk(KERN_ERR "swsusp: Swap header not found!\n");
 		error = -ENODEV;
@@ -184,7 +185,7 @@ static int swsusp_swap_check(void) /* This is called before saving image */
 {
 	int res;
 
-	res = swap_type_of(swsusp_resume_device, 0);
+	res = swap_type_of(swsusp_resume_device, swsusp_resume_block);
 	if (res < 0)
 		return res;
 
@@ -610,12 +611,16 @@ int swsusp_check(void)
 	if (!IS_ERR(resume_bdev)) {
 		set_blocksize(resume_bdev, PAGE_SIZE);
 		memset(&swsusp_header, 0, sizeof(swsusp_header));
-		if ((error = bio_read_page(0, &swsusp_header, NULL)))
+		error = bio_read_page(swsusp_resume_block,
+					&swsusp_header, NULL);
+		if (error)
 			return error;
+
 		if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
 			memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
 			/* Reset swap signature now */
-			error = bio_write_page(0, &swsusp_header, NULL);
+			error = bio_write_page(swsusp_resume_block,
+						&swsusp_header, NULL);
 		} else {
 			return -EINVAL;
 		}
-- 
cgit v1.2.3


From 37b2ba12df88f0e29f2d52aaf1ab22789377d5b5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:15 -0800
Subject: [PATCH] swsusp: add ioctl for swap files support

To be able to use swap files as suspend storage from the userland suspend
tools we need an additional ioctl() that will allow us to provide the kernel
with both the swap header's offset and the identification of the resume
partition.

The new ioctl() should be regarded as a replacement for the
SNAPSHOT_SET_SWAP_FILE ioctl() that from now on will be considered as
obsolete, but has to stay for backwards compatibility of the interface.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/power.h | 14 +++++++++++++-
 kernel/power/user.c  | 31 +++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index adaf7d4fbdaf..7dbfd9f67e1c 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -106,6 +106,16 @@ extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
 extern int snapshot_image_loaded(struct snapshot_handle *handle);
 extern void snapshot_free_unused_memory(struct snapshot_handle *handle);
 
+/*
+ * This structure is used to pass the values needed for the identification
+ * of the resume swap area from a user space to the kernel via the
+ * SNAPSHOT_SET_SWAP_AREA ioctl
+ */
+struct resume_swap_area {
+	loff_t offset;
+	u_int32_t dev;
+} __attribute__((packed));
+
 #define SNAPSHOT_IOC_MAGIC	'3'
 #define SNAPSHOT_FREEZE			_IO(SNAPSHOT_IOC_MAGIC, 1)
 #define SNAPSHOT_UNFREEZE		_IO(SNAPSHOT_IOC_MAGIC, 2)
@@ -119,7 +129,9 @@ extern void snapshot_free_unused_memory(struct snapshot_handle *handle);
 #define SNAPSHOT_SET_SWAP_FILE		_IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
 #define SNAPSHOT_S2RAM			_IO(SNAPSHOT_IOC_MAGIC, 11)
 #define SNAPSHOT_PMOPS			_IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int)
-#define SNAPSHOT_IOC_MAXNR	12
+#define SNAPSHOT_SET_SWAP_AREA		_IOW(SNAPSHOT_IOC_MAGIC, 13, \
+							struct resume_swap_area)
+#define SNAPSHOT_IOC_MAXNR	13
 
 #define PMOPS_PREPARE	1
 #define PMOPS_ENTER	2
diff --git a/kernel/power/user.c b/kernel/power/user.c
index f0b7ef8bdd74..05c58a2c0dd4 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -343,6 +343,37 @@ OutS3:
 		}
 		break;
 
+	case SNAPSHOT_SET_SWAP_AREA:
+		if (data->bitmap) {
+			error = -EPERM;
+		} else {
+			struct resume_swap_area swap_area;
+			dev_t swdev;
+
+			error = copy_from_user(&swap_area, (void __user *)arg,
+					sizeof(struct resume_swap_area));
+			if (error) {
+				error = -EFAULT;
+				break;
+			}
+
+			/*
+			 * User space encodes device types as two-byte values,
+			 * so we need to recode them
+			 */
+			swdev = old_decode_dev(swap_area.dev);
+			if (swdev) {
+				offset = swap_area.offset;
+				data->swap = swap_type_of(swdev, offset);
+				if (data->swap < 0)
+					error = -ENODEV;
+			} else {
+				data->swap = -1;
+				error = -EINVAL;
+			}
+		}
+		break;
+
 	default:
 		error = -ENOTTY;
 
-- 
cgit v1.2.3


From 8357376d3df21b7d6f857931a57ac50da9c66e26 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:18 -0800
Subject: [PATCH] swsusp: Improve handling of highmem

Currently swsusp saves the contents of highmem pages by copying them to the
normal zone which is quite inefficient (eg.  it requires two normal pages
to be used for saving one highmem page).  This may be improved by using
highmem for saving the contents of saveable highmem pages.

Namely, during the suspend phase of the suspend-resume cycle we try to
allocate as many free highmem pages as there are saveable highmem pages.
If there are not enough highmem image pages to store the contents of all of
the saveable highmem pages, some of them will be stored in the "normal"
memory.  Next, we allocate as many free "normal" pages as needed to store
the (remaining) image data.  We use a memory bitmap to mark the allocated
free pages (ie.  highmem as well as "normal" image pages).

Now, we use another memory bitmap to mark all of the saveable pages
(highmem as well as "normal") and the contents of the saveable pages are
copied into the image pages.  Then, the second bitmap is used to save the
pfns corresponding to the saveable pages and the first one is used to save
their data.

During the resume phase the pfns of the pages that were saveable during the
suspend are loaded from the image and used to mark the "unsafe" page
frames.  Next, we try to allocate as many free highmem page frames as to
load all of the image data that had been in the highmem before the suspend
and we allocate so many free "normal" page frames that the total number of
allocated free pages (highmem and "normal") is equal to the size of the
image.  While doing this we have to make sure that there will be some extra
free "normal" and "safe" page frames for two lists of PBEs constructed
later.

Now, the image data are loaded, if possible, into their "original" page
frames.  The image data that cannot be written into their "original" page
frames are loaded into "safe" page frames and their "original" kernel
virtual addresses, as well as the addresses of the "safe" pages containing
their copies, are stored in one of two lists of PBEs.

One list of PBEs is for the copies of "normal" suspend pages (ie.  "normal"
pages that were saveable during the suspend) and it is used in the same way
as previously (ie.  by the architecture-dependent parts of swsusp).  The
other list of PBEs is for the copies of highmem suspend pages.  The pages
in this list are restored (in a reversible way) right before the
arch-dependent code is called.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/power.h    |   2 +-
 kernel/power/snapshot.c | 851 ++++++++++++++++++++++++++++++++++++------------
 kernel/power/swap.c     |   2 +-
 kernel/power/swsusp.c   |  53 +--
 kernel/power/user.c     |   2 +-
 5 files changed, 671 insertions(+), 239 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index 7dbfd9f67e1c..3763343bde2f 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -103,8 +103,8 @@ struct snapshot_handle {
 extern unsigned int snapshot_additional_pages(struct zone *zone);
 extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
 extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
+extern void snapshot_write_finalize(struct snapshot_handle *handle);
 extern int snapshot_image_loaded(struct snapshot_handle *handle);
-extern void snapshot_free_unused_memory(struct snapshot_handle *handle);
 
 /*
  * This structure is used to pass the values needed for the identification
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 99f9b7d177d6..fd8251d40eb8 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1,15 +1,15 @@
 /*
  * linux/kernel/power/snapshot.c
  *
- * This file provide system snapshot/restore functionality.
+ * This file provides system snapshot/restore functionality for swsusp.
  *
  * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
  *
- * This file is released under the GPLv2, and is based on swsusp.c.
+ * This file is released under the GPLv2.
  *
  */
 
-
 #include <linux/version.h>
 #include <linux/module.h>
 #include <linux/mm.h>
@@ -34,137 +34,24 @@
 
 #include "power.h"
 
-/* List of PBEs used for creating and restoring the suspend image */
+/* List of PBEs needed for restoring the pages that were allocated before
+ * the suspend and included in the suspend image, but have also been
+ * allocated by the "resume" kernel, so their contents cannot be written
+ * directly to their "original" page frames.
+ */
 struct pbe *restore_pblist;
 
-static unsigned int nr_copy_pages;
-static unsigned int nr_meta_pages;
+/* Pointer to an auxiliary buffer (1 page) */
 static void *buffer;
 
-#ifdef CONFIG_HIGHMEM
-unsigned int count_highmem_pages(void)
-{
-	struct zone *zone;
-	unsigned long zone_pfn;
-	unsigned int n = 0;
-
-	for_each_zone (zone)
-		if (is_highmem(zone)) {
-			mark_free_pages(zone);
-			for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) {
-				struct page *page;
-				unsigned long pfn = zone_pfn + zone->zone_start_pfn;
-				if (!pfn_valid(pfn))
-					continue;
-				page = pfn_to_page(pfn);
-				if (PageReserved(page))
-					continue;
-				if (PageNosaveFree(page))
-					continue;
-				n++;
-			}
-		}
-	return n;
-}
-
-struct highmem_page {
-	char *data;
-	struct page *page;
-	struct highmem_page *next;
-};
-
-static struct highmem_page *highmem_copy;
-
-static int save_highmem_zone(struct zone *zone)
-{
-	unsigned long zone_pfn;
-	mark_free_pages(zone);
-	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
-		struct page *page;
-		struct highmem_page *save;
-		void *kaddr;
-		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
-
-		if (!(pfn%10000))
-			printk(".");
-		if (!pfn_valid(pfn))
-			continue;
-		page = pfn_to_page(pfn);
-		/*
-		 * This condition results from rvmalloc() sans vmalloc_32()
-		 * and architectural memory reservations. This should be
-		 * corrected eventually when the cases giving rise to this
-		 * are better understood.
-		 */
-		if (PageReserved(page))
-			continue;
-		BUG_ON(PageNosave(page));
-		if (PageNosaveFree(page))
-			continue;
-		save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
-		if (!save)
-			return -ENOMEM;
-		save->next = highmem_copy;
-		save->page = page;
-		save->data = (void *) get_zeroed_page(GFP_ATOMIC);
-		if (!save->data) {
-			kfree(save);
-			return -ENOMEM;
-		}
-		kaddr = kmap_atomic(page, KM_USER0);
-		memcpy(save->data, kaddr, PAGE_SIZE);
-		kunmap_atomic(kaddr, KM_USER0);
-		highmem_copy = save;
-	}
-	return 0;
-}
-
-int save_highmem(void)
-{
-	struct zone *zone;
-	int res = 0;
-
-	pr_debug("swsusp: Saving Highmem");
-	drain_local_pages();
-	for_each_zone (zone) {
-		if (is_highmem(zone))
-			res = save_highmem_zone(zone);
-		if (res)
-			return res;
-	}
-	printk("\n");
-	return 0;
-}
-
-int restore_highmem(void)
-{
-	printk("swsusp: Restoring Highmem\n");
-	while (highmem_copy) {
-		struct highmem_page *save = highmem_copy;
-		void *kaddr;
-		highmem_copy = save->next;
-
-		kaddr = kmap_atomic(save->page, KM_USER0);
-		memcpy(kaddr, save->data, PAGE_SIZE);
-		kunmap_atomic(kaddr, KM_USER0);
-		free_page((long) save->data);
-		kfree(save);
-	}
-	return 0;
-}
-#else
-static inline unsigned int count_highmem_pages(void) {return 0;}
-static inline int save_highmem(void) {return 0;}
-static inline int restore_highmem(void) {return 0;}
-#endif
-
 /**
  *	@safe_needed - on resume, for storing the PBE list and the image,
  *	we can only use memory pages that do not conflict with the pages
- *	used before suspend.
+ *	used before suspend.  The unsafe pages have PageNosaveFree set
+ *	and we count them using unsafe_pages.
  *
- *	The unsafe pages are marked with the PG_nosave_free flag
- *	and we count them using unsafe_pages
+ *	Each allocated image page is marked as PageNosave and PageNosaveFree
+ *	so that swsusp_free() can release it.
  */
 
 #define PG_ANY		0
@@ -174,7 +61,7 @@ static inline int restore_highmem(void) {return 0;}
 
 static unsigned int allocated_unsafe_pages;
 
-static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
+static void *get_image_page(gfp_t gfp_mask, int safe_needed)
 {
 	void *res;
 
@@ -195,20 +82,38 @@ static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
 
 unsigned long get_safe_page(gfp_t gfp_mask)
 {
-	return (unsigned long)alloc_image_page(gfp_mask, PG_SAFE);
+	return (unsigned long)get_image_page(gfp_mask, PG_SAFE);
+}
+
+static struct page *alloc_image_page(gfp_t gfp_mask) {
+	struct page *page;
+
+	page = alloc_page(gfp_mask);
+	if (page) {
+		SetPageNosave(page);
+		SetPageNosaveFree(page);
+	}
+	return page;
 }
 
 /**
  *	free_image_page - free page represented by @addr, allocated with
- *	alloc_image_page (page flags set by it must be cleared)
+ *	get_image_page (page flags set by it must be cleared)
  */
 
 static inline void free_image_page(void *addr, int clear_nosave_free)
 {
-	ClearPageNosave(virt_to_page(addr));
+	struct page *page;
+
+	BUG_ON(!virt_addr_valid(addr));
+
+	page = virt_to_page(addr);
+
+	ClearPageNosave(page);
 	if (clear_nosave_free)
-		ClearPageNosaveFree(virt_to_page(addr));
-	free_page((unsigned long)addr);
+		ClearPageNosaveFree(page);
+
+	__free_page(page);
 }
 
 /* struct linked_page is used to build chains of pages */
@@ -269,7 +174,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
 	if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
 		struct linked_page *lp;
 
-		lp = alloc_image_page(ca->gfp_mask, ca->safe_needed);
+		lp = get_image_page(ca->gfp_mask, ca->safe_needed);
 		if (!lp)
 			return NULL;
 
@@ -446,8 +351,8 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
 
 	/* Compute the number of zones */
 	nr = 0;
-	for_each_zone (zone)
-		if (populated_zone(zone) && !is_highmem(zone))
+	for_each_zone(zone)
+		if (populated_zone(zone))
 			nr++;
 
 	/* Allocate the list of zones bitmap objects */
@@ -459,10 +364,10 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
 	}
 
 	/* Initialize the zone bitmap objects */
-	for_each_zone (zone) {
+	for_each_zone(zone) {
 		unsigned long pfn;
 
-		if (!populated_zone(zone) || is_highmem(zone))
+		if (!populated_zone(zone))
 			continue;
 
 		zone_bm->start_pfn = zone->zone_start_pfn;
@@ -481,7 +386,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
 		while (bb) {
 			unsigned long *ptr;
 
-			ptr = alloc_image_page(gfp_mask, safe_needed);
+			ptr = get_image_page(gfp_mask, safe_needed);
 			bb->data = ptr;
 			if (!ptr)
 				goto Free;
@@ -669,9 +574,81 @@ unsigned int snapshot_additional_pages(struct zone *zone)
 
 	res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
 	res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE);
-	return res;
+	return 2 * res;
 }
 
+#ifdef CONFIG_HIGHMEM
+/**
+ *	count_free_highmem_pages - compute the total number of free highmem
+ *	pages, system-wide.
+ */
+
+static unsigned int count_free_highmem_pages(void)
+{
+	struct zone *zone;
+	unsigned int cnt = 0;
+
+	for_each_zone(zone)
+		if (populated_zone(zone) && is_highmem(zone))
+			cnt += zone->free_pages;
+
+	return cnt;
+}
+
+/**
+ *	saveable_highmem_page - Determine whether a highmem page should be
+ *	included in the suspend image.
+ *
+ *	We should save the page if it isn't Nosave or NosaveFree, or Reserved,
+ *	and it isn't a part of a free chunk of pages.
+ */
+
+static struct page *saveable_highmem_page(unsigned long pfn)
+{
+	struct page *page;
+
+	if (!pfn_valid(pfn))
+		return NULL;
+
+	page = pfn_to_page(pfn);
+
+	BUG_ON(!PageHighMem(page));
+
+	if (PageNosave(page) || PageReserved(page) || PageNosaveFree(page))
+		return NULL;
+
+	return page;
+}
+
+/**
+ *	count_highmem_pages - compute the total number of saveable highmem
+ *	pages.
+ */
+
+unsigned int count_highmem_pages(void)
+{
+	struct zone *zone;
+	unsigned int n = 0;
+
+	for_each_zone(zone) {
+		unsigned long pfn, max_zone_pfn;
+
+		if (!is_highmem(zone))
+			continue;
+
+		mark_free_pages(zone);
+		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+			if (saveable_highmem_page(pfn))
+				n++;
+	}
+	return n;
+}
+#else
+static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
+static inline unsigned int count_highmem_pages(void) { return 0; }
+#endif /* CONFIG_HIGHMEM */
+
 /**
  *	pfn_is_nosave - check if given pfn is in the 'nosave' section
  */
@@ -684,12 +661,12 @@ static inline int pfn_is_nosave(unsigned long pfn)
 }
 
 /**
- *	saveable - Determine whether a page should be cloned or not.
- *	@pfn:	The page
+ *	saveable - Determine whether a non-highmem page should be included in
+ *	the suspend image.
  *
- *	We save a page if it isn't Nosave, and is not in the range of pages
- *	statically defined as 'unsaveable', and it
- *	isn't a part of a free chunk of pages.
+ *	We should save the page if it isn't Nosave, and is not in the range
+ *	of pages statically defined as 'unsaveable', and it isn't a part of
+ *	a free chunk of pages.
  */
 
 static struct page *saveable_page(unsigned long pfn)
@@ -701,76 +678,130 @@ static struct page *saveable_page(unsigned long pfn)
 
 	page = pfn_to_page(pfn);
 
-	if (PageNosave(page))
+	BUG_ON(PageHighMem(page));
+
+	if (PageNosave(page) || PageNosaveFree(page))
 		return NULL;
+
 	if (PageReserved(page) && pfn_is_nosave(pfn))
 		return NULL;
-	if (PageNosaveFree(page))
-		return NULL;
 
 	return page;
 }
 
+/**
+ *	count_data_pages - compute the total number of saveable non-highmem
+ *	pages.
+ */
+
 unsigned int count_data_pages(void)
 {
 	struct zone *zone;
 	unsigned long pfn, max_zone_pfn;
 	unsigned int n = 0;
 
-	for_each_zone (zone) {
+	for_each_zone(zone) {
 		if (is_highmem(zone))
 			continue;
+
 		mark_free_pages(zone);
 		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
-			n += !!saveable_page(pfn);
+			if(saveable_page(pfn))
+				n++;
 	}
 	return n;
 }
 
-static inline void copy_data_page(long *dst, long *src)
+/* This is needed, because copy_page and memcpy are not usable for copying
+ * task structs.
+ */
+static inline void do_copy_page(long *dst, long *src)
 {
 	int n;
 
-	/* copy_page and memcpy are not usable for copying task structs. */
 	for (n = PAGE_SIZE / sizeof(long); n; n--)
 		*dst++ = *src++;
 }
 
+#ifdef CONFIG_HIGHMEM
+static inline struct page *
+page_is_saveable(struct zone *zone, unsigned long pfn)
+{
+	return is_highmem(zone) ?
+			saveable_highmem_page(pfn) : saveable_page(pfn);
+}
+
+static inline void
+copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
+{
+	struct page *s_page, *d_page;
+	void *src, *dst;
+
+	s_page = pfn_to_page(src_pfn);
+	d_page = pfn_to_page(dst_pfn);
+	if (PageHighMem(s_page)) {
+		src = kmap_atomic(s_page, KM_USER0);
+		dst = kmap_atomic(d_page, KM_USER1);
+		do_copy_page(dst, src);
+		kunmap_atomic(src, KM_USER0);
+		kunmap_atomic(dst, KM_USER1);
+	} else {
+		src = page_address(s_page);
+		if (PageHighMem(d_page)) {
+			/* Page pointed to by src may contain some kernel
+			 * data modified by kmap_atomic()
+			 */
+			do_copy_page(buffer, src);
+			dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0);
+			memcpy(dst, buffer, PAGE_SIZE);
+			kunmap_atomic(dst, KM_USER0);
+		} else {
+			dst = page_address(d_page);
+			do_copy_page(dst, src);
+		}
+	}
+}
+#else
+#define page_is_saveable(zone, pfn)	saveable_page(pfn)
+
+static inline void
+copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
+{
+	do_copy_page(page_address(pfn_to_page(dst_pfn)),
+			page_address(pfn_to_page(src_pfn)));
+}
+#endif /* CONFIG_HIGHMEM */
+
 static void
 copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
 {
 	struct zone *zone;
 	unsigned long pfn;
 
-	for_each_zone (zone) {
+	for_each_zone(zone) {
 		unsigned long max_zone_pfn;
 
-		if (is_highmem(zone))
-			continue;
-
 		mark_free_pages(zone);
 		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
-			if (saveable_page(pfn))
+			if (page_is_saveable(zone, pfn))
 				memory_bm_set_bit(orig_bm, pfn);
 	}
 	memory_bm_position_reset(orig_bm);
 	memory_bm_position_reset(copy_bm);
 	do {
 		pfn = memory_bm_next_pfn(orig_bm);
-		if (likely(pfn != BM_END_OF_MAP)) {
-			struct page *page;
-			void *src;
-
-			page = pfn_to_page(pfn);
-			src = page_address(page);
-			page = pfn_to_page(memory_bm_next_pfn(copy_bm));
-			copy_data_page(page_address(page), src);
-		}
+		if (likely(pfn != BM_END_OF_MAP))
+			copy_data_page(memory_bm_next_pfn(copy_bm), pfn);
 	} while (pfn != BM_END_OF_MAP);
 }
 
+/* Total number of image pages */
+static unsigned int nr_copy_pages;
+/* Number of pages needed for saving the original pfns of the image pages */
+static unsigned int nr_meta_pages;
+
 /**
  *	swsusp_free - free pages allocated for the suspend.
  *
@@ -792,7 +823,7 @@ void swsusp_free(void)
 				if (PageNosave(page) && PageNosaveFree(page)) {
 					ClearPageNosave(page);
 					ClearPageNosaveFree(page);
-					free_page((long) page_address(page));
+					__free_page(page);
 				}
 			}
 	}
@@ -802,34 +833,108 @@ void swsusp_free(void)
 	buffer = NULL;
 }
 
+#ifdef CONFIG_HIGHMEM
+/**
+  *	count_pages_for_highmem - compute the number of non-highmem pages
+  *	that will be necessary for creating copies of highmem pages.
+  */
+
+static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
+{
+	unsigned int free_highmem = count_free_highmem_pages();
+
+	if (free_highmem >= nr_highmem)
+		nr_highmem = 0;
+	else
+		nr_highmem -= free_highmem;
+
+	return nr_highmem;
+}
+#else
+static unsigned int
+count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
+#endif /* CONFIG_HIGHMEM */
 
 /**
- *	enough_free_mem - Make sure we enough free memory to snapshot.
- *
- *	Returns TRUE or FALSE after checking the number of available
- *	free pages.
+ *	enough_free_mem - Make sure we have enough free memory for the
+ *	snapshot image.
  */
 
-static int enough_free_mem(unsigned int nr_pages)
+static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
 {
 	struct zone *zone;
 	unsigned int free = 0, meta = 0;
 
-	for_each_zone (zone)
-		if (!is_highmem(zone)) {
+	for_each_zone(zone) {
+		meta += snapshot_additional_pages(zone);
+		if (!is_highmem(zone))
 			free += zone->free_pages;
-			meta += snapshot_additional_pages(zone);
-		}
+	}
 
-	pr_debug("swsusp: pages needed: %u + %u + %u, available pages: %u\n",
+	nr_pages += count_pages_for_highmem(nr_highmem);
+	pr_debug("swsusp: Normal pages needed: %u + %u + %u, available pages: %u\n",
 		nr_pages, PAGES_FOR_IO, meta, free);
 
 	return free > nr_pages + PAGES_FOR_IO + meta;
 }
 
+#ifdef CONFIG_HIGHMEM
+/**
+ *	get_highmem_buffer - if there are some highmem pages in the suspend
+ *	image, we may need the buffer to copy them and/or load their data.
+ */
+
+static inline int get_highmem_buffer(int safe_needed)
+{
+	buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
+	return buffer ? 0 : -ENOMEM;
+}
+
+/**
+ *	alloc_highmem_image_pages - allocate some highmem pages for the image.
+ *	Try to allocate as many pages as needed, but if the number of free
+ *	highmem pages is lesser than that, allocate them all.
+ */
+
+static inline unsigned int
+alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
+{
+	unsigned int to_alloc = count_free_highmem_pages();
+
+	if (to_alloc > nr_highmem)
+		to_alloc = nr_highmem;
+
+	nr_highmem -= to_alloc;
+	while (to_alloc-- > 0) {
+		struct page *page;
+
+		page = alloc_image_page(__GFP_HIGHMEM);
+		memory_bm_set_bit(bm, page_to_pfn(page));
+	}
+	return nr_highmem;
+}
+#else
+static inline int get_highmem_buffer(int safe_needed) { return 0; }
+
+static inline unsigned int
+alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
+#endif /* CONFIG_HIGHMEM */
+
+/**
+ *	swsusp_alloc - allocate memory for the suspend image
+ *
+ *	We first try to allocate as many highmem pages as there are
+ *	saveable highmem pages in the system.  If that fails, we allocate
+ *	non-highmem pages for the copies of the remaining highmem ones.
+ *
+ *	In this approach it is likely that the copies of highmem pages will
+ *	also be located in the high memory, because of the way in which
+ *	copy_data_pages() works.
+ */
+
 static int
 swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
-		unsigned int nr_pages)
+		unsigned int nr_pages, unsigned int nr_highmem)
 {
 	int error;
 
@@ -841,13 +946,19 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
 	if (error)
 		goto Free;
 
+	if (nr_highmem > 0) {
+		error = get_highmem_buffer(PG_ANY);
+		if (error)
+			goto Free;
+
+		nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem);
+	}
 	while (nr_pages-- > 0) {
-		struct page *page = alloc_page(GFP_ATOMIC | __GFP_COLD);
+		struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
+
 		if (!page)
 			goto Free;
 
-		SetPageNosave(page);
-		SetPageNosaveFree(page);
 		memory_bm_set_bit(copy_bm, page_to_pfn(page));
 	}
 	return 0;
@@ -857,30 +968,39 @@ Free:
 	return -ENOMEM;
 }
 
-/* Memory bitmap used for marking saveable pages */
+/* Memory bitmap used for marking saveable pages (during suspend) or the
+ * suspend image pages (during resume)
+ */
 static struct memory_bitmap orig_bm;
-/* Memory bitmap used for marking allocated pages that will contain the copies
- * of saveable pages
+/* Memory bitmap used on suspend for marking allocated pages that will contain
+ * the copies of saveable pages.  During resume it is initially used for
+ * marking the suspend image pages, but then its set bits are duplicated in
+ * @orig_bm and it is released.  Next, on systems with high memory, it may be
+ * used for marking "safe" highmem pages, but it has to be reinitialized for
+ * this purpose.
  */
 static struct memory_bitmap copy_bm;
 
 asmlinkage int swsusp_save(void)
 {
-	unsigned int nr_pages;
+	unsigned int nr_pages, nr_highmem;
 
-	pr_debug("swsusp: critical section: \n");
+	printk("swsusp: critical section: \n");
 
 	drain_local_pages();
 	nr_pages = count_data_pages();
-	printk("swsusp: Need to copy %u pages\n", nr_pages);
+	nr_highmem = count_highmem_pages();
+	printk("swsusp: Need to copy %u pages\n", nr_pages + nr_highmem);
 
-	if (!enough_free_mem(nr_pages)) {
+	if (!enough_free_mem(nr_pages, nr_highmem)) {
 		printk(KERN_ERR "swsusp: Not enough free memory\n");
 		return -ENOMEM;
 	}
 
-	if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages))
+	if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages, nr_highmem)) {
+		printk(KERN_ERR "swsusp: Memory allocation failed\n");
 		return -ENOMEM;
+	}
 
 	/* During allocating of suspend pagedir, new cold pages may appear.
 	 * Kill them.
@@ -894,10 +1014,12 @@ asmlinkage int swsusp_save(void)
 	 * touch swap space! Except we must write out our image of course.
 	 */
 
+	nr_pages += nr_highmem;
 	nr_copy_pages = nr_pages;
-	nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
 
 	printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
+
 	return 0;
 }
 
@@ -960,7 +1082,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 
 	if (!buffer) {
 		/* This makes the buffer be freed by swsusp_free() */
-		buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
+		buffer = get_image_page(GFP_ATOMIC, PG_ANY);
 		if (!buffer)
 			return -ENOMEM;
 	}
@@ -975,9 +1097,23 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 			memset(buffer, 0, PAGE_SIZE);
 			pack_pfns(buffer, &orig_bm);
 		} else {
-			unsigned long pfn = memory_bm_next_pfn(&copy_bm);
+			struct page *page;
 
-			handle->buffer = page_address(pfn_to_page(pfn));
+			page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
+			if (PageHighMem(page)) {
+				/* Highmem pages are copied to the buffer,
+				 * because we can't return with a kmapped
+				 * highmem page (we may not be called again).
+				 */
+				void *kaddr;
+
+				kaddr = kmap_atomic(page, KM_USER0);
+				memcpy(buffer, kaddr, PAGE_SIZE);
+				kunmap_atomic(kaddr, KM_USER0);
+				handle->buffer = buffer;
+			} else {
+				handle->buffer = page_address(page);
+			}
 		}
 		handle->prev = handle->cur;
 	}
@@ -1005,7 +1141,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
 	unsigned long pfn, max_zone_pfn;
 
 	/* Clear page flags */
-	for_each_zone (zone) {
+	for_each_zone(zone) {
 		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
 			if (pfn_valid(pfn))
@@ -1101,6 +1237,218 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
 	}
 }
 
+/* List of "safe" pages that may be used to store data loaded from the suspend
+ * image
+ */
+static struct linked_page *safe_pages_list;
+
+#ifdef CONFIG_HIGHMEM
+/* struct highmem_pbe is used for creating the list of highmem pages that
+ * should be restored atomically during the resume from disk, because the page
+ * frames they have occupied before the suspend are in use.
+ */
+struct highmem_pbe {
+	struct page *copy_page;	/* data is here now */
+	struct page *orig_page;	/* data was here before the suspend */
+	struct highmem_pbe *next;
+};
+
+/* List of highmem PBEs needed for restoring the highmem pages that were
+ * allocated before the suspend and included in the suspend image, but have
+ * also been allocated by the "resume" kernel, so their contents cannot be
+ * written directly to their "original" page frames.
+ */
+static struct highmem_pbe *highmem_pblist;
+
+/**
+ *	count_highmem_image_pages - compute the number of highmem pages in the
+ *	suspend image.  The bits in the memory bitmap @bm that correspond to the
+ *	image pages are assumed to be set.
+ */
+
+static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
+{
+	unsigned long pfn;
+	unsigned int cnt = 0;
+
+	memory_bm_position_reset(bm);
+	pfn = memory_bm_next_pfn(bm);
+	while (pfn != BM_END_OF_MAP) {
+		if (PageHighMem(pfn_to_page(pfn)))
+			cnt++;
+
+		pfn = memory_bm_next_pfn(bm);
+	}
+	return cnt;
+}
+
+/**
+ *	prepare_highmem_image - try to allocate as many highmem pages as
+ *	there are highmem image pages (@nr_highmem_p points to the variable
+ *	containing the number of highmem image pages).  The pages that are
+ *	"safe" (ie. will not be overwritten when the suspend image is
+ *	restored) have the corresponding bits set in @bm (it must be
+ *	unitialized).
+ *
+ *	NOTE: This function should not be called if there are no highmem
+ *	image pages.
+ */
+
+static unsigned int safe_highmem_pages;
+
+static struct memory_bitmap *safe_highmem_bm;
+
+static int
+prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
+{
+	unsigned int to_alloc;
+
+	if (memory_bm_create(bm, GFP_ATOMIC, PG_SAFE))
+		return -ENOMEM;
+
+	if (get_highmem_buffer(PG_SAFE))
+		return -ENOMEM;
+
+	to_alloc = count_free_highmem_pages();
+	if (to_alloc > *nr_highmem_p)
+		to_alloc = *nr_highmem_p;
+	else
+		*nr_highmem_p = to_alloc;
+
+	safe_highmem_pages = 0;
+	while (to_alloc-- > 0) {
+		struct page *page;
+
+		page = alloc_page(__GFP_HIGHMEM);
+		if (!PageNosaveFree(page)) {
+			/* The page is "safe", set its bit the bitmap */
+			memory_bm_set_bit(bm, page_to_pfn(page));
+			safe_highmem_pages++;
+		}
+		/* Mark the page as allocated */
+		SetPageNosave(page);
+		SetPageNosaveFree(page);
+	}
+	memory_bm_position_reset(bm);
+	safe_highmem_bm = bm;
+	return 0;
+}
+
+/**
+ *	get_highmem_page_buffer - for given highmem image page find the buffer
+ *	that suspend_write_next() should set for its caller to write to.
+ *
+ *	If the page is to be saved to its "original" page frame or a copy of
+ *	the page is to be made in the highmem, @buffer is returned.  Otherwise,
+ *	the copy of the page is to be made in normal memory, so the address of
+ *	the copy is returned.
+ *
+ *	If @buffer is returned, the caller of suspend_write_next() will write
+ *	the page's contents to @buffer, so they will have to be copied to the
+ *	right location on the next call to suspend_write_next() and it is done
+ *	with the help of copy_last_highmem_page().  For this purpose, if
+ *	@buffer is returned, @last_highmem page is set to the page to which
+ *	the data will have to be copied from @buffer.
+ */
+
+static struct page *last_highmem_page;
+
+static void *
+get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
+{
+	struct highmem_pbe *pbe;
+	void *kaddr;
+
+	if (PageNosave(page) && PageNosaveFree(page)) {
+		/* We have allocated the "original" page frame and we can
+		 * use it directly to store the loaded page.
+		 */
+		last_highmem_page = page;
+		return buffer;
+	}
+	/* The "original" page frame has not been allocated and we have to
+	 * use a "safe" page frame to store the loaded page.
+	 */
+	pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
+	if (!pbe) {
+		swsusp_free();
+		return NULL;
+	}
+	pbe->orig_page = page;
+	if (safe_highmem_pages > 0) {
+		struct page *tmp;
+
+		/* Copy of the page will be stored in high memory */
+		kaddr = buffer;
+		tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm));
+		safe_highmem_pages--;
+		last_highmem_page = tmp;
+		pbe->copy_page = tmp;
+	} else {
+		/* Copy of the page will be stored in normal memory */
+		kaddr = safe_pages_list;
+		safe_pages_list = safe_pages_list->next;
+		pbe->copy_page = virt_to_page(kaddr);
+	}
+	pbe->next = highmem_pblist;
+	highmem_pblist = pbe;
+	return kaddr;
+}
+
+/**
+ *	copy_last_highmem_page - copy the contents of a highmem image from
+ *	@buffer, where the caller of snapshot_write_next() has place them,
+ *	to the right location represented by @last_highmem_page .
+ */
+
+static void copy_last_highmem_page(void)
+{
+	if (last_highmem_page) {
+		void *dst;
+
+		dst = kmap_atomic(last_highmem_page, KM_USER0);
+		memcpy(dst, buffer, PAGE_SIZE);
+		kunmap_atomic(dst, KM_USER0);
+		last_highmem_page = NULL;
+	}
+}
+
+static inline int last_highmem_page_copied(void)
+{
+	return !last_highmem_page;
+}
+
+static inline void free_highmem_data(void)
+{
+	if (safe_highmem_bm)
+		memory_bm_free(safe_highmem_bm, PG_UNSAFE_CLEAR);
+
+	if (buffer)
+		free_image_page(buffer, PG_UNSAFE_CLEAR);
+}
+#else
+static inline int get_safe_write_buffer(void) { return 0; }
+
+static unsigned int
+count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
+
+static inline int
+prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
+{
+	return 0;
+}
+
+static inline void *
+get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
+{
+	return NULL;
+}
+
+static inline void copy_last_highmem_page(void) {}
+static inline int last_highmem_page_copied(void) { return 1; }
+static inline void free_highmem_data(void) {}
+#endif /* CONFIG_HIGHMEM */
+
 /**
  *	prepare_image - use the memory bitmap @bm to mark the pages that will
  *	be overwritten in the process of restoring the system memory state
@@ -1110,20 +1458,25 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
  *	The idea is to allocate a new memory bitmap first and then allocate
  *	as many pages as needed for the image data, but not to assign these
  *	pages to specific tasks initially.  Instead, we just mark them as
- *	allocated and create a list of "safe" pages that will be used later.
+ *	allocated and create a lists of "safe" pages that will be used
+ *	later.  On systems with high memory a list of "safe" highmem pages is
+ *	also created.
  */
 
 #define PBES_PER_LINKED_PAGE	(LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
 
-static struct linked_page *safe_pages_list;
-
 static int
 prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 {
-	unsigned int nr_pages;
+	unsigned int nr_pages, nr_highmem;
 	struct linked_page *sp_list, *lp;
 	int error;
 
+	/* If there is no highmem, the buffer will not be necessary */
+	free_image_page(buffer, PG_UNSAFE_CLEAR);
+	buffer = NULL;
+
+	nr_highmem = count_highmem_image_pages(bm);
 	error = mark_unsafe_pages(bm);
 	if (error)
 		goto Free;
@@ -1134,6 +1487,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 
 	duplicate_memory_bitmap(new_bm, bm);
 	memory_bm_free(bm, PG_UNSAFE_KEEP);
+	if (nr_highmem > 0) {
+		error = prepare_highmem_image(bm, &nr_highmem);
+		if (error)
+			goto Free;
+	}
 	/* Reserve some safe pages for potential later use.
 	 *
 	 * NOTE: This way we make sure there will be enough safe pages for the
@@ -1142,10 +1500,10 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 	 */
 	sp_list = NULL;
 	/* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
-	nr_pages = nr_copy_pages - allocated_unsafe_pages;
+	nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
 	nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
 	while (nr_pages > 0) {
-		lp = alloc_image_page(GFP_ATOMIC, PG_SAFE);
+		lp = get_image_page(GFP_ATOMIC, PG_SAFE);
 		if (!lp) {
 			error = -ENOMEM;
 			goto Free;
@@ -1156,7 +1514,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 	}
 	/* Preallocate memory for the image */
 	safe_pages_list = NULL;
-	nr_pages = nr_copy_pages - allocated_unsafe_pages;
+	nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
 	while (nr_pages > 0) {
 		lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
 		if (!lp) {
@@ -1196,6 +1554,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
 	struct pbe *pbe;
 	struct page *page = pfn_to_page(memory_bm_next_pfn(bm));
 
+	if (PageHighMem(page))
+		return get_highmem_page_buffer(page, ca);
+
 	if (PageNosave(page) && PageNosaveFree(page))
 		/* We have allocated the "original" page frame and we can
 		 * use it directly to store the loaded page.
@@ -1210,12 +1571,12 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
 		swsusp_free();
 		return NULL;
 	}
-	pbe->orig_address = (unsigned long)page_address(page);
-	pbe->address = (unsigned long)safe_pages_list;
+	pbe->orig_address = page_address(page);
+	pbe->address = safe_pages_list;
 	safe_pages_list = safe_pages_list->next;
 	pbe->next = restore_pblist;
 	restore_pblist = pbe;
-	return (void *)pbe->address;
+	return pbe->address;
 }
 
 /**
@@ -1249,14 +1610,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 	if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
 		return 0;
 
-	if (!buffer) {
-		/* This makes the buffer be freed by swsusp_free() */
-		buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
+	if (handle->offset == 0) {
+		if (!buffer)
+			/* This makes the buffer be freed by swsusp_free() */
+			buffer = get_image_page(GFP_ATOMIC, PG_ANY);
+
 		if (!buffer)
 			return -ENOMEM;
-	}
-	if (!handle->offset)
+
 		handle->buffer = buffer;
+	}
 	handle->sync_read = 1;
 	if (handle->prev < handle->cur) {
 		if (handle->prev == 0) {
@@ -1284,8 +1647,10 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 					return -ENOMEM;
 			}
 		} else {
+			copy_last_highmem_page();
 			handle->buffer = get_buffer(&orig_bm, &ca);
-			handle->sync_read = 0;
+			if (handle->buffer != buffer)
+				handle->sync_read = 0;
 		}
 		handle->prev = handle->cur;
 	}
@@ -1301,15 +1666,73 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 	return count;
 }
 
+/**
+ *	snapshot_write_finalize - must be called after the last call to
+ *	snapshot_write_next() in case the last page in the image happens
+ *	to be a highmem page and its contents should be stored in the
+ *	highmem.  Additionally, it releases the memory that will not be
+ *	used any more.
+ */
+
+void snapshot_write_finalize(struct snapshot_handle *handle)
+{
+	copy_last_highmem_page();
+	/* Free only if we have loaded the image entirely */
+	if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) {
+		memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
+		free_highmem_data();
+	}
+}
+
 int snapshot_image_loaded(struct snapshot_handle *handle)
 {
-	return !(!nr_copy_pages ||
+	return !(!nr_copy_pages || !last_highmem_page_copied() ||
 			handle->cur <= nr_meta_pages + nr_copy_pages);
 }
 
-void snapshot_free_unused_memory(struct snapshot_handle *handle)
+#ifdef CONFIG_HIGHMEM
+/* Assumes that @buf is ready and points to a "safe" page */
+static inline void
+swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
 {
-	/* Free only if we have loaded the image entirely */
-	if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
-		memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
+	void *kaddr1, *kaddr2;
+
+	kaddr1 = kmap_atomic(p1, KM_USER0);
+	kaddr2 = kmap_atomic(p2, KM_USER1);
+	memcpy(buf, kaddr1, PAGE_SIZE);
+	memcpy(kaddr1, kaddr2, PAGE_SIZE);
+	memcpy(kaddr2, buf, PAGE_SIZE);
+	kunmap_atomic(kaddr1, KM_USER0);
+	kunmap_atomic(kaddr2, KM_USER1);
+}
+
+/**
+ *	restore_highmem - for each highmem page that was allocated before
+ *	the suspend and included in the suspend image, and also has been
+ *	allocated by the "resume" kernel swap its current (ie. "before
+ *	resume") contents with the previous (ie. "before suspend") one.
+ *
+ *	If the resume eventually fails, we can call this function once
+ *	again and restore the "before resume" highmem state.
+ */
+
+int restore_highmem(void)
+{
+	struct highmem_pbe *pbe = highmem_pblist;
+	void *buf;
+
+	if (!pbe)
+		return 0;
+
+	buf = get_image_page(GFP_ATOMIC, PG_SAFE);
+	if (!buf)
+		return -ENOMEM;
+
+	while (pbe) {
+		swap_two_pages_data(pbe->copy_page, pbe->orig_page, buf);
+		pbe = pbe->next;
+	}
+	free_image_page(buf, PG_UNSAFE_CLEAR);
+	return 0;
 }
+#endif /* CONFIG_HIGHMEM */
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index aa5a9bff01f1..cbd187e90410 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -558,7 +558,7 @@ static int load_image(struct swap_map_handle *handle,
 		error = err2;
 	if (!error) {
 		printk("\b\b\b\bdone\n");
-		snapshot_free_unused_memory(snapshot);
+		snapshot_write_finalize(snapshot);
 		if (!snapshot_image_loaded(snapshot))
 			error = -ENODATA;
 	}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 4147a756a8c7..68de5c1dbd78 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -64,10 +64,8 @@ int in_suspend __nosavedata = 0;
 
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void);
-int save_highmem(void);
 int restore_highmem(void);
 #else
-static inline int save_highmem(void) { return 0; }
 static inline int restore_highmem(void) { return 0; }
 static inline unsigned int count_highmem_pages(void) { return 0; }
 #endif
@@ -184,7 +182,7 @@ static inline unsigned long __shrink_memory(long tmp)
 
 int swsusp_shrink_memory(void)
 {
-	long size, tmp;
+	long tmp;
 	struct zone *zone;
 	unsigned long pages = 0;
 	unsigned int i = 0;
@@ -192,15 +190,27 @@ int swsusp_shrink_memory(void)
 
 	printk("Shrinking memory...  ");
 	do {
-		size = 2 * count_highmem_pages();
-		size += size / 50 + count_data_pages() + PAGES_FOR_IO;
+		long size, highmem_size;
+
+		highmem_size = count_highmem_pages();
+		size = count_data_pages() + PAGES_FOR_IO;
 		tmp = size;
+		size += highmem_size;
 		for_each_zone (zone)
-			if (!is_highmem(zone) && populated_zone(zone)) {
-				tmp -= zone->free_pages;
-				tmp += zone->lowmem_reserve[ZONE_NORMAL];
-				tmp += snapshot_additional_pages(zone);
+			if (populated_zone(zone)) {
+				if (is_highmem(zone)) {
+					highmem_size -= zone->free_pages;
+				} else {
+					tmp -= zone->free_pages;
+					tmp += zone->lowmem_reserve[ZONE_NORMAL];
+					tmp += snapshot_additional_pages(zone);
+				}
 			}
+
+		if (highmem_size < 0)
+			highmem_size = 0;
+
+		tmp += highmem_size;
 		if (tmp > 0) {
 			tmp = __shrink_memory(tmp);
 			if (!tmp)
@@ -223,6 +233,7 @@ int swsusp_suspend(void)
 
 	if ((error = arch_prepare_suspend()))
 		return error;
+
 	local_irq_disable();
 	/* At this point, device_suspend() has been called, but *not*
 	 * device_power_down(). We *must* device_power_down() now.
@@ -235,18 +246,11 @@ int swsusp_suspend(void)
 		goto Enable_irqs;
 	}
 
-	if ((error = save_highmem())) {
-		printk(KERN_ERR "swsusp: Not enough free pages for highmem\n");
-		goto Restore_highmem;
-	}
-
 	save_processor_state();
 	if ((error = swsusp_arch_suspend()))
 		printk(KERN_ERR "Error %d suspending\n", error);
 	/* Restore control flow magically appears here */
 	restore_processor_state();
-Restore_highmem:
-	restore_highmem();
 	/* NOTE:  device_power_up() is just a resume() for devices
 	 * that suspended with irqs off ... no overall powerup.
 	 */
@@ -268,18 +272,23 @@ int swsusp_resume(void)
 		printk(KERN_ERR "Some devices failed to power down, very bad\n");
 	/* We'll ignore saved state, but this gets preempt count (etc) right */
 	save_processor_state();
-	error = swsusp_arch_resume();
-	/* Code below is only ever reached in case of failure. Otherwise
-	 * execution continues at place where swsusp_arch_suspend was called
-         */
-	BUG_ON(!error);
+	error = restore_highmem();
+	if (!error) {
+		error = swsusp_arch_resume();
+		/* The code below is only ever reached in case of a failure.
+		 * Otherwise execution continues at place where
+		 * swsusp_arch_suspend() was called
+        	 */
+		BUG_ON(!error);
+		/* This call to restore_highmem() undos the previous one */
+		restore_highmem();
+	}
 	/* The only reason why swsusp_arch_resume() can fail is memory being
 	 * very tight, so we have to free it as soon as we can to avoid
 	 * subsequent failures
 	 */
 	swsusp_free();
 	restore_processor_state();
-	restore_highmem();
 	touch_softlockup_watchdog();
 	device_power_up();
 	local_irq_enable();
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 05c58a2c0dd4..a63b25c63b49 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -194,12 +194,12 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		break;
 
 	case SNAPSHOT_ATOMIC_RESTORE:
+		snapshot_write_finalize(&data->handle);
 		if (data->mode != O_WRONLY || !data->frozen ||
 		    !snapshot_image_loaded(&data->handle)) {
 			error = -EPERM;
 			break;
 		}
-		snapshot_free_unused_memory(&data->handle);
 		down(&pm_sem);
 		pm_prepare_console();
 		suspend_console();
-- 
cgit v1.2.3


From 859491218770315ba95ee3fa09961fc71c506cae Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:19 -0800
Subject: [PATCH] swsusp: use __GFP_WAIT

swsusp uses GFP_ATOMIC, but it can afford to use __GFP_WAIT, which will
permit it to reclaim clean pagecache instead of emitting scary
page-allocation-failure messages.

Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swap.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index cbd187e90410..52e70ca832a8 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -63,7 +63,7 @@ static int submit(int rw, pgoff_t page_off, struct page *page,
 {
 	struct bio *bio;
 
-	bio = bio_alloc(GFP_ATOMIC, 1);
+	bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
 	if (!bio)
 		return -ENOMEM;
 	bio->bi_sector = page_off * (PAGE_SIZE >> 9);
@@ -216,7 +216,7 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
 		return -ENOSPC;
 
 	if (bio_chain) {
-		src = (void *)__get_free_page(GFP_ATOMIC);
+		src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
 		if (src) {
 			memcpy(src, buf, PAGE_SIZE);
 		} else {
@@ -473,7 +473,7 @@ static int get_swap_reader(struct swap_map_handle *handle, sector_t start)
 	if (!start)
 		return -EINVAL;
 
-	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
+	handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
 	if (!handle->cur)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From 8a05aac2631aa0e6494d9dc990f8c68ed8b8fde7 Mon Sep 17 00:00:00 2001
From: Stefan Seyfried <seife@suse.de>
Date: Wed, 6 Dec 2006 20:34:21 -0800
Subject: [PATCH] swsusp: fix platform mode

At some point after 2.6.13, in-kernel software suspend got "incomplete" for
the so-called "platform" mode.  pm_ops->prepare() is never called.  A
visible sign of this is the "moon" light on thinkpads not flashing during
suspend.  Fix by readding the pm_ops->prepare call during suspend.

Signed-off-by: Stefan Seyfried <seife@suse.de>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index d79feeb45459..f5079231383e 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -29,6 +29,22 @@ char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
 sector_t swsusp_resume_block;
 
+/**
+ *	platform_prepare - prepare the machine for hibernation using the
+ *	platform driver if so configured and return an error code if it fails
+ */
+
+static inline int platform_prepare(void)
+{
+	int error = 0;
+
+	if (pm_disk_mode == PM_DISK_PLATFORM) {
+		if (pm_ops && pm_ops->prepare)
+			error = pm_ops->prepare(PM_SUSPEND_DISK);
+	}
+	return error;
+}
+
 /**
  *	power_down - Shut machine down for hibernate.
  *	@mode:		Suspend-to-disk mode
@@ -91,9 +107,15 @@ static int prepare_processes(void)
 		goto thaw;
 	}
 
+	error = platform_prepare();
+	if (error)
+		goto thaw;
+
 	/* Free memory before shutting down devices. */
 	if (!(error = swsusp_shrink_memory()))
 		return 0;
+
+	platform_finish();
 thaw:
 	thaw_processes();
 enable_cpus:
-- 
cgit v1.2.3


From 7dfb71030f7636a0d65200158113c37764552f93 Mon Sep 17 00:00:00 2001
From: Nigel Cunningham <ncunningham@linuxmail.org>
Date: Wed, 6 Dec 2006 20:34:23 -0800
Subject: [PATCH] Add include/linux/freezer.h and move definitions from sched.h

Move process freezing functions from include/linux/sched.h to freezer.h, so
that modifications to the freezer or the kernel configuration don't require
recompiling just about everything.

[akpm@osdl.org: fix ueagle driver]
Signed-off-by: Nigel Cunningham <nigel@suspend2.net>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/audit.c          | 1 +
 kernel/power/disk.c     | 1 +
 kernel/power/main.c     | 1 +
 kernel/power/process.c  | 1 +
 kernel/power/user.c     | 1 +
 kernel/rtmutex-tester.c | 1 +
 kernel/sched.c          | 2 +-
 kernel/signal.c         | 1 +
 8 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 98106f6078b0..d9b690ac684b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -57,6 +57,7 @@
 #include <linux/netlink.h>
 #include <linux/selinux.h>
 #include <linux/inotify.h>
+#include <linux/freezer.h>
 
 #include "audit.h"
 
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index f5079231383e..53b3b57c0223 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -20,6 +20,7 @@
 #include <linux/pm.h>
 #include <linux/console.h>
 #include <linux/cpu.h>
+#include <linux/freezer.h>
 
 #include "power.h"
 
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 873228c71dab..6096c71b182b 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -18,6 +18,7 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/resume-trace.h>
+#include <linux/freezer.h>
 
 #include "power.h"
 
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 72e72d2c61e6..29be608e8349 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -13,6 +13,7 @@
 #include <linux/suspend.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/freezer.h>
 
 /* 
  * Timeout for stopping processes
diff --git a/kernel/power/user.c b/kernel/power/user.c
index a63b25c63b49..26c66941c001 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -22,6 +22,7 @@
 #include <linux/fs.h>
 #include <linux/console.h>
 #include <linux/cpu.h>
+#include <linux/freezer.h>
 
 #include <asm/uaccess.h>
 
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 6dcea9dd8c94..015fc633c96c 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -13,6 +13,7 @@
 #include <linux/spinlock.h>
 #include <linux/sysdev.h>
 #include <linux/timer.h>
+#include <linux/freezer.h>
 
 #include "rtmutex.h"
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 3399701c680e..12fdbef1d9bf 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -34,7 +34,7 @@
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/profile.h>
-#include <linux/suspend.h>
+#include <linux/freezer.h>
 #include <linux/vmalloc.h>
 #include <linux/blkdev.h>
 #include <linux/delay.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index 8e19d2785486..bc972d7e6319 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -23,6 +23,7 @@
 #include <linux/ptrace.h>
 #include <linux/signal.h>
 #include <linux/capability.h>
+#include <linux/freezer.h>
 #include <asm/param.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
-- 
cgit v1.2.3


From 32d50f57dab94d8c46566a903bbb633ee72fdcc2 Mon Sep 17 00:00:00 2001
From: Nigel Cunningham <ncunningham@linuxmail.org>
Date: Wed, 6 Dec 2006 20:34:25 -0800
Subject: [PATCH] swsusp: quieten Freezer if !CONFIG_PM_DEBUG

The freezer currently prints an '=' for every process that is frozen.  This
is pretty pointless, as the equals sign says nothing about which process is
frozen, and makes logs look messier (especially if there were a large
number of processes running).  All we really need to know is that we
started trying to freeze processes and what processes (if any) failed to
freeze, or that we succeeded.

Signed-off-by: Nigel Cunningham <nigel@suspend2.net>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/process.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index 29be608e8349..b0edfc6f2798 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -40,7 +40,6 @@ void refrigerator(void)
 	long save;
 	save = current->state;
 	pr_debug("%s entered refrigerator\n", current->comm);
-	printk("=");
 
 	frozen_process(current);
 	spin_lock_irq(&current->sighand->siglock);
-- 
cgit v1.2.3


From 14b5b7cfaa110b1d25b8f80b01a8c97cf2db30bc Mon Sep 17 00:00:00 2001
From: Nigel Cunningham <ncunningham@linuxmail.org>
Date: Wed, 6 Dec 2006 20:34:26 -0800
Subject: [PATCH] swsusp: clean up whitespace in freezer output

Minor whitespace and formatting modifications for the freezer.

Signed-off-by: Nigel Cunningham <nigel@suspend2.net>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/process.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index b0edfc6f2798..fedabad5a180 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -86,7 +86,7 @@ int freeze_processes(void)
 	unsigned long start_time;
 	struct task_struct *g, *p;
 
-	printk( "Stopping tasks: " );
+	printk("Stopping tasks... ");
 	start_time = jiffies;
 	user_frozen = 0;
 	do {
@@ -134,21 +134,21 @@ int freeze_processes(void)
 	 * but it cleans up leftover PF_FREEZE requests.
 	 */
 	if (todo) {
-		printk( "\n" );
-		printk(KERN_ERR " stopping tasks timed out "
+		printk("\n");
+		printk(KERN_ERR "Stopping tasks timed out "
 			"after %d seconds (%d tasks remaining):\n",
 			TIMEOUT / HZ, todo);
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
 			if (freezeable(p) && !frozen(p))
-				printk(KERN_ERR "  %s\n", p->comm);
+				printk(KERN_ERR " %s\n", p->comm);
 			cancel_freezing(p);
 		} while_each_thread(g, p);
 		read_unlock(&tasklist_lock);
 		return todo;
 	}
 
-	printk( "|\n" );
+	printk("done.\n");
 	BUG_ON(in_atomic());
 	return 0;
 }
@@ -157,18 +157,18 @@ void thaw_processes(void)
 {
 	struct task_struct *g, *p;
 
-	printk( "Restarting tasks..." );
+	printk("Restarting tasks... ");
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
 		if (!freezeable(p))
 			continue;
 		if (!thaw_process(p))
-			printk(KERN_INFO " Strange, %s not stopped\n", p->comm );
+			printk(KERN_INFO "Strange, %s not stopped\n", p->comm);
 	} while_each_thread(g, p);
 
 	read_unlock(&tasklist_lock);
 	schedule();
-	printk( " done\n" );
+	printk("done.\n");
 }
 
 EXPORT_SYMBOL(refrigerator);
-- 
cgit v1.2.3


From ff39593ad0ff7a79a3717edac6634407aa8200c2 Mon Sep 17 00:00:00 2001
From: Nigel Cunningham <ncunningham@linuxmail.org>
Date: Wed, 6 Dec 2006 20:34:28 -0800
Subject: [PATCH] swsusp: thaw userspace and kernel space separately

Modify process thawing so that we can thaw kernel space without thawing
userspace, and thaw kernelspace first.  This will be useful in later
patches, where I intend to get swsusp thawing kernel threads only before
seeking to free memory.

Signed-off-by: Nigel Cunningham <nigel@suspend2.net>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/process.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index fedabad5a180..cba8a5890eda 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -153,18 +153,29 @@ int freeze_processes(void)
 	return 0;
 }
 
-void thaw_processes(void)
+void thaw_some_processes(int all)
 {
 	struct task_struct *g, *p;
+	int pass = 0; /* Pass 0 = Kernel space, 1 = Userspace */
 
 	printk("Restarting tasks... ");
 	read_lock(&tasklist_lock);
-	do_each_thread(g, p) {
-		if (!freezeable(p))
-			continue;
-		if (!thaw_process(p))
-			printk(KERN_INFO "Strange, %s not stopped\n", p->comm);
-	} while_each_thread(g, p);
+	do {
+		do_each_thread(g, p) {
+			/*
+			 * is_user = 0 if kernel thread or borrowed mm,
+			 * 1 otherwise.
+			 */
+			int is_user = !!(p->mm && !(p->flags & PF_BORROWED_MM));
+			if (!freezeable(p) || (is_user != pass))
+				continue;
+			if (!thaw_process(p))
+				printk(KERN_INFO
+					"Strange, %s not stopped\n", p->comm);
+		} while_each_thread(g, p);
+
+		pass++;
+	} while (pass < 2 && all);
 
 	read_unlock(&tasklist_lock);
 	schedule();
-- 
cgit v1.2.3


From 2d4a34c9365c6e3f94a5b26ce296e1fce9b66c8b Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:29 -0800
Subject: [PATCH] swsusp: Support i386 systems with PAE or without PSE

Make swsusp support i386 systems with PAE or without PSE.

This is done by creating temporary page tables located in resume-safe page
frames before the suspend image is restored in the same way as x86_64 does
it.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Andi Kleen <ak@suse.de>
Cc: Dave Jones <davej@redhat.com>
Cc: Nigel Cunningham <ncunningham@linuxmail.org>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 825068ca3479..710ed084e7c5 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -78,7 +78,7 @@ config PM_SYSFS_DEPRECATED
 
 config SOFTWARE_SUSPEND
 	bool "Software Suspend"
-	depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP) && !X86_PAE) || ((FRV || PPC32) && !SMP))
+	depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP))
 	---help---
 	  Enable the possibility of suspending the machine.
 	  It doesn't need ACPI or APM.
-- 
cgit v1.2.3


From 112cecb2cc0e7341db92281ba04b26c41bb8146d Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Wed, 6 Dec 2006 20:34:31 -0800
Subject: [PATCH] suspend: don't change cpus_allowed for task initiating the
 suspend

Don't modify the cpus_allowed of the task initiating the suspend.
_cpu_down() already makes sure that the task doing the suspend doesn't run
on dying cpu.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Nigel Cunningham <nigel@suspend2.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpu.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 272254f20d97..9124669f4586 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -270,11 +270,7 @@ int disable_nonboot_cpus(void)
 			goto out;
 		}
 	}
-	error = set_cpus_allowed(current, cpumask_of_cpu(first_cpu));
-	if (error) {
-		printk(KERN_ERR "Could not run on CPU%d\n", first_cpu);
-		goto out;
-	}
+
 	/* We take down all of the non-boot CPUs in one shot to avoid races
 	 * with the userspace trying to use the CPU hotplug at the same time
 	 */
-- 
cgit v1.2.3


From 0d3a9abe8ae055e1052295698bcd0722c92eff47 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:32 -0800
Subject: [PATCH] swsusp: Measure memory shrinking time

Make swsusp measure and print the time needed to shrink memory during the
suspend.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@suspend2.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/power.h  |  4 ++++
 kernel/power/swap.c   | 24 ++----------------------
 kernel/power/swsusp.c | 33 +++++++++++++++++++++++++++++++++
 3 files changed, 39 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index 3763343bde2f..3afa5dbe6bc5 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -171,3 +171,7 @@ extern int swsusp_read(void);
 extern int swsusp_write(void);
 extern void swsusp_close(void);
 extern int suspend_enter(suspend_state_t state);
+
+struct timeval;
+extern void swsusp_show_speed(struct timeval *, struct timeval *,
+				unsigned int, char *);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 52e70ca832a8..dedf8797b723 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -133,26 +133,6 @@ static int wait_on_bio_chain(struct bio **bio_chain)
 	return ret;
 }
 
-static void show_speed(struct timeval *start, struct timeval *stop,
-			unsigned nr_pages, char *msg)
-{
-	s64 elapsed_centisecs64;
-	int centisecs;
-	int k;
-	int kps;
-
-	elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
-	do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
-	centisecs = elapsed_centisecs64;
-	if (centisecs == 0)
-		centisecs = 1;	/* avoid div-by-zero */
-	k = nr_pages * (PAGE_SIZE / 1024);
-	kps = (k * 100) / centisecs;
-	printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
-			centisecs / 100, centisecs % 100,
-			kps / 1000, (kps % 1000) / 10);
-}
-
 /*
  * Saving part
  */
@@ -375,7 +355,7 @@ static int save_image(struct swap_map_handle *handle,
 		error = err2;
 	if (!error)
 		printk("\b\b\b\bdone\n");
-	show_speed(&start, &stop, nr_to_write, "Wrote");
+	swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
 	return error;
 }
 
@@ -562,7 +542,7 @@ static int load_image(struct swap_map_handle *handle,
 		if (!snapshot_image_loaded(snapshot))
 			error = -ENODATA;
 	}
-	show_speed(&start, &stop, nr_to_read, "Read");
+	swsusp_show_speed(&start, &stop, nr_to_read, "Read");
 	return error;
 }
 
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 68de5c1dbd78..aa31432bbd97 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -49,6 +49,7 @@
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
 #include <linux/highmem.h>
+#include <linux/time.h>
 
 #include "power.h"
 
@@ -163,6 +164,34 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
 	}
 }
 
+/**
+ *	swsusp_show_speed - print the time elapsed between two events represented by
+ *	@start and @stop
+ *
+ *	@nr_pages -	number of pages processed between @start and @stop
+ *	@msg -		introductory message to print
+ */
+
+void swsusp_show_speed(struct timeval *start, struct timeval *stop,
+			unsigned nr_pages, char *msg)
+{
+	s64 elapsed_centisecs64;
+	int centisecs;
+	int k;
+	int kps;
+
+	elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
+	do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
+	centisecs = elapsed_centisecs64;
+	if (centisecs == 0)
+		centisecs = 1;	/* avoid div-by-zero */
+	k = nr_pages * (PAGE_SIZE / 1024);
+	kps = (k * 100) / centisecs;
+	printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
+			centisecs / 100, centisecs % 100,
+			kps / 1000, (kps % 1000) / 10);
+}
+
 /**
  *	swsusp_shrink_memory -  Try to free as much memory as needed
  *
@@ -187,8 +216,10 @@ int swsusp_shrink_memory(void)
 	unsigned long pages = 0;
 	unsigned int i = 0;
 	char *p = "-\\|/";
+	struct timeval start, stop;
 
 	printk("Shrinking memory...  ");
+	do_gettimeofday(&start);
 	do {
 		long size, highmem_size;
 
@@ -222,7 +253,9 @@ int swsusp_shrink_memory(void)
 		}
 		printk("\b%c", p[i++%4]);
 	} while (tmp > 0);
+	do_gettimeofday(&stop);
 	printk("\bdone (%lu pages freed)\n", pages);
+	swsusp_show_speed(&start, &stop, pages, "Freed");
 
 	return 0;
 }
-- 
cgit v1.2.3


From 3eb1b3a40722cbb46631db373af66d13d1e7ac81 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:34 -0800
Subject: [PATCH] suspend to disk fails if gdb is suspended with a traced child

Fix http://bugzilla.kernel.org/show_bug.cgi?id=7534

Fix the freezing of processes so that it won't fail if there is a traced
process the parent of which has been stopped.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: maurice barnum <pixi+kbug@burble.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/process.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index cba8a5890eda..1badb9a89ade 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -97,7 +97,9 @@ int freeze_processes(void)
 				continue;
 			if (frozen(p))
 				continue;
-			if (p->state == TASK_TRACED && frozen(p->parent)) {
+			if (p->state == TASK_TRACED &&
+			    (frozen(p->parent) ||
+			     p->parent->state == TASK_STOPPED)) {
 				cancel_freezing(p);
 				continue;
 			}
-- 
cgit v1.2.3


From a6d70980602e6f1869ebcdcbfaf55a0a5941583e Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Wed, 6 Dec 2006 20:34:35 -0800
Subject: [PATCH] convert pm_sem to a mutex

The power management semaphore is only used as mutex, so convert it.

[akpm@osdl.org: fix rotten bug]
Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c  | 16 ++++++++--------
 kernel/power/main.c  | 10 +++++-----
 kernel/power/power.h |  4 +++-
 kernel/power/user.c  | 24 ++++++++++++------------
 4 files changed, 28 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 53b3b57c0223..08d9e7aac0f8 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -213,10 +213,10 @@ static int software_resume(void)
 {
 	int error;
 
-	down(&pm_sem);
+	mutex_lock(&pm_mutex);
 	if (!swsusp_resume_device) {
 		if (!strlen(resume_file)) {
-			up(&pm_sem);
+			mutex_unlock(&pm_mutex);
 			return -ENOENT;
 		}
 		swsusp_resume_device = name_to_dev_t(resume_file);
@@ -231,7 +231,7 @@ static int software_resume(void)
 		 * FIXME: If noresume is specified, we need to find the partition
 		 * and reset it back to normal swap space.
 		 */
-		up(&pm_sem);
+		mutex_unlock(&pm_mutex);
 		return 0;
 	}
 
@@ -275,7 +275,7 @@ static int software_resume(void)
 	unprepare_processes();
  Done:
 	/* For success case, the suspend path will release the lock */
-	up(&pm_sem);
+	mutex_unlock(&pm_mutex);
 	pr_debug("PM: Resume from disk failed.\n");
 	return 0;
 }
@@ -336,7 +336,7 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
 	p = memchr(buf, '\n', n);
 	len = p ? p - buf : n;
 
-	down(&pm_sem);
+	mutex_lock(&pm_mutex);
 	for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) {
 		if (!strncmp(buf, pm_disk_modes[i], len)) {
 			mode = i;
@@ -360,7 +360,7 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
 
 	pr_debug("PM: suspend-to-disk mode set to '%s'\n",
 		 pm_disk_modes[mode]);
-	up(&pm_sem);
+	mutex_unlock(&pm_mutex);
 	return error ? error : n;
 }
 
@@ -385,9 +385,9 @@ static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n)
 	if (maj != MAJOR(res) || min != MINOR(res))
 		goto out;
 
-	down(&pm_sem);
+	mutex_lock(&pm_mutex);
 	swsusp_resume_device = res;
-	up(&pm_sem);
+	mutex_unlock(&pm_mutex);
 	printk("Attempting manual resume\n");
 	noresume = 0;
 	software_resume();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6096c71b182b..751157b7897e 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -25,7 +25,7 @@
 /*This is just an arbitrary number */
 #define FREE_PAGE_NUMBER (100)
 
-DECLARE_MUTEX(pm_sem);
+DEFINE_MUTEX(pm_mutex);
 
 struct pm_ops *pm_ops;
 suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
@@ -37,9 +37,9 @@ suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
 
 void pm_set_ops(struct pm_ops * ops)
 {
-	down(&pm_sem);
+	mutex_lock(&pm_mutex);
 	pm_ops = ops;
-	up(&pm_sem);
+	mutex_unlock(&pm_mutex);
 }
 
 
@@ -183,7 +183,7 @@ static int enter_state(suspend_state_t state)
 
 	if (!valid_state(state))
 		return -ENODEV;
-	if (down_trylock(&pm_sem))
+	if (!mutex_trylock(&pm_mutex))
 		return -EBUSY;
 
 	if (state == PM_SUSPEND_DISK) {
@@ -201,7 +201,7 @@ static int enter_state(suspend_state_t state)
 	pr_debug("PM: Finishing wakeup.\n");
 	suspend_finish(state);
  Unlock:
-	up(&pm_sem);
+	mutex_unlock(&pm_mutex);
 	return error;
 }
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 3afa5dbe6bc5..eb461b816bf4 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -22,7 +22,9 @@ static inline int pm_suspend_disk(void)
 	return -EPERM;
 }
 #endif
-extern struct semaphore pm_sem;
+
+extern struct mutex pm_mutex;
+
 #define power_attr(_name) \
 static struct subsys_attribute _name##_attr = {	\
 	.attr	= {				\
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 26c66941c001..905dc269c3fc 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -79,10 +79,10 @@ static int snapshot_release(struct inode *inode, struct file *filp)
 	free_all_swap_pages(data->swap, data->bitmap);
 	free_bitmap(data->bitmap);
 	if (data->frozen) {
-		down(&pm_sem);
+		mutex_lock(&pm_mutex);
 		thaw_processes();
 		enable_nonboot_cpus();
-		up(&pm_sem);
+		mutex_unlock(&pm_mutex);
 	}
 	atomic_inc(&device_available);
 	return 0;
@@ -144,7 +144,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 	case SNAPSHOT_FREEZE:
 		if (data->frozen)
 			break;
-		down(&pm_sem);
+		mutex_lock(&pm_mutex);
 		error = disable_nonboot_cpus();
 		if (!error) {
 			error = freeze_processes();
@@ -154,7 +154,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 				error = -EBUSY;
 			}
 		}
-		up(&pm_sem);
+		mutex_unlock(&pm_mutex);
 		if (!error)
 			data->frozen = 1;
 		break;
@@ -162,10 +162,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 	case SNAPSHOT_UNFREEZE:
 		if (!data->frozen)
 			break;
-		down(&pm_sem);
+		mutex_lock(&pm_mutex);
 		thaw_processes();
 		enable_nonboot_cpus();
-		up(&pm_sem);
+		mutex_unlock(&pm_mutex);
 		data->frozen = 0;
 		break;
 
@@ -174,7 +174,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 			error = -EPERM;
 			break;
 		}
-		down(&pm_sem);
+		mutex_lock(&pm_mutex);
 		/* Free memory before shutting down devices. */
 		error = swsusp_shrink_memory();
 		if (!error) {
@@ -187,7 +187,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 			}
 			resume_console();
 		}
-		up(&pm_sem);
+		mutex_unlock(&pm_mutex);
 		if (!error)
 			error = put_user(in_suspend, (unsigned int __user *)arg);
 		if (!error)
@@ -201,7 +201,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 			error = -EPERM;
 			break;
 		}
-		down(&pm_sem);
+		mutex_lock(&pm_mutex);
 		pm_prepare_console();
 		suspend_console();
 		error = device_suspend(PMSG_PRETHAW);
@@ -211,7 +211,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		}
 		resume_console();
 		pm_restore_console();
-		up(&pm_sem);
+		mutex_unlock(&pm_mutex);
 		break;
 
 	case SNAPSHOT_FREE:
@@ -286,7 +286,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 			break;
 		}
 
-		if (down_trylock(&pm_sem)) {
+		if (!mutex_trylock(&pm_mutex)) {
 			error = -EBUSY;
 			break;
 		}
@@ -314,7 +314,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 			pm_ops->finish(PM_SUSPEND_MEM);
 
 OutS3:
-		up(&pm_sem);
+		mutex_unlock(&pm_mutex);
 		break;
 
 	case SNAPSHOT_PMOPS:
-- 
cgit v1.2.3


From a9b6f562f14dc28fb4b2415f0f275cede0abe9b5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:37 -0800
Subject: [PATCH] swsusp: Untangle thaw_processes

Move the loop from thaw_processes() to a separate function and call it
independently for kernel threads and user space processes so that the order
of thawing tasks is clearly visible.

Drop thaw_kernel_threads() which is never used.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@suspend2.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/process.c | 49 +++++++++++++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index 1badb9a89ade..fd0ebb942f50 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -20,6 +20,8 @@
  */
 #define TIMEOUT	(20 * HZ)
 
+#define FREEZER_KERNEL_THREADS 0
+#define FREEZER_USER_SPACE 1
 
 static inline int freezeable(struct task_struct * p)
 {
@@ -79,6 +81,11 @@ static void cancel_freezing(struct task_struct *p)
 	}
 }
 
+static inline int is_user_space(struct task_struct *p)
+{
+	return p->mm && !(p->flags & PF_BORROWED_MM);
+}
+
 /* 0 = success, else # of processes that we failed to stop */
 int freeze_processes(void)
 {
@@ -103,10 +110,9 @@ int freeze_processes(void)
 				cancel_freezing(p);
 				continue;
 			}
-			if (p->mm && !(p->flags & PF_BORROWED_MM)) {
-				/* The task is a user-space one.
-				 * Freeze it unless there's a vfork completion
-				 * pending
+			if (is_user_space(p)) {
+				/* Freeze the task unless there is a vfork
+				 * completion pending
 				 */
 				if (!p->vfork_done)
 					freeze_process(p);
@@ -155,31 +161,30 @@ int freeze_processes(void)
 	return 0;
 }
 
-void thaw_some_processes(int all)
+static void thaw_tasks(int thaw_user_space)
 {
 	struct task_struct *g, *p;
-	int pass = 0; /* Pass 0 = Kernel space, 1 = Userspace */
 
-	printk("Restarting tasks... ");
 	read_lock(&tasklist_lock);
-	do {
-		do_each_thread(g, p) {
-			/*
-			 * is_user = 0 if kernel thread or borrowed mm,
-			 * 1 otherwise.
-			 */
-			int is_user = !!(p->mm && !(p->flags & PF_BORROWED_MM));
-			if (!freezeable(p) || (is_user != pass))
-				continue;
-			if (!thaw_process(p))
-				printk(KERN_INFO
-					"Strange, %s not stopped\n", p->comm);
-		} while_each_thread(g, p);
+	do_each_thread(g, p) {
+		if (!freezeable(p))
+			continue;
 
-		pass++;
-	} while (pass < 2 && all);
+		if (is_user_space(p) == !thaw_user_space)
+			continue;
 
+		if (!thaw_process(p))
+			printk(KERN_WARNING " Strange, %s not stopped\n",
+				p->comm );
+	} while_each_thread(g, p);
 	read_unlock(&tasklist_lock);
+}
+
+void thaw_processes(void)
+{
+	printk("Restarting tasks ... ");
+	thaw_tasks(FREEZER_KERNEL_THREADS);
+	thaw_tasks(FREEZER_USER_SPACE);
 	schedule();
 	printk("done.\n");
 }
-- 
cgit v1.2.3


From 11b2ce2ba90f801e2a5ebba4e6b7da72d87f2b13 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:40 -0800
Subject: [PATCH] swsusp: Untangle freeze_processes

Move the loop from freeze_processes() to a separate function and call it
independently for user space processes and kernel threads so that the order
of freezing tasks is clearly visible.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@suspend2.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/process.c | 84 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 54 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index fd0ebb942f50..99eeb119b06d 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -86,24 +86,23 @@ static inline int is_user_space(struct task_struct *p)
 	return p->mm && !(p->flags & PF_BORROWED_MM);
 }
 
-/* 0 = success, else # of processes that we failed to stop */
-int freeze_processes(void)
+static unsigned int try_to_freeze_tasks(int freeze_user_space)
 {
-	int todo, nr_user, user_frozen;
-	unsigned long start_time;
 	struct task_struct *g, *p;
+	unsigned long end_time;
+	unsigned int todo;
 
-	printk("Stopping tasks... ");
-	start_time = jiffies;
-	user_frozen = 0;
+	end_time = jiffies + TIMEOUT;
 	do {
-		nr_user = todo = 0;
+		todo = 0;
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
 			if (!freezeable(p))
 				continue;
+
 			if (frozen(p))
 				continue;
+
 			if (p->state == TASK_TRACED &&
 			    (frozen(p->parent) ||
 			     p->parent->state == TASK_STOPPED)) {
@@ -111,51 +110,76 @@ int freeze_processes(void)
 				continue;
 			}
 			if (is_user_space(p)) {
+				if (!freeze_user_space)
+					continue;
+
 				/* Freeze the task unless there is a vfork
 				 * completion pending
 				 */
 				if (!p->vfork_done)
 					freeze_process(p);
-				nr_user++;
 			} else {
-				/* Freeze only if the user space is frozen */
-				if (user_frozen)
-					freeze_process(p);
-				todo++;
+				if (freeze_user_space)
+					continue;
+
+				freeze_process(p);
 			}
+			todo++;
 		} while_each_thread(g, p);
 		read_unlock(&tasklist_lock);
-		todo += nr_user;
-		if (!user_frozen && !nr_user) {
-			sys_sync();
-			start_time = jiffies;
-		}
-		user_frozen = !nr_user;
 		yield();			/* Yield is okay here */
-		if (todo && time_after(jiffies, start_time + TIMEOUT))
+		if (todo && time_after(jiffies, end_time))
 			break;
-	} while(todo);
+	} while (todo);
 
-	/* This does not unfreeze processes that are already frozen
-	 * (we have slightly ugly calling convention in that respect,
-	 * and caller must call thaw_processes() if something fails),
-	 * but it cleans up leftover PF_FREEZE requests.
-	 */
 	if (todo) {
+		/* This does not unfreeze processes that are already frozen
+		 * (we have slightly ugly calling convention in that respect,
+		 * and caller must call thaw_processes() if something fails),
+		 * but it cleans up leftover PF_FREEZE requests.
+		 */
 		printk("\n");
-		printk(KERN_ERR "Stopping tasks timed out "
-			"after %d seconds (%d tasks remaining):\n",
-			TIMEOUT / HZ, todo);
+		printk(KERN_ERR "Stopping %s timed out after %d seconds "
+				"(%d tasks refusing to freeze):\n",
+				freeze_user_space ? "user space processes" :
+					"kernel threads",
+				TIMEOUT / HZ, todo);
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
+			if (is_user_space(p) == !freeze_user_space)
+				continue;
+
 			if (freezeable(p) && !frozen(p))
 				printk(KERN_ERR " %s\n", p->comm);
+
 			cancel_freezing(p);
 		} while_each_thread(g, p);
 		read_unlock(&tasklist_lock);
-		return todo;
 	}
 
+	return todo;
+}
+
+/**
+ *	freeze_processes - tell processes to enter the refrigerator
+ *
+ *	Returns 0 on success, or the number of processes that didn't freeze,
+ *	although they were told to.
+ */
+int freeze_processes(void)
+{
+	unsigned int nr_unfrozen;
+
+	printk("Stopping tasks ... ");
+	nr_unfrozen = try_to_freeze_tasks(FREEZER_USER_SPACE);
+	if (nr_unfrozen)
+		return nr_unfrozen;
+
+	sys_sync();
+	nr_unfrozen = try_to_freeze_tasks(FREEZER_KERNEL_THREADS);
+	if (nr_unfrozen)
+		return nr_unfrozen;
+
 	printk("done.\n");
 	BUG_ON(in_atomic());
 	return 0;
-- 
cgit v1.2.3


From 5b6d15de2d4c8149902a680a6cd1d3b26cd2e828 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:43 -0800
Subject: [PATCH] swsusp: Fix coding style in suspend.c

Fix coding style in suspend.c.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@suspend2.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index fd8251d40eb8..712f1524d846 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -85,7 +85,8 @@ unsigned long get_safe_page(gfp_t gfp_mask)
 	return (unsigned long)get_image_page(gfp_mask, PG_SAFE);
 }
 
-static struct page *alloc_image_page(gfp_t gfp_mask) {
+static struct page *alloc_image_page(gfp_t gfp_mask)
+{
 	struct page *page;
 
 	page = alloc_page(gfp_mask);
-- 
cgit v1.2.3


From 59a493350e7aefff7e262efa39e017517b31b8e8 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:44 -0800
Subject: [PATCH] swsusp: Fix labels

Move all labels in the swsusp code to the second column, so that they won't
fool diff -p.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@suspend2.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c     | 6 +++---
 kernel/power/snapshot.c | 8 ++++----
 kernel/power/swap.c     | 4 ++--
 kernel/power/swsusp.c   | 2 +-
 kernel/power/user.c     | 2 +-
 5 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 08d9e7aac0f8..126dda61f45d 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -117,9 +117,9 @@ static int prepare_processes(void)
 		return 0;
 
 	platform_finish();
-thaw:
+ thaw:
 	thaw_processes();
-enable_cpus:
+ enable_cpus:
 	enable_nonboot_cpus();
 	pm_restore_console();
 	return error;
@@ -392,7 +392,7 @@ static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n)
 	noresume = 0;
 	software_resume();
 	ret = n;
-out:
+ out:
 	return ret;
 }
 
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 712f1524d846..c024606221c4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -411,7 +411,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
 	memory_bm_position_reset(bm);
 	return 0;
 
-Free:
+ Free:
 	bm->p_list = ca.chain;
 	memory_bm_free(bm, PG_UNSAFE_CLEAR);
 	return -ENOMEM;
@@ -557,7 +557,7 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
 	memory_bm_position_reset(bm);
 	return BM_END_OF_MAP;
 
-Return_pfn:
+ Return_pfn:
 	bm->cur.chunk = chunk;
 	bm->cur.bit = bit;
 	return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit;
@@ -964,7 +964,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
 	}
 	return 0;
 
-Free:
+ Free:
 	swsusp_free();
 	return -ENOMEM;
 }
@@ -1540,7 +1540,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 	}
 	return 0;
 
-Free:
+ Free:
 	swsusp_free();
 	return error;
 }
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index dedf8797b723..f133d4a6d817 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -301,7 +301,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
 		handle->cur_swap = offset;
 		handle->k = 0;
 	}
-out:
+ out:
 	return error;
 }
 
@@ -429,7 +429,7 @@ int swsusp_write(void)
 	if (error)
 		free_all_swap_pages(root_swap, handle.bitmap);
 	release_swap_writer(&handle);
-out:
+ out:
 	swsusp_close();
 	return error;
 }
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index aa31432bbd97..31aa0390c777 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -288,7 +288,7 @@ int swsusp_suspend(void)
 	 * that suspended with irqs off ... no overall powerup.
 	 */
 	device_power_up();
-Enable_irqs:
+ Enable_irqs:
 	local_irq_enable();
 	return error;
 }
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 905dc269c3fc..069732e5695c 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -313,7 +313,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		if (pm_ops->finish)
 			pm_ops->finish(PM_SUSPEND_MEM);
 
-OutS3:
+ OutS3:
 		mutex_unlock(&pm_mutex);
 		break;
 
-- 
cgit v1.2.3


From 2d87595ea628ea58415ba4638c553a8c2fbd90e2 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:47 -0800
Subject: [PATCH] PM: Fix swsusp debug mode testproc

The 'testproc' swsusp debug mode thaws tasks twice in a row, which is _very_
confusing.  Fix that.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 126dda61f45d..6180379d5b84 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -151,7 +151,7 @@ int pm_suspend_disk(void)
 		return error;
 
 	if (pm_disk_mode == PM_DISK_TESTPROC)
-		goto Thaw;
+		return 0;
 
 	suspend_console();
 	error = device_suspend(PMSG_FREEZE);
-- 
cgit v1.2.3


From 5045cfc103566878228ca36d05a0ae0076673e5a Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Wed, 6 Dec 2006 20:34:48 -0800
Subject: [PATCH] swsusp: kill write-only variable

Cleanup write-only variable, suggested by D Binderman.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 6180379d5b84..0b00f56c2ad0 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -58,12 +58,10 @@ static inline int platform_prepare(void)
 
 static void power_down(suspend_disk_method_t mode)
 {
-	int error = 0;
-
 	switch(mode) {
 	case PM_DISK_PLATFORM:
 		kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
-		error = pm_ops->enter(PM_SUSPEND_DISK);
+		pm_ops->enter(PM_SUSPEND_DISK);
 		break;
 	case PM_DISK_SHUTDOWN:
 		kernel_power_off();
-- 
cgit v1.2.3


From 341a595850dac1b0503df34260257d71b4fdf72c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Dec 2006 20:34:49 -0800
Subject: [PATCH] Support for freezeable workqueues

Make it possible to create a workqueue the worker thread of which will be
frozen during suspend, along with other kernel threads.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@suspend2.net>
Cc: David Chinner <dgc@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/workqueue.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8d1e7cb8a51a..2945b094d871 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -29,6 +29,7 @@
 #include <linux/kthread.h>
 #include <linux/hardirq.h>
 #include <linux/mempolicy.h>
+#include <linux/freezer.h>
 
 /*
  * The per-CPU workqueue (if single thread, we always use the first
@@ -55,6 +56,8 @@ struct cpu_workqueue_struct {
 	struct task_struct *thread;
 
 	int run_depth;		/* Detect run_workqueue() recursion depth */
+
+	int freezeable;		/* Freeze the thread during suspend */
 } ____cacheline_aligned;
 
 /*
@@ -265,7 +268,8 @@ static int worker_thread(void *__cwq)
 	struct k_sigaction sa;
 	sigset_t blocked;
 
-	current->flags |= PF_NOFREEZE;
+	if (!cwq->freezeable)
+		current->flags |= PF_NOFREEZE;
 
 	set_user_nice(current, -5);
 
@@ -288,6 +292,9 @@ static int worker_thread(void *__cwq)
 
 	set_current_state(TASK_INTERRUPTIBLE);
 	while (!kthread_should_stop()) {
+		if (cwq->freezeable)
+			try_to_freeze();
+
 		add_wait_queue(&cwq->more_work, &wait);
 		if (list_empty(&cwq->worklist))
 			schedule();
@@ -364,7 +371,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
 EXPORT_SYMBOL_GPL(flush_workqueue);
 
 static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
-						   int cpu)
+						   int cpu, int freezeable)
 {
 	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
 	struct task_struct *p;
@@ -374,6 +381,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
 	cwq->thread = NULL;
 	cwq->insert_sequence = 0;
 	cwq->remove_sequence = 0;
+	cwq->freezeable = freezeable;
 	INIT_LIST_HEAD(&cwq->worklist);
 	init_waitqueue_head(&cwq->more_work);
 	init_waitqueue_head(&cwq->work_done);
@@ -389,7 +397,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
 }
 
 struct workqueue_struct *__create_workqueue(const char *name,
-					    int singlethread)
+					    int singlethread, int freezeable)
 {
 	int cpu, destroy = 0;
 	struct workqueue_struct *wq;
@@ -409,7 +417,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
 	mutex_lock(&workqueue_mutex);
 	if (singlethread) {
 		INIT_LIST_HEAD(&wq->list);
-		p = create_workqueue_thread(wq, singlethread_cpu);
+		p = create_workqueue_thread(wq, singlethread_cpu, freezeable);
 		if (!p)
 			destroy = 1;
 		else
@@ -417,7 +425,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
 	} else {
 		list_add(&wq->list, &workqueues);
 		for_each_online_cpu(cpu) {
-			p = create_workqueue_thread(wq, cpu);
+			p = create_workqueue_thread(wq, cpu, freezeable);
 			if (p) {
 				kthread_bind(p, cpu);
 				wake_up_process(p);
@@ -667,7 +675,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		mutex_lock(&workqueue_mutex);
 		/* Create a new workqueue thread for it. */
 		list_for_each_entry(wq, &workqueues, list) {
-			if (!create_workqueue_thread(wq, hotcpu)) {
+			if (!create_workqueue_thread(wq, hotcpu, 0)) {
 				printk("workqueue for %i failed\n", hotcpu);
 				return NOTIFY_BAD;
 			}
-- 
cgit v1.2.3


From ed07536ed6731775219c1df7fa26a7588753e693 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 6 Dec 2006 20:35:24 -0800
Subject: [PATCH] lockdep: annotate nfs/nfsd in-kernel sockets

Stick NFS sockets in their own class to avoid some lockdep warnings.  NFS
sockets are never exposed to user-space, and will hence not trigger certain
code paths that would otherwise pose deadlock scenarios.

[akpm@osdl.org: cleanups]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Steven Dickson <SteveD@redhat.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Acked-by: Neil Brown <neilb@suse.de>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
[ Fixed patch corruption by quilt, pointed out by Peter Zijlstra ]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/lockdep.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index c9fefdb1a7db..e33f6207f5b3 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2645,6 +2645,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
 	}
 	local_irq_restore(flags);
 }
+EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
 
 static void print_held_locks_bug(struct task_struct *curr)
 {
-- 
cgit v1.2.3


From 07354a00901d103085e4376b7df0aad264c1836a Mon Sep 17 00:00:00 2001
From: "Adam B. Jerome" <abj@novell.com>
Date: Wed, 6 Dec 2006 20:35:30 -0800
Subject: [PATCH] /proc/kallsyms reports lower-case types for some non-exported
 symbols

This patch addresses incorrect symbol type information reported through
/proc/kallsyms.  A lowercase character should designate the symbol as local
(or non-exported).  An uppercase character should designate the symbol as
global (or external).

Without this patch, some non-exported symbols are incorrectly assigned an
upper-case designation in /proc/kallsyms.  This patch corrects this
condition by converting non-exported symbols types to lower case when
appropriate and eliminates the superfluous upcase_if_global function

Signed-off-by: Adam B. Jerome <abj@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kallsyms.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index eeac3e313b2b..54befe36ee0b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -20,6 +20,7 @@
 #include <linux/proc_fs.h>
 #include <linux/sched.h>	/* for cond_resched */
 #include <linux/mm.h>
+#include <linux/ctype.h>
 
 #include <asm/sections.h>
 
@@ -301,13 +302,6 @@ struct kallsym_iter
 	char name[KSYM_NAME_LEN+1];
 };
 
-/* Only label it "global" if it is exported. */
-static void upcase_if_global(struct kallsym_iter *iter)
-{
-	if (is_exported(iter->name, iter->owner))
-		iter->type += 'A' - 'a';
-}
-
 static int get_ksymbol_mod(struct kallsym_iter *iter)
 {
 	iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
@@ -316,7 +310,10 @@ static int get_ksymbol_mod(struct kallsym_iter *iter)
 	if (iter->owner == NULL)
 		return 0;
 
-	upcase_if_global(iter);
+	/* Label it "global" if it is exported, "local" if not exported. */
+	iter->type = is_exported(iter->name, iter->owner)
+		? toupper(iter->type) : tolower(iter->type);
+
 	return 1;
 }
 
-- 
cgit v1.2.3


From e59e2ae2c29700117a54e85c106017c24837119f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 6 Dec 2006 20:35:59 -0800
Subject: [PATCH] SysRq-X: show blocked tasks

Add SysRq-X support: show blocked (TASK_UNINTERRUPTIBLE) tasks only.

Useful for debugging IO stalls.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 12fdbef1d9bf..1848e280504d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4804,7 +4804,7 @@ static void show_task(struct task_struct *p)
 		show_stack(p, NULL);
 }
 
-void show_state(void)
+void show_state_filter(unsigned long state_filter)
 {
 	struct task_struct *g, *p;
 
@@ -4824,11 +4824,16 @@ void show_state(void)
 		 * console might take alot of time:
 		 */
 		touch_nmi_watchdog();
-		show_task(p);
+		if (p->state & state_filter)
+			show_task(p);
 	} while_each_thread(g, p);
 
 	read_unlock(&tasklist_lock);
-	debug_show_all_locks();
+	/*
+	 * Only show locks if all tasks are dumped:
+	 */
+	if (state_filter == -1)
+		debug_show_all_locks();
 }
 
 /**
-- 
cgit v1.2.3


From 4cf303487d5dddaace2daca8437c555f3f0bc1aa Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Wed, 6 Dec 2006 20:36:06 -0800
Subject: [PATCH] Export pm_suspend for the shared APM emulation

The new shared APM emulation just like its ARM and MIPS predecessors uses
pm_suspend() which was only exported on SH.  Move export to close to it's
definition where it really should be anyway.

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index 751157b7897e..500eb87f643d 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -8,6 +8,7 @@
  *
  */
 
+#include <linux/module.h>
 #include <linux/suspend.h>
 #include <linux/kobject.h>
 #include <linux/string.h>
@@ -230,7 +231,7 @@ int pm_suspend(suspend_state_t state)
 	return -EINVAL;
 }
 
-
+EXPORT_SYMBOL(pm_suspend);
 
 decl_subsys(power,NULL,NULL);
 
-- 
cgit v1.2.3


From 696040670a12f66b17a839011f96d9ca376f688b Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Wed, 6 Dec 2006 20:36:15 -0800
Subject: [PATCH] cpuset: minor code refinements

A couple of minor code simplifications to the kernel/cpuset.c code.  No
functional change.  Just a little less code and a little more readable.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6313c38c930e..bd1e89c4c96a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -729,9 +729,11 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 	}
 
 	/* Remaining checks don't apply to root cpuset */
-	if ((par = cur->parent) == NULL)
+	if (cur == &top_cpuset)
 		return 0;
 
+	par = cur->parent;
+
 	/* We must be a subset of our parent cpuset */
 	if (!is_cpuset_subset(trial, par))
 		return -EACCES;
@@ -1060,10 +1062,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
 	cpu_exclusive_changed =
 		(is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
 	mutex_lock(&callback_mutex);
-	if (turning_on)
-		set_bit(bit, &cs->flags);
-	else
-		clear_bit(bit, &cs->flags);
+	cs->flags = trialcs.flags;
 	mutex_unlock(&callback_mutex);
 
 	if (cpu_exclusive_changed)
-- 
cgit v1.2.3


From 910b1b2e6d7d10e1c3bffdd12a90ec82f535f9a5 Mon Sep 17 00:00:00 2001
From: Jarek Poplawski <jarkao2@o2.pl>
Date: Thu, 7 Dec 2006 10:45:25 +0100
Subject: [PATCH] lockdep: internal locking fixes

Here are mainly some lockdep returns with 0 with unlocking fixes.

Signed-off-by: Jarek Poplawski <jarkao2@o2.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/lockdep.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index e33f6207f5b3..f76a24f2ac20 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -237,8 +237,10 @@ e	   locks. */
 	trace->max_entries = trace->nr_entries;
 
 	nr_stack_trace_entries += trace->nr_entries;
-	if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES))
+	if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES)) {
+		__raw_spin_unlock(&hash_lock);
 		return 0;
+	}
 
 	if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) {
 		__raw_spin_unlock(&hash_lock);
@@ -474,7 +476,8 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
 		return 0;
 
 	entry->class = this;
-	save_trace(&entry->trace);
+	if (!save_trace(&entry->trace))
+		return 0;
 
 	/*
 	 * Since we never remove from the dependency list, the list can
@@ -562,8 +565,12 @@ static noinline int print_circular_bug_tail(void)
 	if (debug_locks_silent)
 		return 0;
 
+	/* hash_lock unlocked by the header */
+	__raw_spin_lock(&hash_lock);
 	this.class = check_source->class;
-	save_trace(&this.trace);
+	if (!save_trace(&this.trace))
+		return 0;
+	__raw_spin_unlock(&hash_lock);
 	print_circular_bug_entry(&this, 0);
 
 	printk("\nother info that might help us debug this:\n\n");
@@ -966,14 +973,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 			       &prev->class->locks_after, next->acquire_ip);
 	if (!ret)
 		return 0;
-	/*
-	 * Return value of 2 signals 'dependency already added',
-	 * in that case we dont have to add the backlink either.
-	 */
-	if (ret == 2)
-		return 2;
+
 	ret = add_lock_to_list(next->class, prev->class,
 			       &next->class->locks_before, next->acquire_ip);
+	if (!ret)
+		return 0;
 
 	/*
 	 * Debugging printouts:
@@ -1025,7 +1029,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
 		 * added:
 		 */
 		if (hlock->read != 2) {
-			check_prev_add(curr, hlock, next);
+			if (!check_prev_add(curr, hlock, next))
+				return 0;
 			/*
 			 * Stop after the first non-trylock entry,
 			 * as non-trylock entries have added their
-- 
cgit v1.2.3


From b23984d0a12a4821b2e9712c71550f321eb88bb5 Mon Sep 17 00:00:00 2001
From: Jarek Poplawski <jarkao2@o2.pl>
Date: Wed, 6 Dec 2006 20:36:23 -0800
Subject: [PATCH] lockdep: misc fixes in lockdep.c

 - numeric string size replaced with constant in print_lock_name and
   print_lockdep_cache,

 - return on null pointer in print_lock_dependencies,

 - one more lockdep return with 0 with unlocking fix in mark_lock.

Signed-off-by: Jarek Poplawski <jarkao2@o2.pl>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/lockdep.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f76a24f2ac20..300d61bb314b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -359,7 +359,7 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4
 
 static void print_lock_name(struct lock_class *class)
 {
-	char str[128], c1, c2, c3, c4;
+	char str[KSYM_NAME_LEN + 1], c1, c2, c3, c4;
 	const char *name;
 
 	get_usage_chars(class, &c1, &c2, &c3, &c4);
@@ -381,7 +381,7 @@ static void print_lock_name(struct lock_class *class)
 static void print_lockdep_cache(struct lockdep_map *lock)
 {
 	const char *name;
-	char str[128];
+	char str[KSYM_NAME_LEN + 1];
 
 	name = lock->name;
 	if (!name)
@@ -451,7 +451,9 @@ static void print_lock_dependencies(struct lock_class *class, int depth)
 	print_lock_class_header(class, depth);
 
 	list_for_each_entry(entry, &class->locks_after, entry) {
-		DEBUG_LOCKS_WARN_ON(!entry->class);
+		if (DEBUG_LOCKS_WARN_ON(!entry->class))
+			return;
+
 		print_lock_dependencies(entry->class, depth + 1);
 
 		printk("%*s ... acquired at:\n",depth,"");
@@ -1733,6 +1735,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 		debug_atomic_dec(&nr_unused_locks);
 		break;
 	default:
+		__raw_spin_unlock(&hash_lock);
 		debug_locks_off();
 		WARN_ON(1);
 		return 0;
-- 
cgit v1.2.3


From fec1d0115240593b39898289e6e1413ea6e44a84 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 6 Dec 2006 20:36:34 -0800
Subject: [PATCH] Disable CLONE_CHILD_CLEARTID for abnormal exit

The CLONE_CHILD_CLEARTID flag is used by NPTL to have its threads
communicate via memory/futex when they exit, so pthread_join can
synchronize using a simple futex wait.  The word of user memory where NPTL
stores a thread's own TID is what it passes; this gets reset to zero at
thread exit.

It is not desireable to touch this user memory when threads are dying due
to a fatal signal.  A core dump is more usefully representative of the
dying program state if the threads live at the time of the crash have their
NPTL data structures unperturbed.  The userland expectation of
CLONE_CHILD_CLEARTID has only ever been that it works for a thread making
an _exit system call.

This problem was identified by Ernie Petrides <petrides@redhat.com>.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Ernie Petrides <petrides@redhat.com>
Cc: Jakub Jelinek <jakub@redhat.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 2cf74edd3295..f37980df1d58 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -448,7 +448,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 		tsk->vfork_done = NULL;
 		complete(vfork_done);
 	}
-	if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) {
+
+	/*
+	 * If we're exiting normally, clear a user-space tid field if
+	 * requested.  We leave this alone when dying by signal, to leave
+	 * the value intact in a core dump, and to save the unnecessary
+	 * trouble otherwise.  Userland only wants this done for a sys_exit.
+	 */
+	if (tsk->clear_child_tid
+	    && !(tsk->flags & PF_SIGNALED)
+	    && atomic_read(&mm->mm_users) > 1) {
 		u32 __user * tidptr = tsk->clear_child_tid;
 		tsk->clear_child_tid = NULL;
 
-- 
cgit v1.2.3


From 064b022c7adb2d853378078a9dc141f8288d1c73 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 6 Dec 2006 20:36:37 -0800
Subject: [PATCH] profile: fix uaccess handling

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/profile.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/profile.c b/kernel/profile.c
index f940b462eec9..15b012df4ff1 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -442,7 +442,8 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 	read = 0;
 
 	while (p < sizeof(unsigned int) && count > 0) {
-		put_user(*((char *)(&sample_step)+p),buf);
+		if (put_user(*((char *)(&sample_step)+p),buf))
+			return -EFAULT;
 		buf++; p++; count--; read++;
 	}
 	pnt = (char *)prof_buffer + p - sizeof(atomic_t);
-- 
cgit v1.2.3


From 128fb95650b3273a8dc9ba5514b6fe7db8ea30bf Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 6 Dec 2006 20:36:50 -0800
Subject: [PATCH] taskstats_exit_alloc: optimize/simplify

If there are no listeners, every task does unneeded kmem_cache alloc/free on
exit. We don't need listeners->sem for 'if (!list_empty())' check. Yes, we may
have a false positive, but this doesn't differ from the case when the listener
is unregistered after we drop the semaphore. So we don't need to do allocation
beforehand.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Balbir Singh <balbir@in.ibm.com>
Acked-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index f5f92014ae98..d9d7c3576238 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -416,7 +416,6 @@ err:
 void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
 {
 	struct listener_list *listeners;
-	struct taskstats *tmp;
 	/*
 	 * This is the cpu on which the task is exiting currently and will
 	 * be the one for which the exit event is sent, even if the cpu
@@ -424,19 +423,11 @@ void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
 	 */
 	*mycpu = raw_smp_processor_id();
 
-	*ptidstats = NULL;
-	tmp = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
-	if (!tmp)
-		return;
-
 	listeners = &per_cpu(listener_array, *mycpu);
-	down_read(&listeners->sem);
-	if (!list_empty(&listeners->list)) {
-		*ptidstats = tmp;
-		tmp = NULL;
-	}
-	up_read(&listeners->sem);
-	kfree(tmp);
+
+	*ptidstats = NULL;
+	if (!list_empty(&listeners->list))
+		*ptidstats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
 }
 
 /* Send pid data out on exit */
-- 
cgit v1.2.3


From 115085ea0794c0f339be8f9d25505c7f9861d824 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 6 Dec 2006 20:36:51 -0800
Subject: [PATCH] taskstats: cleanup do_exit() path

do_exit:
	taskstats_exit_alloc()
	...
	taskstats_exit_send()
	taskstats_exit_free()

I think this is not good, let it be a single function exported to the core
kernel, taskstats_exit(), which does alloc + send + free itself.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c      |  8 ++------
 kernel/taskstats.c | 41 +++++++++++++++--------------------------
 2 files changed, 17 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 06de6c4e8ca3..4e3f919edc48 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -850,9 +850,7 @@ static void exit_notify(struct task_struct *tsk)
 fastcall NORET_TYPE void do_exit(long code)
 {
 	struct task_struct *tsk = current;
-	struct taskstats *tidstats;
 	int group_dead;
-	unsigned int mycpu;
 
 	profile_task_exit(tsk);
 
@@ -890,8 +888,6 @@ fastcall NORET_TYPE void do_exit(long code)
 				current->comm, current->pid,
 				preempt_count());
 
-	taskstats_exit_alloc(&tidstats, &mycpu);
-
 	acct_update_integrals(tsk);
 	if (tsk->mm) {
 		update_hiwater_rss(tsk->mm);
@@ -911,8 +907,8 @@ fastcall NORET_TYPE void do_exit(long code)
 #endif
 	if (unlikely(tsk->audit_context))
 		audit_free(tsk);
-	taskstats_exit_send(tsk, tidstats, group_dead, mycpu);
-	taskstats_exit_free(tidstats);
+
+	taskstats_exit(tsk, group_dead);
 
 	exit_mm(tsk);
 
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index d9d7c3576238..2654886fe058 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -119,10 +119,10 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
 /*
  * Send taskstats data in @skb to listeners registered for @cpu's exit data
  */
-static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
+static void send_cpu_listeners(struct sk_buff *skb,
+					struct listener_list *listeners)
 {
 	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
-	struct listener_list *listeners;
 	struct listener *s, *tmp;
 	struct sk_buff *skb_next, *skb_cur = skb;
 	void *reply = genlmsg_data(genlhdr);
@@ -135,7 +135,6 @@ static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 	}
 
 	rc = 0;
-	listeners = &per_cpu(listener_array, cpu);
 	down_read(&listeners->sem);
 	list_for_each_entry(s, &listeners->list, list) {
 		skb_next = NULL;
@@ -413,28 +412,12 @@ err:
 	return rc;
 }
 
-void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
-{
-	struct listener_list *listeners;
-	/*
-	 * This is the cpu on which the task is exiting currently and will
-	 * be the one for which the exit event is sent, even if the cpu
-	 * on which this function is running changes later.
-	 */
-	*mycpu = raw_smp_processor_id();
-
-	listeners = &per_cpu(listener_array, *mycpu);
-
-	*ptidstats = NULL;
-	if (!list_empty(&listeners->list))
-		*ptidstats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
-}
-
 /* Send pid data out on exit */
-void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
-			int group_dead, unsigned int mycpu)
+void taskstats_exit(struct task_struct *tsk, int group_dead)
 {
 	int rc;
+	struct listener_list *listeners;
+	struct taskstats *tidstats;
 	struct sk_buff *rep_skb;
 	void *reply;
 	size_t size;
@@ -458,12 +441,17 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 		fill_tgid_exit(tsk);
 	}
 
+	listeners = &__raw_get_cpu_var(listener_array);
+	if (list_empty(&listeners->list))
+		return;
+
+	tidstats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
 	if (!tidstats)
 		return;
 
 	rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
 	if (rc < 0)
-		goto ret;
+		goto free_stats;
 
 	rc = fill_pid(tsk->pid, tsk, tidstats);
 	if (rc < 0)
@@ -492,15 +480,16 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 	nla_nest_end(rep_skb, na);
 
 send:
-	send_cpu_listeners(rep_skb, mycpu);
+	send_cpu_listeners(rep_skb, listeners);
+free_stats:
+	kmem_cache_free(taskstats_cache, tidstats);
 	return;
 
 nla_put_failure:
 	genlmsg_cancel(rep_skb, reply);
 err_skb:
 	nlmsg_free(rep_skb);
-ret:
-	return;
+	goto free_stats;
 }
 
 static struct genl_ops taskstats_ops = {
-- 
cgit v1.2.3


From 34ec12349c8a9505adc59d72f92b4595bc2483ff Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 6 Dec 2006 20:36:52 -0800
Subject: [PATCH] taskstats: cleanup ->signal->stats allocation

Allocate ->signal->stats on demand in taskstats_exit(), this allows us to
remove taskstats_tgid_alloc() (the last non-trivial inline) from taskstat's
public interface.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c      |  1 -
 kernel/taskstats.c | 26 +++++++++++++++++++++++++-
 2 files changed, 25 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index f37980df1d58..658838148647 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -847,7 +847,6 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	if (clone_flags & CLONE_THREAD) {
 		atomic_inc(&current->signal->count);
 		atomic_inc(&current->signal->live);
-		taskstats_tgid_alloc(current);
 		return 0;
 	}
 	sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 2654886fe058..7d793d6b1e90 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -412,6 +412,30 @@ err:
 	return rc;
 }
 
+static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
+{
+	struct signal_struct *sig = tsk->signal;
+	struct taskstats *stats;
+
+	if (sig->stats || thread_group_empty(tsk))
+		goto ret;
+
+	/* No problem if kmem_cache_zalloc() fails */
+	stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
+
+	spin_lock_irq(&tsk->sighand->siglock);
+	if (!sig->stats) {
+		sig->stats = stats;
+		stats = NULL;
+	}
+	spin_unlock_irq(&tsk->sighand->siglock);
+
+	if (stats)
+		kmem_cache_free(taskstats_cache, stats);
+ret:
+	return sig->stats;
+}
+
 /* Send pid data out on exit */
 void taskstats_exit(struct task_struct *tsk, int group_dead)
 {
@@ -433,7 +457,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
 	size = nla_total_size(sizeof(u32)) +
 		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
 
-	is_thread_group = (tsk->signal->stats != NULL);
+	is_thread_group = !!taskstats_tgid_alloc(tsk);
 	if (is_thread_group) {
 		/* PID + STATS + TGID + STATS */
 		size = 2 * size;
-- 
cgit v1.2.3


From 68062b86fc0f480b806d270a8278709a5a41bb67 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 6 Dec 2006 20:36:53 -0800
Subject: [PATCH] taskstats: factor out reply assembling

Introduce mk_reply() helper which does all nla_put()s on reply.

Saves 453 bytes and a preparation for the next patch.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@sgi.com>
Cc: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 55 +++++++++++++++++++++++++++---------------------------
 1 file changed, 28 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 7d793d6b1e90..d2a41339f37b 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -343,6 +343,25 @@ static int parse(struct nlattr *na, cpumask_t *mask)
 	return ret;
 }
 
+static int mk_reply(struct sk_buff *skb, int type, u32 pid, struct taskstats *stats)
+{
+	struct nlattr *na;
+	int aggr;
+
+	aggr = TASKSTATS_TYPE_AGGR_TGID;
+	if (type == TASKSTATS_TYPE_PID)
+		aggr = TASKSTATS_TYPE_AGGR_PID;
+
+	na = nla_nest_start(skb, aggr);
+	NLA_PUT_U32(skb, type, pid);
+	NLA_PUT_TYPE(skb, struct taskstats, TASKSTATS_TYPE_STATS, *stats);
+	nla_nest_end(skb, na);
+
+	return 0;
+nla_put_failure:
+	return -1;
+}
+
 static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 {
 	int rc = 0;
@@ -350,7 +369,6 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 	struct taskstats stats;
 	void *reply;
 	size_t size;
-	struct nlattr *na;
 	cpumask_t mask;
 
 	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
@@ -382,27 +400,21 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 		if (rc < 0)
 			goto err;
 
-		na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
-		NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid);
-		NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
-				stats);
+		if (mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid, &stats))
+			goto nla_put_failure;
 	} else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
 		u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
 		rc = fill_tgid(tgid, NULL, &stats);
 		if (rc < 0)
 			goto err;
 
-		na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
-		NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid);
-		NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
-				stats);
+		if (mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid, &stats))
+			goto nla_put_failure;
 	} else {
 		rc = -EINVAL;
 		goto err;
 	}
 
-	nla_nest_end(rep_skb, na);
-
 	return send_reply(rep_skb, info->snd_pid);
 
 nla_put_failure:
@@ -446,7 +458,6 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
 	void *reply;
 	size_t size;
 	int is_thread_group;
-	struct nlattr *na;
 
 	if (!family_registered)
 		return;
@@ -481,27 +492,17 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
 	if (rc < 0)
 		goto err_skb;
 
-	na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
-	NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid);
-	NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
-			*tidstats);
-	nla_nest_end(rep_skb, na);
-
-	if (!is_thread_group)
-		goto send;
+	if (mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid, tidstats))
+		goto nla_put_failure;
 
 	/*
 	 * Doesn't matter if tsk is the leader or the last group member leaving
 	 */
-	if (!group_dead)
+	if (!is_thread_group || !group_dead)
 		goto send;
 
-	na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
-	NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid);
-	/* No locking needed for tsk->signal->stats since group is dead */
-	NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
-			*tsk->signal->stats);
-	nla_nest_end(rep_skb, na);
+	if (mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid, tsk->signal->stats))
+		goto nla_put_failure;
 
 send:
 	send_cpu_listeners(rep_skb, listeners);
-- 
cgit v1.2.3


From 51de4d90852ba4cfa5743594ec4a7f158b52dc43 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 6 Dec 2006 20:36:54 -0800
Subject: [PATCH] taskstats: use nla_reserve() for reply assembling

Currently taskstats_user_cmd()/taskstats_exit() do:

	1) allocate stats
	2) fill stats
	3) make a temporary copy on stack (236 bytes)
	4) copy that copy to skb
	5) free stats

With the help of nla_reserve() we can operate on skb->data directly,
thus avoiding all these steps except 2).

So, before this patch:

	// copy *stats to skb->data
	int mk_reply(skb, ..., struct taskstats *stats);

	fill_pid(stats);
	mk_reply(skb, ..., stats);

After:
	// return a pointer to skb->data
	struct taskstats *mk_reply(skb, ...);

	stat = mk_reply(skb, ...);
	fill_pid(stats);

Shrinks taskatsks.o by 162 bytes.

A stupid benchmark (send one million TASKSTATS_CMD_ATTR_PID) shows the

		real user sys
	before:
		4.02 0.06 3.96
		4.02 0.04 3.98
		4.02 0.04 3.97
	after:
		3.86 0.08 3.78
		3.88 0.10 3.77
		3.89 0.09 3.80

but this looks suspiciously good.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@sgi.com>
Cc: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 86 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 44 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index d2a41339f37b..b0aad99c4f8d 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -185,6 +185,7 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
 	} else
 		get_task_struct(tsk);
 
+	memset(stats, 0, sizeof(*stats));
 	/*
 	 * Each accounting subsystem adds calls to its functions to
 	 * fill in relevant parts of struct taskstsats as follows
@@ -227,6 +228,8 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
 
 	if (first->signal->stats)
 		memcpy(stats, first->signal->stats, sizeof(*stats));
+	else
+		memset(stats, 0, sizeof(*stats));
 
 	tsk = first;
 	do {
@@ -343,9 +346,9 @@ static int parse(struct nlattr *na, cpumask_t *mask)
 	return ret;
 }
 
-static int mk_reply(struct sk_buff *skb, int type, u32 pid, struct taskstats *stats)
+static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
 {
-	struct nlattr *na;
+	struct nlattr *na, *ret;
 	int aggr;
 
 	aggr = TASKSTATS_TYPE_AGGR_TGID;
@@ -353,20 +356,23 @@ static int mk_reply(struct sk_buff *skb, int type, u32 pid, struct taskstats *st
 		aggr = TASKSTATS_TYPE_AGGR_PID;
 
 	na = nla_nest_start(skb, aggr);
-	NLA_PUT_U32(skb, type, pid);
-	NLA_PUT_TYPE(skb, struct taskstats, TASKSTATS_TYPE_STATS, *stats);
+	if (nla_put(skb, type, sizeof(pid), &pid) < 0)
+		goto err;
+	ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
+	if (!ret)
+		goto err;
 	nla_nest_end(skb, na);
 
-	return 0;
-nla_put_failure:
-	return -1;
+	return nla_data(ret);
+err:
+	return NULL;
 }
 
 static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 {
 	int rc = 0;
 	struct sk_buff *rep_skb;
-	struct taskstats stats;
+	struct taskstats *stats;
 	void *reply;
 	size_t size;
 	cpumask_t mask;
@@ -389,36 +395,36 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 	size = nla_total_size(sizeof(u32)) +
 		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
 
-	memset(&stats, 0, sizeof(stats));
 	rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
 	if (rc < 0)
 		return rc;
 
+	rc = -EINVAL;
 	if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
 		u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
-		rc = fill_pid(pid, NULL, &stats);
-		if (rc < 0)
-			goto err;
+		stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
+		if (!stats)
+			goto nla_err;
 
-		if (mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid, &stats))
-			goto nla_put_failure;
+		rc = fill_pid(pid, NULL, stats);
+		if (rc < 0)
+			goto nla_err;
 	} else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
 		u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
-		rc = fill_tgid(tgid, NULL, &stats);
-		if (rc < 0)
-			goto err;
+		stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
+		if (!stats)
+			goto nla_err;
 
-		if (mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid, &stats))
-			goto nla_put_failure;
-	} else {
-		rc = -EINVAL;
+		rc = fill_tgid(tgid, NULL, stats);
+		if (rc < 0)
+			goto nla_err;
+	} else
 		goto err;
-	}
 
 	return send_reply(rep_skb, info->snd_pid);
 
-nla_put_failure:
-	rc = genlmsg_cancel(rep_skb, reply);
+nla_err:
+	genlmsg_cancel(rep_skb, reply);
 err:
 	nlmsg_free(rep_skb);
 	return rc;
@@ -453,7 +459,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
 {
 	int rc;
 	struct listener_list *listeners;
-	struct taskstats *tidstats;
+	struct taskstats *stats;
 	struct sk_buff *rep_skb;
 	void *reply;
 	size_t size;
@@ -480,20 +486,17 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
 	if (list_empty(&listeners->list))
 		return;
 
-	tidstats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
-	if (!tidstats)
-		return;
-
 	rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
 	if (rc < 0)
-		goto free_stats;
+		return;
 
-	rc = fill_pid(tsk->pid, tsk, tidstats);
-	if (rc < 0)
-		goto err_skb;
+	stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid);
+	if (!stats)
+		goto nla_err;
 
-	if (mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid, tidstats))
-		goto nla_put_failure;
+	rc = fill_pid(tsk->pid, tsk, stats);
+	if (rc < 0)
+		goto nla_err;
 
 	/*
 	 * Doesn't matter if tsk is the leader or the last group member leaving
@@ -501,20 +504,19 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
 	if (!is_thread_group || !group_dead)
 		goto send;
 
-	if (mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid, tsk->signal->stats))
-		goto nla_put_failure;
+	stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid);
+	if (!stats)
+		goto nla_err;
+
+	memcpy(stats, tsk->signal->stats, sizeof(*stats));
 
 send:
 	send_cpu_listeners(rep_skb, listeners);
-free_stats:
-	kmem_cache_free(taskstats_cache, tidstats);
 	return;
 
-nla_put_failure:
+nla_err:
 	genlmsg_cancel(rep_skb, reply);
-err_skb:
 	nlmsg_free(rep_skb);
-	goto free_stats;
 }
 
 static struct genl_ops taskstats_ops = {
-- 
cgit v1.2.3


From 37167485302c8876cb0303af113696e88c2945aa Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 6 Dec 2006 20:36:55 -0800
Subject: [PATCH] taskstats: cleanup reply assembling

Thomas Graf wrote:
>
> nla_nest_start() may return NULL, either rely on prepare_reply() to be
> correct and BUG() on failure or do proper error handling for all
> functions.

nla_put() in taskstat.c can fail only if the 'size' argument of alloc_skb()
was not right. This is a kernel bug, we should not hide it. So add 'BUG()'
on error path and check for 'na == NULL'.

> genlmsg_cancel() is only required in error paths for dumping
> procedures.

So we can remove 'genlmsg_cancel()' calls and 'void *reply' (saves 227 bytes).

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 38 ++++++++++++++++----------------------
 1 file changed, 16 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index b0aad99c4f8d..4c3476fa058d 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -69,7 +69,7 @@ enum actions {
 };
 
 static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
-			void **replyp, size_t size)
+				size_t size)
 {
 	struct sk_buff *skb;
 	void *reply;
@@ -94,7 +94,6 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
 	}
 
 	*skbp = skb;
-	*replyp = reply;
 	return 0;
 }
 
@@ -351,11 +350,13 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
 	struct nlattr *na, *ret;
 	int aggr;
 
-	aggr = TASKSTATS_TYPE_AGGR_TGID;
-	if (type == TASKSTATS_TYPE_PID)
-		aggr = TASKSTATS_TYPE_AGGR_PID;
+	aggr = (type == TASKSTATS_TYPE_PID)
+			? TASKSTATS_TYPE_AGGR_PID
+			: TASKSTATS_TYPE_AGGR_TGID;
 
 	na = nla_nest_start(skb, aggr);
+	if (!na)
+		goto err;
 	if (nla_put(skb, type, sizeof(pid), &pid) < 0)
 		goto err;
 	ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
@@ -373,7 +374,6 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 	int rc = 0;
 	struct sk_buff *rep_skb;
 	struct taskstats *stats;
-	void *reply;
 	size_t size;
 	cpumask_t mask;
 
@@ -395,7 +395,7 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 	size = nla_total_size(sizeof(u32)) +
 		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
 
-	rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
+	rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
 	if (rc < 0)
 		return rc;
 
@@ -404,27 +404,24 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 		u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
 		stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
 		if (!stats)
-			goto nla_err;
+			goto err;
 
 		rc = fill_pid(pid, NULL, stats);
 		if (rc < 0)
-			goto nla_err;
+			goto err;
 	} else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
 		u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
 		stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
 		if (!stats)
-			goto nla_err;
+			goto err;
 
 		rc = fill_tgid(tgid, NULL, stats);
 		if (rc < 0)
-			goto nla_err;
+			goto err;
 	} else
 		goto err;
 
 	return send_reply(rep_skb, info->snd_pid);
-
-nla_err:
-	genlmsg_cancel(rep_skb, reply);
 err:
 	nlmsg_free(rep_skb);
 	return rc;
@@ -461,7 +458,6 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
 	struct listener_list *listeners;
 	struct taskstats *stats;
 	struct sk_buff *rep_skb;
-	void *reply;
 	size_t size;
 	int is_thread_group;
 
@@ -486,17 +482,17 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
 	if (list_empty(&listeners->list))
 		return;
 
-	rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
+	rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size);
 	if (rc < 0)
 		return;
 
 	stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid);
 	if (!stats)
-		goto nla_err;
+		goto err;
 
 	rc = fill_pid(tsk->pid, tsk, stats);
 	if (rc < 0)
-		goto nla_err;
+		goto err;
 
 	/*
 	 * Doesn't matter if tsk is the leader or the last group member leaving
@@ -506,16 +502,14 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
 
 	stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid);
 	if (!stats)
-		goto nla_err;
+		goto err;
 
 	memcpy(stats, tsk->signal->stats, sizeof(*stats));
 
 send:
 	send_cpu_listeners(rep_skb, listeners);
 	return;
-
-nla_err:
-	genlmsg_cancel(rep_skb, reply);
+err:
 	nlmsg_free(rep_skb);
 }
 
-- 
cgit v1.2.3


From a4c410f00f7ca4bd448b0d63f6f882fd244dc991 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 6 Dec 2006 20:37:21 -0800
Subject: [PATCH] lockdep: print current locks on in_atomic warnings

Add debug_show_held_locks(current) to __might_sleep() and schedule(); this
makes finding the offending lock leak easier.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 1848e280504d..343e1794233e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3333,6 +3333,7 @@ asmlinkage void __sched schedule(void)
 		printk(KERN_ERR "BUG: scheduling while atomic: "
 			"%s/0x%08x/%d\n",
 			current->comm, preempt_count(), current->pid);
+		debug_show_held_locks(current);
 		dump_stack();
 	}
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -6872,6 +6873,7 @@ void __might_sleep(char *file, int line)
 				" context at %s:%d\n", file, line);
 		printk("in_atomic():%d, irqs_disabled():%d\n",
 			in_atomic(), irqs_disabled());
+		debug_show_held_locks(current);
 		dump_stack();
 	}
 #endif
-- 
cgit v1.2.3


From 6cfd76a26d9fe2ba54b9d496a48c1d9285e5c5ed Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 6 Dec 2006 20:37:22 -0800
Subject: [PATCH] lockdep: name some old style locks

Name some of the remaning 'old_style_spin_init' locks

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/acct.c       | 3 ++-
 kernel/irq/handle.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index 0aad5ca36a81..dc12db8600e7 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -89,7 +89,8 @@ struct acct_glbs {
 	struct timer_list	timer;
 };
 
-static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED};
+static struct acct_glbs acct_globals __cacheline_aligned =
+	{__SPIN_LOCK_UNLOCKED(acct_globals.lock)};
 
 /*
  * Called whenever the timer says to check the free space.
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a681912bc89a..aff1f0fabb0d 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -54,7 +54,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = {
 		.chip = &no_irq_chip,
 		.handle_irq = handle_bad_irq,
 		.depth = 1,
-		.lock = SPIN_LOCK_UNLOCKED,
+		.lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
 #ifdef CONFIG_SMP
 		.affinity = CPU_MASK_ALL
 #endif
-- 
cgit v1.2.3


From ece8a684c75df215320b4155944979e3f78c5c93 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 6 Dec 2006 20:37:24 -0800
Subject: [PATCH] sleep profiling

Implement prof=sleep profiling.  TASK_UNINTERRUPTIBLE sleeps will be taken
as a profile hit, and every millisecond spent sleeping causes a profile-hit
for the call site that initiated the sleep.

Sample readprofile output on i386:

   306 ps2_sendbyte                               1.3973
   432 call_usermodehelper_keys                   1.9548
   484 ps2_command                                0.6453
   790 __driver_attach                            4.7879
  1593 msleep                                    44.2500
  3976 sync_buffer                               64.1290
  4076 do_lookup                                 12.4648
  8587 sync_page                                122.6714
 20820 total                                      0.0067

(NOTE: architectures need to check whether get_wchan() can be called from
deep within the wakeup path.)

akpm: we need to mark more functions __sched.  lock_sock(), msleep(), others..

akpm: the contention in do_lookup() is a surprise.  Presumably doing disk
reads for directory contents while holding i_mutex.

[akpm@osdl.org: various fixes]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/profile.c | 39 ++++++++++++++++++++++++++++++---------
 kernel/sched.c   | 11 +++++++++++
 2 files changed, 41 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/profile.c b/kernel/profile.c
index 15b012df4ff1..04fd84e8cdbe 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -40,7 +40,7 @@ int (*timer_hook)(struct pt_regs *) __read_mostly;
 
 static atomic_t *prof_buffer;
 static unsigned long prof_len, prof_shift;
-static int prof_on __read_mostly;
+int prof_on __read_mostly;
 static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
@@ -51,9 +51,19 @@ static DEFINE_MUTEX(profile_flip_mutex);
 static int __init profile_setup(char * str)
 {
 	static char __initdata schedstr[] = "schedule";
+	static char __initdata sleepstr[] = "sleep";
 	int par;
 
-	if (!strncmp(str, schedstr, strlen(schedstr))) {
+	if (!strncmp(str, sleepstr, strlen(sleepstr))) {
+		prof_on = SLEEP_PROFILING;
+		if (str[strlen(sleepstr)] == ',')
+			str += strlen(sleepstr) + 1;
+		if (get_option(&str, &par))
+			prof_shift = par;
+		printk(KERN_INFO
+			"kernel sleep profiling enabled (shift: %ld)\n",
+			prof_shift);
+	} else if (!strncmp(str, sleepstr, strlen(sleepstr))) {
 		prof_on = SCHED_PROFILING;
 		if (str[strlen(schedstr)] == ',')
 			str += strlen(schedstr) + 1;
@@ -204,7 +214,8 @@ EXPORT_SYMBOL_GPL(profile_event_unregister);
  * positions to which hits are accounted during short intervals (e.g.
  * several seconds) is usually very small. Exclusion from buffer
  * flipping is provided by interrupt disablement (note that for
- * SCHED_PROFILING profile_hit() may be called from process context).
+ * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
+ * process context).
  * The hash function is meant to be lightweight as opposed to strong,
  * and was vaguely inspired by ppc64 firmware-supported inverted
  * pagetable hash functions, but uses a full hashtable full of finite
@@ -257,7 +268,7 @@ static void profile_discard_flip_buffers(void)
 	mutex_unlock(&profile_flip_mutex);
 }
 
-void profile_hit(int type, void *__pc)
+void profile_hits(int type, void *__pc, unsigned int nr_hits)
 {
 	unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
 	int i, j, cpu;
@@ -274,21 +285,31 @@ void profile_hit(int type, void *__pc)
 		put_cpu();
 		return;
 	}
+	/*
+	 * We buffer the global profiler buffer into a per-CPU
+	 * queue and thus reduce the number of global (and possibly
+	 * NUMA-alien) accesses. The write-queue is self-coalescing:
+	 */
 	local_irq_save(flags);
 	do {
 		for (j = 0; j < PROFILE_GRPSZ; ++j) {
 			if (hits[i + j].pc == pc) {
-				hits[i + j].hits++;
+				hits[i + j].hits += nr_hits;
 				goto out;
 			} else if (!hits[i + j].hits) {
 				hits[i + j].pc = pc;
-				hits[i + j].hits = 1;
+				hits[i + j].hits = nr_hits;
 				goto out;
 			}
 		}
 		i = (i + secondary) & (NR_PROFILE_HIT - 1);
 	} while (i != primary);
-	atomic_inc(&prof_buffer[pc]);
+
+	/*
+	 * Add the current hit(s) and flush the write-queue out
+	 * to the global buffer:
+	 */
+	atomic_add(nr_hits, &prof_buffer[pc]);
 	for (i = 0; i < NR_PROFILE_HIT; ++i) {
 		atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
 		hits[i].pc = hits[i].hits = 0;
@@ -356,14 +377,14 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
 #define profile_flip_buffers()		do { } while (0)
 #define profile_discard_flip_buffers()	do { } while (0)
 
-void profile_hit(int type, void *__pc)
+void profile_hits(int type, void *__pc, unsigned int nr_hits)
 {
 	unsigned long pc;
 
 	if (prof_on != type || !prof_buffer)
 		return;
 	pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
-	atomic_inc(&prof_buffer[min(pc, prof_len - 1)]);
+	atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
 }
 #endif /* !CONFIG_SMP */
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 343e1794233e..75a005ed4eda 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -948,6 +948,17 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
 	}
 #endif
 
+	/*
+	 * Sleep time is in units of nanosecs, so shift by 20 to get a
+	 * milliseconds-range estimation of the amount of time that the task
+	 * spent sleeping:
+	 */
+	if (unlikely(prof_on == SLEEP_PROFILING)) {
+		if (p->state == TASK_UNINTERRUPTIBLE)
+			profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
+				     (now - p->timestamp) >> 20);
+	}
+
 	if (!rt_task(p))
 		p->prio = recalc_task_prio(p, now);
 
-- 
cgit v1.2.3


From d5abe669172f20a4129a711de0f250a4e07db298 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 6 Dec 2006 20:37:26 -0800
Subject: [PATCH] debug: workqueue locking sanity

Workqueue functions should not leak locks, assert so, printing the
last function ran.

Use macros in lockdep.h to avoid include dependency pains.

[akpm@osdl.org: build fix]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/workqueue.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2945b094d871..5484d6e045c2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -30,6 +30,8 @@
 #include <linux/hardirq.h>
 #include <linux/mempolicy.h>
 #include <linux/freezer.h>
+#include <linux/kallsyms.h>
+#include <linux/debug_locks.h>
 
 /*
  * The per-CPU workqueue (if single thread, we always use the first
@@ -253,6 +255,17 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
 			work_release(work);
 		f(work);
 
+		if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
+			printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
+					"%s/0x%08x/%d\n",
+					current->comm, preempt_count(),
+				       	current->pid);
+			printk(KERN_ERR "    last function: ");
+			print_symbol("%s\n", (unsigned long)f);
+			debug_show_held_locks(current);
+			dump_stack();
+		}
+
 		spin_lock_irqsave(&cwq->lock, flags);
 		cwq->remove_sequence++;
 		wake_up(&cwq->work_done);
-- 
cgit v1.2.3


From 40fcfc87222e2e8af6379ec366f0cb2a411570cd Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Wed, 6 Dec 2006 20:37:27 -0800
Subject: [PATCH] HZ: 300Hz support

Fix two things.  Firstly the unit is "Hz" not "HZ".  Secondly it is useful
to have 300Hz support when doing multimedia work.  250 is fine for us in
Europe but the US frame rate is 30fps (29.99 blah for pedants).  300 gives
us a tick divisible by both 25 and 30, and for interlace work 50 and 60.
It's also giving similar performance to 250Hz.

I'd argue we should remove 250 and add 300, but that might be excess
disruption for now.

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/Kconfig.hz | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 248e1c396f8b..4af15802ccd4 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -7,7 +7,7 @@ choice
 	default HZ_250
 	help
 	 Allows the configuration of the timer frequency. It is customary
-	 to have the timer interrupt run at 1000 HZ but 100 HZ may be more
+	 to have the timer interrupt run at 1000 Hz but 100 Hz may be more
 	 beneficial for servers and NUMA systems that do not need to have
 	 a fast response for user interaction and that may experience bus
 	 contention and cacheline bounces as a result of timer interrupts.
@@ -19,21 +19,30 @@ choice
 	config HZ_100
 		bool "100 HZ"
 	help
-	  100 HZ is a typical choice for servers, SMP and NUMA systems
+	  100 Hz is a typical choice for servers, SMP and NUMA systems
 	  with lots of processors that may show reduced performance if
 	  too many timer interrupts are occurring.
 
 	config HZ_250
 		bool "250 HZ"
 	help
-	 250 HZ is a good compromise choice allowing server performance
+	 250 Hz is a good compromise choice allowing server performance
 	 while also showing good interactive responsiveness even
-	 on SMP and NUMA systems.
+	 on SMP and NUMA systems. If you are going to be using NTSC video
+	 or multimedia, selected 300Hz instead.
+
+	config HZ_300
+		bool "300 HZ"
+	help
+	 300 Hz is a good compromise choice allowing server performance
+	 while also showing good interactive responsiveness even
+	 on SMP and NUMA systems and exactly dividing by both PAL and
+	 NTSC frame rates for video and multimedia work.
 
 	config HZ_1000
 		bool "1000 HZ"
 	help
-	 1000 HZ is the preferred choice for desktop systems and other
+	 1000 Hz is the preferred choice for desktop systems and other
 	 systems requiring fast interactive responses to events.
 
 endchoice
@@ -42,5 +51,6 @@ config HZ
 	int
 	default 100 if HZ_100
 	default 250 if HZ_250
+	default 300 if HZ_300
 	default 1000 if HZ_1000
 
-- 
cgit v1.2.3


From c36264dfb2d6fa6383082de0a1bba8e12b477da1 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 6 Dec 2006 20:37:42 -0800
Subject: [PATCH] remove the syslog interface when printk is disabled

Attempts to read() from the non-existent dmesg buffer will return zero and
userspace tends to get stuck in a busyloop.

So just remove /dev/kmsg altogether if CONFIG_PRINTK=n.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/printk.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 66426552fbfe..ba59c2a30ed0 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -631,12 +631,7 @@ EXPORT_SYMBOL(vprintk);
 
 asmlinkage long sys_syslog(int type, char __user *buf, int len)
 {
-	return 0;
-}
-
-int do_syslog(int type, char __user *buf, int len)
-{
-	return 0;
+	return -ENOSYS;
 }
 
 static void call_console_drivers(unsigned long start, unsigned long end)
-- 
cgit v1.2.3


From b4c6c34a530b4d1c626f4ac0a884e0a9b849378c Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Wed, 6 Dec 2006 20:38:11 -0800
Subject: [PATCH] kprobes: enable booster on the preemptible kernel

When we are unregistering a kprobe-booster, we can't release its
instruction buffer immediately on the preemptive kernel, because some
processes might be preempted on the buffer.  The freeze_processes() and
thaw_processes() functions can clean most of processes up from the buffer.
There are still some non-frozen threads who have the PF_NOFREEZE flag.  If
those threads are sleeping (not preempted) at the known place outside the
buffer, we can ensure safety of freeing.

However, the processing of this check routine takes a long time.  So, this
patch introduces the garbage collection mechanism of insn_slot.  It also
introduces the "dirty" flag to free_insn_slot because of efficiency.

The "clean" instruction slots (dirty flag is cleared) are released
immediately.  But the "dirty" slots which are used by boosted kprobes, are
marked as garbages.  collect_garbage_slots() will be invoked to release
"dirty" slots if there are more than INSNS_PER_PAGE garbage slots or if
there are no unused slots.

Cc: "Keshavamurthy, Anil S" <anil.s.keshavamurthy@intel.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: "bibo,mao" <bibo.mao@intel.com>
Cc: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Cc: Yumiko Sugita <yumiko.sugita.yf@hitachi.com>
Cc: Satoshi Oshima <soshima@redhat.com>
Cc: Hideo Aoki <haoki@redhat.com>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kprobes.c | 117 +++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 96 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 610c837ad9e0..17ec4afb0994 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -38,6 +38,7 @@
 #include <linux/module.h>
 #include <linux/moduleloader.h>
 #include <linux/kallsyms.h>
+#include <linux/freezer.h>
 #include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
 #include <asm/errno.h>
@@ -83,9 +84,36 @@ struct kprobe_insn_page {
 	kprobe_opcode_t *insns;		/* Page of instruction slots */
 	char slot_used[INSNS_PER_PAGE];
 	int nused;
+	int ngarbage;
 };
 
 static struct hlist_head kprobe_insn_pages;
+static int kprobe_garbage_slots;
+static int collect_garbage_slots(void);
+
+static int __kprobes check_safety(void)
+{
+	int ret = 0;
+#if defined(CONFIG_PREEMPT) && defined(CONFIG_PM)
+	ret = freeze_processes();
+	if (ret == 0) {
+		struct task_struct *p, *q;
+		do_each_thread(p, q) {
+			if (p != current && p->state == TASK_RUNNING &&
+			    p->pid != 0) {
+				printk("Check failed: %s is running\n",p->comm);
+				ret = -1;
+				goto loop_end;
+			}
+		} while_each_thread(p, q);
+	}
+loop_end:
+	thaw_processes();
+#else
+	synchronize_sched();
+#endif
+	return ret;
+}
 
 /**
  * get_insn_slot() - Find a slot on an executable page for an instruction.
@@ -96,6 +124,7 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
 	struct kprobe_insn_page *kip;
 	struct hlist_node *pos;
 
+      retry:
 	hlist_for_each(pos, &kprobe_insn_pages) {
 		kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
 		if (kip->nused < INSNS_PER_PAGE) {
@@ -112,7 +141,11 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
 		}
 	}
 
-	/* All out of space.  Need to allocate a new page. Use slot 0.*/
+	/* If there are any garbage slots, collect it and try again. */
+	if (kprobe_garbage_slots && collect_garbage_slots() == 0) {
+		goto retry;
+	}
+	/* All out of space.  Need to allocate a new page. Use slot 0. */
 	kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
 	if (!kip) {
 		return NULL;
@@ -133,10 +166,62 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
 	memset(kip->slot_used, 0, INSNS_PER_PAGE);
 	kip->slot_used[0] = 1;
 	kip->nused = 1;
+	kip->ngarbage = 0;
 	return kip->insns;
 }
 
-void __kprobes free_insn_slot(kprobe_opcode_t *slot)
+/* Return 1 if all garbages are collected, otherwise 0. */
+static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
+{
+	kip->slot_used[idx] = 0;
+	kip->nused--;
+	if (kip->nused == 0) {
+		/*
+		 * Page is no longer in use.  Free it unless
+		 * it's the last one.  We keep the last one
+		 * so as not to have to set it up again the
+		 * next time somebody inserts a probe.
+		 */
+		hlist_del(&kip->hlist);
+		if (hlist_empty(&kprobe_insn_pages)) {
+			INIT_HLIST_NODE(&kip->hlist);
+			hlist_add_head(&kip->hlist,
+				       &kprobe_insn_pages);
+		} else {
+			module_free(NULL, kip->insns);
+			kfree(kip);
+		}
+		return 1;
+	}
+	return 0;
+}
+
+static int __kprobes collect_garbage_slots(void)
+{
+	struct kprobe_insn_page *kip;
+	struct hlist_node *pos, *next;
+
+	/* Ensure no-one is preepmted on the garbages */
+	if (check_safety() != 0)
+		return -EAGAIN;
+
+	hlist_for_each_safe(pos, next, &kprobe_insn_pages) {
+		int i;
+		kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
+		if (kip->ngarbage == 0)
+			continue;
+		kip->ngarbage = 0;	/* we will collect all garbages */
+		for (i = 0; i < INSNS_PER_PAGE; i++) {
+			if (kip->slot_used[i] == -1 &&
+			    collect_one_slot(kip, i))
+				break;
+		}
+	}
+	kprobe_garbage_slots = 0;
+	return 0;
+}
+
+void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
 {
 	struct kprobe_insn_page *kip;
 	struct hlist_node *pos;
@@ -146,28 +231,18 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot)
 		if (kip->insns <= slot &&
 		    slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
 			int i = (slot - kip->insns) / MAX_INSN_SIZE;
-			kip->slot_used[i] = 0;
-			kip->nused--;
-			if (kip->nused == 0) {
-				/*
-				 * Page is no longer in use.  Free it unless
-				 * it's the last one.  We keep the last one
-				 * so as not to have to set it up again the
-				 * next time somebody inserts a probe.
-				 */
-				hlist_del(&kip->hlist);
-				if (hlist_empty(&kprobe_insn_pages)) {
-					INIT_HLIST_NODE(&kip->hlist);
-					hlist_add_head(&kip->hlist,
-						&kprobe_insn_pages);
-				} else {
-					module_free(NULL, kip->insns);
-					kfree(kip);
-				}
+			if (dirty) {
+				kip->slot_used[i] = -1;
+				kip->ngarbage++;
+			} else {
+				collect_one_slot(kip, i);
 			}
-			return;
+			break;
 		}
 	}
+	if (dirty && (++kprobe_garbage_slots > INSNS_PER_PAGE)) {
+		collect_garbage_slots();
+	}
 }
 #endif
 
-- 
cgit v1.2.3


From 02316067852187b8bec781bec07410e91af79627 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 6 Dec 2006 20:38:17 -0800
Subject: [PATCH] hotplug CPU: clean up hotcpu_notifier() use

There was lots of #ifdef noise in the kernel due to hotcpu_notifier(fn,
prio) not correctly marking 'fn' as used in the !HOTPLUG_CPU case, and thus
generating compiler warnings of unused symbols, hence forcing people to add
#ifdefs.

the compiler can skip truly unused functions just fine:

    text    data     bss     dec     hex filename
 1624412  728710 3674856 6027978  5bfaca vmlinux.before
 1624412  728710 3674856 6027978  5bfaca vmlinux.after

[akpm@osdl.org: topology.c fix]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c    | 4 ----
 kernel/profile.c   | 3 +--
 kernel/sched.c     | 3 ---
 kernel/workqueue.c | 2 --
 4 files changed, 1 insertion(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index bd1e89c4c96a..9b62b4c03ad0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2044,7 +2044,6 @@ out:
 	return err;
 }
 
-#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
 /*
  * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
  * or memory nodes, we need to walk over the cpuset hierarchy,
@@ -2108,9 +2107,7 @@ static void common_cpu_mem_hotplug_unplug(void)
 	mutex_unlock(&callback_mutex);
 	mutex_unlock(&manage_mutex);
 }
-#endif
 
-#ifdef CONFIG_HOTPLUG_CPU
 /*
  * The top_cpuset tracks what CPUs and Memory Nodes are online,
  * period.  This is necessary in order to make cpusets transparent
@@ -2127,7 +2124,6 @@ static int cpuset_handle_cpuhp(struct notifier_block *nb,
 	common_cpu_mem_hotplug_unplug();
 	return 0;
 }
-#endif
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
diff --git a/kernel/profile.c b/kernel/profile.c
index 04fd84e8cdbe..0961d93e1d91 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -319,7 +319,6 @@ out:
 	put_cpu();
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static int __devinit profile_cpu_callback(struct notifier_block *info,
 					unsigned long action, void *__cpu)
 {
@@ -372,10 +371,10 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
 	}
 	return NOTIFY_OK;
 }
-#endif /* CONFIG_HOTPLUG_CPU */
 #else /* !CONFIG_SMP */
 #define profile_flip_buffers()		do { } while (0)
 #define profile_discard_flip_buffers()	do { } while (0)
+#define profile_cpu_callback		NULL
 
 void profile_hits(int type, void *__pc, unsigned int nr_hits)
 {
diff --git a/kernel/sched.c b/kernel/sched.c
index 75a005ed4eda..c83f531c2886 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6740,8 +6740,6 @@ SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
 	    sched_smt_power_savings_store);
 #endif
 
-
-#ifdef CONFIG_HOTPLUG_CPU
 /*
  * Force a reinitialization of the sched domains hierarchy.  The domains
  * and groups cannot be updated in place without racing with the balancing
@@ -6774,7 +6772,6 @@ static int update_sched_domains(struct notifier_block *nfb,
 
 	return NOTIFY_OK;
 }
-#endif
 
 void __init sched_init_smp(void)
 {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5484d6e045c2..c5257316f4b9 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -655,7 +655,6 @@ int current_is_keventd(void)
 
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 /* Take the work from this (downed) CPU. */
 static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
 {
@@ -738,7 +737,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 
 	return NOTIFY_OK;
 }
-#endif
 
 void init_workqueues(void)
 {
-- 
cgit v1.2.3


From ebe7e5fe4b41deeb2731c5b52d8c8e6ac08b1f74 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:38:21 -0800
Subject: [PATCH] remove kernel/lockdep.c:lockdep_internal

Remove the no longer used lockdep_internal().

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/lockdep.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 300d61bb314b..3926c3674354 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -140,13 +140,6 @@ void lockdep_on(void)
 
 EXPORT_SYMBOL(lockdep_on);
 
-int lockdep_internal(void)
-{
-	return current->lockdep_recursion != 0;
-}
-
-EXPORT_SYMBOL(lockdep_internal);
-
 /*
  * Debugging switches:
  */
-- 
cgit v1.2.3


From d3228a887cae75ef2b8b1211c31c539bef5a5698 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:38:22 -0800
Subject: [PATCH] make kernel/signal.c:kill_proc_info() static

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index bc972d7e6319..ec81defde339 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1134,8 +1134,7 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
 	return error;
 }
 
-int
-kill_proc_info(int sig, struct siginfo *info, pid_t pid)
+static int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 {
 	int error;
 	rcu_read_lock();
-- 
cgit v1.2.3


From 1c69d921ed9cc6593ad4f60c0f9951cb0d62b0b4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 6 Dec 2006 20:38:44 -0800
Subject: [PATCH] rcu: add a prefetch() in rcu_do_batch()

On some workloads, (for example when lot of close() syscalls are done), RCU
qlen can be quite large, and RCU heads are no longer in cpu cache when
rcu_do_batch() is called.

This patch adds a prefetch() in rcu_do_batch() to give CPU a hint to bring
back cache lines containing 'struct rcu_head's.

Most list manipulations macros include prefetch(), but not open coded ones
(at least with current C compilers :) )

I got a nice speedup on a trivial benchmark (3.48 us per iteration instead
of 3.95 us on a 1.6 GHz Pentium-M)

while (1) { pipe(p); close(fd[0]); close(fd[1]);}

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcupdate.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 26bb5ffe1ef1..3554b76da84c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -235,12 +235,14 @@ static void rcu_do_batch(struct rcu_data *rdp)
 
 	list = rdp->donelist;
 	while (list) {
-		next = rdp->donelist = list->next;
+		next = list->next;
+		prefetch(next);
 		list->func(list);
 		list = next;
 		if (++count >= rdp->blimit)
 			break;
 	}
+	rdp->donelist = list;
 
 	local_irq_disable();
 	rdp->qlen -= count;
-- 
cgit v1.2.3


From 4668edc334ee90cf50c382c3e423cfc510b5a126 Mon Sep 17 00:00:00 2001
From: Burman Yan <yan_952@hotmail.com>
Date: Wed, 6 Dec 2006 20:38:51 -0800
Subject: [PATCH] kernel core: replace kmalloc+memset with kzalloc

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/auditfilter.c | 3 +--
 kernel/futex.c       | 3 +--
 kernel/kexec.c       | 3 +--
 3 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 4f40d923af8e..2e896f8ae29e 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -636,10 +636,9 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
 	struct audit_rule *rule;
 	int i;
 
-	rule = kmalloc(sizeof(*rule), GFP_KERNEL);
+	rule = kzalloc(sizeof(*rule), GFP_KERNEL);
 	if (unlikely(!rule))
 		return NULL;
-	memset(rule, 0, sizeof(*rule));
 
 	rule->flags = krule->flags | krule->listnr;
 	rule->action = krule->action;
diff --git a/kernel/futex.c b/kernel/futex.c
index af7b81cbde30..7c0d0d4fa7f7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -324,12 +324,11 @@ static int refill_pi_state_cache(void)
 	if (likely(current->pi_state_cache))
 		return 0;
 
-	pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL);
+	pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
 
 	if (!pi_state)
 		return -ENOMEM;
 
-	memset(pi_state, 0, sizeof(*pi_state));
 	INIT_LIST_HEAD(&pi_state->list);
 	/* pi_mutex gets initialized later */
 	pi_state->owner = NULL;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index fcdd5d2bc3f4..d43692cf2321 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -108,11 +108,10 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
 
 	/* Allocate a controlling structure */
 	result = -ENOMEM;
-	image = kmalloc(sizeof(*image), GFP_KERNEL);
+	image = kzalloc(sizeof(*image), GFP_KERNEL);
 	if (!image)
 		goto out;
 
-	memset(image, 0, sizeof(*image));
 	image->head = 0;
 	image->entry = &image->head;
 	image->last_entry = &image->head;
-- 
cgit v1.2.3


From 95362fa90312ff2d52c0b4d42412cd7ceeb3b89b Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 6 Dec 2006 20:39:03 -0800
Subject: [PATCH] futex: init error check

Check register_filesystem() and kern_mount() return values.

Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 7c0d0d4fa7f7..d60b7f7a8cc3 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1857,10 +1857,16 @@ static struct file_system_type futex_fs_type = {
 
 static int __init init(void)
 {
-	unsigned int i;
+	int i = register_filesystem(&futex_fs_type);
+
+	if (i)
+		return i;
 
-	register_filesystem(&futex_fs_type);
 	futex_mnt = kern_mount(&futex_fs_type);
+	if (IS_ERR(futex_mnt)) {
+		unregister_filesystem(&futex_fs_type);
+		return PTR_ERR(futex_mnt);
+	}
 
 	for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
 		INIT_LIST_HEAD(&futex_queues[i].chain);
-- 
cgit v1.2.3


From bd9b0bac6f601655044fc35978e26231dffee03e Mon Sep 17 00:00:00 2001
From: "BP, Praveen" <praveenbp@ti.com>
Date: Wed, 6 Dec 2006 20:39:09 -0800
Subject: [PATCH] sysctl: string length calculated is wrong if it contains
 negative numbers

In the functions do_proc_dointvec() and do_proc_doulongvec_minmax(),
there seems to be a bug in string length calculation if string contains
negative integer.

The console log given below explains the bug. Setting negative values
may not be a right thing to do for "console log level" but then the test
(given below) can be used to demonstrate the bug in the code.

# echo "-1 -1 -1 -123456" > /proc/sys/kernel/printk
# cat /proc/sys/kernel/printk
-1      -1      -1      -1234
#
# echo "-1 -1 -1 123456" > /proc/sys/kernel/printk
# cat /proc/sys/kernel/printk
-1      -1      -1      1234
#

(akpm: the bug is that 123456 gets truncated)

It works as expected if string contains all +ve integers

# echo "1 2 3 4" > /proc/sys/kernel/printk
# cat /proc/sys/kernel/printk
1       2       3       4
#

The patch given below fixes the issue.

Signed-off-by: Praveen BP <praveenbp@ti.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7abe9704e75a..6d7147cf66bb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1875,7 +1875,7 @@ static int __do_proc_dointvec(void *tbl_data, ctl_table *table,
 			p = buf;
 			if (*p == '-' && left > 1) {
 				neg = 1;
-				left--, p++;
+				p++;
 			}
 			if (*p < '0' || *p > '9')
 				break;
@@ -2126,7 +2126,7 @@ static int __do_proc_doulongvec_minmax(void *data, ctl_table *table, int write,
 			p = buf;
 			if (*p == '-' && left > 1) {
 				neg = 1;
-				left--, p++;
+				p++;
 			}
 			if (*p < '0' || *p > '9')
 				break;
-- 
cgit v1.2.3


From 301827acbe49d0ba7ec9770803970893ac9ded97 Mon Sep 17 00:00:00 2001
From: Chris Caputo <ccaputo@alt.net>
Date: Wed, 6 Dec 2006 20:39:11 -0800
Subject: [PATCH] sched: correct output of show_state()

At present show_state prints a header the does not match the output of
show_task, as follows:

-
                                               sibling
  task             PC      pid father child younger older
init          S 00000000     0     1      0     2               (NOTLB)
-

This patch corrects the output of show_state so that the header is
aligned with the data, ala:

-
                         free                        sibling
  task             PC    stack   pid father child younger older
init          S 00000000     0     1      0     2               (NOTLB)
-

Signed-off-by: Chris Caputo <ccaputo@alt.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c83f531c2886..b43cef02ce5b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4822,12 +4822,12 @@ void show_state_filter(unsigned long state_filter)
 
 #if (BITS_PER_LONG == 32)
 	printk("\n"
-	       "                                               sibling\n");
-	printk("  task             PC      pid father child younger older\n");
+	       "                         free                        sibling\n");
+	printk("  task             PC    stack   pid father child younger older\n");
 #else
 	printk("\n"
-	       "                                                       sibling\n");
-	printk("  task                 PC          pid father child younger older\n");
+	       "                                 free                        sibling\n");
+	printk("  task                 PC        stack   pid father child younger older\n");
 #endif
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
-- 
cgit v1.2.3


From 50cc670aebf4fc64afaf533fb9fa1c8570f09d74 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 6 Dec 2006 20:39:30 -0800
Subject: [PATCH] lockdep: more chains

Some have reported a chain-table overflow - double its size.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/lockdep_internals.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index eab043c83bb2..8ce09bc4613d 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -20,7 +20,7 @@
 #define MAX_LOCKDEP_KEYS_BITS	11
 #define MAX_LOCKDEP_KEYS	(1UL << MAX_LOCKDEP_KEYS_BITS)
 
-#define MAX_LOCKDEP_CHAINS_BITS	13
+#define MAX_LOCKDEP_CHAINS_BITS	14
 #define MAX_LOCKDEP_CHAINS	(1UL << MAX_LOCKDEP_CHAINS_BITS)
 
 /*
-- 
cgit v1.2.3


From 2ee91f197c0bc654b24eed5831fd12aa0d566a7d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 6 Dec 2006 20:39:32 -0800
Subject: [PATCH] lockdep: show more details about self-test failures

Make the locking self-test failures (of 'FAILURE' type) easier to debug by
printing more information.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/mutex-debug.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 18651641a7b5..841539d72c55 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -77,6 +77,9 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
 
 void debug_mutex_unlock(struct mutex *lock)
 {
+	if (unlikely(!debug_locks))
+		return;
+
 	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
 	DEBUG_LOCKS_WARN_ON(lock->magic != lock);
 	DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
-- 
cgit v1.2.3


From 3908fd2ed920af818aa596672da68ba26173ff27 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Wed, 6 Dec 2006 20:39:39 -0800
Subject: [PATCH] softirq: remove BUG_ONs which can incorrectly trigger

It is possible to have tasklets get scheduled before softirqd has had a chance
to spawn on all CPUs.  This is totally harmless; after success during action
CPU_UP_PREPARE, action CPU_ONLINE will be called, which immediately wakes
softirqd on the appropriate CPU to process the already pending tasklets.  So
there is no danger of having a missed wakeup for any tasklets that were
already pending.

In particular, i386 is affected by this during startup, and is visible when
using a very large initrd; during the time it takes for the initrd to be
decompressed, a timer IRQ can come in and schedule RCU callbacks.  It is also
possible that resending of a hardware IRQ via a softirq triggers the same bug.

Because of different timing conditions, this shows up in all emulators and
virtual machines tested, including Xen, VMware, Virtual PC, and Qemu.  It is
also possible to trigger on native hardware with a large enough initrd,
although I don't have a reliable case demonstrating that.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Cc: <caglar@pardus.org.tr>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/softirq.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index bf25015dce16..918e52df090e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -574,8 +574,6 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
 
 	switch (action) {
 	case CPU_UP_PREPARE:
-		BUG_ON(per_cpu(tasklet_vec, hotcpu).list);
-		BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list);
 		p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
 		if (IS_ERR(p)) {
 			printk("ksoftirqd for %i failed\n", hotcpu);
-- 
cgit v1.2.3


From 045f147f3290395661b56b9231fc4d221e150963 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 6 Dec 2006 20:40:15 -0800
Subject: [PATCH] remove EXPORT_UNUSED_SYMBOL'ed symbols

In time for 2.6.20, we can get rid of this junk.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/printk.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index ba59c2a30ed0..c3d90a58e4c5 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -53,8 +53,6 @@ int console_printk[4] = {
 	DEFAULT_CONSOLE_LOGLEVEL,	/* default_console_loglevel */
 };
 
-EXPORT_UNUSED_SYMBOL(console_printk);  /*  June 2006  */
-
 /*
  * Low lever drivers may need that to know if they can schedule in
  * their unblank() callback or not. So let's export it.
@@ -772,7 +770,6 @@ int is_console_locked(void)
 {
 	return console_locked;
 }
-EXPORT_UNUSED_SYMBOL(is_console_locked);  /*  June 2006  */
 
 /**
  * release_console_sem - unlock the console system
-- 
cgit v1.2.3


From a09c17a6fdad9ae5b5ea1c3383080f84ec76ab20 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@cs.washington.edu>
Date: Wed, 6 Dec 2006 20:40:18 -0800
Subject: [PATCH] sys: remove unused variable

Remove unused 'new_ruid' variable.

Reported by David Binderman <dcb314@hotmail.com>.

Signed-off-by: David Rientjes <rientjes@cs.washington.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index c87b461de38d..a0c1a29a507f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1102,14 +1102,14 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
 asmlinkage long sys_setuid(uid_t uid)
 {
 	int old_euid = current->euid;
-	int old_ruid, old_suid, new_ruid, new_suid;
+	int old_ruid, old_suid, new_suid;
 	int retval;
 
 	retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
 	if (retval)
 		return retval;
 
-	old_ruid = new_ruid = current->uid;
+	old_ruid = current->uid;
 	old_suid = current->suid;
 	new_suid = old_suid;
 	
-- 
cgit v1.2.3


From 012d3ca8d85bf3ae5278ba897dd1ca3999247107 Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@freedesktop.org>
Date: Wed, 6 Dec 2006 20:40:19 -0800
Subject: [PATCH] Add Sparse annotations to SRCU wrapper functions in
 rcutorture

The SRCU wrapper functions srcu_torture_read_lock and
srcu_torture_read_unlock in rcutorture intentionally change the SRCU
context; annotate them accordingly, to avoid a warning.

Signed-off-by: Josh Triplett <josh@freedesktop.org>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcutorture.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index e2bda18f6f42..c52f981ea008 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -401,7 +401,7 @@ static void srcu_torture_cleanup(void)
 	cleanup_srcu_struct(&srcu_ctl);
 }
 
-static int srcu_torture_read_lock(void)
+static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
 {
 	return srcu_read_lock(&srcu_ctl);
 }
@@ -419,7 +419,7 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
 		schedule_timeout_interruptible(longdelay);
 }
 
-static void srcu_torture_read_unlock(int idx)
+static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
 {
 	srcu_read_unlock(&srcu_ctl, idx);
 }
-- 
cgit v1.2.3


From ccdea2f88b5689f0fd29c3804be43a3acf0311e3 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Wed, 6 Dec 2006 20:40:26 -0800
Subject: [PATCH] futex: remove unneeded barrier

When disassembling a kernel I found around over 90 sync Instructions from
mb, rmb and wmb calls in the kernel and only few of those make any sense to
me.  So here's the first one - I think the wmb() in kernel/futex.c is not
needed on uniprocessors so should become an smb_wmb().

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index d60b7f7a8cc3..a8302a1620ea 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -552,7 +552,7 @@ static void wake_futex(struct futex_q *q)
 	 * at the end of wake_up_all() does not prevent this store from
 	 * moving.
 	 */
-	wmb();
+	smp_wmb();
 	q->lock_ptr = NULL;
 }
 
-- 
cgit v1.2.3


From 15ad7cdcfd76450d4beebc789ec646664238184d Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Wed, 6 Dec 2006 20:40:36 -0800
Subject: [PATCH] struct seq_operations and struct file_operations
 constification

 - move some file_operations structs into the .rodata section

 - move static strings from policy_types[] array into the .rodata section

 - fix generic seq_operations usages, so that those structs may be defined
   as "const" as well

[akpm@osdl.org: couple of fixes]
Signed-off-by: Helge Deller <deller@gmx.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/configs.c      | 2 +-
 kernel/cpuset.c       | 4 ++--
 kernel/dma.c          | 2 +-
 kernel/futex.c        | 2 +-
 kernel/kallsyms.c     | 4 ++--
 kernel/lockdep_proc.c | 6 +++---
 kernel/module.c       | 2 +-
 kernel/power/user.c   | 2 +-
 kernel/profile.c      | 2 +-
 kernel/relay.c        | 2 +-
 kernel/resource.c     | 6 +++---
 kernel/sched.c        | 2 +-
 kernel/sysctl.c       | 2 +-
 13 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/configs.c b/kernel/configs.c
index f9e31974f4ad..8fa1fb28f8a7 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -75,7 +75,7 @@ ikconfig_read_current(struct file *file, char __user *buf,
 	return count;
 }
 
-static struct file_operations ikconfig_file_ops = {
+static const struct file_operations ikconfig_file_ops = {
 	.owner = THIS_MODULE,
 	.read = ikconfig_read_current,
 };
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9b62b4c03ad0..4ef1d29297ca 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1531,7 +1531,7 @@ static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry,
 	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
 }
 
-static struct file_operations cpuset_file_operations = {
+static const struct file_operations cpuset_file_operations = {
 	.read = cpuset_file_read,
 	.write = cpuset_file_write,
 	.llseek = generic_file_llseek,
@@ -2605,7 +2605,7 @@ static int cpuset_open(struct inode *inode, struct file *file)
 	return single_open(file, proc_cpuset_show, pid);
 }
 
-struct file_operations proc_cpuset_operations = {
+const struct file_operations proc_cpuset_operations = {
 	.open		= cpuset_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/kernel/dma.c b/kernel/dma.c
index 2020644c938a..937b13ca33ba 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -140,7 +140,7 @@ static int proc_dma_open(struct inode *inode, struct file *file)
 	return single_open(file, proc_dma_show, NULL);
 }
 
-static struct file_operations proc_dma_operations = {
+static const struct file_operations proc_dma_operations = {
 	.open		= proc_dma_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/kernel/futex.c b/kernel/futex.c
index a8302a1620ea..95989a3b4168 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1492,7 +1492,7 @@ static unsigned int futex_poll(struct file *filp,
 	return ret;
 }
 
-static struct file_operations futex_fops = {
+static const struct file_operations futex_fops = {
 	.release	= futex_close,
 	.poll		= futex_poll,
 };
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 54befe36ee0b..ab63cfc42992 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -398,7 +398,7 @@ static int s_show(struct seq_file *m, void *p)
 	return 0;
 }
 
-static struct seq_operations kallsyms_op = {
+static const struct seq_operations kallsyms_op = {
 	.start = s_start,
 	.next = s_next,
 	.stop = s_stop,
@@ -433,7 +433,7 @@ static int kallsyms_release(struct inode *inode, struct file *file)
 	return seq_release(inode, file);
 }
 
-static struct file_operations kallsyms_operations = {
+static const struct file_operations kallsyms_operations = {
 	.open = kallsyms_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index f6e72eaab3fa..b554b40a4aa6 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -113,7 +113,7 @@ static int l_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static struct seq_operations lockdep_ops = {
+static const struct seq_operations lockdep_ops = {
 	.start	= l_start,
 	.next	= l_next,
 	.stop	= l_stop,
@@ -135,7 +135,7 @@ static int lockdep_open(struct inode *inode, struct file *file)
 	return res;
 }
 
-static struct file_operations proc_lockdep_operations = {
+static const struct file_operations proc_lockdep_operations = {
 	.open		= lockdep_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -319,7 +319,7 @@ static int lockdep_stats_open(struct inode *inode, struct file *file)
 	return single_open(file, lockdep_stats_show, NULL);
 }
 
-static struct file_operations proc_lockdep_stats_operations = {
+static const struct file_operations proc_lockdep_stats_operations = {
 	.open		= lockdep_stats_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/kernel/module.c b/kernel/module.c
index e2d09d604ca0..d9eae45d0145 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2209,7 +2209,7 @@ static int m_show(struct seq_file *m, void *p)
    Where refcount is a number or -, and deps is a comma-separated list
    of depends or -.
 */
-struct seq_operations modules_op = {
+const struct seq_operations modules_op = {
 	.start	= m_start,
 	.next	= m_next,
 	.stop	= m_stop,
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 069732e5695c..89443b85163b 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -383,7 +383,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 	return error;
 }
 
-static struct file_operations snapshot_fops = {
+static const struct file_operations snapshot_fops = {
 	.open = snapshot_open,
 	.release = snapshot_release,
 	.read = snapshot_read,
diff --git a/kernel/profile.c b/kernel/profile.c
index 0961d93e1d91..fb5e03d57e9d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -501,7 +501,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
 	return count;
 }
 
-static struct file_operations proc_profile_operations = {
+static const struct file_operations proc_profile_operations = {
 	.read		= read_profile,
 	.write		= write_profile,
 };
diff --git a/kernel/relay.c b/kernel/relay.c
index 2b92e8ece85b..75a3a9a7efc2 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1013,7 +1013,7 @@ static ssize_t relay_file_sendfile(struct file *filp,
 				       actor, &desc);
 }
 
-struct file_operations relay_file_operations = {
+const struct file_operations relay_file_operations = {
 	.open		= relay_file_open,
 	.poll		= relay_file_poll,
 	.mmap		= relay_file_mmap,
diff --git a/kernel/resource.c b/kernel/resource.c
index 6de60c12143e..7b9a497419d9 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -88,7 +88,7 @@ static int r_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static struct seq_operations resource_op = {
+static const struct seq_operations resource_op = {
 	.start	= r_start,
 	.next	= r_next,
 	.stop	= r_stop,
@@ -115,14 +115,14 @@ static int iomem_open(struct inode *inode, struct file *file)
 	return res;
 }
 
-static struct file_operations proc_ioports_operations = {
+static const struct file_operations proc_ioports_operations = {
 	.open		= ioports_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= seq_release,
 };
 
-static struct file_operations proc_iomem_operations = {
+static const struct file_operations proc_iomem_operations = {
 	.open		= iomem_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/kernel/sched.c b/kernel/sched.c
index b43cef02ce5b..f385eff4682d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -505,7 +505,7 @@ static int schedstat_open(struct inode *inode, struct file *file)
 	return res;
 }
 
-struct file_operations proc_schedstat_operations = {
+const struct file_operations proc_schedstat_operations = {
 	.open    = schedstat_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6d7147cf66bb..758dbbf972a5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -170,7 +170,7 @@ static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *);
 static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *);
 static int proc_opensys(struct inode *, struct file *);
 
-struct file_operations proc_sys_file_operations = {
+const struct file_operations proc_sys_file_operations = {
 	.open		= proc_opensys,
 	.read		= proc_readsys,
 	.write		= proc_writesys,
-- 
cgit v1.2.3


From 85916f8166b59eeac63d2b4f7f1df8de849334b4 Mon Sep 17 00:00:00 2001
From: Magnus Damm <magnus@valinux.co.jp>
Date: Wed, 6 Dec 2006 20:40:41 -0800
Subject: [PATCH] Kexec / Kdump: Unify elf note code

The elf note saving code is currently duplicated over several
architectures.  This cleanup patch simply adds code to a common file and
then replaces the arch-specific code with calls to the newly added code.

The only drawback with this approach is that s390 doesn't fully support
kexec-on-panic which for that arch leads to introduction of unused code.

Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
Cc: Vivek Goyal <vgoyal@in.ibm.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kexec.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index d43692cf2321..afbbbe981be2 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -20,6 +20,8 @@
 #include <linux/syscalls.h>
 #include <linux/ioport.h>
 #include <linux/hardirq.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -1066,6 +1068,60 @@ void crash_kexec(struct pt_regs *regs)
 	}
 }
 
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+			    size_t data_len)
+{
+	struct elf_note note;
+
+	note.n_namesz = strlen(name) + 1;
+	note.n_descsz = data_len;
+	note.n_type   = type;
+	memcpy(buf, &note, sizeof(note));
+	buf += (sizeof(note) + 3)/4;
+	memcpy(buf, name, note.n_namesz);
+	buf += (note.n_namesz + 3)/4;
+	memcpy(buf, data, note.n_descsz);
+	buf += (note.n_descsz + 3)/4;
+
+	return buf;
+}
+
+static void final_note(u32 *buf)
+{
+	struct elf_note note;
+
+	note.n_namesz = 0;
+	note.n_descsz = 0;
+	note.n_type   = 0;
+	memcpy(buf, &note, sizeof(note));
+}
+
+void crash_save_cpu(struct pt_regs *regs, int cpu)
+{
+	struct elf_prstatus prstatus;
+	u32 *buf;
+
+	if ((cpu < 0) || (cpu >= NR_CPUS))
+		return;
+
+	/* Using ELF notes here is opportunistic.
+	 * I need a well defined structure format
+	 * for the data I pass, and I need tags
+	 * on the data to indicate what information I have
+	 * squirrelled away.  ELF notes happen to provide
+	 * all of that, so there is no need to invent something new.
+	 */
+	buf = (u32*)per_cpu_ptr(crash_notes, cpu);
+	if (!buf)
+		return;
+	memset(&prstatus, 0, sizeof(prstatus));
+	prstatus.pr_pid = current->pid;
+	elf_core_copy_regs(&prstatus.pr_reg, regs);
+	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
+				sizeof(prstatus));
+	final_note(buf);
+}
+
 static int __init crash_notes_memory_init(void)
 {
 	/* Allocate memory for saving cpu registers. */
-- 
cgit v1.2.3


From 70e4506765602cca047cfa31933836e354c61a63 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 6 Dec 2006 20:40:50 -0800
Subject: [PATCH] lockdep: register_lock_class() fix

The hash_lock must only ever be taken with irqs disabled.  This happens in
all the important places, except one codepath: register_lock_class().  The
race should trigger rarely because register_lock_class() is quite rare and
single-threaded (happens during init most of the time).

The fix is to disable irqs.

( bug found live in -rt: there preemption is alot more agressive and
  preempting with the hash-lock held caused a lockup.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/lockdep.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 3926c3674354..62e73ce68197 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -1182,6 +1182,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 	struct lockdep_subclass_key *key;
 	struct list_head *hash_head;
 	struct lock_class *class;
+	unsigned long flags;
 
 	class = look_up_lock_class(lock, subclass);
 	if (likely(class))
@@ -1203,6 +1204,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 	key = lock->key->subkeys + subclass;
 	hash_head = classhashentry(key);
 
+	raw_local_irq_save(flags);
 	__raw_spin_lock(&hash_lock);
 	/*
 	 * We have to do the hash-walk again, to avoid races
@@ -1217,6 +1219,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 	 */
 	if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
 		__raw_spin_unlock(&hash_lock);
+		raw_local_irq_restore(flags);
 		debug_locks_off();
 		printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
 		printk("turning off the locking correctness validator.\n");
@@ -1239,15 +1242,18 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 
 	if (verbose(class)) {
 		__raw_spin_unlock(&hash_lock);
+		raw_local_irq_restore(flags);
 		printk("\nnew class %p: %s", class->key, class->name);
 		if (class->name_version > 1)
 			printk("#%d", class->name_version);
 		printk("\n");
 		dump_stack();
+		raw_local_irq_save(flags);
 		__raw_spin_lock(&hash_lock);
 	}
 out_unlock_set:
 	__raw_spin_unlock(&hash_lock);
+	raw_local_irq_restore(flags);
 
 	if (!subclass || force)
 		lock->class_cache = class;
-- 
cgit v1.2.3


From 792908225064b1d841a8990b9d1d1cfc4e0e5bb2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 6 Dec 2006 20:40:51 -0800
Subject: [PATCH] add ignore_loglevel boot option

Sometimes the kernel prints something interesting while userspace bootup
keeps messages turned off via loglevel.  Enable the printing of /all/
kernel messages via the "ignore_loglevel" boot option.  Off by default.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/printk.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index c3d90a58e4c5..185bb45eacf7 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -333,13 +333,25 @@ static void __call_console_drivers(unsigned long start, unsigned long end)
 	}
 }
 
+static int __read_mostly ignore_loglevel;
+
+int __init ignore_loglevel_setup(char *str)
+{
+	ignore_loglevel = 1;
+	printk(KERN_INFO "debug: ignoring loglevel setting.\n");
+
+	return 1;
+}
+
+__setup("ignore_loglevel", ignore_loglevel_setup);
+
 /*
  * Write out chars from start to end - 1 inclusive
  */
 static void _call_console_drivers(unsigned long start,
 				unsigned long end, int msg_log_level)
 {
-	if (msg_log_level < console_loglevel &&
+	if ((msg_log_level < console_loglevel || ignore_loglevel) &&
 			console_drivers && start != end) {
 		if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
 			/* wrapped write */
-- 
cgit v1.2.3


From d3ed11c35635487d34ad476e1d6a66dd298ab2de Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Wed, 6 Dec 2006 20:41:37 -0800
Subject: [PATCH] cpuset: allow a larger buffer for writes to cpuset files

When using fake NUMA setup, the number of memory nodes can greatly exceed
the number of CPUs.  So the current limit in cpuset_common_file_write() is
insufficient.

Signed-off-by: Paul Menage <menage@google.com>
Acked-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4ef1d29297ca..0a6b4d89f9a0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1280,7 +1280,8 @@ typedef enum {
 	FILE_TASKLIST,
 } cpuset_filetype_t;
 
-static ssize_t cpuset_common_file_write(struct file *file, const char __user *userbuf,
+static ssize_t cpuset_common_file_write(struct file *file,
+					const char __user *userbuf,
 					size_t nbytes, loff_t *unused_ppos)
 {
 	struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
@@ -1291,7 +1292,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	int retval = 0;
 
 	/* Crude upper limit on largest legitimate cpulist user might write. */
-	if (nbytes > 100 + 6 * NR_CPUS)
+	if (nbytes > 100 + 6 * max(NR_CPUS, MAX_NUMNODES))
 		return -E2BIG;
 
 	/* +1 for nul-terminator */
-- 
cgit v1.2.3


From 68380b581383c028830f79ec2670f4a193854aa6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@woody.osdl.org>
Date: Thu, 7 Dec 2006 09:28:19 -0800
Subject: Add "run_scheduled_work()" workqueue function

This allows workqueue users to run just their own pending work, rather
than wait for the whole workqueue to finish running.  This solves the
deadlock with networking libphy that was due to other workqueue entries
possibly needing a lock that was held by the routine that wanted to
flush its own work.

It's not wonderful: if you absolutely need to synchronize with the work
function having been executed, any user strictly speaking should have
its own completion tracking logic, since when we run things explicitly
by hand, the generic workqueue layer can no longer help us synchronize.

Also, this is strictly only usable for work that has been scheduled
without any delayed timers.  You can not mix the new interface with
schedule_delayed_work().

But it's better than what we had currently.

Acked-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/workqueue.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c5257316f4b9..6b186750e9be 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -108,6 +108,79 @@ static inline void *get_wq_data(struct work_struct *work)
 	return (void *) (work->management & WORK_STRUCT_WQ_DATA_MASK);
 }
 
+static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work)
+{
+	int ret = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cwq->lock, flags);
+	/*
+	 * We need to re-validate the work info after we've gotten
+	 * the cpu_workqueue lock. We can run the work now iff:
+	 *
+	 *  - the wq_data still matches the cpu_workqueue_struct
+	 *  - AND the work is still marked pending
+	 *  - AND the work is still on a list (which will be this
+	 *    workqueue_struct list)
+	 *
+	 * All these conditions are important, because we
+	 * need to protect against the work being run right
+	 * now on another CPU (all but the last one might be
+	 * true if it's currently running and has not been
+	 * released yet, for example).
+	 */
+	if (get_wq_data(work) == cwq
+	    && work_pending(work)
+	    && !list_empty(&work->entry)) {
+		work_func_t f = work->func;
+		list_del_init(&work->entry);
+		spin_unlock_irqrestore(&cwq->lock, flags);
+
+		if (!test_bit(WORK_STRUCT_NOAUTOREL, &work->management))
+			work_release(work);
+		f(work);
+
+		spin_lock_irqsave(&cwq->lock, flags);
+		cwq->remove_sequence++;
+		wake_up(&cwq->work_done);
+		ret = 1;
+	}
+	spin_unlock_irqrestore(&cwq->lock, flags);
+	return ret;
+}
+
+/**
+ * run_scheduled_work - run scheduled work synchronously
+ * @work: work to run
+ *
+ * This checks if the work was pending, and runs it
+ * synchronously if so. It returns a boolean to indicate
+ * whether it had any scheduled work to run or not.
+ *
+ * NOTE! This _only_ works for normal work_structs. You
+ * CANNOT use this for delayed work, because the wq data
+ * for delayed work will not point properly to the per-
+ * CPU workqueue struct, but will change!
+ */
+int fastcall run_scheduled_work(struct work_struct *work)
+{
+	for (;;) {
+		struct cpu_workqueue_struct *cwq;
+
+		if (!work_pending(work))
+			return 0;
+		if (list_empty(&work->entry))
+			return 0;
+		/* NOTE! This depends intimately on __queue_work! */
+		cwq = get_wq_data(work);
+		if (!cwq)
+			return 0;
+		if (__run_work(cwq, work))
+			return 1;
+	}
+}
+EXPORT_SYMBOL(run_scheduled_work);
+
 /* Preempt must be disabled. */
 static void __queue_work(struct cpu_workqueue_struct *cwq,
 			 struct work_struct *work)
-- 
cgit v1.2.3


From a79561134f38de12dce14ed72138f38e55ef53fc Mon Sep 17 00:00:00 2001
From: Zou Nan hai <nanhai.zou@intel.com>
Date: Thu, 7 Dec 2006 09:51:35 -0800
Subject: [IA64] IA64 Kexec/kdump

Changes and updates.

1. Remove fake rendz path and related code according to discuss with Khalid Aziz.
2. fc.i offset fix in relocate_kernel.S.
3. iospic shutdown code eoi and mask race fix from Fujitsu.
4. Warm boot hook in machine_kexec to SN SAL code from Jack Steiner.
5. Send slave to SAL slave loop patch from Jay Lan.
6. Kdump on non-recoverable MCA event patch from Jay Lan
7. Use CTL_UNNUMBERED in kdump_on_init sysctl.

Signed-off-by: Zou Nan hai <nanhai.zou@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 kernel/kexec.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index fcdd5d2bc3f4..05aada293592 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -851,6 +851,7 @@ static int kimage_load_crash_segment(struct kimage *image,
 			memset(ptr + uchunk, 0, mchunk - uchunk);
 		}
 		result = copy_from_user(ptr, buf, uchunk);
+		kexec_flush_icache_page(page);
 		kunmap(page);
 		if (result) {
 			result = (result < 0) ? result : -EIO;
-- 
cgit v1.2.3


From aad094701c6355cb2b3d74a07ec0496f4a48c787 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Fri, 8 Dec 2006 02:35:57 -0800
Subject: [PATCH] move kallsyms data to .rodata

Kallsyms data is never written to, so it can as well benefit from
CONFIG_DEBUG_RODATA.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kallsyms.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index ab63cfc42992..6f294ff4f9ee 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -31,14 +31,14 @@
 #endif
 
 /* These will be re-linked against their real values during the second link stage */
-extern unsigned long kallsyms_addresses[] __attribute__((weak));
-extern unsigned long kallsyms_num_syms __attribute__((weak,section("data")));
-extern u8 kallsyms_names[] __attribute__((weak));
+extern const unsigned long kallsyms_addresses[] __attribute__((weak));
+extern const unsigned long kallsyms_num_syms __attribute__((weak));
+extern const u8 kallsyms_names[] __attribute__((weak));
 
-extern u8 kallsyms_token_table[] __attribute__((weak));
-extern u16 kallsyms_token_index[] __attribute__((weak));
+extern const u8 kallsyms_token_table[] __attribute__((weak));
+extern const u16 kallsyms_token_index[] __attribute__((weak));
 
-extern unsigned long kallsyms_markers[] __attribute__((weak));
+extern const unsigned long kallsyms_markers[] __attribute__((weak));
 
 static inline int is_kernel_inittext(unsigned long addr)
 {
@@ -84,7 +84,7 @@ static int is_ksym_addr(unsigned long addr)
 static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
 {
 	int len, skipped_first = 0;
-	u8 *tptr, *data;
+	const u8 *tptr, *data;
 
 	/* get the compressed symbol length from the first symbol byte */
 	data = &kallsyms_names[off];
@@ -132,7 +132,7 @@ static char kallsyms_get_symbol_type(unsigned int off)
  * kallsyms array */
 static unsigned int get_symbol_offset(unsigned long pos)
 {
-	u8 *name;
+	const u8 *name;
 	int i;
 
 	/* use the closest marker we have. We have markers every 256 positions,
-- 
cgit v1.2.3


From 6e2ac66470976ad7f57e0948572669b2bdfea2d0 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Fri, 8 Dec 2006 02:35:58 -0800
Subject: [PATCH] CPEI gets warning at
 kernel/irq/migration.c:27/move_masked_irq()

While running my MCA test (hardware error injection) on 2.6.19,
I got some warning like following:

> BUG: warning at kernel/irq/migration.c:27/move_masked_irq()
>
> Call Trace:
>  [<a000000100013d20>] show_stack+0x40/0xa0
>                                 sp=e00000006b2578d0 bsp=e00000006b2510b0
>  [<a000000100013db0>] dump_stack+0x30/0x60
>                                 sp=e00000006b257aa0 bsp=e00000006b251098
>  [<a0000001000de430>] move_masked_irq+0xb0/0x240
>                                 sp=e00000006b257aa0 bsp=e00000006b251070
>  [<a0000001000de6a0>] move_native_irq+0xe0/0x180
>                                 sp=e00000006b257aa0 bsp=e00000006b251040
>  [<a00000010004ff50>] iosapic_end_level_irq+0x30/0xe0
>                                 sp=e00000006b257aa0 bsp=e00000006b251020
>  [<a0000001000d94d0>] __do_IRQ+0x170/0x400
>                                 sp=e00000006b257aa0 bsp=e00000006b250fd8
>  [<a0000001000116f0>] ia64_handle_irq+0x1b0/0x260
>                                 sp=e00000006b257aa0 bsp=e00000006b250fa8
>  [<a00000010000c3a0>] ia64_leave_kernel+0x0/0x280
>                                 sp=e00000006b257aa0 bsp=e00000006b250fa8
>  [<a000000100690cf0>] _spin_unlock_irqrestore+0x30/0x60
>                                 sp=e00000006b257c70 bsp=e00000006b250f90

It comes from:

[kernel/irq/migration.c]
  26         if (CHECK_IRQ_PER_CPU(desc->status)) {
  27                 WARN_ON(1);
  28                 return;
  29         }

By putting some printk in kernel, I found that irqbalance is trying to
move CPEI which is handled as PER_CPU irq. That's why.

CPEI(Corrected Platform Error Interrupt) is ia64 specific irq, is
allowed to pin to particular processor which selected by the platform, and
even it is PER_CPU but it has set_affinity handler (=iosapic_set_affinity)
as same as other IO-SAPIC-level interrupts. (I don't know why, but
I guess that there would be typical situation where the handler for
migration is needed, such as hotplug - the processor going to be
offline/hot-removed.)

To shut up this warning, there are 2 way at least:
 a) fix CPEI stuff
 b) prohibit setting affinity to PER_CPU irq

I'm not sure what stuff of CPEI need to be fixed, but I think that
returning error to attempting move PER_CPU irq is useful for all
applications since it will never work.

Following small patch takes b) style.
It works, the warning disappeared and irqbalance still runs well.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/proc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 9a352667007c..61f5c717a8f5 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -54,7 +54,8 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
 	unsigned int irq = (int)(long)data, full_count = count, err;
 	cpumask_t new_value, tmp;
 
-	if (!irq_desc[irq].chip->set_affinity || no_irq_affinity)
+	if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
+				CHECK_IRQ_PER_CPU(irq_desc[irq].status))
 		return -EIO;
 
 	err = cpumask_parse_user(buffer, count, new_value);
-- 
cgit v1.2.3


From 24ec839c431eb79bb8f6abc00c4e1eb3b8c4d517 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 8 Dec 2006 02:36:04 -0800
Subject: [PATCH] tty: ->signal->tty locking

Fix the locking of signal->tty.

Use ->sighand->siglock to protect ->signal->tty; this lock is already used
by most other members of ->signal/->sighand.  And unless we are 'current'
or the tasklist_lock is held we need ->siglock to access ->signal anyway.

(NOTE: sys_unshare() is broken wrt ->sighand locking rules)

Note that tty_mutex is held over tty destruction, so while holding
tty_mutex any tty pointer remains valid.  Otherwise the lifetime of ttys
are governed by their open file handles.  This leaves some holes for tty
access from signal->tty (or any other non file related tty access).

It solves the tty SLAB scribbles we were seeing.

(NOTE: the change from group_send_sig_info to __group_send_sig_info needs to
       be examined by someone familiar with the security framework, I think
       it is safe given the SEND_SIG_PRIV from other __group_send_sig_info
       invocations)

[schwidefsky@de.ibm.com: 3270 fix]
[akpm@osdl.org: various post-viro fixes]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Alan Cox <alan@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Roland McGrath <roland@redhat.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: James Morris <jmorris@namei.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Jan Kara <jack@ucw.cz>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/acct.c    | 9 +++------
 kernel/auditsc.c | 2 ++
 kernel/exit.c    | 4 +---
 kernel/sys.c     | 6 ++++--
 4 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index dc12db8600e7..ca5619039367 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -428,6 +428,7 @@ static void do_acct_process(struct file *file)
 	u64 elapsed;
 	u64 run_time;
 	struct timespec uptime;
+	struct tty_struct *tty;
 
 	/*
 	 * First check to see if there is enough free_space to continue
@@ -485,12 +486,8 @@ static void do_acct_process(struct file *file)
 #endif
 
 	mutex_lock(&tty_mutex);
-	/* FIXME: Whoever is responsible for current->signal locking needs
-	   to use the same locking all over the kernel and document it */
-	read_lock(&tasklist_lock);
-	ac.ac_tty = current->signal->tty ?
-		old_encode_dev(tty_devnum(current->signal->tty)) : 0;
-	read_unlock(&tasklist_lock);
+	tty = get_current_tty();
+	ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
 	mutex_unlock(&tty_mutex);
 
 	spin_lock_irq(&current->sighand->siglock);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 40722e26de98..b6cb802fbcd1 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -826,10 +826,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 				 context->return_code);
 
 	mutex_lock(&tty_mutex);
+	read_lock(&tasklist_lock);
 	if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
 		tty = tsk->signal->tty->name;
 	else
 		tty = "(none)";
+	read_unlock(&tasklist_lock);
 	audit_log_format(ab,
 		  " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
 		  " ppid=%d pid=%d auid=%u uid=%u gid=%u"
diff --git a/kernel/exit.c b/kernel/exit.c
index 4e3f919edc48..fa235779b6a3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -384,9 +384,7 @@ void daemonize(const char *name, ...)
 	exit_mm(current);
 
 	set_special_pids(1, 1);
-	mutex_lock(&tty_mutex);
-	current->signal->tty = NULL;
-	mutex_unlock(&tty_mutex);
+	proc_clear_tty(current);
 
 	/* Block and flush all signals */
 	sigfillset(&blocked);
diff --git a/kernel/sys.c b/kernel/sys.c
index a0c1a29a507f..1ac2d1c5d84e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1484,7 +1484,6 @@ asmlinkage long sys_setsid(void)
 	pid_t session;
 	int err = -EPERM;
 
-	mutex_lock(&tty_mutex);
 	write_lock_irq(&tasklist_lock);
 
 	/* Fail if I am already a session leader */
@@ -1504,12 +1503,15 @@ asmlinkage long sys_setsid(void)
 
 	group_leader->signal->leader = 1;
 	__set_special_pids(session, session);
+
+	spin_lock(&group_leader->sighand->siglock);
 	group_leader->signal->tty = NULL;
 	group_leader->signal->tty_old_pgrp = 0;
+	spin_unlock(&group_leader->sighand->siglock);
+
 	err = process_group(group_leader);
 out:
 	write_unlock_irq(&tasklist_lock);
-	mutex_unlock(&tty_mutex);
 	return err;
 }
 
-- 
cgit v1.2.3


From 7bcfa95e561f11a17720162935e4f704c5d6fda3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 8 Dec 2006 02:36:07 -0800
Subject: [PATCH] do_acct_process(): don't take tty_mutex

No need to take the global tty_mutex, signal->tty->driver can't go away while
we are holding ->siglock.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/acct.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index ca5619039367..69a40c9777a2 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -485,12 +485,9 @@ static void do_acct_process(struct file *file)
 	ac.ac_ppid = current->parent->tgid;
 #endif
 
-	mutex_lock(&tty_mutex);
-	tty = get_current_tty();
-	ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
-	mutex_unlock(&tty_mutex);
-
 	spin_lock_irq(&current->sighand->siglock);
+	tty = current->signal->tty;
+	ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
 	ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
 	ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
 	ac.ac_flag = pacct->ac_flag;
-- 
cgit v1.2.3


From ae424ae4b5bcd820ad6ee6f0b986c4e14ed4d6cf Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 8 Dec 2006 02:36:08 -0800
Subject: [PATCH] make set_special_pids() static

Make set_special_pids() static, the only caller is daemonize().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index fa235779b6a3..fa0495e167e9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -314,7 +314,7 @@ void __set_special_pids(pid_t session, pid_t pgrp)
 	}
 }
 
-void set_special_pids(pid_t session, pid_t pgrp)
+static void set_special_pids(pid_t session, pid_t pgrp)
 {
 	write_lock_irq(&tasklist_lock);
 	__set_special_pids(session, pgrp);
-- 
cgit v1.2.3


From dae3c5a0b7052ad7dd9fa78c51ecfab828c5007b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 8 Dec 2006 02:36:09 -0800
Subject: [PATCH] sys_unshare: remove a broken CLONE_SIGHAND code

sys_unshare(CLONE_SIGHAND) is broken, the code under 'if (new_sigh)' is
never executed but very wrong. Just remove it to avoid a confusion,
task_lock() has nothing to do with ->sighand changing.

Also, change the comment in unshare_sighand(). Yes, CLONE_THREAD implies
CLONE_SIGHAND, but still it looks confusing. Also, we don't need to check
current->sighand != NULL.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 7f2e31ba33af..f387a1393ca5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1544,15 +1544,13 @@ static int unshare_namespace(unsigned long unshare_flags, struct namespace **new
 }
 
 /*
- * Unsharing of sighand for tasks created with CLONE_SIGHAND is not
- * supported yet
+ * Unsharing of sighand is not supported yet
  */
 static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
 {
 	struct sighand_struct *sigh = current->sighand;
 
-	if ((unshare_flags & CLONE_SIGHAND) &&
-	    (sigh && atomic_read(&sigh->count) > 1))
+	if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
 		return -EINVAL;
 	else
 		return 0;
@@ -1626,7 +1624,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 	int err = 0;
 	struct fs_struct *fs, *new_fs = NULL;
 	struct namespace *ns, *new_ns = NULL;
-	struct sighand_struct *sigh, *new_sigh = NULL;
+	struct sighand_struct *new_sigh = NULL;
 	struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
 	struct files_struct *fd, *new_fd = NULL;
 	struct sem_undo_list *new_ulist = NULL;
@@ -1671,7 +1669,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 		}
 	}
 
-	if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist ||
+	if (new_fs || new_ns || new_mm || new_fd || new_ulist ||
 				new_uts || new_ipc) {
 
 		task_lock(current);
@@ -1693,12 +1691,6 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 			new_ns = ns;
 		}
 
-		if (new_sigh) {
-			sigh = current->sighand;
-			rcu_assign_pointer(current->sighand, new_sigh);
-			new_sigh = sigh;
-		}
-
 		if (new_mm) {
 			mm = current->mm;
 			active_mm = current->active_mm;
-- 
cgit v1.2.3


From d63a5a74dee87883fda6b7d170244acaac5b05e8 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 8 Dec 2006 02:36:17 -0800
Subject: [PATCH] lockdep: avoid lockdep warning in md

md_open takes ->reconfig_mutex which causes lockdep to complain.  This
(normally) doesn't have deadlock potential as the possible conflict is with a
reconfig_mutex in a different device.

I say "normally" because if a loop were created in the array->member hierarchy
a deadlock could happen.  However that causes bigger problems than a deadlock
and should be fixed independently.

So we flag the lock in md_open as a nested lock.  This requires defining
mutex_lock_interruptible_nested.

Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/mutex.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/mutex.c b/kernel/mutex.c
index 8c71cf72a497..e7cbbb82765b 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -206,6 +206,15 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass)
 }
 
 EXPORT_SYMBOL_GPL(mutex_lock_nested);
+
+int __sched
+mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
+{
+	might_sleep();
+	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass);
+}
+
+EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
 #endif
 
 /*
-- 
cgit v1.2.3


From f3a43f3f64bff8e205c3702f6b4804d66e306848 Mon Sep 17 00:00:00 2001
From: "Josef \"Jeff\" Sipek" <jsipek@cs.sunysb.edu>
Date: Fri, 8 Dec 2006 02:36:43 -0800
Subject: [PATCH] kernel: change uses of f_{dentry, vfsmnt} to use f_path

Change all the uses of f_{dentry,vfsmnt} to f_path.{dentry,mnt} in
linux/kernel/.

Signed-off-by: Josef "Jeff" Sipek <jsipek@cs.sunysb.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/acct.c   | 14 +++++++-------
 kernel/fork.c   |  2 +-
 kernel/futex.c  | 10 +++++-----
 kernel/relay.c  |  4 ++--
 kernel/sysctl.c |  2 +-
 5 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index 69a40c9777a2..70d0d88e5554 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -118,7 +118,7 @@ static int check_free_space(struct file *file)
 	spin_unlock(&acct_globals.lock);
 
 	/* May block */
-	if (vfs_statfs(file->f_dentry, &sbuf))
+	if (vfs_statfs(file->f_path.dentry, &sbuf))
 		return res;
 	suspend = sbuf.f_blocks * SUSPEND;
 	resume = sbuf.f_blocks * RESUME;
@@ -194,7 +194,7 @@ static void acct_file_reopen(struct file *file)
 		add_timer(&acct_globals.timer);
 	}
 	if (old_acct) {
-		mnt_unpin(old_acct->f_vfsmnt);
+		mnt_unpin(old_acct->f_path.mnt);
 		spin_unlock(&acct_globals.lock);
 		do_acct_process(old_acct);
 		filp_close(old_acct, NULL);
@@ -212,7 +212,7 @@ static int acct_on(char *name)
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 
-	if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
+	if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) {
 		filp_close(file, NULL);
 		return -EACCES;
 	}
@@ -229,11 +229,11 @@ static int acct_on(char *name)
 	}
 
 	spin_lock(&acct_globals.lock);
-	mnt_pin(file->f_vfsmnt);
+	mnt_pin(file->f_path.mnt);
 	acct_file_reopen(file);
 	spin_unlock(&acct_globals.lock);
 
-	mntput(file->f_vfsmnt);	/* it's pinned, now give up active reference */
+	mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
 
 	return 0;
 }
@@ -283,7 +283,7 @@ asmlinkage long sys_acct(const char __user *name)
 void acct_auto_close_mnt(struct vfsmount *m)
 {
 	spin_lock(&acct_globals.lock);
-	if (acct_globals.file && acct_globals.file->f_vfsmnt == m)
+	if (acct_globals.file && acct_globals.file->f_path.mnt == m)
 		acct_file_reopen(NULL);
 	spin_unlock(&acct_globals.lock);
 }
@@ -299,7 +299,7 @@ void acct_auto_close(struct super_block *sb)
 {
 	spin_lock(&acct_globals.lock);
 	if (acct_globals.file &&
-	    acct_globals.file->f_vfsmnt->mnt_sb == sb) {
+	    acct_globals.file->f_path.mnt->mnt_sb == sb) {
 		acct_file_reopen(NULL);
 	}
 	spin_unlock(&acct_globals.lock);
diff --git a/kernel/fork.c b/kernel/fork.c
index f387a1393ca5..597707819327 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -252,7 +252,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		anon_vma_link(tmp);
 		file = tmp->vm_file;
 		if (file) {
-			struct inode *inode = file->f_dentry->d_inode;
+			struct inode *inode = file->f_path.dentry->d_inode;
 			get_file(file);
 			if (tmp->vm_flags & VM_DENYWRITE)
 				atomic_dec(&inode->i_writecount);
diff --git a/kernel/futex.c b/kernel/futex.c
index 95989a3b4168..5a737de857d3 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -166,7 +166,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
 /*
  * Get parameters which are the keys for a futex.
  *
- * For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode,
+ * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
  * offset_within_page).  For private mappings, it's (uaddr, current->mm).
  * We can usually work out the index without swapping in the page.
  *
@@ -223,7 +223,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key)
 	/*
 	 * Linear file mappings are also simple.
 	 */
-	key->shared.inode = vma->vm_file->f_dentry->d_inode;
+	key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
 	key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
 	if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
 		key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
@@ -1528,9 +1528,9 @@ static int futex_fd(u32 __user *uaddr, int signal)
 		goto out;
 	}
 	filp->f_op = &futex_fops;
-	filp->f_vfsmnt = mntget(futex_mnt);
-	filp->f_dentry = dget(futex_mnt->mnt_root);
-	filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
+	filp->f_path.mnt = mntget(futex_mnt);
+	filp->f_path.dentry = dget(futex_mnt->mnt_root);
+	filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
 
 	if (signal) {
 		err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1);
diff --git a/kernel/relay.c b/kernel/relay.c
index 75a3a9a7efc2..818e514729cf 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -959,7 +959,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp,
 	if (!desc->count)
 		return 0;
 
-	mutex_lock(&filp->f_dentry->d_inode->i_mutex);
+	mutex_lock(&filp->f_path.dentry->d_inode->i_mutex);
 	do {
 		if (!relay_file_read_avail(buf, *ppos))
 			break;
@@ -979,7 +979,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp,
 			*ppos = relay_file_read_end_pos(buf, read_start, ret);
 		}
 	} while (desc->count && ret);
-	mutex_unlock(&filp->f_dentry->d_inode->i_mutex);
+	mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
 
 	return desc->written;
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8e9f00fd6d18..9846db93a595 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1614,7 +1614,7 @@ static ssize_t do_rw_proc(int write, struct file * file, char __user * buf,
 			  size_t count, loff_t *ppos)
 {
 	int op;
-	struct proc_dir_entry *de = PDE(file->f_dentry->d_inode);
+	struct proc_dir_entry *de = PDE(file->f_path.dentry->d_inode);
 	struct ctl_table *table;
 	size_t res;
 	ssize_t error = -ENOTDIR;
-- 
cgit v1.2.3


From a7a005fd12b84392becca311f2a20d5bf2a1b7af Mon Sep 17 00:00:00 2001
From: Josef Sipek <jsipek@fsl.cs.sunysb.edu>
Date: Fri, 8 Dec 2006 02:37:17 -0800
Subject: [PATCH] struct path: convert kernel

Signed-off-by: Josef Sipek <jsipek@fsl.cs.sunysb.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/auditsc.c |  4 ++--
 kernel/cpuset.c  | 22 +++++++++++-----------
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b6cb802fbcd1..298897559ca4 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -781,8 +781,8 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
 			if ((vma->vm_flags & VM_EXECUTABLE) &&
 			    vma->vm_file) {
 				audit_log_d_path(ab, "exe=",
-						 vma->vm_file->f_dentry,
-						 vma->vm_file->f_vfsmnt);
+						 vma->vm_file->f_path.dentry,
+						 vma->vm_file->f_path.mnt);
 				break;
 			}
 			vma = vma->vm_next;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 0a6b4d89f9a0..2c3b4431472b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -413,8 +413,8 @@ static struct file_system_type cpuset_fs_type = {
  *
  *
  * When reading/writing to a file:
- *	- the cpuset to use in file->f_dentry->d_parent->d_fsdata
- *	- the 'cftype' of the file is file->f_dentry->d_fsdata
+ *	- the cpuset to use in file->f_path.dentry->d_parent->d_fsdata
+ *	- the 'cftype' of the file is file->f_path.dentry->d_fsdata
  */
 
 struct cftype {
@@ -1284,8 +1284,8 @@ static ssize_t cpuset_common_file_write(struct file *file,
 					const char __user *userbuf,
 					size_t nbytes, loff_t *unused_ppos)
 {
-	struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
-	struct cftype *cft = __d_cft(file->f_dentry);
+	struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
+	struct cftype *cft = __d_cft(file->f_path.dentry);
 	cpuset_filetype_t type = cft->private;
 	char *buffer;
 	char *pathbuf = NULL;
@@ -1367,7 +1367,7 @@ static ssize_t cpuset_file_write(struct file *file, const char __user *buf,
 						size_t nbytes, loff_t *ppos)
 {
 	ssize_t retval = 0;
-	struct cftype *cft = __d_cft(file->f_dentry);
+	struct cftype *cft = __d_cft(file->f_path.dentry);
 	if (!cft)
 		return -ENODEV;
 
@@ -1417,8 +1417,8 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
 				size_t nbytes, loff_t *ppos)
 {
-	struct cftype *cft = __d_cft(file->f_dentry);
-	struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
+	struct cftype *cft = __d_cft(file->f_path.dentry);
+	struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
 	cpuset_filetype_t type = cft->private;
 	char *page;
 	ssize_t retval = 0;
@@ -1476,7 +1476,7 @@ static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbyt
 								loff_t *ppos)
 {
 	ssize_t retval = 0;
-	struct cftype *cft = __d_cft(file->f_dentry);
+	struct cftype *cft = __d_cft(file->f_path.dentry);
 	if (!cft)
 		return -ENODEV;
 
@@ -1498,7 +1498,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file)
 	if (err)
 		return err;
 
-	cft = __d_cft(file->f_dentry);
+	cft = __d_cft(file->f_path.dentry);
 	if (!cft)
 		return -ENODEV;
 	if (cft->open)
@@ -1511,7 +1511,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file)
 
 static int cpuset_file_release(struct inode *inode, struct file *file)
 {
-	struct cftype *cft = __d_cft(file->f_dentry);
+	struct cftype *cft = __d_cft(file->f_path.dentry);
 	if (cft->release)
 		return cft->release(inode, file);
 	return 0;
@@ -1700,7 +1700,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
  */
 static int cpuset_tasks_open(struct inode *unused, struct file *file)
 {
-	struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
+	struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
 	struct ctr_struct *ctr;
 	pid_t *pidarray;
 	int npids;
-- 
cgit v1.2.3


From 937949d9edbf4049bd41af6c9f92c26280584564 Mon Sep 17 00:00:00 2001
From: Cedric Le Goater <clg@fr.ibm.com>
Date: Fri, 8 Dec 2006 02:37:54 -0800
Subject: [PATCH] add process_session() helper routine

Replace occurences of task->signal->session by a new process_session() helper
routine.

It will be useful for pid namespaces to abstract the session pid number.

Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c   | 19 ++++++++++---------
 kernel/fork.c   |  4 ++--
 kernel/signal.c |  2 +-
 kernel/sys.c    |  8 ++++----
 4 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index fa0495e167e9..8d289bfc13d1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -193,14 +193,14 @@ int session_of_pgrp(int pgrp)
 
 	read_lock(&tasklist_lock);
 	do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
-		if (p->signal->session > 0) {
-			sid = p->signal->session;
+		if (process_session(p) > 0) {
+			sid = process_session(p);
 			goto out;
 		}
 	} while_each_task_pid(pgrp, PIDTYPE_PGID, p);
 	p = find_task_by_pid(pgrp);
 	if (p)
-		sid = p->signal->session;
+		sid = process_session(p);
 out:
 	read_unlock(&tasklist_lock);
 	
@@ -225,8 +225,8 @@ static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task)
 				|| p->exit_state
 				|| is_init(p->real_parent))
 			continue;
-		if (process_group(p->real_parent) != pgrp
-			    && p->real_parent->signal->session == p->signal->session) {
+		if (process_group(p->real_parent) != pgrp &&
+		    process_session(p->real_parent) == process_session(p)) {
 			ret = 0;
 			break;
 		}
@@ -302,7 +302,7 @@ void __set_special_pids(pid_t session, pid_t pgrp)
 {
 	struct task_struct *curr = current->group_leader;
 
-	if (curr->signal->session != session) {
+	if (process_session(curr) != session) {
 		detach_pid(curr, PIDTYPE_SID);
 		curr->signal->session = session;
 		attach_pid(curr, PIDTYPE_SID, session);
@@ -647,10 +647,11 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
 	 * outside, so the child pgrp is now orphaned.
 	 */
 	if ((process_group(p) != process_group(father)) &&
-	    (p->signal->session == father->signal->session)) {
+	    (process_session(p) == process_session(father))) {
 		int pgrp = process_group(p);
 
-		if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) {
+		if (will_become_orphaned_pgrp(pgrp, NULL) &&
+		    has_stopped_jobs(pgrp)) {
 			__kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp);
 			__kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp);
 		}
@@ -784,7 +785,7 @@ static void exit_notify(struct task_struct *tsk)
 	t = tsk->real_parent;
 	
 	if ((process_group(t) != process_group(tsk)) &&
-	    (t->signal->session == tsk->signal->session) &&
+	    (process_session(t) == process_session(tsk)) &&
 	    will_become_orphaned_pgrp(process_group(tsk), tsk) &&
 	    has_stopped_jobs(process_group(tsk))) {
 		__kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk));
diff --git a/kernel/fork.c b/kernel/fork.c
index 597707819327..298c4d6ab512 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1259,9 +1259,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		if (thread_group_leader(p)) {
 			p->signal->tty = current->signal->tty;
 			p->signal->pgrp = process_group(current);
-			p->signal->session = current->signal->session;
+			p->signal->session = process_session(current);
 			attach_pid(p, PIDTYPE_PGID, process_group(p));
-			attach_pid(p, PIDTYPE_SID, p->signal->session);
+			attach_pid(p, PIDTYPE_SID, process_session(p));
 
 			list_add_tail_rcu(&p->tasks, &init_task.tasks);
 			__get_cpu_var(process_counts)++;
diff --git a/kernel/signal.c b/kernel/signal.c
index ec81defde339..9eac4db60eda 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -583,7 +583,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	error = -EPERM;
 	if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
 	    && ((sig != SIGCONT) ||
-		(current->signal->session != t->signal->session))
+		(process_session(current) != process_session(t)))
 	    && (current->euid ^ t->suid) && (current->euid ^ t->uid)
 	    && (current->uid ^ t->suid) && (current->uid ^ t->uid)
 	    && !capable(CAP_KILL))
diff --git a/kernel/sys.c b/kernel/sys.c
index 1ac2d1c5d84e..4f9d23a3095f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1381,7 +1381,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 
 	if (p->real_parent == group_leader) {
 		err = -EPERM;
-		if (p->signal->session != group_leader->signal->session)
+		if (process_session(p) != process_session(group_leader))
 			goto out;
 		err = -EACCES;
 		if (p->did_exec)
@@ -1400,7 +1400,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 		struct task_struct *p;
 
 		do_each_task_pid(pgid, PIDTYPE_PGID, p) {
-			if (p->signal->session == group_leader->signal->session)
+			if (process_session(p) == process_session(group_leader))
 				goto ok_pgid;
 		} while_each_task_pid(pgid, PIDTYPE_PGID, p);
 		goto out;
@@ -1459,7 +1459,7 @@ asmlinkage long sys_getpgrp(void)
 asmlinkage long sys_getsid(pid_t pid)
 {
 	if (!pid)
-		return current->signal->session;
+		return process_session(current);
 	else {
 		int retval;
 		struct task_struct *p;
@@ -1471,7 +1471,7 @@ asmlinkage long sys_getsid(pid_t pid)
 		if (p) {
 			retval = security_task_getsid(p);
 			if (!retval)
-				retval = p->signal->session;
+				retval = process_session(p);
 		}
 		read_unlock(&tasklist_lock);
 		return retval;
-- 
cgit v1.2.3


From 1ec320afdc9552c92191d5f89fcd1ebe588334ca Mon Sep 17 00:00:00 2001
From: Cedric Le Goater <clg@fr.ibm.com>
Date: Fri, 8 Dec 2006 02:37:55 -0800
Subject: [PATCH] add process_session() helper routine: deprecate old field

Add an anonymous union and ((deprecated)) to catch direct usage of the
session field.

[akpm@osdl.org: fix various missed conversions]
[jdike@addtoit.com: fix UML bug]
Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 2 +-
 kernel/fork.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 8d289bfc13d1..6267a6cc6113 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -304,7 +304,7 @@ void __set_special_pids(pid_t session, pid_t pgrp)
 
 	if (process_session(curr) != session) {
 		detach_pid(curr, PIDTYPE_SID);
-		curr->signal->session = session;
+		set_signal_session(curr->signal, session);
 		attach_pid(curr, PIDTYPE_SID, session);
 	}
 	if (process_group(curr) != pgrp) {
diff --git a/kernel/fork.c b/kernel/fork.c
index 298c4d6ab512..60d2644bfe85 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1259,7 +1259,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		if (thread_group_leader(p)) {
 			p->signal->tty = current->signal->tty;
 			p->signal->pgrp = process_group(current);
-			p->signal->session = process_session(current);
+			set_signal_session(p->signal, process_session(current));
 			attach_pid(p, PIDTYPE_PGID, process_group(p));
 			attach_pid(p, PIDTYPE_SID, process_session(p));
 
-- 
cgit v1.2.3


From 6b3286ed1169d74fea401367d6d4d6c6ec758a81 Mon Sep 17 00:00:00 2001
From: Kirill Korotaev <dev@sw.ru>
Date: Fri, 8 Dec 2006 02:37:56 -0800
Subject: [PATCH] rename struct namespace to struct mnt_namespace

Rename 'struct namespace' to 'struct mnt_namespace' to avoid confusion with
other namespaces being developped for the containers : pid, uts, ipc, etc.
'namespace' variables and attributes are also renamed to 'mnt_ns'

Signed-off-by: Kirill Korotaev <dev@sw.ru>
Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c    |  2 +-
 kernel/fork.c    | 21 +++++++++++----------
 kernel/kmod.c    |  2 +-
 kernel/nsproxy.c | 16 ++++++++--------
 4 files changed, 21 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 6267a6cc6113..28d9feedfd27 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -13,7 +13,7 @@
 #include <linux/completion.h>
 #include <linux/personality.h>
 #include <linux/tty.h>
-#include <linux/namespace.h>
+#include <linux/mnt_namespace.h>
 #include <linux/key.h>
 #include <linux/security.h>
 #include <linux/cpu.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 60d2644bfe85..8c859eef8e6a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -18,7 +18,7 @@
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/completion.h>
-#include <linux/namespace.h>
+#include <linux/mnt_namespace.h>
 #include <linux/personality.h>
 #include <linux/mempolicy.h>
 #include <linux/sem.h>
@@ -1525,17 +1525,18 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 }
 
 /*
- * Unshare the namespace structure if it is being shared
+ * Unshare the mnt_namespace structure if it is being shared
  */
-static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs)
+static int unshare_mnt_namespace(unsigned long unshare_flags,
+		struct mnt_namespace **new_nsp, struct fs_struct *new_fs)
 {
-	struct namespace *ns = current->nsproxy->namespace;
+	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 
 	if ((unshare_flags & CLONE_NEWNS) && ns) {
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 
-		*new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs);
+		*new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs);
 		if (!*new_nsp)
 			return -ENOMEM;
 	}
@@ -1623,7 +1624,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 {
 	int err = 0;
 	struct fs_struct *fs, *new_fs = NULL;
-	struct namespace *ns, *new_ns = NULL;
+	struct mnt_namespace *ns, *new_ns = NULL;
 	struct sighand_struct *new_sigh = NULL;
 	struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
 	struct files_struct *fd, *new_fd = NULL;
@@ -1645,7 +1646,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 		goto bad_unshare_out;
 	if ((err = unshare_fs(unshare_flags, &new_fs)))
 		goto bad_unshare_cleanup_thread;
-	if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs)))
+	if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs)))
 		goto bad_unshare_cleanup_fs;
 	if ((err = unshare_sighand(unshare_flags, &new_sigh)))
 		goto bad_unshare_cleanup_ns;
@@ -1686,8 +1687,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 		}
 
 		if (new_ns) {
-			ns = current->nsproxy->namespace;
-			current->nsproxy->namespace = new_ns;
+			ns = current->nsproxy->mnt_ns;
+			current->nsproxy->mnt_ns = new_ns;
 			new_ns = ns;
 		}
 
@@ -1748,7 +1749,7 @@ bad_unshare_cleanup_sigh:
 
 bad_unshare_cleanup_ns:
 	if (new_ns)
-		put_namespace(new_ns);
+		put_mnt_ns(new_ns);
 
 bad_unshare_cleanup_fs:
 	if (new_fs)
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 8d2bea09a4ec..3a7379aa31ca 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -25,7 +25,7 @@
 #include <linux/kmod.h>
 #include <linux/smp_lock.h>
 #include <linux/slab.h>
-#include <linux/namespace.h>
+#include <linux/mnt_namespace.h>
 #include <linux/completion.h>
 #include <linux/file.h>
 #include <linux/workqueue.h>
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 674aceb7335a..bd9cb435dfe0 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -17,7 +17,7 @@
 #include <linux/version.h>
 #include <linux/nsproxy.h>
 #include <linux/init_task.h>
-#include <linux/namespace.h>
+#include <linux/mnt_namespace.h>
 #include <linux/utsname.h>
 
 struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
@@ -60,8 +60,8 @@ struct nsproxy *dup_namespaces(struct nsproxy *orig)
 	struct nsproxy *ns = clone_namespaces(orig);
 
 	if (ns) {
-		if (ns->namespace)
-			get_namespace(ns->namespace);
+		if (ns->mnt_ns)
+			get_mnt_ns(ns->mnt_ns);
 		if (ns->uts_ns)
 			get_uts_ns(ns->uts_ns);
 		if (ns->ipc_ns)
@@ -97,7 +97,7 @@ int copy_namespaces(int flags, struct task_struct *tsk)
 
 	tsk->nsproxy = new_ns;
 
-	err = copy_namespace(flags, tsk);
+	err = copy_mnt_ns(flags, tsk);
 	if (err)
 		goto out_ns;
 
@@ -117,8 +117,8 @@ out_ipc:
 	if (new_ns->uts_ns)
 		put_uts_ns(new_ns->uts_ns);
 out_uts:
-	if (new_ns->namespace)
-		put_namespace(new_ns->namespace);
+	if (new_ns->mnt_ns)
+		put_mnt_ns(new_ns->mnt_ns);
 out_ns:
 	tsk->nsproxy = old_ns;
 	kfree(new_ns);
@@ -127,8 +127,8 @@ out_ns:
 
 void free_nsproxy(struct nsproxy *ns)
 {
-		if (ns->namespace)
-			put_namespace(ns->namespace);
+		if (ns->mnt_ns)
+			put_mnt_ns(ns->mnt_ns);
 		if (ns->uts_ns)
 			put_uts_ns(ns->uts_ns);
 		if (ns->ipc_ns)
-- 
cgit v1.2.3


From 373beb35cd6b625e0ba4ad98baace12310a26aa8 Mon Sep 17 00:00:00 2001
From: Cedric Le Goater <clg@fr.ibm.com>
Date: Fri, 8 Dec 2006 02:37:57 -0800
Subject: [PATCH] identifier to nsproxy

Add an identifier to nsproxy.  The default init_ns_proxy has identifier 0 and
allocated nsproxies are given -1.

This identifier will be used by a new syscall sys_bind_ns.

Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/nsproxy.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index bd9cb435dfe0..f223c15c18e9 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -45,8 +45,10 @@ static inline struct nsproxy *clone_namespaces(struct nsproxy *orig)
 	struct nsproxy *ns;
 
 	ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL);
-	if (ns)
+	if (ns) {
 		atomic_set(&ns->count, 1);
+		ns->id = -1;
+	}
 	return ns;
 }
 
-- 
cgit v1.2.3


From 61a58c6c238cc81f7742b8cc84212cc55fb57747 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Date: Fri, 8 Dec 2006 02:37:58 -0800
Subject: [PATCH] rename struct pspace to struct pid_namespace

Rename struct pspace to struct pid_namespace for consistency with other
namespaces (uts_namespace and ipc_namespace).  Also rename
include/linux/pspace.h to include/linux/pid_namespace.h and variables from
pspace to pid_ns.

Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/pid.c | 49 +++++++++++++++++++++++++------------------------
 1 file changed, 25 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index a48879b0b921..25807e1b98dd 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -26,7 +26,7 @@
 #include <linux/init.h>
 #include <linux/bootmem.h>
 #include <linux/hash.h>
-#include <linux/pspace.h>
+#include <linux/pid_namespace.h>
 
 #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
 static struct hlist_head *pid_hash;
@@ -43,9 +43,10 @@ int pid_max_max = PID_MAX_LIMIT;
 #define BITS_PER_PAGE		(PAGE_SIZE*8)
 #define BITS_PER_PAGE_MASK	(BITS_PER_PAGE-1)
 
-static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off)
+static inline int mk_pid(struct pid_namespace *pid_ns,
+		struct pidmap *map, int off)
 {
-	return (map - pspace->pidmap)*BITS_PER_PAGE + off;
+	return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
 }
 
 #define find_next_offset(map, off)					\
@@ -57,7 +58,7 @@ static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off)
  * value does not cause lots of bitmaps to be allocated, but
  * the scheme scales to up to 4 million PIDs, runtime.
  */
-struct pspace init_pspace = {
+struct pid_namespace init_pid_ns = {
 	.pidmap = {
 		[ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
 	},
@@ -80,25 +81,25 @@ struct pspace init_pspace = {
 
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
 
-static fastcall void free_pidmap(struct pspace *pspace, int pid)
+static fastcall void free_pidmap(struct pid_namespace *pid_ns, int pid)
 {
-	struct pidmap *map = pspace->pidmap + pid / BITS_PER_PAGE;
+	struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE;
 	int offset = pid & BITS_PER_PAGE_MASK;
 
 	clear_bit(offset, map->page);
 	atomic_inc(&map->nr_free);
 }
 
-static int alloc_pidmap(struct pspace *pspace)
+static int alloc_pidmap(struct pid_namespace *pid_ns)
 {
-	int i, offset, max_scan, pid, last = pspace->last_pid;
+	int i, offset, max_scan, pid, last = pid_ns->last_pid;
 	struct pidmap *map;
 
 	pid = last + 1;
 	if (pid >= pid_max)
 		pid = RESERVED_PIDS;
 	offset = pid & BITS_PER_PAGE_MASK;
-	map = &pspace->pidmap[pid/BITS_PER_PAGE];
+	map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
 	max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
 	for (i = 0; i <= max_scan; ++i) {
 		if (unlikely(!map->page)) {
@@ -120,11 +121,11 @@ static int alloc_pidmap(struct pspace *pspace)
 			do {
 				if (!test_and_set_bit(offset, map->page)) {
 					atomic_dec(&map->nr_free);
-					pspace->last_pid = pid;
+					pid_ns->last_pid = pid;
 					return pid;
 				}
 				offset = find_next_offset(map, offset);
-				pid = mk_pid(pspace, map, offset);
+				pid = mk_pid(pid_ns, map, offset);
 			/*
 			 * find_next_offset() found a bit, the pid from it
 			 * is in-bounds, and if we fell back to the last
@@ -135,34 +136,34 @@ static int alloc_pidmap(struct pspace *pspace)
 					(i != max_scan || pid < last ||
 					    !((last+1) & BITS_PER_PAGE_MASK)));
 		}
-		if (map < &pspace->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
+		if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
 			++map;
 			offset = 0;
 		} else {
-			map = &pspace->pidmap[0];
+			map = &pid_ns->pidmap[0];
 			offset = RESERVED_PIDS;
 			if (unlikely(last == offset))
 				break;
 		}
-		pid = mk_pid(pspace, map, offset);
+		pid = mk_pid(pid_ns, map, offset);
 	}
 	return -1;
 }
 
-static int next_pidmap(struct pspace *pspace, int last)
+static int next_pidmap(struct pid_namespace *pid_ns, int last)
 {
 	int offset;
 	struct pidmap *map, *end;
 
 	offset = (last + 1) & BITS_PER_PAGE_MASK;
-	map = &pspace->pidmap[(last + 1)/BITS_PER_PAGE];
-	end = &pspace->pidmap[PIDMAP_ENTRIES];
+	map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
+	end = &pid_ns->pidmap[PIDMAP_ENTRIES];
 	for (; map < end; map++, offset = 0) {
 		if (unlikely(!map->page))
 			continue;
 		offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
 		if (offset < BITS_PER_PAGE)
-			return mk_pid(pspace, map, offset);
+			return mk_pid(pid_ns, map, offset);
 	}
 	return -1;
 }
@@ -192,7 +193,7 @@ fastcall void free_pid(struct pid *pid)
 	hlist_del_rcu(&pid->pid_chain);
 	spin_unlock_irqrestore(&pidmap_lock, flags);
 
-	free_pidmap(&init_pspace, pid->nr);
+	free_pidmap(&init_pid_ns, pid->nr);
 	call_rcu(&pid->rcu, delayed_put_pid);
 }
 
@@ -206,7 +207,7 @@ struct pid *alloc_pid(void)
 	if (!pid)
 		goto out;
 
-	nr = alloc_pidmap(&init_pspace);
+	nr = alloc_pidmap(&init_pid_ns);
 	if (nr < 0)
 		goto out_free;
 
@@ -348,7 +349,7 @@ struct pid *find_ge_pid(int nr)
 		pid = find_pid(nr);
 		if (pid)
 			break;
-		nr = next_pidmap(&init_pspace, nr);
+		nr = next_pidmap(&init_pid_ns, nr);
 	} while (nr > 0);
 
 	return pid;
@@ -382,10 +383,10 @@ void __init pidhash_init(void)
 
 void __init pidmap_init(void)
 {
-	init_pspace.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
 	/* Reserve PID 0. We never call free_pidmap(0) */
-	set_bit(0, init_pspace.pidmap[0].page);
-	atomic_dec(&init_pspace.pidmap[0].nr_free);
+	set_bit(0, init_pid_ns.pidmap[0].page);
+	atomic_dec(&init_pid_ns.pidmap[0].nr_free);
 
 	pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
 					__alignof__(struct pid),
-- 
cgit v1.2.3


From 9a575a92db3312a40cdf0b0406d88de88ad9741e Mon Sep 17 00:00:00 2001
From: Cedric Le Goater <clg@fr.ibm.com>
Date: Fri, 8 Dec 2006 02:37:59 -0800
Subject: [PATCH] to nsproxy

Add the pid namespace framework to the nsproxy object.  The copy of the pid
namespace only increases the refcount on the global pid namespace,
init_pid_ns, and unshare is not implemented.

There is no configuration option to activate or deactivate this feature
because this not relevant for the moment.

Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/nsproxy.c | 26 +++++++++++++++++++-------
 kernel/pid.c     | 23 +++++++++++++++++++++++
 2 files changed, 42 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f223c15c18e9..e2ce748e96af 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -19,6 +19,7 @@
 #include <linux/init_task.h>
 #include <linux/mnt_namespace.h>
 #include <linux/utsname.h>
+#include <linux/pid_namespace.h>
 
 struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
 
@@ -68,6 +69,8 @@ struct nsproxy *dup_namespaces(struct nsproxy *orig)
 			get_uts_ns(ns->uts_ns);
 		if (ns->ipc_ns)
 			get_ipc_ns(ns->ipc_ns);
+		if (ns->pid_ns)
+			get_pid_ns(ns->pid_ns);
 	}
 
 	return ns;
@@ -111,10 +114,17 @@ int copy_namespaces(int flags, struct task_struct *tsk)
 	if (err)
 		goto out_ipc;
 
+	err = copy_pid_ns(flags, tsk);
+	if (err)
+		goto out_pid;
+
 out:
 	put_nsproxy(old_ns);
 	return err;
 
+out_pid:
+	if (new_ns->ipc_ns)
+		put_ipc_ns(new_ns->ipc_ns);
 out_ipc:
 	if (new_ns->uts_ns)
 		put_uts_ns(new_ns->uts_ns);
@@ -129,11 +139,13 @@ out_ns:
 
 void free_nsproxy(struct nsproxy *ns)
 {
-		if (ns->mnt_ns)
-			put_mnt_ns(ns->mnt_ns);
-		if (ns->uts_ns)
-			put_uts_ns(ns->uts_ns);
-		if (ns->ipc_ns)
-			put_ipc_ns(ns->ipc_ns);
-		kfree(ns);
+	if (ns->mnt_ns)
+		put_mnt_ns(ns->mnt_ns);
+	if (ns->uts_ns)
+		put_uts_ns(ns->uts_ns);
+	if (ns->ipc_ns)
+		put_ipc_ns(ns->ipc_ns);
+	if (ns->pid_ns)
+		put_pid_ns(ns->pid_ns);
+	kfree(ns);
 }
diff --git a/kernel/pid.c b/kernel/pid.c
index 25807e1b98dd..5319b9f2fc5e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -59,6 +59,9 @@ static inline int mk_pid(struct pid_namespace *pid_ns,
  * the scheme scales to up to 4 million PIDs, runtime.
  */
 struct pid_namespace init_pid_ns = {
+	.kref = {
+		.refcount       = ATOMIC_INIT(2),
+	},
 	.pidmap = {
 		[ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
 	},
@@ -356,6 +359,26 @@ struct pid *find_ge_pid(int nr)
 }
 EXPORT_SYMBOL_GPL(find_get_pid);
 
+int copy_pid_ns(int flags, struct task_struct *tsk)
+{
+	struct pid_namespace *old_ns = tsk->nsproxy->pid_ns;
+	int err = 0;
+
+	if (!old_ns)
+		return 0;
+
+	get_pid_ns(old_ns);
+	return err;
+}
+
+void free_pid_ns(struct kref *kref)
+{
+	struct pid_namespace *ns;
+
+	ns = container_of(kref, struct pid_namespace, kref);
+	kfree(ns);
+}
+
 /*
  * The pid hash table is scaled according to the amount of memory in the
  * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
-- 
cgit v1.2.3


From 6cc1b22a4acef3816eaa5f8c227d93d749b23195 Mon Sep 17 00:00:00 2001
From: Cedric Le Goater <clg@fr.ibm.com>
Date: Fri, 8 Dec 2006 02:38:00 -0800
Subject: [PATCH] use current->nsproxy->pid_ns

Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/pid.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index 5319b9f2fc5e..1d9cc268b499 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -196,7 +196,7 @@ fastcall void free_pid(struct pid *pid)
 	hlist_del_rcu(&pid->pid_chain);
 	spin_unlock_irqrestore(&pidmap_lock, flags);
 
-	free_pidmap(&init_pid_ns, pid->nr);
+	free_pidmap(current->nsproxy->pid_ns, pid->nr);
 	call_rcu(&pid->rcu, delayed_put_pid);
 }
 
@@ -210,7 +210,7 @@ struct pid *alloc_pid(void)
 	if (!pid)
 		goto out;
 
-	nr = alloc_pidmap(&init_pid_ns);
+	nr = alloc_pidmap(current->nsproxy->pid_ns);
 	if (nr < 0)
 		goto out_free;
 
@@ -352,7 +352,7 @@ struct pid *find_ge_pid(int nr)
 		pid = find_pid(nr);
 		if (pid)
 			break;
-		nr = next_pidmap(&init_pid_ns, nr);
+		nr = next_pidmap(current->nsproxy->pid_ns, nr);
 	} while (nr > 0);
 
 	return pid;
-- 
cgit v1.2.3


From 84d737866e2babdeab0c6b18ea155c6a649663b8 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Date: Fri, 8 Dec 2006 02:38:01 -0800
Subject: [PATCH] add child reaper to pid_namespace

Add a per pid_namespace child-reaper.  This is needed so processes are reaped
within the same pid space and do not spill over to the parent pid space.  Its
also needed so containers preserve existing semantic that pid == 1 would reap
orphaned children.

This is based on Eric Biederman's patch: http://lkml.org/lkml/2006/2/6/285

Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c   | 23 +++++++++++++++--------
 kernel/pid.c    |  3 ++-
 kernel/signal.c | 11 +++++++++--
 3 files changed, 26 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 28d9feedfd27..fd0e067952ab 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -22,6 +22,7 @@
 #include <linux/file.h>
 #include <linux/binfmts.h>
 #include <linux/nsproxy.h>
+#include <linux/pid_namespace.h>
 #include <linux/ptrace.h>
 #include <linux/profile.h>
 #include <linux/mount.h>
@@ -48,7 +49,6 @@
 #include <asm/mmu_context.h>
 
 extern void sem_exit (void);
-extern struct task_struct *child_reaper;
 
 static void exit_mm(struct task_struct * tsk);
 
@@ -260,7 +260,8 @@ static int has_stopped_jobs(int pgrp)
 }
 
 /**
- * reparent_to_init - Reparent the calling kernel thread to the init task.
+ * reparent_to_init - Reparent the calling kernel thread to the init task
+ * of the pid space that the thread belongs to.
  *
  * If a kernel thread is launched as a result of a system call, or if
  * it ever exits, it should generally reparent itself to init so that
@@ -278,8 +279,8 @@ static void reparent_to_init(void)
 	ptrace_unlink(current);
 	/* Reparent to init */
 	remove_parent(current);
-	current->parent = child_reaper;
-	current->real_parent = child_reaper;
+	current->parent = child_reaper(current);
+	current->real_parent = child_reaper(current);
 	add_parent(current);
 
 	/* Set the exit signal to SIGCHLD so we signal init on exit */
@@ -662,7 +663,8 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
  * When we die, we re-parent all our children.
  * Try to give them to another thread in our thread
  * group, and if no such member exists, give it to
- * the global child reaper process (ie "init")
+ * the child reaper process (ie "init") in our pid
+ * space.
  */
 static void
 forget_original_parent(struct task_struct *father, struct list_head *to_release)
@@ -673,7 +675,7 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release)
 	do {
 		reaper = next_thread(reaper);
 		if (reaper == father) {
-			reaper = child_reaper;
+			reaper = child_reaper(father);
 			break;
 		}
 	} while (reaper->exit_state);
@@ -859,8 +861,13 @@ fastcall NORET_TYPE void do_exit(long code)
 		panic("Aiee, killing interrupt handler!");
 	if (unlikely(!tsk->pid))
 		panic("Attempted to kill the idle task!");
-	if (unlikely(tsk == child_reaper))
-		panic("Attempted to kill init!");
+	if (unlikely(tsk == child_reaper(tsk))) {
+		if (tsk->nsproxy->pid_ns != &init_pid_ns)
+			tsk->nsproxy->pid_ns->child_reaper = init_pid_ns.child_reaper;
+		else
+			panic("Attempted to kill init!");
+	}
+
 
 	if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
 		current->ptrace_message = code;
diff --git a/kernel/pid.c b/kernel/pid.c
index 1d9cc268b499..2efe9d8d367b 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -65,7 +65,8 @@ struct pid_namespace init_pid_ns = {
 	.pidmap = {
 		[ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
 	},
-	.last_pid = 0
+	.last_pid = 0,
+	.child_reaper = &init_task
 };
 
 /*
diff --git a/kernel/signal.c b/kernel/signal.c
index 9eac4db60eda..1921ffdc5e77 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -24,6 +24,9 @@
 #include <linux/signal.h>
 #include <linux/capability.h>
 #include <linux/freezer.h>
+#include <linux/pid_namespace.h>
+#include <linux/nsproxy.h>
+
 #include <asm/param.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -1877,8 +1880,12 @@ relock:
 		if (sig_kernel_ignore(signr)) /* Default is nothing. */
 			continue;
 
-		/* Init gets no signals it doesn't want.  */
-		if (current == child_reaper)
+		/*
+		 * Init of a pid space gets no signals it doesn't want from
+		 * within that pid space. It can of course get signals from
+		 * its parent pid space.
+		 */
+		if (current == child_reaper(current))
 			continue;
 
 		if (sig_kernel_stop(signr)) {
-- 
cgit v1.2.3


From f020bc468fe4a91d32046d448511978c7b611315 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 8 Dec 2006 02:38:02 -0800
Subject: [PATCH] sys_setpgid: eliminate unnecessary
 do_each_task_pid(PIDTYPE_PGID)

All tasks in the process group have the same sid, we don't need to iterate
them all to check that the caller of sys_setpgid() doesn't change its
session.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 4f9d23a3095f..c7675c1bfdf2 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1397,16 +1397,13 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 		goto out;
 
 	if (pgid != pid) {
-		struct task_struct *p;
+		struct task_struct *g =
+			find_task_by_pid_type(PIDTYPE_PGID, pgid);
 
-		do_each_task_pid(pgid, PIDTYPE_PGID, p) {
-			if (process_session(p) == process_session(group_leader))
-				goto ok_pgid;
-		} while_each_task_pid(pgid, PIDTYPE_PGID, p);
-		goto out;
+		if (!g || process_session(g) != process_session(group_leader))
+			goto out;
 	}
 
-ok_pgid:
 	err = security_task_setpgid(p, pgid);
 	if (err)
 		goto out;
-- 
cgit v1.2.3


From 62dfb5541a025b47df9405ff0219c7829a97d83b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 8 Dec 2006 02:38:03 -0800
Subject: [PATCH] session_of_pgrp: kill unnecessary
 do_each_task_pid(PIDTYPE_PGID)

All members of the process group have the same sid and it can't be == 0.

NOTE: this code (and a similar one in sys_setpgid) was needed because it
was possibe to have ->session == 0. It's not possible any longer since

	[PATCH] pidhash: don't use zero pids
	Commit: c7c6464117a02b0d54feb4ebeca4db70fa493678

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index fd0e067952ab..03e64fe4a14a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -189,21 +189,18 @@ repeat:
 int session_of_pgrp(int pgrp)
 {
 	struct task_struct *p;
-	int sid = -1;
+	int sid = 0;
 
 	read_lock(&tasklist_lock);
-	do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
-		if (process_session(p) > 0) {
-			sid = process_session(p);
-			goto out;
-		}
-	} while_each_task_pid(pgrp, PIDTYPE_PGID, p);
-	p = find_task_by_pid(pgrp);
-	if (p)
+
+	p = find_task_by_pid_type(PIDTYPE_PGID, pgrp);
+	if (p == NULL)
+		p = find_task_by_pid(pgrp);
+	if (p != NULL)
 		sid = process_session(p);
-out:
+
 	read_unlock(&tasklist_lock);
-	
+
 	return sid;
 }
 
-- 
cgit v1.2.3


From cf9f151c7257683f489df85f94baf408d1d5694a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 8 Dec 2006 02:39:55 -0800
Subject: [PATCH] sysctl: simplify sysctl_uts_string

The binary interface to the namespace sysctls was never implemented resulting
in some really weird things if you attempted to use sys_sysctl to read your
hostname for example.

This patch series simples the code a little and implements the binary sysctl
interface.

In testing this patch series I discovered that our 32bit compatibility for the
binary sysctl interface is imperfect.  In particular KERN_SHMMAX and
KERN_SMMALL are size_t sized quantities and are returned as 8 bytes on to
32bit binaries using a x86_64 kernel.  However this has existing for a long
time so it is not a new regression with the namespace work.

Gads the whole sysctl thing needs work before it stops being easy to shoot
yourself in the foot.

Looking forward a little bit we need a better way to handle sysctls and
namespaces as our current technique will not work for the network namespace.
I think something based on the current overlapping sysctl trees will work but
the proc side needs to be redone before we can use it.

This patch:

Introduce get_uts() and put_uts() (used later) and remove most of the special
cases for when UTS namespace is compiled in.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 128 ++++++++++++--------------------------------------------
 1 file changed, 26 insertions(+), 102 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9846db93a595..c8bcbfb2e069 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -163,6 +163,28 @@ extern ctl_table inotify_table[];
 int sysctl_legacy_va_layout;
 #endif
 
+static void *get_uts(ctl_table *table, int write)
+{
+	char *which = table->data;
+#ifdef CONFIG_UTS_NS
+	struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
+	which = (which - (char *)&init_uts_ns) + (char *)uts_ns;
+#endif
+	if (!write)
+		down_read(&uts_sem);
+	else
+		down_write(&uts_sem);
+	return which;
+}
+
+static void put_uts(ctl_table *table, int write, void *which)
+{
+	if (!write)
+		up_read(&uts_sem);
+	else
+		up_write(&uts_sem);
+}
+
 /* /proc declarations: */
 
 #ifdef CONFIG_PROC_SYSCTL
@@ -229,7 +251,6 @@ static ctl_table root_table[] = {
 };
 
 static ctl_table kern_table[] = {
-#ifndef CONFIG_UTS_NS
 	{
 		.ctl_name	= KERN_OSTYPE,
 		.procname	= "ostype",
@@ -275,54 +296,6 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_do_uts_string,
 		.strategy	= &sysctl_string,
 	},
-#else  /* !CONFIG_UTS_NS */
-	{
-		.ctl_name	= KERN_OSTYPE,
-		.procname	= "ostype",
-		.data		= NULL,
-		/* could maybe use __NEW_UTS_LEN here? */
-		.maxlen		= FIELD_SIZEOF(struct new_utsname, sysname),
-		.mode		= 0444,
-		.proc_handler	= &proc_do_uts_string,
-		.strategy	= &sysctl_string,
-	},
-	{
-		.ctl_name	= KERN_OSRELEASE,
-		.procname	= "osrelease",
-		.data		= NULL,
-		.maxlen		= FIELD_SIZEOF(struct new_utsname, release),
-		.mode		= 0444,
-		.proc_handler	= &proc_do_uts_string,
-		.strategy	= &sysctl_string,
-	},
-	{
-		.ctl_name	= KERN_VERSION,
-		.procname	= "version",
-		.data		= NULL,
-		.maxlen		= FIELD_SIZEOF(struct new_utsname, version),
-		.mode		= 0444,
-		.proc_handler	= &proc_do_uts_string,
-		.strategy	= &sysctl_string,
-	},
-	{
-		.ctl_name	= KERN_NODENAME,
-		.procname	= "hostname",
-		.data		= NULL,
-		.maxlen		= FIELD_SIZEOF(struct new_utsname, nodename),
-		.mode		= 0644,
-		.proc_handler	= &proc_do_uts_string,
-		.strategy	= &sysctl_string,
-	},
-	{
-		.ctl_name	= KERN_DOMAINNAME,
-		.procname	= "domainname",
-		.data		= NULL,
-		.maxlen		= FIELD_SIZEOF(struct new_utsname, domainname),
-		.mode		= 0644,
-		.proc_handler	= &proc_do_uts_string,
-		.strategy	= &sysctl_string,
-	},
-#endif /* !CONFIG_UTS_NS */
 	{
 		.ctl_name	= KERN_PANIC,
 		.procname	= "panic",
@@ -1753,66 +1726,17 @@ int proc_dostring(ctl_table *table, int write, struct file *filp,
  *	Special case of dostring for the UTS structure. This has locks
  *	to observe. Should this be in kernel/sys.c ????
  */
- 
-#ifndef CONFIG_UTS_NS
-static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
-		  void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	int r;
 
-	if (!write) {
-		down_read(&uts_sem);
-		r=proc_dostring(table,0,filp,buffer,lenp, ppos);
-		up_read(&uts_sem);
-	} else {
-		down_write(&uts_sem);
-		r=proc_dostring(table,1,filp,buffer,lenp, ppos);
-		up_write(&uts_sem);
-	}
-	return r;
-}
-#else /* !CONFIG_UTS_NS */
 static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
 		  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int r;
-	struct uts_namespace* uts_ns = current->nsproxy->uts_ns;
-	char* which;
-
-	switch (table->ctl_name) {
-	case KERN_OSTYPE:
-		which = uts_ns->name.sysname;
-		break;
-	case KERN_NODENAME:
-		which = uts_ns->name.nodename;
-		break;
-	case KERN_OSRELEASE:
-		which = uts_ns->name.release;
-		break;
-	case KERN_VERSION:
-		which = uts_ns->name.version;
-		break;
-	case KERN_DOMAINNAME:
-		which = uts_ns->name.domainname;
-		break;
-	default:
-		r = -EINVAL;
-		goto out;
-	}
-
-	if (!write) {
-		down_read(&uts_sem);
-		r=_proc_do_string(which,table->maxlen,0,filp,buffer,lenp, ppos);
-		up_read(&uts_sem);
-	} else {
-		down_write(&uts_sem);
-		r=_proc_do_string(which,table->maxlen,1,filp,buffer,lenp, ppos);
-		up_write(&uts_sem);
-	}
- out:
+	void *which;
+	which = get_uts(table, write);
+	r = _proc_do_string(which, table->maxlen,write,filp,buffer,lenp, ppos);
+	put_uts(table, write, which);
 	return r;
 }
-#endif /* !CONFIG_UTS_NS */
 
 static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
 				 int *valp,
-- 
cgit v1.2.3


From c4b8b769fa9051838d2772886ecd0ee2a926ddc3 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 8 Dec 2006 02:39:55 -0800
Subject: [PATCH] sysctl: implement sysctl_uts_string()

The problem: When using sys_sysctl we don't read the proper values for the
variables exported from the uts namespace, nor do we do the proper locking.

This patch introduces sysctl_uts_string which properly fetches the values and
does the proper locking.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 37 ++++++++++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c8bcbfb2e069..77ea4fc386ef 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -137,6 +137,10 @@ static int parse_table(int __user *, int, void __user *, size_t __user *,
 static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
 
+static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
+		  void __user *oldval, size_t __user *oldlenp,
+		  void __user *newval, size_t newlen, void **context);
+
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -258,7 +262,7 @@ static ctl_table kern_table[] = {
 		.maxlen		= sizeof(init_uts_ns.name.sysname),
 		.mode		= 0444,
 		.proc_handler	= &proc_do_uts_string,
-		.strategy	= &sysctl_string,
+		.strategy	= &sysctl_uts_string,
 	},
 	{
 		.ctl_name	= KERN_OSRELEASE,
@@ -267,7 +271,7 @@ static ctl_table kern_table[] = {
 		.maxlen		= sizeof(init_uts_ns.name.release),
 		.mode		= 0444,
 		.proc_handler	= &proc_do_uts_string,
-		.strategy	= &sysctl_string,
+		.strategy	= &sysctl_uts_string,
 	},
 	{
 		.ctl_name	= KERN_VERSION,
@@ -276,7 +280,7 @@ static ctl_table kern_table[] = {
 		.maxlen		= sizeof(init_uts_ns.name.version),
 		.mode		= 0444,
 		.proc_handler	= &proc_do_uts_string,
-		.strategy	= &sysctl_string,
+		.strategy	= &sysctl_uts_string,
 	},
 	{
 		.ctl_name	= KERN_NODENAME,
@@ -285,7 +289,7 @@ static ctl_table kern_table[] = {
 		.maxlen		= sizeof(init_uts_ns.name.nodename),
 		.mode		= 0644,
 		.proc_handler	= &proc_do_uts_string,
-		.strategy	= &sysctl_string,
+		.strategy	= &sysctl_uts_string,
 	},
 	{
 		.ctl_name	= KERN_DOMAINNAME,
@@ -294,7 +298,7 @@ static ctl_table kern_table[] = {
 		.maxlen		= sizeof(init_uts_ns.name.domainname),
 		.mode		= 0644,
 		.proc_handler	= &proc_do_uts_string,
-		.strategy	= &sysctl_string,
+		.strategy	= &sysctl_uts_string,
 	},
 	{
 		.ctl_name	= KERN_PANIC,
@@ -2598,6 +2602,23 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
 	return 1;
 }
 
+
+/* The generic string strategy routine: */
+static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
+		  void __user *oldval, size_t __user *oldlenp,
+		  void __user *newval, size_t newlen, void **context)
+{
+	struct ctl_table uts_table;
+	int r, write;
+	write = newval && newlen;
+	memcpy(&uts_table, table, sizeof(uts_table));
+	uts_table.data = get_uts(table, write);
+	r = sysctl_string(&uts_table, name, nlen,
+		oldval, oldlenp, newval, newlen, context);
+	put_uts(table, write, uts_table.data);
+	return r;
+}
+
 #else /* CONFIG_SYSCTL_SYSCALL */
 
 
@@ -2662,6 +2683,12 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
 	return -ENOSYS;
 }
 
+static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
+		  void __user *oldval, size_t __user *oldlenp,
+		  void __user *newval, size_t newlen, void **context)
+{
+	return -ENOSYS;
+}
 #endif /* CONFIG_SYSCTL_SYSCALL */
 
 /*
-- 
cgit v1.2.3


From 9bc9a6bd3cf559bffe962c51efb062e8b5270ca9 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 8 Dec 2006 02:39:56 -0800
Subject: [PATCH] sysctl: simplify ipc ns specific sysctls

Refactor the ipc sysctl support so that it is simpler, more readable, and
prepares for fixing the bug with the wrong values being returned in the
sys_sysctl interface.

The function proc_do_ipc_string() was misnamed as it never handled strings.
It's magic of when to work with strings and when to work with longs belonged
in the sysctl table.  I couldn't tell if the code would work if you disabled
the ipc namespace but it certainly looked like it would have problems.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 106 ++++++++++++++++++++++++++------------------------------
 1 file changed, 49 insertions(+), 57 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 77ea4fc386ef..10474a63a111 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -92,7 +92,9 @@ extern char modprobe_path[];
 extern int sg_big_buff;
 #endif
 #ifdef CONFIG_SYSVIPC
-static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
+static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
+		void __user *buffer, size_t *lenp, loff_t *ppos);
+static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
 		void __user *buffer, size_t *lenp, loff_t *ppos);
 #endif
 
@@ -189,6 +191,18 @@ static void put_uts(ctl_table *table, int write, void *which)
 		up_write(&uts_sem);
 }
 
+#ifdef CONFIG_SYSVIPC
+static void *get_ipc(ctl_table *table, int write)
+{
+	char *which = table->data;
+	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
+	which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns;
+	return which;
+}
+#else
+#define get_ipc(T,W) ((T)->data)
+#endif
+
 /* /proc declarations: */
 
 #ifdef CONFIG_PROC_SYSCTL
@@ -458,58 +472,58 @@ static ctl_table kern_table[] = {
 	{
 		.ctl_name	= KERN_SHMMAX,
 		.procname	= "shmmax",
-		.data		= NULL,
-		.maxlen		= sizeof (size_t),
+		.data		= &init_ipc_ns.shm_ctlmax,
+		.maxlen		= sizeof (init_ipc_ns.shm_ctlmax),
 		.mode		= 0644,
-		.proc_handler	= &proc_do_ipc_string,
+		.proc_handler	= &proc_ipc_doulongvec_minmax,
 	},
 	{
 		.ctl_name	= KERN_SHMALL,
 		.procname	= "shmall",
-		.data		= NULL,
-		.maxlen		= sizeof (size_t),
+		.data		= &init_ipc_ns.shm_ctlall,
+		.maxlen		= sizeof (init_ipc_ns.shm_ctlall),
 		.mode		= 0644,
-		.proc_handler	= &proc_do_ipc_string,
+		.proc_handler	= &proc_ipc_doulongvec_minmax,
 	},
 	{
 		.ctl_name	= KERN_SHMMNI,
 		.procname	= "shmmni",
-		.data		= NULL,
-		.maxlen		= sizeof (int),
+		.data		= &init_ipc_ns.shm_ctlmni,
+		.maxlen		= sizeof (init_ipc_ns.shm_ctlmni),
 		.mode		= 0644,
-		.proc_handler	= &proc_do_ipc_string,
+		.proc_handler	= &proc_ipc_dointvec,
 	},
 	{
 		.ctl_name	= KERN_MSGMAX,
 		.procname	= "msgmax",
-		.data		= NULL,
-		.maxlen		= sizeof (int),
+		.data		= &init_ipc_ns.msg_ctlmax,
+		.maxlen		= sizeof (init_ipc_ns.msg_ctlmax),
 		.mode		= 0644,
-		.proc_handler	= &proc_do_ipc_string,
+		.proc_handler	= &proc_ipc_dointvec,
 	},
 	{
 		.ctl_name	= KERN_MSGMNI,
 		.procname	= "msgmni",
-		.data		= NULL,
-		.maxlen		= sizeof (int),
+		.data		= &init_ipc_ns.msg_ctlmni,
+		.maxlen		= sizeof (init_ipc_ns.msg_ctlmni),
 		.mode		= 0644,
-		.proc_handler	= &proc_do_ipc_string,
+		.proc_handler	= &proc_ipc_dointvec,
 	},
 	{
 		.ctl_name	= KERN_MSGMNB,
 		.procname	=  "msgmnb",
-		.data		= NULL,
-		.maxlen		= sizeof (int),
+		.data		= &init_ipc_ns.msg_ctlmnb,
+		.maxlen		= sizeof (init_ipc_ns.msg_ctlmnb),
 		.mode		= 0644,
-		.proc_handler	= &proc_do_ipc_string,
+		.proc_handler	= &proc_ipc_dointvec,
 	},
 	{
 		.ctl_name	= KERN_SEM,
 		.procname	= "sem",
-		.data		= NULL,
+		.data		= &init_ipc_ns.sem_ctls,
 		.maxlen		= 4*sizeof (int),
 		.mode		= 0644,
-		.proc_handler	= &proc_do_ipc_string,
+		.proc_handler	= &proc_ipc_dointvec,
 	},
 #endif
 #ifdef CONFIG_MAGIC_SYSRQ
@@ -2319,46 +2333,24 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
 }
 
 #ifdef CONFIG_SYSVIPC
-static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
-		void __user *buffer, size_t *lenp, loff_t *ppos)
+static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
+	void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-	void *data;
-	struct ipc_namespace *ns;
-
-	ns = current->nsproxy->ipc_ns;
-
-	switch (table->ctl_name) {
-	case KERN_SHMMAX:
-		data = &ns->shm_ctlmax;
-		goto proc_minmax;
-	case KERN_SHMALL:
-		data = &ns->shm_ctlall;
-		goto proc_minmax;
-	case KERN_SHMMNI:
-		data = &ns->shm_ctlmni;
-		break;
-	case KERN_MSGMAX:
-		data = &ns->msg_ctlmax;
-		break;
-	case KERN_MSGMNI:
-		data = &ns->msg_ctlmni;
-		break;
-	case KERN_MSGMNB:
-		data = &ns->msg_ctlmnb;
-		break;
-	case KERN_SEM:
-		data = &ns->sem_ctls;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return __do_proc_dointvec(data, table, write, filp, buffer,
+	void *which;
+	which = get_ipc(table, write);
+	return __do_proc_dointvec(which, table, write, filp, buffer,
 			lenp, ppos, NULL, NULL);
-proc_minmax:
-	return __do_proc_doulongvec_minmax(data, table, write, filp, buffer,
+}
+
+static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
+	struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	void *which;
+	which = get_ipc(table, write);
+	return __do_proc_doulongvec_minmax(which, table, write, filp, buffer,
 			lenp, ppos, 1l, 1l);
 }
+
 #endif
 
 static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
-- 
cgit v1.2.3


From 6b49a257850fb8ad91f4c76bb712e9213141a34a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 8 Dec 2006 02:39:57 -0800
Subject: [PATCH] sysctl: fix sys_sysctl interface of ipc sysctls

Currently there is a regression and the ipc sysctls don't show up in the
binary sysctl namespace.

This patch adds sysctl_ipc_data to read data/write from the appropriate
namespace and deliver it in the expected manner.

[akpm@osdl.org: warning fix]
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 10474a63a111..025fcb3c66f8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -143,6 +143,12 @@ static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
 		  void __user *oldval, size_t __user *oldlenp,
 		  void __user *newval, size_t newlen, void **context);
 
+#ifdef CONFIG_SYSVIPC
+static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
+		  void __user *oldval, size_t __user *oldlenp,
+		  void __user *newval, size_t newlen, void **context);
+#endif
+
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -476,6 +482,7 @@ static ctl_table kern_table[] = {
 		.maxlen		= sizeof (init_ipc_ns.shm_ctlmax),
 		.mode		= 0644,
 		.proc_handler	= &proc_ipc_doulongvec_minmax,
+		.strategy	= sysctl_ipc_data,
 	},
 	{
 		.ctl_name	= KERN_SHMALL,
@@ -484,6 +491,7 @@ static ctl_table kern_table[] = {
 		.maxlen		= sizeof (init_ipc_ns.shm_ctlall),
 		.mode		= 0644,
 		.proc_handler	= &proc_ipc_doulongvec_minmax,
+		.strategy	= sysctl_ipc_data,
 	},
 	{
 		.ctl_name	= KERN_SHMMNI,
@@ -492,6 +500,7 @@ static ctl_table kern_table[] = {
 		.maxlen		= sizeof (init_ipc_ns.shm_ctlmni),
 		.mode		= 0644,
 		.proc_handler	= &proc_ipc_dointvec,
+		.strategy	= sysctl_ipc_data,
 	},
 	{
 		.ctl_name	= KERN_MSGMAX,
@@ -500,6 +509,7 @@ static ctl_table kern_table[] = {
 		.maxlen		= sizeof (init_ipc_ns.msg_ctlmax),
 		.mode		= 0644,
 		.proc_handler	= &proc_ipc_dointvec,
+		.strategy	= sysctl_ipc_data,
 	},
 	{
 		.ctl_name	= KERN_MSGMNI,
@@ -508,6 +518,7 @@ static ctl_table kern_table[] = {
 		.maxlen		= sizeof (init_ipc_ns.msg_ctlmni),
 		.mode		= 0644,
 		.proc_handler	= &proc_ipc_dointvec,
+		.strategy	= sysctl_ipc_data,
 	},
 	{
 		.ctl_name	= KERN_MSGMNB,
@@ -516,6 +527,7 @@ static ctl_table kern_table[] = {
 		.maxlen		= sizeof (init_ipc_ns.msg_ctlmnb),
 		.mode		= 0644,
 		.proc_handler	= &proc_ipc_dointvec,
+		.strategy	= sysctl_ipc_data,
 	},
 	{
 		.ctl_name	= KERN_SEM,
@@ -524,6 +536,7 @@ static ctl_table kern_table[] = {
 		.maxlen		= 4*sizeof (int),
 		.mode		= 0644,
 		.proc_handler	= &proc_ipc_dointvec,
+		.strategy	= sysctl_ipc_data,
 	},
 #endif
 #ifdef CONFIG_MAGIC_SYSRQ
@@ -2611,6 +2624,47 @@ static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
 	return r;
 }
 
+#ifdef CONFIG_SYSVIPC
+/* The generic sysctl ipc data routine. */
+static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
+		void __user *oldval, size_t __user *oldlenp,
+		void __user *newval, size_t newlen, void **context)
+{
+	size_t len;
+	void *data;
+
+	/* Get out of I don't have a variable */
+	if (!table->data || !table->maxlen)
+		return -ENOTDIR;
+
+	data = get_ipc(table, 1);
+	if (!data)
+		return -ENOTDIR;
+
+	if (oldval && oldlenp) {
+		if (get_user(len, oldlenp))
+			return -EFAULT;
+		if (len) {
+			if (len > table->maxlen)
+				len = table->maxlen;
+			if (copy_to_user(oldval, data, len))
+				return -EFAULT;
+			if (put_user(len, oldlenp))
+				return -EFAULT;
+		}
+	}
+
+	if (newval && newlen) {
+		if (newlen > table->maxlen)
+			newlen = table->maxlen;
+
+		if (copy_from_user(data, newval, newlen))
+			return -EFAULT;
+	}
+	return 1;
+}
+#endif
+
 #else /* CONFIG_SYSCTL_SYSCALL */
 
 
@@ -2681,6 +2735,12 @@ static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
 {
 	return -ENOSYS;
 }
+static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
+		void __user *oldval, size_t __user *oldlenp,
+		void __user *newval, size_t newlen, void **context)
+{
+	return -ENOSYS;
+}
 #endif /* CONFIG_SYSCTL_SYSCALL */
 
 /*
-- 
cgit v1.2.3


From 4594bf159f1962cec3b727954b7c598b07e2e737 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 7 Dec 2006 11:33:26 +0000
Subject: [PATCH] WorkStruct: Use direct assignment rather than cmpxchg()

Use direct assignment rather than cmpxchg() as the latter is unavailable
and unimplementable on some platforms and is actually unnecessary.

The use of cmpxchg() was to guard against two possibilities, neither of
which can actually occur:

 (1) The pending flag may have been unset or may be cleared.  However, given
     where it's called, the pending flag is _always_ set.  I don't think it
     can be unset whilst we're in set_wq_data().

     Once the work is enqueued to be actually run, the only way off the queue
     is for it to be actually run.

     If it's a delayed work item, then the bit can't be cleared by the timer
     because we haven't started the timer yet.  Also, the pending bit can't be
     cleared by cancelling the delayed work _until_ the work item has had its
     timer started.

 (2) The workqueue pointer might change.  This can only happen in two cases:

     (a) The work item has just been queued to actually run, and so we're
         protected by the appropriate workqueue spinlock.

     (b) A delayed work item is being queued, and so the timer hasn't been
     	 started yet, and so no one else knows about the work item or can
     	 access it (the pending bit protects us).

     Besides, set_wq_data() _sets_ the workqueue pointer unconditionally, so
     it can be assigned instead.

So, replacing the set_wq_data() with a straight assignment would be okay
in most cases.

The problem is where we end up tangling with test_and_set_bit() emulated
using spinlocks, and even then it's not a problem _provided_
test_and_set_bit() doesn't attempt to modify the word if the bit was
set.

If that's a problem, then a bitops-proofed assignment will be required -
equivalent to atomic_set() vs other atomic_xxx() ops.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/workqueue.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6b186750e9be..db49886bfae1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -85,22 +85,19 @@ static inline int is_single_threaded(struct workqueue_struct *wq)
 	return list_empty(&wq->list);
 }
 
+/*
+ * Set the workqueue on which a work item is to be run
+ * - Must *only* be called if the pending flag is set
+ */
 static inline void set_wq_data(struct work_struct *work, void *wq)
 {
-	unsigned long new, old, res;
+	unsigned long new;
+
+	BUG_ON(!work_pending(work));
 
-	/* assume the pending flag is already set and that the task has already
-	 * been queued on this workqueue */
 	new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING);
-	res = work->management;
-	if (res != new) {
-		do {
-			old = res;
-			new = (unsigned long) wq;
-			new |= (old & WORK_STRUCT_FLAG_MASK);
-			res = cmpxchg(&work->management, old, new);
-		} while (res != old);
-	}
+	new |= work->management & WORK_STRUCT_FLAG_MASK;
+	work->management = new;
 }
 
 static inline void *get_wq_data(struct work_struct *work)
-- 
cgit v1.2.3


From d53ef07ab45085c0b06b652d588aa49b8ba41458 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Sun, 10 Dec 2006 02:18:36 -0800
Subject: [PATCH] ipc-procfs-sysctl mixups

When CONFIG_PROC_FS=n and CONFIG_PROC_SYSCTL=n but CONFIG_SYSVIPC=y, we get
this build error:

kernel/built-in.o:(.data+0xc38): undefined reference to `proc_ipc_doulongvec_minmax'
kernel/built-in.o:(.data+0xc88): undefined reference to `proc_ipc_doulongvec_minmax'
kernel/built-in.o:(.data+0xcd8): undefined reference to `proc_ipc_dointvec'
kernel/built-in.o:(.data+0xd28): undefined reference to `proc_ipc_dointvec'
kernel/built-in.o:(.data+0xd78): undefined reference to `proc_ipc_dointvec'
kernel/built-in.o:(.data+0xdc8): undefined reference to `proc_ipc_dointvec'
kernel/built-in.o:(.data+0xe18): undefined reference to `proc_ipc_dointvec'
make: *** [vmlinux] Error 1

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 025fcb3c66f8..1c5697e3521e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2408,6 +2408,17 @@ static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
 {
 	return -ENOSYS;
 }
+static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
+		struct file *filp, void __user *buffer,
+		size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
 #endif
 
 int proc_dointvec(ctl_table *table, int write, struct file *filp,
-- 
cgit v1.2.3


From 98d7340c360993fdd703609ff7462051e03cc2fb Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sun, 10 Dec 2006 02:19:09 -0800
Subject: [PATCH] sysctl: remove some OPs

kernel.cap-bound uses only OP_SET and OP_AND

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1c5697e3521e..c86445a62af2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1931,9 +1931,6 @@ int proc_dointvec(ctl_table *table, int write, struct file *filp,
 
 #define OP_SET	0
 #define OP_AND	1
-#define OP_OR	2
-#define OP_MAX	3
-#define OP_MIN	4
 
 static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
 				      int *valp,
@@ -1945,13 +1942,6 @@ static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
 		switch(op) {
 		case OP_SET:	*valp = val; break;
 		case OP_AND:	*valp &= val; break;
-		case OP_OR:	*valp |= val; break;
-		case OP_MAX:	if(*valp < val)
-					*valp = val;
-				break;
-		case OP_MIN:	if(*valp > val)
-				*valp = val;
-				break;
 		}
 	} else {
 		int val = *valp;
-- 
cgit v1.2.3


From 1f29bcd739972f71f2fd5d5d265daf3e1208fa5e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sun, 10 Dec 2006 02:19:10 -0800
Subject: [PATCH] sysctl: remove unused "context" param

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andi Kleen <ak@suse.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: David Howells <dhowells@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 47 ++++++++++++++++++++++-------------------------
 1 file changed, 22 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c86445a62af2..130c5ec9ee0b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -133,7 +133,7 @@ extern int max_lock_depth;
 
 #ifdef CONFIG_SYSCTL_SYSCALL
 static int parse_table(int __user *, int, void __user *, size_t __user *,
-		void __user *, size_t, ctl_table *, void **);
+		void __user *, size_t, ctl_table *);
 #endif
 
 static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
@@ -141,12 +141,12 @@ static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
 
 static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
 		  void __user *oldval, size_t __user *oldlenp,
-		  void __user *newval, size_t newlen, void **context);
+		  void __user *newval, size_t newlen);
 
 #ifdef CONFIG_SYSVIPC
 static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
 		  void __user *oldval, size_t __user *oldlenp,
-		  void __user *newval, size_t newlen, void **context);
+		  void __user *newval, size_t newlen);
 #endif
 
 #ifdef CONFIG_PROC_SYSCTL
@@ -1243,7 +1243,6 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
 	do {
 		struct ctl_table_header *head =
 			list_entry(tmp, struct ctl_table_header, ctl_entry);
-		void *context = NULL;
 
 		if (!use_table(head))
 			continue;
@@ -1251,9 +1250,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
 		spin_unlock(&sysctl_lock);
 
 		error = parse_table(name, nlen, oldval, oldlenp, 
-					newval, newlen, head->ctl_table,
-					&context);
-		kfree(context);
+					newval, newlen, head->ctl_table);
 
 		spin_lock(&sysctl_lock);
 		unuse_table(head);
@@ -1309,7 +1306,7 @@ static inline int ctl_perm(ctl_table *table, int op)
 static int parse_table(int __user *name, int nlen,
 		       void __user *oldval, size_t __user *oldlenp,
 		       void __user *newval, size_t newlen,
-		       ctl_table *table, void **context)
+		       ctl_table *table)
 {
 	int n;
 repeat:
@@ -1329,7 +1326,7 @@ repeat:
 					error = table->strategy(
 						table, name, nlen,
 						oldval, oldlenp,
-						newval, newlen, context);
+						newval, newlen);
 					if (error)
 						return error;
 				}
@@ -1340,7 +1337,7 @@ repeat:
 			}
 			error = do_sysctl_strategy(table, name, nlen,
 						   oldval, oldlenp,
-						   newval, newlen, context);
+						   newval, newlen);
 			return error;
 		}
 	}
@@ -1351,7 +1348,7 @@ repeat:
 int do_sysctl_strategy (ctl_table *table, 
 			int __user *name, int nlen,
 			void __user *oldval, size_t __user *oldlenp,
-			void __user *newval, size_t newlen, void **context)
+			void __user *newval, size_t newlen)
 {
 	int op = 0, rc;
 	size_t len;
@@ -1365,7 +1362,7 @@ int do_sysctl_strategy (ctl_table *table,
 
 	if (table->strategy) {
 		rc = table->strategy(table, name, nlen, oldval, oldlenp,
-				     newval, newlen, context);
+				     newval, newlen);
 		if (rc < 0)
 			return rc;
 		if (rc > 0)
@@ -2473,7 +2470,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
 /* The generic string strategy routine: */
 int sysctl_string(ctl_table *table, int __user *name, int nlen,
 		  void __user *oldval, size_t __user *oldlenp,
-		  void __user *newval, size_t newlen, void **context)
+		  void __user *newval, size_t newlen)
 {
 	if (!table->data || !table->maxlen) 
 		return -ENOTDIR;
@@ -2519,7 +2516,7 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen,
  */
 int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
 		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen, void **context)
+		void __user *newval, size_t newlen)
 {
 
 	if (newval && newlen) {
@@ -2555,7 +2552,7 @@ int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
 /* Strategy function to convert jiffies to seconds */ 
 int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
 		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen, void **context)
+		void __user *newval, size_t newlen)
 {
 	if (oldval) {
 		size_t olen;
@@ -2583,7 +2580,7 @@ int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
 /* Strategy function to convert jiffies to seconds */ 
 int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
 		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen, void **context)
+		void __user *newval, size_t newlen)
 {
 	if (oldval) {
 		size_t olen;
@@ -2612,7 +2609,7 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
 /* The generic string strategy routine: */
 static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
 		  void __user *oldval, size_t __user *oldlenp,
-		  void __user *newval, size_t newlen, void **context)
+		  void __user *newval, size_t newlen)
 {
 	struct ctl_table uts_table;
 	int r, write;
@@ -2620,7 +2617,7 @@ static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
 	memcpy(&uts_table, table, sizeof(uts_table));
 	uts_table.data = get_uts(table, write);
 	r = sysctl_string(&uts_table, name, nlen,
-		oldval, oldlenp, newval, newlen, context);
+		oldval, oldlenp, newval, newlen);
 	put_uts(table, write, uts_table.data);
 	return r;
 }
@@ -2629,7 +2626,7 @@ static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
 /* The generic sysctl ipc data routine. */
 static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
 		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen, void **context)
+		void __user *newval, size_t newlen)
 {
 	size_t len;
 	void *data;
@@ -2704,41 +2701,41 @@ out:
 
 int sysctl_string(ctl_table *table, int __user *name, int nlen,
 		  void __user *oldval, size_t __user *oldlenp,
-		  void __user *newval, size_t newlen, void **context)
+		  void __user *newval, size_t newlen)
 {
 	return -ENOSYS;
 }
 
 int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
 		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen, void **context)
+		void __user *newval, size_t newlen)
 {
 	return -ENOSYS;
 }
 
 int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
 		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen, void **context)
+		void __user *newval, size_t newlen)
 {
 	return -ENOSYS;
 }
 
 int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
 		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen, void **context)
+		void __user *newval, size_t newlen)
 {
 	return -ENOSYS;
 }
 
 static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
 		  void __user *oldval, size_t __user *oldlenp,
-		  void __user *newval, size_t newlen, void **context)
+		  void __user *newval, size_t newlen)
 {
 	return -ENOSYS;
 }
 static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
 		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen, void **context)
+		void __user *newval, size_t newlen)
 {
 	return -ENOSYS;
 }
-- 
cgit v1.2.3


From 7c3ab7381e79dfc7db14a67c6f4f3285664e1ec2 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 10 Dec 2006 02:19:19 -0800
Subject: [PATCH] io-accounting: core statistics

The present per-task IO accounting isn't very useful.  It simply counts the
number of bytes passed into read() and write().  So if a process reads 1MB
from an already-cached file, it is accused of having performed 1MB of I/O,
which is wrong.

(David Wright had some comments on the applicability of the present logical IO accounting:

  For billing purposes it is useless but for workload analysis it is very
  useful

  read_bytes/read_calls  average read request size
  write_bytes/write_calls average write request size

  read_bytes/read_blocks ie logical/physical can indicate hit rate or thrashing
  write_bytes/write_blocks  ie logical/physical  guess since pdflush writes can
                                                be missed

  I often look for logical larger than physical to see filesystem cache
  problems.  And the bytes/cpusec can help find applications that are
  dominating the cache and causing slow interactive response from page cache
  contention.

  I want to find the IO intensive applications and make sure they are doing
  efficient IO.  Thus the acctcms(sysV) or csacms command would give the high
  IO commands).

This patchset adds new accounting which tries to be more accurate.  We account
for three things:

reads:

  attempt to count the number of bytes which this process really did cause
  to be fetched from the storage layer.  Done at the submit_bio() level, so it
  is accurate for block-backed filesystems.  I also attempt to wire up NFS and
  CIFS.

writes:

  attempt to count the number of bytes which this process caused to be sent
  to the storage layer.  This is done at page-dirtying time.

  The big inaccuracy here is truncate.  If a process writes 1MB to a file
  and then deletes the file, it will in fact perform no writeout.  But it will
  have been accounted as having caused 1MB of write.

  So...

cancelled_writes:

  account the number of bytes which this process caused to not happen, by
  truncating pagecache.

  We _could_ just subtract this from the process's `write' accounting.  But
  that means that some processes would be reported to have done negative
  amounts of write IO, which is silly.

  So we just report the raw number and punt this decision up to userspace.

Now, we _could_ account for writes at the physical I/O level.  But

- This would require that we track memory-dirtying tasks at the per-page
  level (would require a new pointer in struct page).

- It would mean that IO statistics for a process are usually only available
  long after that process has exitted.  Which means that we probably cannot
  communicate this info via taskstats.

This patch:

Wire up the kernel-private data structures and the accessor functions to
manipulate them.

Cc: Jay Lan <jlan@sgi.com>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Chris Sturtivant <csturtiv@sgi.com>
Cc: Tony Ernst <tee@sgi.com>
Cc: Guillaume Thouvenin <guillaume.thouvenin@bull.net>
Cc: David Wright <daw@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 8c859eef8e6a..086e172d0d3d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -36,6 +36,7 @@
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
 #include <linux/futex.h>
+#include <linux/task_io_accounting_ops.h>
 #include <linux/rcupdate.h>
 #include <linux/ptrace.h>
 #include <linux/mount.h>
@@ -1055,6 +1056,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->wchar = 0;		/* I/O counter: bytes written */
 	p->syscr = 0;		/* I/O counter: read syscalls */
 	p->syscw = 0;		/* I/O counter: write syscalls */
+	task_io_accounting_init(p);
 	acct_clear_integrals(p);
 
  	p->it_virt_expires = cputime_zero;
-- 
cgit v1.2.3


From 4a7864ca638e0a38307962ee8ef122822a351b65 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 10 Dec 2006 02:19:53 -0800
Subject: [PATCH] io-accounting: via taskstats

Deliver IO accounting via taskstats.

Cc: Jay Lan <jlan@sgi.com>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Chris Sturtivant <csturtiv@sgi.com>
Cc: Tony Ernst <tee@sgi.com>
Cc: Guillaume Thouvenin <guillaume.thouvenin@bull.net>
Cc: David Wright <daw@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/tsacct.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 96f77013d3f0..baacc3691415 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -96,6 +96,15 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 	stats->write_char	= p->wchar;
 	stats->read_syscalls	= p->syscr;
 	stats->write_syscalls	= p->syscw;
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+	stats->read_bytes	= p->ioac.read_bytes;
+	stats->write_bytes	= p->ioac.write_bytes;
+	stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes;
+#else
+	stats->read_bytes	= 0;
+	stats->write_bytes	= 0;
+	stats->cancelled_write_bytes = 0;
+#endif
 }
 #undef KB
 #undef MB
-- 
cgit v1.2.3


From cc2a73b5caf065f8612fcb5df5bd2f5e25881d99 Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@mindspring.com>
Date: Sun, 10 Dec 2006 02:20:00 -0800
Subject: [PATCH] sched.c: correct comment for this_rq_lock()

Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Robert P. J. Day <rpjday@mindspring.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index f385eff4682d..479199ab14ab 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -547,7 +547,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
 #endif
 
 /*
- * rq_lock - lock a given runqueue and disable interrupts.
+ * this_rq_lock - lock this runqueue and disable interrupts.
  */
 static inline struct rq *this_rq_lock(void)
 	__acquires(rq->lock)
-- 
cgit v1.2.3


From 6711cab43ed5e60bf51e3dbbce6395e87d4e9805 Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Sun, 10 Dec 2006 02:20:07 -0800
Subject: [PATCH] ched domain: move sched group allocations to percpu area

Move the sched group allocations to percpu area.  This will minimize cross
node memory references and also cleans up the sched groups allocation for
allnodes sched domain.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 132 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 64 insertions(+), 68 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 479199ab14ab..a08387b5f7fa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5528,28 +5528,27 @@ static int __init isolated_cpu_setup(char *str)
 __setup ("isolcpus=", isolated_cpu_setup);
 
 /*
- * init_sched_build_groups takes an array of groups, the cpumask we wish
- * to span, and a pointer to a function which identifies what group a CPU
- * belongs to. The return value of group_fn must be a valid index into the
- * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we
- * keep track of groups covered with a cpumask_t).
+ * init_sched_build_groups takes the cpumask we wish to span, and a pointer
+ * to a function which identifies what group(along with sched group) a CPU
+ * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
+ * (due to the fact that we keep track of groups covered with a cpumask_t).
  *
  * init_sched_build_groups will build a circular linked list of the groups
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
 static void
-init_sched_build_groups(struct sched_group groups[], cpumask_t span,
-			const cpumask_t *cpu_map,
-			int (*group_fn)(int cpu, const cpumask_t *cpu_map))
+init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
+			int (*group_fn)(int cpu, const cpumask_t *cpu_map,
+					struct sched_group **sg))
 {
 	struct sched_group *first = NULL, *last = NULL;
 	cpumask_t covered = CPU_MASK_NONE;
 	int i;
 
 	for_each_cpu_mask(i, span) {
-		int group = group_fn(i, cpu_map);
-		struct sched_group *sg = &groups[group];
+		struct sched_group *sg;
+		int group = group_fn(i, cpu_map, &sg);
 		int j;
 
 		if (cpu_isset(i, covered))
@@ -5559,7 +5558,7 @@ init_sched_build_groups(struct sched_group groups[], cpumask_t span,
 		sg->cpu_power = 0;
 
 		for_each_cpu_mask(j, span) {
-			if (group_fn(j, cpu_map) != group)
+			if (group_fn(j, cpu_map, NULL) != group)
 				continue;
 
 			cpu_set(j, covered);
@@ -6135,10 +6134,13 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
  */
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-static struct sched_group sched_group_cpus[NR_CPUS];
+static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
 
-static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map)
+static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
+			    struct sched_group **sg)
 {
+	if (sg)
+		*sg = &per_cpu(sched_group_cpus, cpu);
 	return cpu;
 }
 #endif
@@ -6148,39 +6150,52 @@ static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map)
  */
 #ifdef CONFIG_SCHED_MC
 static DEFINE_PER_CPU(struct sched_domain, core_domains);
-static struct sched_group sched_group_core[NR_CPUS];
+static DEFINE_PER_CPU(struct sched_group, sched_group_core);
 #endif
 
 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
-static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map)
+static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
+			     struct sched_group **sg)
 {
+	int group;
 	cpumask_t mask = cpu_sibling_map[cpu];
 	cpus_and(mask, mask, *cpu_map);
-	return first_cpu(mask);
+	group = first_cpu(mask);
+	if (sg)
+		*sg = &per_cpu(sched_group_core, group);
+	return group;
 }
 #elif defined(CONFIG_SCHED_MC)
-static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map)
+static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
+			     struct sched_group **sg)
 {
+	if (sg)
+		*sg = &per_cpu(sched_group_core, cpu);
 	return cpu;
 }
 #endif
 
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static struct sched_group sched_group_phys[NR_CPUS];
+static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
 
-static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map)
+static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
+			     struct sched_group **sg)
 {
+	int group;
 #ifdef CONFIG_SCHED_MC
 	cpumask_t mask = cpu_coregroup_map(cpu);
 	cpus_and(mask, mask, *cpu_map);
-	return first_cpu(mask);
+	group = first_cpu(mask);
 #elif defined(CONFIG_SCHED_SMT)
 	cpumask_t mask = cpu_sibling_map[cpu];
 	cpus_and(mask, mask, *cpu_map);
-	return first_cpu(mask);
+	group = first_cpu(mask);
 #else
-	return cpu;
+	group = cpu;
 #endif
+	if (sg)
+		*sg = &per_cpu(sched_group_phys, group);
+	return group;
 }
 
 #ifdef CONFIG_NUMA
@@ -6193,12 +6208,22 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains);
 static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
 
 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
-static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
+static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
 
-static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map)
+static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
+				 struct sched_group **sg)
 {
-	return cpu_to_node(cpu);
+	cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
+	int group;
+
+	cpus_and(nodemask, nodemask, *cpu_map);
+	group = first_cpu(nodemask);
+
+	if (sg)
+		*sg = &per_cpu(sched_group_allnodes, group);
+	return group;
 }
+
 static void init_numa_sched_groups_power(struct sched_group *group_head)
 {
 	struct sched_group *sg = group_head;
@@ -6234,16 +6259,9 @@ static void free_sched_groups(const cpumask_t *cpu_map)
 	int cpu, i;
 
 	for_each_cpu_mask(cpu, *cpu_map) {
-		struct sched_group *sched_group_allnodes
-			= sched_group_allnodes_bycpu[cpu];
 		struct sched_group **sched_group_nodes
 			= sched_group_nodes_bycpu[cpu];
 
-		if (sched_group_allnodes) {
-			kfree(sched_group_allnodes);
-			sched_group_allnodes_bycpu[cpu] = NULL;
-		}
-
 		if (!sched_group_nodes)
 			continue;
 
@@ -6337,7 +6355,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 	struct sched_domain *sd;
 #ifdef CONFIG_NUMA
 	struct sched_group **sched_group_nodes = NULL;
-	struct sched_group *sched_group_allnodes = NULL;
+	int sd_allnodes = 0;
 
 	/*
 	 * Allocate the per-node list of sched groups
@@ -6355,7 +6373,6 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 	 * Set up domains for cpus specified by the cpu_map.
 	 */
 	for_each_cpu_mask(i, *cpu_map) {
-		int group;
 		struct sched_domain *sd = NULL, *p;
 		cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
 
@@ -6364,26 +6381,12 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 #ifdef CONFIG_NUMA
 		if (cpus_weight(*cpu_map)
 				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
-			if (!sched_group_allnodes) {
-				sched_group_allnodes
-					= kmalloc_node(sizeof(struct sched_group)
-						  	* MAX_NUMNODES,
-						  GFP_KERNEL,
-						  cpu_to_node(i));
-				if (!sched_group_allnodes) {
-					printk(KERN_WARNING
-					"Can not alloc allnodes sched group\n");
-					goto error;
-				}
-				sched_group_allnodes_bycpu[i]
-						= sched_group_allnodes;
-			}
 			sd = &per_cpu(allnodes_domains, i);
 			*sd = SD_ALLNODES_INIT;
 			sd->span = *cpu_map;
-			group = cpu_to_allnodes_group(i, cpu_map);
-			sd->groups = &sched_group_allnodes[group];
+			cpu_to_allnodes_group(i, cpu_map, &sd->groups);
 			p = sd;
+			sd_allnodes = 1;
 		} else
 			p = NULL;
 
@@ -6398,36 +6401,33 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 
 		p = sd;
 		sd = &per_cpu(phys_domains, i);
-		group = cpu_to_phys_group(i, cpu_map);
 		*sd = SD_CPU_INIT;
 		sd->span = nodemask;
 		sd->parent = p;
 		if (p)
 			p->child = sd;
-		sd->groups = &sched_group_phys[group];
+		cpu_to_phys_group(i, cpu_map, &sd->groups);
 
 #ifdef CONFIG_SCHED_MC
 		p = sd;
 		sd = &per_cpu(core_domains, i);
-		group = cpu_to_core_group(i, cpu_map);
 		*sd = SD_MC_INIT;
 		sd->span = cpu_coregroup_map(i);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
-		sd->groups = &sched_group_core[group];
+		cpu_to_core_group(i, cpu_map, &sd->groups);
 #endif
 
 #ifdef CONFIG_SCHED_SMT
 		p = sd;
 		sd = &per_cpu(cpu_domains, i);
-		group = cpu_to_cpu_group(i, cpu_map);
 		*sd = SD_SIBLING_INIT;
 		sd->span = cpu_sibling_map[i];
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
-		sd->groups = &sched_group_cpus[group];
+		cpu_to_cpu_group(i, cpu_map, &sd->groups);
 #endif
 	}
 
@@ -6439,8 +6439,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 		if (i != first_cpu(this_sibling_map))
 			continue;
 
-		init_sched_build_groups(sched_group_cpus, this_sibling_map,
-					cpu_map, &cpu_to_cpu_group);
+		init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group);
 	}
 #endif
 
@@ -6451,8 +6450,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 		cpus_and(this_core_map, this_core_map, *cpu_map);
 		if (i != first_cpu(this_core_map))
 			continue;
-		init_sched_build_groups(sched_group_core, this_core_map,
-					cpu_map, &cpu_to_core_group);
+		init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group);
 	}
 #endif
 
@@ -6465,15 +6463,13 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 		if (cpus_empty(nodemask))
 			continue;
 
-		init_sched_build_groups(sched_group_phys, nodemask,
-					cpu_map, &cpu_to_phys_group);
+		init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
 	}
 
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
-	if (sched_group_allnodes)
-		init_sched_build_groups(sched_group_allnodes, *cpu_map,
-					cpu_map, &cpu_to_allnodes_group);
+	if (sd_allnodes)
+		init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group);
 
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		/* Set up node groups */
@@ -6565,10 +6561,10 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 	for (i = 0; i < MAX_NUMNODES; i++)
 		init_numa_sched_groups_power(sched_group_nodes[i]);
 
-	if (sched_group_allnodes) {
-		int group = cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map);
-		struct sched_group *sg = &sched_group_allnodes[group];
+	if (sd_allnodes) {
+		struct sched_group *sg;
 
+		cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
 		init_numa_sched_groups_power(sg);
 	}
 #endif
-- 
cgit v1.2.3


From 054b9108e01ef27e2e6b32b4226abb6024626f06 Mon Sep 17 00:00:00 2001
From: Kirill Korotaev <dev@openvz.org>
Date: Sun, 10 Dec 2006 02:20:11 -0800
Subject: [PATCH] move_task_off_dead_cpu() should be called with disabled ints

move_task_off_dead_cpu() requires interrupts to be disabled, while
migrate_dead() calls it with enabled interrupts.  Added appropriate
comments to functions and added BUG_ON(!irqs_disabled()) into
double_rq_lock() and double_lock_balance() which are the origin sources of
such bugs.

Signed-off-by: Kirill Korotaev <dev@openvz.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index a08387b5f7fa..f04add905bdf 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1952,6 +1952,7 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
 	__acquires(rq1->lock)
 	__acquires(rq2->lock)
 {
+	BUG_ON(!irqs_disabled());
 	if (rq1 == rq2) {
 		spin_lock(&rq1->lock);
 		__acquire(rq2->lock);	/* Fake it out ;) */
@@ -1991,6 +1992,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
 	__acquires(busiest->lock)
 	__acquires(this_rq->lock)
 {
+	if (unlikely(!irqs_disabled())) {
+		/* printk() doesn't work good under rq->lock */
+		spin_unlock(&this_rq->lock);
+		BUG_ON(1);
+	}
 	if (unlikely(!spin_trylock(&busiest->lock))) {
 		if (busiest < this_rq) {
 			spin_unlock(&this_rq->lock);
@@ -5067,7 +5073,10 @@ wait_to_die:
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-/* Figure out where task on dead CPU should go, use force if neccessary. */
+/*
+ * Figure out where task on dead CPU should go, use force if neccessary.
+ * NOTE: interrupts should be disabled by the caller
+ */
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
 	unsigned long flags;
@@ -5187,6 +5196,7 @@ void idle_task_exit(void)
 	mmdrop(mm);
 }
 
+/* called under rq->lock with disabled interrupts */
 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
 {
 	struct rq *rq = cpu_rq(dead_cpu);
@@ -5203,10 +5213,11 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
 	 * Drop lock around migration; if someone else moves it,
 	 * that's OK.  No task can be added to this CPU, so iteration is
 	 * fine.
+	 * NOTE: interrupts should be left disabled  --dev@
 	 */
-	spin_unlock_irq(&rq->lock);
+	spin_unlock(&rq->lock);
 	move_task_off_dead_cpu(dead_cpu, p);
-	spin_lock_irq(&rq->lock);
+	spin_lock(&rq->lock);
 
 	put_task_struct(p);
 }
-- 
cgit v1.2.3


From 571f6d2fb0b1c04798df783db2ba85e96bcce43d Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 10 Dec 2006 02:20:13 -0800
Subject: [PATCH] sched: avoid taking rq lock in wake_priority_sleeper

Avoid taking the request queue lock in wake_priority_sleeper if there are no
running processes.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Peter Williams <pwil3058@bigpond.net.au>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index f04add905bdf..fdd26fffaa20 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2915,6 +2915,9 @@ static inline int wake_priority_sleeper(struct rq *rq)
 	int ret = 0;
 
 #ifdef CONFIG_SCHED_SMT
+	if (!rq->nr_running)
+		return 0;
+
 	spin_lock(&rq->lock);
 	/*
 	 * If an SMT sibling task has been put to sleep for priority
-- 
cgit v1.2.3


From 4211a9a2e94a34df8c02bc39b7ec10678ad5c2ab Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 10 Dec 2006 02:20:19 -0800
Subject: [PATCH] sched: remove staggering of load balancing

Timer interrupts already are staggered.  We do not need an additional layer of
time staggering for short load balancing actions that take a reasonably small
portion of the time slice.

For load balancing on large sched_domains we will add a serialization later
that avoids concurrent load balance operations and thus has the same effect as
load staggering.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Peter Williams <pwil3058@bigpond.net.au>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index fdd26fffaa20..b5b350135002 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2841,16 +2841,10 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
  * Balancing parameters are set up in arch_init_sched_domains.
  */
 
-/* Don't have all balancing operations going off at once: */
-static inline unsigned long cpu_offset(int cpu)
-{
-	return jiffies + cpu * HZ / NR_CPUS;
-}
-
 static void
 rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
 {
-	unsigned long this_load, interval, j = cpu_offset(this_cpu);
+	unsigned long this_load, interval;
 	struct sched_domain *sd;
 	int i, scale;
 
@@ -2885,7 +2879,7 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
 		if (unlikely(!interval))
 			interval = 1;
 
-		if (j - sd->last_balance >= interval) {
+		if (jiffies - sd->last_balance >= interval) {
 			if (load_balance(this_cpu, this_rq, sd, idle)) {
 				/*
 				 * We've pulled tasks over so either we're no
-- 
cgit v1.2.3


From fe2eea3fafb3df2f5b8a55a48bcbb0d23b3b5618 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 10 Dec 2006 02:20:21 -0800
Subject: [PATCH] sched: disable interrupts for locking in load_balance()

Interrupts must be disabled for request queue locks if we want to run
load_balance() with interrupts enabled.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Peter Williams <pwil3058@bigpond.net.au>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b5b350135002..d327511d268e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2546,8 +2546,6 @@ static inline unsigned long minus_1_or_zero(unsigned long n)
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
- *
- * Called with this_rq unlocked.
  */
 static int load_balance(int this_cpu, struct rq *this_rq,
 			struct sched_domain *sd, enum idle_type idle)
@@ -2557,6 +2555,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	unsigned long imbalance;
 	struct rq *busiest;
 	cpumask_t cpus = CPU_MASK_ALL;
+	unsigned long flags;
 
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
@@ -2596,11 +2595,13 @@ redo:
 		 * still unbalanced. nr_moved simply stays zero, so it is
 		 * correctly treated as an imbalance.
 		 */
+		local_irq_save(flags);
 		double_rq_lock(this_rq, busiest);
 		nr_moved = move_tasks(this_rq, this_cpu, busiest,
 				      minus_1_or_zero(busiest->nr_running),
 				      imbalance, sd, idle, &all_pinned);
 		double_rq_unlock(this_rq, busiest);
+		local_irq_restore(flags);
 
 		/* All tasks on this runqueue were pinned by CPU affinity */
 		if (unlikely(all_pinned)) {
@@ -2617,13 +2618,13 @@ redo:
 
 		if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
 
-			spin_lock(&busiest->lock);
+			spin_lock_irqsave(&busiest->lock, flags);
 
 			/* don't kick the migration_thread, if the curr
 			 * task on busiest cpu can't be moved to this_cpu
 			 */
 			if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
-				spin_unlock(&busiest->lock);
+				spin_unlock_irqrestore(&busiest->lock, flags);
 				all_pinned = 1;
 				goto out_one_pinned;
 			}
@@ -2633,7 +2634,7 @@ redo:
 				busiest->push_cpu = this_cpu;
 				active_balance = 1;
 			}
-			spin_unlock(&busiest->lock);
+			spin_unlock_irqrestore(&busiest->lock, flags);
 			if (active_balance)
 				wake_up_process(busiest->migration_thread);
 
-- 
cgit v1.2.3


From 7835b98bc6de2ca10afa45572d272304b000b048 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 10 Dec 2006 02:20:22 -0800
Subject: [PATCH] sched: extract load calculation from rebalance_tick

A load calculation is always done in rebalance_tick() in addition to the real
load balancing activities that only take place when certain jiffie counts have
been reached.  Move that processing into a separate function and call it
directly from scheduler_tick().

Also extract the time slice handling from scheduler_tick and put it into a
separate function.  Then we can clean up scheduler_tick significantly.  It
will no longer have any gotos.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Peter Williams <pwil3058@bigpond.net.au>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 96 +++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 54 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index d327511d268e..3c1b74aa1b07 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2833,20 +2833,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 	spin_unlock(&target_rq->lock);
 }
 
-/*
- * rebalance_tick will get called every timer tick, on every CPU.
- *
- * It checks each scheduling domain to see if it is due to be balanced,
- * and initiates a balancing operation if so.
- *
- * Balancing parameters are set up in arch_init_sched_domains.
- */
-
-static void
-rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
+static void update_load(struct rq *this_rq)
 {
-	unsigned long this_load, interval;
-	struct sched_domain *sd;
+	unsigned long this_load;
 	int i, scale;
 
 	this_load = this_rq->raw_weighted_load;
@@ -2866,6 +2855,22 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
 			new_load += scale-1;
 		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
 	}
+}
+
+/*
+ * rebalance_tick will get called every timer tick, on every CPU.
+ *
+ * It checks each scheduling domain to see if it is due to be balanced,
+ * and initiates a balancing operation if so.
+ *
+ * Balancing parameters are set up in arch_init_sched_domains.
+ */
+
+static void
+rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
+{
+	unsigned long interval;
+	struct sched_domain *sd;
 
 	for_each_domain(this_cpu, sd) {
 		if (!(sd->flags & SD_LOAD_BALANCE))
@@ -2897,12 +2902,15 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
 /*
  * on UP we do not need to balance between CPUs:
  */
-static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle)
+static inline void rebalance_tick(int cpu, struct rq *rq)
 {
 }
 static inline void idle_balance(int cpu, struct rq *rq)
 {
 }
+static inline void update_load(struct rq *this_rq)
+{
+}
 #endif
 
 static inline int wake_priority_sleeper(struct rq *rq)
@@ -3052,35 +3060,12 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
 		cpustat->steal = cputime64_add(cpustat->steal, tmp);
 }
 
-/*
- * This function gets called by the timer code, with HZ frequency.
- * We call it with interrupts disabled.
- *
- * It also gets called by the fork code, when changing the parent's
- * timeslices.
- */
-void scheduler_tick(void)
+static void task_running_tick(struct rq *rq, struct task_struct *p)
 {
-	unsigned long long now = sched_clock();
-	struct task_struct *p = current;
-	int cpu = smp_processor_id();
-	struct rq *rq = cpu_rq(cpu);
-
-	update_cpu_clock(p, rq, now);
-
-	rq->timestamp_last_tick = now;
-
-	if (p == rq->idle) {
-		if (wake_priority_sleeper(rq))
-			goto out;
-		rebalance_tick(cpu, rq, SCHED_IDLE);
-		return;
-	}
-
-	/* Task might have expired already, but not scheduled off yet */
 	if (p->array != rq->active) {
+		/* Task has expired but was not scheduled yet */
 		set_tsk_need_resched(p);
-		goto out;
+		return;
 	}
 	spin_lock(&rq->lock);
 	/*
@@ -3148,8 +3133,35 @@ void scheduler_tick(void)
 	}
 out_unlock:
 	spin_unlock(&rq->lock);
-out:
-	rebalance_tick(cpu, rq, NOT_IDLE);
+}
+
+/*
+ * This function gets called by the timer code, with HZ frequency.
+ * We call it with interrupts disabled.
+ *
+ * It also gets called by the fork code, when changing the parent's
+ * timeslices.
+ */
+void scheduler_tick(void)
+{
+	unsigned long long now = sched_clock();
+	struct task_struct *p = current;
+	int cpu = smp_processor_id();
+	struct rq *rq = cpu_rq(cpu);
+	enum idle_type idle = NOT_IDLE;
+
+	update_cpu_clock(p, rq, now);
+
+	rq->timestamp_last_tick = now;
+
+	if (p == rq->idle) {
+		/* Task on the idle queue */
+		if (!wake_priority_sleeper(rq))
+			idle = SCHED_IDLE;
+	} else
+		task_running_tick(rq, p);
+	update_load(rq);
+	rebalance_tick(cpu, rq, idle);
 }
 
 #ifdef CONFIG_SCHED_SMT
-- 
cgit v1.2.3


From e418e1c2bf1a253916b569370653414eb28597b6 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 10 Dec 2006 02:20:23 -0800
Subject: [PATCH] sched: move idle status calculation into rebalance_tick()

Perform the idle state determination in rebalance_tick.

If we separate balancing from sched_tick then we also need to determine the
idle state in rebalance_tick.

V2->V3
	Remove useless idlle != 0 check. Checking nr_running seems
	to be sufficient. Thanks Suresh.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Peter Williams <pwil3058@bigpond.net.au>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 37 ++++++++++++++++---------------------
 1 file changed, 16 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3c1b74aa1b07..14a8d9050cd4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2867,10 +2867,16 @@ static void update_load(struct rq *this_rq)
  */
 
 static void
-rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
+rebalance_tick(int this_cpu, struct rq *this_rq)
 {
 	unsigned long interval;
 	struct sched_domain *sd;
+	/*
+	 * We are idle if there are no processes running. This
+	 * is valid even if we are the idle process (SMT).
+	 */
+	enum idle_type idle = !this_rq->nr_running ?
+				SCHED_IDLE : NOT_IDLE;
 
 	for_each_domain(this_cpu, sd) {
 		if (!(sd->flags & SD_LOAD_BALANCE))
@@ -2902,37 +2908,26 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
 /*
  * on UP we do not need to balance between CPUs:
  */
-static inline void rebalance_tick(int cpu, struct rq *rq)
-{
-}
 static inline void idle_balance(int cpu, struct rq *rq)
 {
 }
-static inline void update_load(struct rq *this_rq)
-{
-}
 #endif
 
-static inline int wake_priority_sleeper(struct rq *rq)
+static inline void wake_priority_sleeper(struct rq *rq)
 {
-	int ret = 0;
-
 #ifdef CONFIG_SCHED_SMT
 	if (!rq->nr_running)
-		return 0;
+		return;
 
 	spin_lock(&rq->lock);
 	/*
 	 * If an SMT sibling task has been put to sleep for priority
 	 * reasons reschedule the idle task to see if it can now run.
 	 */
-	if (rq->nr_running) {
+	if (rq->nr_running)
 		resched_task(rq->idle);
-		ret = 1;
-	}
 	spin_unlock(&rq->lock);
 #endif
-	return ret;
 }
 
 DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -3148,20 +3143,20 @@ void scheduler_tick(void)
 	struct task_struct *p = current;
 	int cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(cpu);
-	enum idle_type idle = NOT_IDLE;
 
 	update_cpu_clock(p, rq, now);
 
 	rq->timestamp_last_tick = now;
 
-	if (p == rq->idle) {
+	if (p == rq->idle)
 		/* Task on the idle queue */
-		if (!wake_priority_sleeper(rq))
-			idle = SCHED_IDLE;
-	} else
+		wake_priority_sleeper(rq);
+	else
 		task_running_tick(rq, p);
+#ifdef CONFIG_SMP
 	update_load(rq);
-	rebalance_tick(cpu, rq, idle);
+	rebalance_tick(cpu, rq);
+#endif
 }
 
 #ifdef CONFIG_SCHED_SMT
-- 
cgit v1.2.3


From c9819f4593e8d052b41a89f47140f5c5e7e30582 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 10 Dec 2006 02:20:25 -0800
Subject: [PATCH] sched: use softirq for load balancing

Call rebalance_tick (renamed to run_rebalance_domains) from a newly introduced
softirq.

We calculate the earliest time for each layer of sched domains to be rescanned
(this is the rescan time for idle) and use the earliest of those to schedule
the softirq via a new field "next_balance" added to struct rq.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Peter Williams <pwil3058@bigpond.net.au>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 14a8d9050cd4..0a3e748d737d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -227,6 +227,7 @@ struct rq {
 	unsigned long expired_timestamp;
 	unsigned long long timestamp_last_tick;
 	struct task_struct *curr, *idle;
+	unsigned long next_balance;
 	struct mm_struct *prev_mm;
 	struct prio_array *active, *expired, arrays[2];
 	int best_expired_prio;
@@ -2858,7 +2859,7 @@ static void update_load(struct rq *this_rq)
 }
 
 /*
- * rebalance_tick will get called every timer tick, on every CPU.
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
  *
  * It checks each scheduling domain to see if it is due to be balanced,
  * and initiates a balancing operation if so.
@@ -2866,9 +2867,10 @@ static void update_load(struct rq *this_rq)
  * Balancing parameters are set up in arch_init_sched_domains.
  */
 
-static void
-rebalance_tick(int this_cpu, struct rq *this_rq)
+static void run_rebalance_domains(struct softirq_action *h)
 {
+	int this_cpu = smp_processor_id();
+	struct rq *this_rq = cpu_rq(this_cpu);
 	unsigned long interval;
 	struct sched_domain *sd;
 	/*
@@ -2877,6 +2879,8 @@ rebalance_tick(int this_cpu, struct rq *this_rq)
 	 */
 	enum idle_type idle = !this_rq->nr_running ?
 				SCHED_IDLE : NOT_IDLE;
+	/* Earliest time when we have to call run_rebalance_domains again */
+	unsigned long next_balance = jiffies + 60*HZ;
 
 	for_each_domain(this_cpu, sd) {
 		if (!(sd->flags & SD_LOAD_BALANCE))
@@ -2891,7 +2895,7 @@ rebalance_tick(int this_cpu, struct rq *this_rq)
 		if (unlikely(!interval))
 			interval = 1;
 
-		if (jiffies - sd->last_balance >= interval) {
+		if (time_after_eq(jiffies, sd->last_balance + interval)) {
 			if (load_balance(this_cpu, this_rq, sd, idle)) {
 				/*
 				 * We've pulled tasks over so either we're no
@@ -2902,7 +2906,10 @@ rebalance_tick(int this_cpu, struct rq *this_rq)
 			}
 			sd->last_balance += interval;
 		}
+		if (time_after(next_balance, sd->last_balance + interval))
+			next_balance = sd->last_balance + interval;
 	}
+	this_rq->next_balance = next_balance;
 }
 #else
 /*
@@ -3155,7 +3162,8 @@ void scheduler_tick(void)
 		task_running_tick(rq, p);
 #ifdef CONFIG_SMP
 	update_load(rq);
-	rebalance_tick(cpu, rq);
+	if (time_after_eq(jiffies, rq->next_balance))
+		raise_softirq(SCHED_SOFTIRQ);
 #endif
 }
 
@@ -6859,6 +6867,10 @@ void __init sched_init(void)
 
 	set_load_weight(&init_task);
 
+#ifdef CONFIG_SMP
+	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
+#endif
+
 #ifdef CONFIG_RT_MUTEXES
 	plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
 #endif
-- 
cgit v1.2.3


From 1bd77f2da58e9cdd1f159217887343dadd9af417 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 10 Dec 2006 02:20:27 -0800
Subject: [PATCH] sched: call tasklet less frequently

Trigger softirq less frequently

We trigger the softirq before this patch using offset of sd->interval.
However, if the queue is busy then it is sufficient to schedule the softirq
with sd->interval * busy_factor.

So we modify the calculation of the next time to balance by taking
the interval added to last_balance again. This is only the
right value if the idle/busy situation continues as is.

There are two potential trouble spots:
- If the queue was idle and now gets busy then we call rebalance
  early. However, that is not a problem because we will then use
  the longer interval for the next period.

- If the queue was busy and becomes idle then we potentially
  wait too long before rebalancing. However, when the task
  goes idle then idle_balance is called. We add another calculation
  of the next balance time based on sd->interval in idle_balance
  so that we will rebalance soon.

V2->V3:
- Calculate rebalance time based on current jiffies and not
  based on the jiffies at the last time we load balanced.
  We no longer rely on staggering and therefore we can
  affort to do this now.

V3->V4:
- Use functions to do jiffy comparisons.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Peter Williams <pwil3058@bigpond.net.au>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0a3e748d737d..0a4a26b21f69 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2774,14 +2774,28 @@ out_balanced:
 static void idle_balance(int this_cpu, struct rq *this_rq)
 {
 	struct sched_domain *sd;
+	int pulled_task = 0;
+	unsigned long next_balance = jiffies + 60 *  HZ;
 
 	for_each_domain(this_cpu, sd) {
 		if (sd->flags & SD_BALANCE_NEWIDLE) {
 			/* If we've pulled tasks over stop searching: */
-			if (load_balance_newidle(this_cpu, this_rq, sd))
+			pulled_task = load_balance_newidle(this_cpu,
+							this_rq, sd);
+			if (time_after(next_balance,
+				  sd->last_balance + sd->balance_interval))
+				next_balance = sd->last_balance
+						+ sd->balance_interval;
+			if (pulled_task)
 				break;
 		}
 	}
+	if (!pulled_task)
+		/*
+		 * We are going idle. next_balance may be set based on
+		 * a busy processor. So reset next_balance.
+		 */
+		this_rq->next_balance = next_balance;
 }
 
 /*
@@ -2904,7 +2918,7 @@ static void run_rebalance_domains(struct softirq_action *h)
 				 */
 				idle = NOT_IDLE;
 			}
-			sd->last_balance += interval;
+			sd->last_balance = jiffies;
 		}
 		if (time_after(next_balance, sd->last_balance + interval))
 			next_balance = sd->last_balance + interval;
-- 
cgit v1.2.3


From 08c183f31bdbb709f177f6d3110d5f288ea33933 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 10 Dec 2006 02:20:29 -0800
Subject: [PATCH] sched: add option to serialize load balancing

Large sched domains can be very expensive to scan.  Add an option SD_SERIALIZE
to the sched domain flags.  If that flag is set then we make sure that no
other such domain is being balanced.

[akpm@osdl.org: build fix]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Peter Williams <pwil3058@bigpond.net.au>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0a4a26b21f69..2b2b780939c9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2880,6 +2880,7 @@ static void update_load(struct rq *this_rq)
  *
  * Balancing parameters are set up in arch_init_sched_domains.
  */
+static DEFINE_SPINLOCK(balancing);
 
 static void run_rebalance_domains(struct softirq_action *h)
 {
@@ -2909,6 +2910,11 @@ static void run_rebalance_domains(struct softirq_action *h)
 		if (unlikely(!interval))
 			interval = 1;
 
+		if (sd->flags & SD_SERIALIZE) {
+			if (!spin_trylock(&balancing))
+				goto out;
+		}
+
 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
 			if (load_balance(this_cpu, this_rq, sd, idle)) {
 				/*
@@ -2920,6 +2926,9 @@ static void run_rebalance_domains(struct softirq_action *h)
 			}
 			sd->last_balance = jiffies;
 		}
+		if (sd->flags & SD_SERIALIZE)
+			spin_unlock(&balancing);
+out:
 		if (time_after(next_balance, sd->last_balance + interval))
 			next_balance = sd->last_balance + interval;
 	}
-- 
cgit v1.2.3


From b18ec80396834497933d77b81ec0918519f4e2a7 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Sun, 10 Dec 2006 02:20:31 -0800
Subject: [PATCH] sched: improve migration accuracy

Co-opt rq->timestamp_last_tick to maintain a cache_hot_time evaluation
reference timestamp at both tick and sched times to prevent said reference,
formerly rq->timestamp_last_tick, from being behind task->last_ran at
evaluation time, and to move said reference closer to current time on the
remote processor, intent being to improve cache hot evaluation and
timestamp adjustment accuracy for task migration.

Fix minor sched_time double accounting error which occurs when a task
passing through schedule() does not schedule off, and takes the next timer
tick.

[kenneth.w.chen@intel.com: cleanup]
Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Ken Chen <kenneth.w.chen@intel.com>
Cc: Don Mullis <dwm@meer.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 2b2b780939c9..15ce772a471a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -225,7 +225,8 @@ struct rq {
 	unsigned long nr_uninterruptible;
 
 	unsigned long expired_timestamp;
-	unsigned long long timestamp_last_tick;
+	/* Cached timestamp set by update_cpu_clock() */
+	unsigned long long most_recent_timestamp;
 	struct task_struct *curr, *idle;
 	unsigned long next_balance;
 	struct mm_struct *prev_mm;
@@ -944,8 +945,8 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
 	if (!local) {
 		/* Compensate for drifting sched_clock */
 		struct rq *this_rq = this_rq();
-		now = (now - this_rq->timestamp_last_tick)
-			+ rq->timestamp_last_tick;
+		now = (now - this_rq->most_recent_timestamp)
+			+ rq->most_recent_timestamp;
 	}
 #endif
 
@@ -1689,8 +1690,8 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		 * Not the local CPU - must adjust timestamp. This should
 		 * get optimised away in the !CONFIG_SMP case.
 		 */
-		p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
-					+ rq->timestamp_last_tick;
+		p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
+					+ rq->most_recent_timestamp;
 		__activate_task(p, rq);
 		if (TASK_PREEMPTS_CURR(p, rq))
 			resched_task(rq->curr);
@@ -2068,8 +2069,8 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array,
 	set_task_cpu(p, this_cpu);
 	inc_nr_running(p, this_rq);
 	enqueue_task(p, this_array);
-	p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
-				+ this_rq->timestamp_last_tick;
+	p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
+				+ this_rq->most_recent_timestamp;
 	/*
 	 * Note that idle threads have a prio of MAX_PRIO, for this test
 	 * to be always true for them.
@@ -2105,10 +2106,15 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 	 * 2) too many balance attempts have failed.
 	 */
 
-	if (sd->nr_balance_failed > sd->cache_nice_tries)
+	if (sd->nr_balance_failed > sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+		if (task_hot(p, rq->most_recent_timestamp, sd))
+			schedstat_inc(sd, lb_hot_gained[idle]);
+#endif
 		return 1;
+	}
 
-	if (task_hot(p, rq->timestamp_last_tick, sd))
+	if (task_hot(p, rq->most_recent_timestamp, sd))
 		return 0;
 	return 1;
 }
@@ -2206,11 +2212,6 @@ skip_queue:
 		goto skip_bitmap;
 	}
 
-#ifdef CONFIG_SCHEDSTATS
-	if (task_hot(tmp, busiest->timestamp_last_tick, sd))
-		schedstat_inc(sd, lb_hot_gained[idle]);
-#endif
-
 	pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
 	pulled++;
 	rem_load_move -= tmp->load_weight;
@@ -2971,7 +2972,8 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 static inline void
 update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
 {
-	p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick);
+	p->sched_time += now - p->last_ran;
+	p->last_ran = rq->most_recent_timestamp = now;
 }
 
 /*
@@ -2984,8 +2986,7 @@ unsigned long long current_sched_time(const struct task_struct *p)
 	unsigned long flags;
 
 	local_irq_save(flags);
-	ns = max(p->timestamp, task_rq(p)->timestamp_last_tick);
-	ns = p->sched_time + sched_clock() - ns;
+	ns = p->sched_time + sched_clock() - p->last_ran;
 	local_irq_restore(flags);
 
 	return ns;
@@ -3176,8 +3177,6 @@ void scheduler_tick(void)
 
 	update_cpu_clock(p, rq, now);
 
-	rq->timestamp_last_tick = now;
-
 	if (p == rq->idle)
 		/* Task on the idle queue */
 		wake_priority_sleeper(rq);
@@ -5032,8 +5031,8 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 		 * afterwards, and pretending it was a local activate.
 		 * This way is cleaner and logically correct.
 		 */
-		p->timestamp = p->timestamp - rq_src->timestamp_last_tick
-				+ rq_dest->timestamp_last_tick;
+		p->timestamp = p->timestamp - rq_src->most_recent_timestamp
+				+ rq_dest->most_recent_timestamp;
 		deactivate_task(p, rq_src);
 		__activate_task(p, rq_dest);
 		if (TASK_PREEMPTS_CURR(p, rq_dest))
-- 
cgit v1.2.3


From 783609c6cb4eaa23f2ac5c968a44483584ec133f Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Sun, 10 Dec 2006 02:20:33 -0800
Subject: [PATCH] sched: decrease number of load balances

Currently at a particular domain, each cpu in the sched group will do a
load balance at the frequency of balance_interval.  More the cores and
threads, more the cpus will be in each sched group at SMP and NUMA domain.
And we endup spending quite a bit of time doing load balancing in those
domains.

Fix this by making only one cpu(first idle cpu or first cpu in the group if
all the cpus are busy) in the sched group do the load balance at that
particular sched domain and this load will slowly percolate down to the
other cpus with in that group(when they do load balancing at lower
domains).

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 47 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 15ce772a471a..4e453431c61a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -428,7 +428,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
  * bump this up when changing the output format or the meaning of an existing
  * format, so that tools can adapt (or abort)
  */
-#define SCHEDSTAT_VERSION 12
+#define SCHEDSTAT_VERSION 13
 
 static int show_schedstat(struct seq_file *seq, void *v)
 {
@@ -466,7 +466,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
 			seq_printf(seq, "domain%d %s", dcnt++, mask_str);
 			for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
 					itype++) {
-				seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",
+				seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu",
 				    sd->lb_cnt[itype],
 				    sd->lb_balanced[itype],
 				    sd->lb_failed[itype],
@@ -474,7 +474,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
 				    sd->lb_gained[itype],
 				    sd->lb_hot_gained[itype],
 				    sd->lb_nobusyq[itype],
-				    sd->lb_nobusyg[itype]);
+				    sd->lb_nobusyg[itype],
+				    sd->lb_stopbalance[itype]);
 			}
 			seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
 			    sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
@@ -2249,7 +2250,7 @@ out:
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
 		   unsigned long *imbalance, enum idle_type idle, int *sd_idle,
-		   cpumask_t *cpus)
+		   cpumask_t *cpus, int *balance)
 {
 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2278,10 +2279,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		unsigned long load, group_capacity;
 		int local_group;
 		int i;
+		unsigned int balance_cpu = -1, first_idle_cpu = 0;
 		unsigned long sum_nr_running, sum_weighted_load;
 
 		local_group = cpu_isset(this_cpu, group->cpumask);
 
+		if (local_group)
+			balance_cpu = first_cpu(group->cpumask);
+
 		/* Tally up the load of all CPUs in the group */
 		sum_weighted_load = sum_nr_running = avg_load = 0;
 
@@ -2297,9 +2302,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 				*sd_idle = 0;
 
 			/* Bias balancing toward cpus of our domain */
-			if (local_group)
+			if (local_group) {
+				if (idle_cpu(i) && !first_idle_cpu) {
+					first_idle_cpu = 1;
+					balance_cpu = i;
+				}
+
 				load = target_load(i, load_idx);
-			else
+			} else
 				load = source_load(i, load_idx);
 
 			avg_load += load;
@@ -2307,6 +2317,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 			sum_weighted_load += rq->raw_weighted_load;
 		}
 
+		/*
+		 * First idle cpu or the first cpu(busiest) in this sched group
+		 * is eligible for doing load balancing at this and above
+		 * domains.
+		 */
+		if (local_group && balance_cpu != this_cpu && balance) {
+			*balance = 0;
+			goto ret;
+		}
+
 		total_load += avg_load;
 		total_pwr += group->cpu_power;
 
@@ -2498,8 +2518,8 @@ out_balanced:
 		*imbalance = min_load_per_task;
 		return group_min;
 	}
-ret:
 #endif
+ret:
 	*imbalance = 0;
 	return NULL;
 }
@@ -2550,7 +2570,8 @@ static inline unsigned long minus_1_or_zero(unsigned long n)
  * tasks if there is an imbalance.
  */
 static int load_balance(int this_cpu, struct rq *this_rq,
-			struct sched_domain *sd, enum idle_type idle)
+			struct sched_domain *sd, enum idle_type idle,
+			int *balance)
 {
 	int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
 	struct sched_group *group;
@@ -2573,7 +2594,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
 redo:
 	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
-							&cpus);
+				   &cpus, balance);
+
+	if (*balance == 0) {
+		schedstat_inc(sd, lb_stopbalance[idle]);
+		goto out_balanced;
+	}
+
 	if (!group) {
 		schedstat_inc(sd, lb_nobusyg[idle]);
 		goto out_balanced;
@@ -2715,7 +2742,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 	schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
 redo:
 	group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
-				&sd_idle, &cpus);
+				   &sd_idle, &cpus, NULL);
 	if (!group) {
 		schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
 		goto out_balanced;
@@ -2885,7 +2912,7 @@ static DEFINE_SPINLOCK(balancing);
 
 static void run_rebalance_domains(struct softirq_action *h)
 {
-	int this_cpu = smp_processor_id();
+	int this_cpu = smp_processor_id(), balance = 1;
 	struct rq *this_rq = cpu_rq(this_cpu);
 	unsigned long interval;
 	struct sched_domain *sd;
@@ -2917,7 +2944,7 @@ static void run_rebalance_domains(struct softirq_action *h)
 		}
 
 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
-			if (load_balance(this_cpu, this_rq, sd, idle)) {
+			if (load_balance(this_cpu, this_rq, sd, idle, &balance)) {
 				/*
 				 * We've pulled tasks over so either we're no
 				 * longer idle, or one of our SMT siblings is
@@ -2932,6 +2959,14 @@ static void run_rebalance_domains(struct softirq_action *h)
 out:
 		if (time_after(next_balance, sd->last_balance + interval))
 			next_balance = sd->last_balance + interval;
+
+		/*
+		 * Stop the load balance at this level. There is another
+		 * CPU in our sched group which is doing load balancing more
+		 * actively.
+		 */
+		if (!balance)
+			break;
 	}
 	this_rq->next_balance = next_balance;
 }
-- 
cgit v1.2.3


From 06066714f6016cffcb249f6ab21b7919de1bc859 Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Sun, 10 Dec 2006 02:20:35 -0800
Subject: [PATCH] sched: remove lb_stopbalance counter

Remove scheduler stats lb_stopbalance counter.  This counter can be
calculated by: lb_balanced - lb_nobusyg - lb_nobusyq.  There is no need to
create gazillion counters while we can derive the value.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 4e453431c61a..66e44b5b53d2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -428,7 +428,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
  * bump this up when changing the output format or the meaning of an existing
  * format, so that tools can adapt (or abort)
  */
-#define SCHEDSTAT_VERSION 13
+#define SCHEDSTAT_VERSION 14
 
 static int show_schedstat(struct seq_file *seq, void *v)
 {
@@ -466,7 +466,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
 			seq_printf(seq, "domain%d %s", dcnt++, mask_str);
 			for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
 					itype++) {
-				seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu",
+				seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",
 				    sd->lb_cnt[itype],
 				    sd->lb_balanced[itype],
 				    sd->lb_failed[itype],
@@ -474,8 +474,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
 				    sd->lb_gained[itype],
 				    sd->lb_hot_gained[itype],
 				    sd->lb_nobusyq[itype],
-				    sd->lb_nobusyg[itype],
-				    sd->lb_stopbalance[itype]);
+				    sd->lb_nobusyg[itype]);
 			}
 			seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
 			    sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
@@ -2596,10 +2595,8 @@ redo:
 	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
 				   &cpus, balance);
 
-	if (*balance == 0) {
-		schedstat_inc(sd, lb_stopbalance[idle]);
+	if (*balance == 0)
 		goto out_balanced;
-	}
 
 	if (!group) {
 		schedstat_inc(sd, lb_nobusyg[idle]);
-- 
cgit v1.2.3


From 62ab616d54371a65f595c199aad1e1755b837d25 Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Sun, 10 Dec 2006 02:20:36 -0800
Subject: [PATCH] sched: optimize activate_task for RT task

RT task does not participate in interactiveness priority and thus shouldn't
be bothered with timestamp and p->sleep_type manipulation when task is
being put on run queue.  Bypass all of the them with a single if (rt_task)
test.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 66e44b5b53d2..48e35c916326 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -940,6 +940,9 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
 {
 	unsigned long long now;
 
+	if (rt_task(p))
+		goto out;
+
 	now = sched_clock();
 #ifdef CONFIG_SMP
 	if (!local) {
@@ -961,8 +964,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
 				     (now - p->timestamp) >> 20);
 	}
 
-	if (!rt_task(p))
-		p->prio = recalc_task_prio(p, now);
+	p->prio = recalc_task_prio(p, now);
 
 	/*
 	 * This checks to make sure it's not an uninterruptible task
@@ -987,7 +989,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
 		}
 	}
 	p->timestamp = now;
-
+out:
 	__activate_task(p, rq);
 }
 
-- 
cgit v1.2.3


From 33859f7f9788da2ac9aa23be4dc8e948112809ca Mon Sep 17 00:00:00 2001
From: Miguel Ojeda Sandonis <maxextreme@gmail.com>
Date: Sun, 10 Dec 2006 02:20:38 -0800
Subject: [PATCH] kernel/sched.c: whitespace cleanups

[akpm@osdl.org: additional cleanups]
Signed-off-by: Miguel Ojeda Sandonis <maxextreme@gmail.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 95 +++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 55 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 48e35c916326..8a0afb97af71 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -466,7 +466,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
 			seq_printf(seq, "domain%d %s", dcnt++, mask_str);
 			for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
 					itype++) {
-				seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",
+				seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
+						"%lu",
 				    sd->lb_cnt[itype],
 				    sd->lb_balanced[itype],
 				    sd->lb_failed[itype],
@@ -476,11 +477,13 @@ static int show_schedstat(struct seq_file *seq, void *v)
 				    sd->lb_nobusyq[itype],
 				    sd->lb_nobusyg[itype]);
 			}
-			seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
+			seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
+			    " %lu %lu %lu\n",
 			    sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
 			    sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
 			    sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
-			    sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
+			    sd->ttwu_wake_remote, sd->ttwu_move_affine,
+			    sd->ttwu_move_balance);
 		}
 		preempt_enable();
 #endif
@@ -1454,7 +1457,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 
 		if (this_sd->flags & SD_WAKE_AFFINE) {
 			unsigned long tl = this_load;
-			unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu);
+			unsigned long tl_per_task;
+
+			tl_per_task = cpu_avg_load_per_task(this_cpu);
 
 			/*
 			 * If sync wakeup then subtract the (maximum possible)
@@ -2487,18 +2492,21 @@ small_imbalance:
 		pwr_now /= SCHED_LOAD_SCALE;
 
 		/* Amount of load we'd subtract */
-		tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power;
+		tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
+			busiest->cpu_power;
 		if (max_load > tmp)
 			pwr_move += busiest->cpu_power *
 				min(busiest_load_per_task, max_load - tmp);
 
 		/* Amount of load we'd add */
-		if (max_load*busiest->cpu_power <
-				busiest_load_per_task*SCHED_LOAD_SCALE)
-			tmp = max_load*busiest->cpu_power/this->cpu_power;
+		if (max_load * busiest->cpu_power <
+				busiest_load_per_task * SCHED_LOAD_SCALE)
+			tmp = max_load * busiest->cpu_power / this->cpu_power;
 		else
-			tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power;
-		pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp);
+			tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
+				this->cpu_power;
+		pwr_move += this->cpu_power *
+			min(this_load_per_task, this_load + tmp);
 		pwr_move /= SCHED_LOAD_SCALE;
 
 		/* Move if we gain throughput */
@@ -3366,7 +3374,8 @@ void fastcall add_preempt_count(int val)
 	/*
 	 * Spinlock count overflowing soon?
 	 */
-	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
+	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
+				PREEMPT_MASK - 10);
 }
 EXPORT_SYMBOL(add_preempt_count);
 
@@ -5439,16 +5448,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 		if (!(sd->flags & SD_LOAD_BALANCE)) {
 			printk("does not load-balance\n");
 			if (sd->parent)
-				printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
+				printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
+						" has parent");
 			break;
 		}
 
 		printk("span %s\n", str);
 
 		if (!cpu_isset(cpu, sd->span))
-			printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
+			printk(KERN_ERR "ERROR: domain->span does not contain "
+					"CPU%d\n", cpu);
 		if (!cpu_isset(cpu, group->cpumask))
-			printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
+			printk(KERN_ERR "ERROR: domain->groups does not contain"
+					" CPU%d\n", cpu);
 
 		printk(KERN_DEBUG);
 		for (i = 0; i < level + 2; i++)
@@ -5463,7 +5475,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 
 			if (!group->cpu_power) {
 				printk("\n");
-				printk(KERN_ERR "ERROR: domain->cpu_power not set\n");
+				printk(KERN_ERR "ERROR: domain->cpu_power not "
+						"set\n");
 			}
 
 			if (!cpus_weight(group->cpumask)) {
@@ -5486,15 +5499,17 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 		printk("\n");
 
 		if (!cpus_equal(sd->span, groupmask))
-			printk(KERN_ERR "ERROR: groups don't span domain->span\n");
+			printk(KERN_ERR "ERROR: groups don't span "
+					"domain->span\n");
 
 		level++;
 		sd = sd->parent;
+		if (!sd)
+			continue;
 
-		if (sd) {
-			if (!cpus_subset(groupmask, sd->span))
-				printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
-		}
+		if (!cpus_subset(groupmask, sd->span))
+			printk(KERN_ERR "ERROR: parent span is not a superset "
+				"of domain->span\n");
 
 	} while (sd);
 }
@@ -5812,8 +5827,9 @@ __setup("max_cache_size=", setup_max_cache_size);
  */
 static void touch_cache(void *__cache, unsigned long __size)
 {
-	unsigned long size = __size/sizeof(long), chunk1 = size/3,
-			chunk2 = 2*size/3;
+	unsigned long size = __size / sizeof(long);
+	unsigned long chunk1 = size / 3;
+	unsigned long chunk2 = 2 * size / 3;
 	unsigned long *cache = __cache;
 	int i;
 
@@ -5922,11 +5938,11 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
 	 */
 	measure_one(cache, size, cpu1, cpu2);
 	for (i = 0; i < ITERATIONS; i++)
-		cost1 += measure_one(cache, size - i*1024, cpu1, cpu2);
+		cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
 
 	measure_one(cache, size, cpu2, cpu1);
 	for (i = 0; i < ITERATIONS; i++)
-		cost1 += measure_one(cache, size - i*1024, cpu2, cpu1);
+		cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
 
 	/*
 	 * (We measure the non-migrating [cached] cost on both
@@ -5936,17 +5952,17 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
 
 	measure_one(cache, size, cpu1, cpu1);
 	for (i = 0; i < ITERATIONS; i++)
-		cost2 += measure_one(cache, size - i*1024, cpu1, cpu1);
+		cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
 
 	measure_one(cache, size, cpu2, cpu2);
 	for (i = 0; i < ITERATIONS; i++)
-		cost2 += measure_one(cache, size - i*1024, cpu2, cpu2);
+		cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
 
 	/*
 	 * Get the per-iteration migration cost:
 	 */
-	do_div(cost1, 2*ITERATIONS);
-	do_div(cost2, 2*ITERATIONS);
+	do_div(cost1, 2 * ITERATIONS);
+	do_div(cost2, 2 * ITERATIONS);
 
 	return cost1 - cost2;
 }
@@ -5984,7 +6000,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
 	 */
 	cache = vmalloc(max_size);
 	if (!cache) {
-		printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
+		printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
 		return 1000000; /* return 1 msec on very small boxen */
 	}
 
@@ -6009,7 +6025,8 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
 		avg_fluct = (avg_fluct + fluct)/2;
 
 		if (migration_debug)
-			printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n",
+			printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
+				"(%8Ld %8Ld)\n",
 				cpu1, cpu2, size,
 				(long)cost / 1000000,
 				((long)cost / 100000) % 10,
@@ -6104,20 +6121,18 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map)
 			-1
 #endif
 		);
-	if (system_state == SYSTEM_BOOTING) {
-		if (num_online_cpus() > 1) {
-			printk("migration_cost=");
-			for (distance = 0; distance <= max_distance; distance++) {
-				if (distance)
-					printk(",");
-				printk("%ld", (long)migration_cost[distance] / 1000);
-			}
-			printk("\n");
+	if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
+		printk("migration_cost=");
+		for (distance = 0; distance <= max_distance; distance++) {
+			if (distance)
+				printk(",");
+			printk("%ld", (long)migration_cost[distance] / 1000);
 		}
+		printk("\n");
 	}
 	j1 = jiffies;
 	if (migration_debug)
-		printk("migration: %ld seconds\n", (j1-j0)/HZ);
+		printk("migration: %ld seconds\n", (j1-j0) / HZ);
 
 	/*
 	 * Move back to the original CPU. NUMA-Q gets confused
-- 
cgit v1.2.3


From f3d19c90fb117a5f080310a4592929aa8e1ad8e9 Mon Sep 17 00:00:00 2001
From: Vadim Lobanov <vlobanov@speakeasy.net>
Date: Sun, 10 Dec 2006 02:21:09 -0800
Subject: [PATCH] fdtable: Delete pointless code in dup_fd()

The dup_fd() function creates a new files_struct and fdtable embedded inside
that files_struct, and then possibly expands the fdtable using expand_files().

The out_release error path is invoked when expand_files() returns an error
code.  However, when this attempt to expand fails, the fdtable is left in its
original embedded form, so it is pointless to try to free the associated
fdarray and fdsets.

Signed-off-by: Vadim Lobanov <vlobanov@speakeasy.net>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 086e172d0d3d..30eab4f063cd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -711,8 +711,10 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 	old_fds = old_fdt->fd;
 	new_fds = new_fdt->fd;
 
-	memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8);
-	memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8);
+	memcpy(new_fdt->open_fds->fds_bits,
+		old_fdt->open_fds->fds_bits, open_files/8);
+	memcpy(new_fdt->close_on_exec->fds_bits,
+		old_fdt->close_on_exec->fds_bits, open_files/8);
 
 	for (i = open_files; i != 0; i--) {
 		struct file *f = *old_fds++;
@@ -745,14 +747,11 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 		memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
 	}
 
-out:
 	return newf;
 
 out_release:
-	free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset);
-	free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
-	free_fd_array(new_fdt->fd, new_fdt->max_fds);
 	kmem_cache_free(files_cachep, newf);
+out:
 	return NULL;
 }
 
-- 
cgit v1.2.3


From bbea9f69668a3d0cf9feba15a724cd02896f8675 Mon Sep 17 00:00:00 2001
From: Vadim Lobanov <vlobanov@speakeasy.net>
Date: Sun, 10 Dec 2006 02:21:12 -0800
Subject: [PATCH] fdtable: Make fdarray and fdsets equal in size

Currently, each fdtable supports three dynamically-sized arrays of data: the
fdarray and two fdsets.  The code allows the number of fds supported by the
fdarray (fdtable->max_fds) to differ from the number of fds supported by each
of the fdsets (fdtable->max_fdset).

In practice, it is wasteful for these two sizes to differ: whenever we hit a
limit on the smaller-capacity structure, we will reallocate the entire fdtable
and all the dynamic arrays within it, so any delta in the memory used by the
larger-capacity structure will never be touched at all.

Rather than hogging this excess, we shouldn't even allocate it in the first
place, and keep the capacities of the fdarray and the fdsets equal.  This
patch removes fdtable->max_fdset.  As an added bonus, most of the supporting
code becomes simpler.

Signed-off-by: Vadim Lobanov <vlobanov@speakeasy.net>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c |  2 +-
 kernel/fork.c | 24 ++++++------------------
 2 files changed, 7 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 03e64fe4a14a..5f77e76b4f97 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -425,7 +425,7 @@ static void close_files(struct files_struct * files)
 	for (;;) {
 		unsigned long set;
 		i = j * __NFDBITS;
-		if (i >= fdt->max_fdset || i >= fdt->max_fds)
+		if (i >= fdt->max_fds)
 			break;
 		set = fdt->open_fds->fds_bits[j++];
 		while (set) {
diff --git a/kernel/fork.c b/kernel/fork.c
index 30eab4f063cd..aba595424f78 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -614,7 +614,7 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
 
 static int count_open_files(struct fdtable *fdt)
 {
-	int size = fdt->max_fdset;
+	int size = fdt->max_fds;
 	int i;
 
 	/* Find the last open fd */
@@ -641,7 +641,6 @@ static struct files_struct *alloc_files(void)
 	newf->next_fd = 0;
 	fdt = &newf->fdtab;
 	fdt->max_fds = NR_OPEN_DEFAULT;
-	fdt->max_fdset = EMBEDDED_FD_SET_SIZE;
 	fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
 	fdt->open_fds = (fd_set *)&newf->open_fds_init;
 	fdt->fd = &newf->fd_array[0];
@@ -662,7 +661,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 {
 	struct files_struct *newf;
 	struct file **old_fds, **new_fds;
-	int open_files, size, i, expand;
+	int open_files, size, i;
 	struct fdtable *old_fdt, *new_fdt;
 
 	*errorp = -ENOMEM;
@@ -673,25 +672,14 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 	spin_lock(&oldf->file_lock);
 	old_fdt = files_fdtable(oldf);
 	new_fdt = files_fdtable(newf);
-	size = old_fdt->max_fdset;
 	open_files = count_open_files(old_fdt);
-	expand = 0;
 
 	/*
-	 * Check whether we need to allocate a larger fd array or fd set.
-	 * Note: we're not a clone task, so the open count won't  change.
+	 * Check whether we need to allocate a larger fd array and fd set.
+	 * Note: we're not a clone task, so the open count won't change.
 	 */
-	if (open_files > new_fdt->max_fdset) {
-		new_fdt->max_fdset = 0;
-		expand = 1;
-	}
 	if (open_files > new_fdt->max_fds) {
 		new_fdt->max_fds = 0;
-		expand = 1;
-	}
-
-	/* if the old fdset gets grown now, we'll only copy up to "size" fds */
-	if (expand) {
 		spin_unlock(&oldf->file_lock);
 		spin_lock(&newf->file_lock);
 		*errorp = expand_files(newf, open_files-1);
@@ -739,8 +727,8 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 	/* This is long word aligned thus could use a optimized version */ 
 	memset(new_fds, 0, size); 
 
-	if (new_fdt->max_fdset > open_files) {
-		int left = (new_fdt->max_fdset-open_files)/8;
+	if (new_fdt->max_fds > open_files) {
+		int left = (new_fdt->max_fds-open_files)/8;
 		int start = open_files / (8 * sizeof(unsigned long));
 
 		memset(&new_fdt->open_fds->fds_bits[start], 0, left);
-- 
cgit v1.2.3


From 4fd45812cbe875a620c86a096a5d46c742694b7e Mon Sep 17 00:00:00 2001
From: Vadim Lobanov <vlobanov@speakeasy.net>
Date: Sun, 10 Dec 2006 02:21:17 -0800
Subject: [PATCH] fdtable: Remove the free_files field

An fdtable can either be embedded inside a files_struct or standalone (after
being expanded).  When an fdtable is being discarded after all RCU references
to it have expired, we must either free it directly, in the standalone case,
or free the files_struct it is contained within, in the embedded case.

Currently the free_files field controls this behavior, but we can get rid of
it entirely, as all the necessary information is already recorded.  We can
distinguish embedded and standalone fdtables using max_fds, and if it is
embedded we can divine the relevant files_struct using container_of().

Signed-off-by: Vadim Lobanov <vlobanov@speakeasy.net>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 6 ++----
 kernel/fork.c | 1 -
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 5f77e76b4f97..122fadb972fc 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -466,11 +466,9 @@ void fastcall put_files_struct(struct files_struct *files)
 		 * you can free files immediately.
 		 */
 		fdt = files_fdtable(files);
-		if (fdt == &files->fdtab)
-			fdt->free_files = files;
-		else
+		if (fdt != &files->fdtab)
 			kmem_cache_free(files_cachep, files);
-		free_fdtable(fdt);
+		call_rcu(&fdt->rcu, free_fdtable_rcu);
 	}
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index aba595424f78..d16c566eb645 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -645,7 +645,6 @@ static struct files_struct *alloc_files(void)
 	fdt->open_fds = (fd_set *)&newf->open_fds_init;
 	fdt->fd = &newf->fd_array[0];
 	INIT_RCU_HEAD(&fdt->rcu);
-	fdt->free_files = NULL;
 	fdt->next = NULL;
 	rcu_assign_pointer(newf->fdt, fdt);
 out:
-- 
cgit v1.2.3


From 4c36a5dec25fb344ad76b11860da3a8b50bd1248 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 10 Dec 2006 02:21:24 -0800
Subject: [PATCH] round_jiffies infrastructure

Introduce a round_jiffies() function as well as a round_jiffies_relative()
function.  These functions round a jiffies value to the next whole second.
The primary purpose of this rounding is to cause all "we don't care exactly
when" timers to happen at the same jiffy.

This avoids multiple timers firing within the second for no real reason;
with dynamic ticks these extra timers cause wakeups from deep sleep CPU
sleep states and thus waste power.

The exact wakeup moment is skewed by the cpu number, to avoid all cpus from
waking up at the exact same time (and hitting the same lock/cachelines
there)

[akpm@osdl.org: fix variable type]
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 132 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 132 insertions(+)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index c1c7fbcffec1..b1f40f256eb0 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -80,6 +80,138 @@ tvec_base_t boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
 static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
 
+/**
+ * __round_jiffies - function to round jiffies to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * __round_jiffies rounds an absolute time in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The exact rounding is skewed for each processor to avoid all
+ * processors firing at the exact same time, which could lead
+ * to lock contention or spurious cache line bouncing.
+ *
+ * The return value is the rounded version of the "j" parameter.
+ */
+unsigned long __round_jiffies(unsigned long j, int cpu)
+{
+	int rem;
+	unsigned long original = j;
+
+	/*
+	 * We don't want all cpus firing their timers at once hitting the
+	 * same lock or cachelines, so we skew each extra cpu with an extra
+	 * 3 jiffies. This 3 jiffies came originally from the mm/ code which
+	 * already did this.
+	 * The skew is done by adding 3*cpunr, then round, then subtract this
+	 * extra offset again.
+	 */
+	j += cpu * 3;
+
+	rem = j % HZ;
+
+	/*
+	 * If the target jiffie is just after a whole second (which can happen
+	 * due to delays of the timer irq, long irq off times etc etc) then
+	 * we should round down to the whole second, not up. Use 1/4th second
+	 * as cutoff for this rounding as an extreme upper bound for this.
+	 */
+	if (rem < HZ/4) /* round down */
+		j = j - rem;
+	else /* round up */
+		j = j - rem + HZ;
+
+	/* now that we have rounded, subtract the extra skew again */
+	j -= cpu * 3;
+
+	if (j <= jiffies) /* rounding ate our timeout entirely; */
+		return original;
+	return j;
+}
+EXPORT_SYMBOL_GPL(__round_jiffies);
+
+/**
+ * __round_jiffies_relative - function to round jiffies to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * __round_jiffies_relative rounds a time delta  in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The exact rounding is skewed for each processor to avoid all
+ * processors firing at the exact same time, which could lead
+ * to lock contention or spurious cache line bouncing.
+ *
+ * The return value is the rounded version of the "j" parameter.
+ */
+unsigned long __round_jiffies_relative(unsigned long j, int cpu)
+{
+	/*
+	 * In theory the following code can skip a jiffy in case jiffies
+	 * increments right between the addition and the later subtraction.
+	 * However since the entire point of this function is to use approximate
+	 * timeouts, it's entirely ok to not handle that.
+	 */
+	return  __round_jiffies(j + jiffies, cpu) - jiffies;
+}
+EXPORT_SYMBOL_GPL(__round_jiffies_relative);
+
+/**
+ * round_jiffies - function to round jiffies to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ *
+ * round_jiffies rounds an absolute time in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The return value is the rounded version of the "j" parameter.
+ */
+unsigned long round_jiffies(unsigned long j)
+{
+	return __round_jiffies(j, raw_smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(round_jiffies);
+
+/**
+ * round_jiffies_relative - function to round jiffies to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ *
+ * round_jiffies_relative rounds a time delta  in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The return value is the rounded version of the "j" parameter.
+ */
+unsigned long round_jiffies_relative(unsigned long j)
+{
+	return __round_jiffies_relative(j, raw_smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(round_jiffies_relative);
+
+
 static inline void set_running_timer(tvec_base_t *base,
 					struct timer_list *timer)
 {
-- 
cgit v1.2.3


From 2b0137001de68153203dd3bc20e6d27eb7c9719c Mon Sep 17 00:00:00 2001
From: Daniel Walker <dwalker@mvista.com>
Date: Sun, 10 Dec 2006 02:21:30 -0800
Subject: [PATCH] clocksource: add usage of CONFIG_SYSFS

Simply adds some ifdefs to remove clocksoure sysfs code when CONFIG_SYSFS
isn't turn on.

Signed-off-by: Daniel Walker <dwalker@mvista.com>
Acked-by: John Stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time/clocksource.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 74eca5939bd9..3651b34cc355 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -186,6 +186,7 @@ void clocksource_reselect(void)
 }
 EXPORT_SYMBOL(clocksource_reselect);
 
+#ifdef CONFIG_SYSFS
 /**
  * sysfs_show_current_clocksources - sysfs interface for current clocksource
  * @dev:	unused
@@ -307,6 +308,7 @@ static int __init init_clocksource_sysfs(void)
 }
 
 device_initcall(init_clocksource_sysfs);
+#endif /* CONFIG_SYSFS */
 
 /**
  * boot_override_clocksource - boot clock override
-- 
cgit v1.2.3


From f5f1a24a2caa299bb7d294aee92d7dd3410d9ed7 Mon Sep 17 00:00:00 2001
From: Daniel Walker <dwalker@mvista.com>
Date: Sun, 10 Dec 2006 02:21:33 -0800
Subject: [PATCH] clocksource: small cleanup

Mostly changing alignment.  Just some general cleanup.

[akpm@osdl.org: build fix]
Signed-off-by: Daniel Walker <dwalker@mvista.com>
Acked-by: John Stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time/clocksource.c |  6 +++---
 kernel/timer.c            | 16 +++++++++++-----
 2 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 3651b34cc355..22504afc0d34 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -156,7 +156,7 @@ int clocksource_register(struct clocksource *c)
 	/* check if clocksource is already registered */
 	if (is_registered_source(c)) {
 		printk("register_clocksource: Cannot register %s. "
-			"Already registered!", c->name);
+		       "Already registered!", c->name);
 		ret = -EBUSY;
 	} else {
 		/* register it */
@@ -276,10 +276,10 @@ sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
  * Sysfs setup bits:
  */
 static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
-			sysfs_override_clocksource);
+		   sysfs_override_clocksource);
 
 static SYSDEV_ATTR(available_clocksource, 0600,
-			sysfs_show_available_clocksources, NULL);
+		   sysfs_show_available_clocksources, NULL);
 
 static struct sysdev_class clocksource_sysclass = {
 	set_kset_name("clocksource"),
diff --git a/kernel/timer.c b/kernel/timer.c
index b1f40f256eb0..0256ab443d8a 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -846,7 +846,7 @@ static int change_clocksource(void)
 		clock = new;
 		clock->cycle_last = now;
 		printk(KERN_INFO "Time: %s clocksource has been installed.\n",
-					clock->name);
+		       clock->name);
 		return 1;
 	} else if (clock->update_callback) {
 		return clock->update_callback();
@@ -854,7 +854,10 @@ static int change_clocksource(void)
 	return 0;
 }
 #else
-#define change_clocksource() (0)
+static inline int change_clocksource(void)
+{
+	return 0;
+}
 #endif
 
 /**
@@ -952,7 +955,8 @@ device_initcall(timekeeping_init_device);
  * If the error is already larger, we look ahead even further
  * to compensate for late or lost adjustments.
  */
-static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset)
+static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
+						 s64 *offset)
 {
 	s64 tick_error, i;
 	u32 look_ahead, adj;
@@ -976,7 +980,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *
 	 * Now calculate the error in (1 << look_ahead) ticks, but first
 	 * remove the single look ahead already included in the error.
 	 */
-	tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1);
+	tick_error = current_tick_length() >>
+		(TICK_LENGTH_SHIFT - clock->shift + 1);
 	tick_error -= clock->xtime_interval >> 1;
 	error = ((error - tick_error) >> look_ahead) + tick_error;
 
@@ -1028,7 +1033,8 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset)
 	clock->mult += adj;
 	clock->xtime_interval += interval;
 	clock->xtime_nsec -= offset;
-	clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift);
+	clock->error -= (interval - offset) <<
+			(TICK_LENGTH_SHIFT - clock->shift);
 }
 
 /**
-- 
cgit v1.2.3