From 49c3df6aaa6a51071fc135273d1a2515d019099f Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Wed, 2 May 2007 19:27:07 +0200
Subject: [PATCH] x86: Move swsusp __pa() dependent code to arch portion

o __pa() should be used only on kernel linearly mapped virtual addresses
  and not on kernel text and data addresses.

o Hibernation code needs to determine the physical address associated
  with kernel symbol to mark a section boundary which contains pages which
  don't have to be saved and restored during hibernate/resume operation.

o Move this piece of code in arch dependent section. So that architectures
  which don't have kernel text/data mapped into kernel linearly mapped
  region can come up with their own ways of determining physical addresses
  associated with a kernel text.

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 kernel/power/power.h    |  5 ++---
 kernel/power/snapshot.c | 11 -----------
 2 files changed, 2 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index eb461b816bf4..1c6eef8df4ad 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -23,6 +23,8 @@ static inline int pm_suspend_disk(void)
 }
 #endif
 
+extern int pfn_is_nosave(unsigned long);
+
 extern struct mutex pm_mutex;
 
 #define power_attr(_name) \
@@ -37,9 +39,6 @@ static struct subsys_attribute _name##_attr = {	\
 
 extern struct subsystem power_subsys;
 
-/* References to section boundaries */
-extern const void __nosave_begin, __nosave_end;
-
 /* Preferred image size in bytes (default 500 MB) */
 extern unsigned long image_size;
 extern int in_suspend;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index fc53ad068128..704c25a3ffec 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -650,17 +650,6 @@ static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
 static inline unsigned int count_highmem_pages(void) { return 0; }
 #endif /* CONFIG_HIGHMEM */
 
-/**
- *	pfn_is_nosave - check if given pfn is in the 'nosave' section
- */
-
-static inline int pfn_is_nosave(unsigned long pfn)
-{
-	unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
-	unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
-	return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
-}
-
 /**
  *	saveable - Determine whether a non-highmem page should be included in
  *	the suspend image.
-- 
cgit v1.2.3


From 1b29c1643c0d82512477ccd97dc290198fe23e22 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Wed, 2 May 2007 19:27:07 +0200
Subject: [PATCH] x86-64: do not use virt_to_page on kernel data address

o virt_to_page() call should be used on kernel linear addresses and not
  on kernel text and data addresses. Swsusp code uses it on kernel data
  (statically allocated swsusp_header).

o Allocate swsusp_header dynamically so that virt_to_page() can be used
  safely.

o I am changing this because in next few patches, __pa() on x86_64 will
  no longer support kernel text and data addresses and hibernation breaks.

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 kernel/power/swap.c | 42 +++++++++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 3581f8f86acd..b18c155cbb60 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -33,12 +33,14 @@ extern char resume_file[];
 
 #define SWSUSP_SIG	"S1SUSPEND"
 
-static struct swsusp_header {
+struct swsusp_header {
 	char reserved[PAGE_SIZE - 20 - sizeof(sector_t)];
 	sector_t image;
 	char	orig_sig[10];
 	char	sig[10];
-} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
+} __attribute__((packed));
+
+static struct swsusp_header *swsusp_header;
 
 /*
  * General things
@@ -141,14 +143,14 @@ static int mark_swapfiles(sector_t start)
 {
 	int error;
 
-	bio_read_page(swsusp_resume_block, &swsusp_header, NULL);
-	if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
-	    !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
-		memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
-		memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
-		swsusp_header.image = start;
+	bio_read_page(swsusp_resume_block, swsusp_header, NULL);
+	if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
+	    !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
+		memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
+		memcpy(swsusp_header->sig,SWSUSP_SIG, 10);
+		swsusp_header->image = start;
 		error = bio_write_page(swsusp_resume_block,
-					&swsusp_header, NULL);
+					swsusp_header, NULL);
 	} else {
 		printk(KERN_ERR "swsusp: Swap header not found!\n");
 		error = -ENODEV;
@@ -564,7 +566,7 @@ int swsusp_read(void)
 	if (error < PAGE_SIZE)
 		return error < 0 ? error : -EFAULT;
 	header = (struct swsusp_info *)data_of(snapshot);
-	error = get_swap_reader(&handle, swsusp_header.image);
+	error = get_swap_reader(&handle, swsusp_header->image);
 	if (!error)
 		error = swap_read_page(&handle, header, NULL);
 	if (!error)
@@ -591,17 +593,17 @@ int swsusp_check(void)
 	resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
 	if (!IS_ERR(resume_bdev)) {
 		set_blocksize(resume_bdev, PAGE_SIZE);
-		memset(&swsusp_header, 0, sizeof(swsusp_header));
+		memset(swsusp_header, 0, sizeof(PAGE_SIZE));
 		error = bio_read_page(swsusp_resume_block,
-					&swsusp_header, NULL);
+					swsusp_header, NULL);
 		if (error)
 			return error;
 
-		if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
-			memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
+		if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) {
+			memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
 			/* Reset swap signature now */
 			error = bio_write_page(swsusp_resume_block,
-						&swsusp_header, NULL);
+						swsusp_header, NULL);
 		} else {
 			return -EINVAL;
 		}
@@ -632,3 +634,13 @@ void swsusp_close(void)
 
 	blkdev_put(resume_bdev);
 }
+
+static int swsusp_header_init(void)
+{
+	swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL);
+	if (!swsusp_header)
+		panic("Could not allocate memory for swsusp_header\n");
+	return 0;
+}
+
+core_initcall(swsusp_header_init);
-- 
cgit v1.2.3


From b00742d399513a4100c24cc2accefdc1bb1e0b15 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 2 May 2007 19:27:11 +0200
Subject: [PATCH] x86-64: Account for module percpu space separately from
 kernel percpu

Rather than using a single constant PERCPU_ENOUGH_ROOM, compute it as
the sum of kernel_percpu + PERCPU_MODULE_RESERVE.  This is now common
to all architectures; if an architecture wants to set
PERCPU_ENOUGH_ROOM to something special, then it may do so (ia64 is
the only one which does).

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Andi Kleen <ak@suse.de>
---
 kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 9da5af668a20..cf49ca25fcce 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -430,7 +430,7 @@ static int percpu_modinit(void)
 	pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated,
 			    GFP_KERNEL);
 	/* Static in-kernel percpu data (used). */
-	pcpu_size[0] = -ALIGN(__per_cpu_end-__per_cpu_start, SMP_CACHE_BYTES);
+	pcpu_size[0] = -(__per_cpu_end-__per_cpu_start);
 	/* Free room. */
 	pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0];
 	if (pcpu_size[1] < 0) {
-- 
cgit v1.2.3


From b6e3590f8145c77b8fcef3247e2412335221412f Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 2 May 2007 19:27:12 +0200
Subject: [PATCH] x86: Allow percpu variables to be page-aligned

Let's allow page-alignment in general for per-cpu data (wanted by Xen, and
Ingo suggested KVM as well).

Because larger alignments can use more room, we increase the max per-cpu
memory to 64k rather than 32k: it's getting a little tight.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/module.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index cf49ca25fcce..4dc4a257545c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -346,10 +346,10 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
 	unsigned int i;
 	void *ptr;
 
-	if (align > SMP_CACHE_BYTES) {
-		printk(KERN_WARNING "%s: per-cpu alignment %li > %i\n",
-		       name, align, SMP_CACHE_BYTES);
-		align = SMP_CACHE_BYTES;
+	if (align > PAGE_SIZE) {
+		printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
+		       name, align, PAGE_SIZE);
+		align = PAGE_SIZE;
 	}
 
 	ptr = __per_cpu_start;
-- 
cgit v1.2.3


From d6dd61c831226f9cd7750885da04d360d6455101 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 2 May 2007 19:27:14 +0200
Subject: [PATCH] x86: PARAVIRT: add hooks to intercept mm creation and
 destruction

Add hooks to allow a paravirt implementation to track the lifetime of
an mm.  Paravirtualization requires three hooks, but only two are
needed in common code.  They are:

arch_dup_mmap, which is called when a new mmap is created at fork

arch_exit_mmap, which is called when the last process reference to an
  mm is dropped, which typically happens on exit and exec.

The third hook is activate_mm, which is called from the arch-specific
activate_mm() macro/function, and so doesn't need stub versions for
other architectures.  It's called when an mm is first used.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: linux-arch@vger.kernel.org
Cc: James Bottomley <James.Bottomley@SteelEye.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/fork.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 6af959c034d8..ffccefb28b6a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -286,6 +286,8 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		if (retval)
 			goto out;
 	}
+	/* a new mm has just been created */
+	arch_dup_mmap(oldmm, mm);
 	retval = 0;
 out:
 	up_write(&mm->mmap_sem);
-- 
cgit v1.2.3


From 823bccfc4002296ba88c3ad0f049e1abd8108d30 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Fri, 13 Apr 2007 13:15:19 -0700
Subject: remove "struct subsystem" as it is no longer needed

We need to work on cleaning up the relationship between kobjects, ksets and
ktypes.  The removal of 'struct subsystem' is the first step of this,
especially as it is not really needed at all.

Thanks to Kay for fixing the bugs in this patch.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/ksysfs.c      | 12 ++++++------
 kernel/module.c      |  8 +++++---
 kernel/params.c      |  2 ++
 kernel/power/disk.c  | 14 +++++++-------
 kernel/power/main.c  | 10 +++++-----
 kernel/power/power.h |  2 +-
 6 files changed, 26 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index e0ffe4ab0917..559deca5ed15 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -24,18 +24,18 @@ static struct subsys_attribute _name##_attr = \
 
 #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
 /* current uevent sequence number */
-static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page)
+static ssize_t uevent_seqnum_show(struct kset *kset, char *page)
 {
 	return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum);
 }
 KERNEL_ATTR_RO(uevent_seqnum);
 
 /* uevent helper program, used during early boo */
-static ssize_t uevent_helper_show(struct subsystem *subsys, char *page)
+static ssize_t uevent_helper_show(struct kset *kset, char *page)
 {
 	return sprintf(page, "%s\n", uevent_helper);
 }
-static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, size_t count)
+static ssize_t uevent_helper_store(struct kset *kset, const char *page, size_t count)
 {
 	if (count+1 > UEVENT_HELPER_PATH_LEN)
 		return -ENOENT;
@@ -49,13 +49,13 @@ KERNEL_ATTR_RW(uevent_helper);
 #endif
 
 #ifdef CONFIG_KEXEC
-static ssize_t kexec_loaded_show(struct subsystem *subsys, char *page)
+static ssize_t kexec_loaded_show(struct kset *kset, char *page)
 {
 	return sprintf(page, "%d\n", !!kexec_image);
 }
 KERNEL_ATTR_RO(kexec_loaded);
 
-static ssize_t kexec_crash_loaded_show(struct subsystem *subsys, char *page)
+static ssize_t kexec_crash_loaded_show(struct kset *kset, char *page)
 {
 	return sprintf(page, "%d\n", !!kexec_crash_image);
 }
@@ -85,7 +85,7 @@ static int __init ksysfs_init(void)
 {
 	int error = subsystem_register(&kernel_subsys);
 	if (!error)
-		error = sysfs_create_group(&kernel_subsys.kset.kobj,
+		error = sysfs_create_group(&kernel_subsys.kobj,
 					   &kernel_attr_group);
 
 	return error;
diff --git a/kernel/module.c b/kernel/module.c
index 9da5af668a20..ff982ec435fc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -45,6 +45,8 @@
 #include <asm/cacheflush.h>
 #include <linux/license.h>
 
+extern int module_sysfs_initialized;
+
 #if 0
 #define DEBUGP printk
 #else
@@ -1117,8 +1119,8 @@ int mod_sysfs_init(struct module *mod)
 {
 	int err;
 
-	if (!module_subsys.kset.subsys) {
-		printk(KERN_ERR "%s: module_subsys not initialized\n",
+	if (!module_sysfs_initialized) {
+		printk(KERN_ERR "%s: module sysfs not initialized\n",
 		       mod->name);
 		err = -EINVAL;
 		goto out;
@@ -2385,7 +2387,7 @@ void module_add_driver(struct module *mod, struct device_driver *drv)
 		struct kobject *mkobj;
 
 		/* Lookup built-in module entry in /sys/modules */
-		mkobj = kset_find_obj(&module_subsys.kset, drv->mod_name);
+		mkobj = kset_find_obj(&module_subsys, drv->mod_name);
 		if (mkobj) {
 			mk = container_of(mkobj, struct module_kobject, kobj);
 			/* remember our module structure */
diff --git a/kernel/params.c b/kernel/params.c
index 1fc4ac746cd8..312172320b4c 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -691,6 +691,7 @@ static struct kset_uevent_ops module_uevent_ops = {
 };
 
 decl_subsys(module, &module_ktype, &module_uevent_ops);
+int module_sysfs_initialized;
 
 static struct kobj_type module_ktype = {
 	.sysfs_ops =	&module_sysfs_ops,
@@ -709,6 +710,7 @@ static int __init param_sysfs_init(void)
 			__FILE__, __LINE__, ret);
 		return ret;
 	}
+	module_sysfs_initialized = 1;
 
 	param_sysfs_builtin();
 
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 02e4fb69111a..8df51c23bba4 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -322,13 +322,13 @@ static const char * const pm_disk_modes[] = {
  *	supports it (as determined from pm_ops->pm_disk_mode).
  */
 
-static ssize_t disk_show(struct subsystem * subsys, char * buf)
+static ssize_t disk_show(struct kset *kset, char *buf)
 {
 	return sprintf(buf, "%s\n", pm_disk_modes[pm_disk_mode]);
 }
 
 
-static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
+static ssize_t disk_store(struct kset *kset, const char *buf, size_t n)
 {
 	int error = 0;
 	int i;
@@ -373,13 +373,13 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
 
 power_attr(disk);
 
-static ssize_t resume_show(struct subsystem * subsys, char *buf)
+static ssize_t resume_show(struct kset *kset, char *buf)
 {
 	return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device),
 		       MINOR(swsusp_resume_device));
 }
 
-static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n)
+static ssize_t resume_store(struct kset *kset, const char *buf, size_t n)
 {
 	unsigned int maj, min;
 	dev_t res;
@@ -405,12 +405,12 @@ static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n)
 
 power_attr(resume);
 
-static ssize_t image_size_show(struct subsystem * subsys, char *buf)
+static ssize_t image_size_show(struct kset *kset, char *buf)
 {
 	return sprintf(buf, "%lu\n", image_size);
 }
 
-static ssize_t image_size_store(struct subsystem * subsys, const char * buf, size_t n)
+static ssize_t image_size_store(struct kset *kset, const char *buf, size_t n)
 {
 	unsigned long size;
 
@@ -439,7 +439,7 @@ static struct attribute_group attr_group = {
 
 static int __init pm_disk_init(void)
 {
-	return sysfs_create_group(&power_subsys.kset.kobj,&attr_group);
+	return sysfs_create_group(&power_subsys.kobj, &attr_group);
 }
 
 core_initcall(pm_disk_init);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 72419a3b1beb..b21c2a56f960 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -285,7 +285,7 @@ decl_subsys(power,NULL,NULL);
  *	proper enumerated value, and initiates a suspend transition.
  */
 
-static ssize_t state_show(struct subsystem * subsys, char * buf)
+static ssize_t state_show(struct kset *kset, char *buf)
 {
 	int i;
 	char * s = buf;
@@ -298,7 +298,7 @@ static ssize_t state_show(struct subsystem * subsys, char * buf)
 	return (s - buf);
 }
 
-static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n)
+static ssize_t state_store(struct kset *kset, const char *buf, size_t n)
 {
 	suspend_state_t state = PM_SUSPEND_STANDBY;
 	const char * const *s;
@@ -325,13 +325,13 @@ power_attr(state);
 #ifdef CONFIG_PM_TRACE
 int pm_trace_enabled;
 
-static ssize_t pm_trace_show(struct subsystem * subsys, char * buf)
+static ssize_t pm_trace_show(struct kset *kset, char *buf)
 {
 	return sprintf(buf, "%d\n", pm_trace_enabled);
 }
 
 static ssize_t
-pm_trace_store(struct subsystem * subsys, const char * buf, size_t n)
+pm_trace_store(struct kset *kset, const char *buf, size_t n)
 {
 	int val;
 
@@ -365,7 +365,7 @@ static int __init pm_init(void)
 {
 	int error = subsystem_register(&power_subsys);
 	if (!error)
-		error = sysfs_create_group(&power_subsys.kset.kobj,&attr_group);
+		error = sysfs_create_group(&power_subsys.kobj,&attr_group);
 	return error;
 }
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index eb461b816bf4..5f842c3efc4b 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -35,7 +35,7 @@ static struct subsys_attribute _name##_attr = {	\
 	.store	= _name##_store,		\
 }
 
-extern struct subsystem power_subsys;
+extern struct kset power_subsys;
 
 /* References to section boundaries */
 extern const void __nosave_begin, __nosave_end;
-- 
cgit v1.2.3


From 7fe3730de729b758e9f69b862b9255d998671b5f Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Wed, 18 Apr 2007 19:39:21 +1000
Subject: MSI: arch must connect the irq and the msi_desc

set_irq_msi() currently connects an irq_desc to an msi_desc. The archs call
it at some point in their setup routine, and then the generic code sets up the
reverse mapping from the msi_desc back to the irq.

set_irq_msi() should do both connections, making it the one and only call
required to connect an irq with it's MSI desc and vice versa.

The arch code MUST call set_irq_msi(), and it must do so only once it's sure
it's not going to fail the irq allocation.

Given that there's no need for the arch to return the irq anymore, the return
value from the arch setup routine just becomes 0 for success and anything else
for failure.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/irq/chip.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 0133f4f9e9f0..615ce97c6cfd 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/irq.h>
+#include <linux/msi.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
@@ -185,6 +186,8 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
 	desc = irq_desc + irq;
 	spin_lock_irqsave(&desc->lock, flags);
 	desc->msi_desc = entry;
+	if (entry)
+		entry->irq = irq;
 	spin_unlock_irqrestore(&desc->lock, flags);
 	return 0;
 }
-- 
cgit v1.2.3


From 543b9fd3528f64c4b20439de0edb453764482de7 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 3 May 2007 22:31:38 +1000
Subject: [POWERPC] powermac: Suspend to disk on G5

Powermac G5 suspend to disk implementation.  The code is platform
agnostic but only tested on powermac, no other 64-bit powerpc
machines.

Because nvidiafb still breaks suspend I have marked it EXPERIMENTAL on
powermac and because I can't test it and some lowlevel code will need
changes it is BROKEN on all other 64-bit platforms.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 kernel/power/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 51a4dd0f1b74..5001c652028c 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -79,7 +79,7 @@ config PM_SYSFS_DEPRECATED
 
 config SOFTWARE_SUSPEND
 	bool "Software Suspend"
-	depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP))
+	depends on PM && SWAP && (((X86 || PPC64_SWSUSP) && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP))
 	---help---
 	  Enable the suspend to disk (STD) functionality.
 
@@ -134,7 +134,7 @@ config PM_STD_PARTITION
 
 config SUSPEND_SMP
 	bool
-	depends on HOTPLUG_CPU && X86 && PM
+	depends on HOTPLUG_CPU && (X86 || PPC64) && PM
 	default y
 
 config APM_EMULATION
-- 
cgit v1.2.3


From 476f35348eb8d2a827765992899fea78b7dcc46f Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sun, 6 May 2007 14:48:58 -0700
Subject: Safer nr_node_ids and nr_node_ids determination and initial values

The nr_cpu_ids value is currently only calculated in smp_init.  However, it
may be needed before (SLUB needs it on kmem_cache_init!) and other kernel
components may also want to allocate dynamically sized per cpu array before
smp_init.  So move the determination of possible cpus into sched_init()
where we already loop over all possible cpus early in boot.

Also initialize both nr_node_ids and nr_cpu_ids with the highest value they
could take.  If we have accidental users before these values are determined
then the current valud of 0 may cause too small per cpu and per node arrays
to be allocated.  If it is set to the maximum possible then we only waste
some memory for early boot users.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 960d7c5fca39..0227f1625a75 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5244,6 +5244,11 @@ int __init migration_init(void)
 #endif
 
 #ifdef CONFIG_SMP
+
+/* Number of possible processor ids */
+int nr_cpu_ids __read_mostly = NR_CPUS;
+EXPORT_SYMBOL(nr_cpu_ids);
+
 #undef SCHED_DOMAIN_DEBUG
 #ifdef SCHED_DOMAIN_DEBUG
 static void sched_domain_debug(struct sched_domain *sd, int cpu)
@@ -6726,6 +6731,7 @@ int in_sched_functions(unsigned long addr)
 void __init sched_init(void)
 {
 	int i, j, k;
+	int highest_cpu = 0;
 
 	for_each_possible_cpu(i) {
 		struct prio_array *array;
@@ -6760,11 +6766,13 @@ void __init sched_init(void)
 			// delimiter for bitsearch
 			__set_bit(MAX_PRIO, array->bitmap);
 		}
+		highest_cpu = i;
 	}
 
 	set_load_weight(&init_task);
 
 #ifdef CONFIG_SMP
+	nr_cpu_ids = highest_cpu + 1;
 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
 #endif
 
-- 
cgit v1.2.3


From c596d9f320aaf30d28c1d793ff3a976dee1db8f5 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Sun, 6 May 2007 14:49:32 -0700
Subject: cpusets: allow TIF_MEMDIE threads to allocate anywhere

OOM killed tasks have access to memory reserves as specified by the
TIF_MEMDIE flag in the hopes that it will quickly exit.  If such a task has
memory allocations constrained by cpusets, we may encounter a deadlock if a
blocking task cannot exit because it cannot allocate the necessary memory.

We allow tasks that have the TIF_MEMDIE flag to allocate memory anywhere,
including outside its cpuset restriction, so that it can quickly die
regardless of whether it is __GFP_HARDWALL.

Cc: Andi Kleen <ak@suse.de>
Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f382b0f775e1..d240349cbf0f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2351,6 +2351,8 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
  * z's node is in our tasks mems_allowed, yes.  If it's not a
  * __GFP_HARDWALL request and this zone's nodes is in the nearest
  * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
+ * If the task has been OOM killed and has access to memory reserves
+ * as specified by the TIF_MEMDIE flag, yes.
  * Otherwise, no.
  *
  * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall()
@@ -2368,7 +2370,8 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
  * calls get to this routine, we should just shut up and say 'yes'.
  *
  * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
- * and do not allow allocations outside the current tasks cpuset.
+ * and do not allow allocations outside the current tasks cpuset
+ * unless the task has been OOM killed as is marked TIF_MEMDIE.
  * GFP_KERNEL allocations are not so marked, so can escape to the
  * nearest enclosing mem_exclusive ancestor cpuset.
  *
@@ -2392,6 +2395,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
  * affect that:
  *	in_interrupt - any node ok (current task context irrelevant)
  *	GFP_ATOMIC   - any node ok
+ *	TIF_MEMDIE   - any node ok
  *	GFP_KERNEL   - any node in enclosing mem_exclusive cpuset ok
  *	GFP_USER     - only nodes in current tasks mems allowed ok.
  *
@@ -2413,6 +2417,12 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
 	might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
 	if (node_isset(node, current->mems_allowed))
 		return 1;
+	/*
+	 * Allow tasks that have access to memory reserves because they have
+	 * been OOM killed to get memory anywhere.
+	 */
+	if (unlikely(test_thread_flag(TIF_MEMDIE)))
+		return 1;
 	if (gfp_mask & __GFP_HARDWALL)	/* If hardwall request, stop here */
 		return 0;
 
@@ -2438,7 +2448,9 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
  *
  * If we're in interrupt, yes, we can always allocate.
  * If __GFP_THISNODE is set, yes, we can always allocate.  If zone
- * z's node is in our tasks mems_allowed, yes.   Otherwise, no.
+ * z's node is in our tasks mems_allowed, yes.   If the task has been
+ * OOM killed and has access to memory reserves as specified by the
+ * TIF_MEMDIE flag, yes.  Otherwise, no.
  *
  * The __GFP_THISNODE placement logic is really handled elsewhere,
  * by forcibly using a zonelist starting at a specified node, and by
@@ -2462,6 +2474,12 @@ int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
 	node = zone_to_nid(z);
 	if (node_isset(node, current->mems_allowed))
 		return 1;
+        /*
+         * Allow tasks that have access to memory reserves because they have
+         * been OOM killed to get memory anywhere.
+         */
+        if (unlikely(test_thread_flag(TIF_MEMDIE)))
+                return 1;
 	return 0;
 }
 
-- 
cgit v1.2.3


From 0a31bd5f2bbb6473ef9d24f0063ca91cfa678b64 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 6 May 2007 14:49:57 -0700
Subject: KMEM_CACHE(): simplify slab cache creation

This patch provides a new macro

KMEM_CACHE(<struct>, <flags>)

to simplify slab creation. KMEM_CACHE creates a slab with the name of the
struct, with the size of the struct and with the alignment of the struct.
Additional slab flags may be specified if necessary.

Example

struct test_slab {
	int a,b,c;
	struct list_head;
} __cacheline_aligned_in_smp;

test_slab_cache = KMEM_CACHE(test_slab, SLAB_PANIC)

will create a new slab named "test_slab" of the size sizeof(struct
test_slab) and aligned to the alignment of test slab.  If it fails then we
panic.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/delayacct.c | 6 +-----
 kernel/pid.c       | 4 +---
 kernel/signal.c    | 6 +-----
 kernel/taskstats.c | 4 +---
 4 files changed, 4 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 766d5912b26a..c0148ae992c4 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -31,11 +31,7 @@ __setup("nodelayacct", delayacct_setup_disable);
 
 void delayacct_init(void)
 {
-	delayacct_cache = kmem_cache_create("delayacct_cache",
-					sizeof(struct task_delay_info),
-					0,
-					SLAB_PANIC,
-					NULL, NULL);
+	delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC);
 	delayacct_tsk_init(&init_task);
 }
 
diff --git a/kernel/pid.c b/kernel/pid.c
index 78f2aee90f54..9c80bc23d6b8 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -412,7 +412,5 @@ void __init pidmap_init(void)
 	set_bit(0, init_pid_ns.pidmap[0].page);
 	atomic_dec(&init_pid_ns.pidmap[0].nr_free);
 
-	pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
-					__alignof__(struct pid),
-					SLAB_PANIC, NULL, NULL);
+	pid_cachep = KMEM_CACHE(pid, SLAB_PANIC);
 }
diff --git a/kernel/signal.c b/kernel/signal.c
index 3670225ecbc0..2b4087d545a3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2636,9 +2636,5 @@ __attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
 
 void __init signals_init(void)
 {
-	sigqueue_cachep =
-		kmem_cache_create("sigqueue",
-				  sizeof(struct sigqueue),
-				  __alignof__(struct sigqueue),
-				  SLAB_PANIC, NULL, NULL);
+	sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
 }
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index ad7d2392cb0e..906cae771585 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -524,9 +524,7 @@ void __init taskstats_init_early(void)
 {
 	unsigned int i;
 
-	taskstats_cache = kmem_cache_create("taskstats_cache",
-						sizeof(struct taskstats),
-						0, SLAB_PANIC, NULL, NULL);
+	taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC);
 	for_each_possible_cpu(i) {
 		INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
 		init_rwsem(&(per_cpu(listener_array, i).sem));
-- 
cgit v1.2.3


From 50953fe9e00ebbeffa032a565ab2f08312d51a87 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 6 May 2007 14:50:16 -0700
Subject: slab allocators: Remove SLAB_DEBUG_INITIAL flag

I have never seen a use of SLAB_DEBUG_INITIAL.  It is only supported by
SLAB.

I think its purpose was to have a callback after an object has been freed
to verify that the state is the constructor state again?  The callback is
performed before each freeing of an object.

I would think that it is much easier to check the object state manually
before the free.  That also places the check near the code object
manipulation of the object.

Also the SLAB_DEBUG_INITIAL callback is only performed if the kernel was
compiled with SLAB debugging on.  If there would be code in a constructor
handling SLAB_DEBUG_INITIAL then it would have to be conditional on
SLAB_DEBUG otherwise it would just be dead code.  But there is no such code
in the kernel.  I think SLUB_DEBUG_INITIAL is too problematic to make real
use of, difficult to understand and there are easier ways to accomplish the
same effect (i.e.  add debug code before kfree).

There is a related flag SLAB_CTOR_VERIFY that is frequently checked to be
clear in fs inode caches.  Remove the pointless checks (they would even be
pointless without removeal of SLAB_DEBUG_INITIAL) from the fs constructors.

This is the last slab flag that SLUB did not support.  Remove the check for
unimplemented flags from SLUB.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index ffccefb28b6a..b7d169def942 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1425,8 +1425,7 @@ static void sighand_ctor(void *data, struct kmem_cache *cachep, unsigned long fl
 {
 	struct sighand_struct *sighand = data;
 
-	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
-					SLAB_CTOR_CONSTRUCTOR)
+	if (flags & SLAB_CTOR_CONSTRUCTOR)
 		spin_lock_init(&sighand->siglock);
 }
 
-- 
cgit v1.2.3


From 73243284463a761e04d69d22c7516b2be7de096c Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Sun, 6 May 2007 14:50:20 -0700
Subject: Return EPERM not ECHILD on security_task_wait failure

wait* syscalls return -ECHILD even when an individual PID of a live child
was requested explicitly, when security_task_wait denies the operation.
This means that something like a broken SELinux policy can produce an
unexpected failure that looks just like a bug with wait or ptrace or
something.

This patch makes do_wait return -EACCES (or other appropriate error returned
from security_task_wait() instead of -ECHILD if some children were ruled out
solely because security_task_wait failed.

[jmorris@namei.org: switch error code to EACCES]
Signed-off-by: Roland McGrath <roland@redhat.com>
Acked-by: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index b55ed4cc9104..92369240d91d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1033,6 +1033,8 @@ asmlinkage void sys_exit_group(int error_code)
 
 static int eligible_child(pid_t pid, int options, struct task_struct *p)
 {
+	int err;
+
 	if (pid > 0) {
 		if (p->pid != pid)
 			return 0;
@@ -1066,8 +1068,9 @@ static int eligible_child(pid_t pid, int options, struct task_struct *p)
 	if (delay_group_leader(p))
 		return 2;
 
-	if (security_task_wait(p))
-		return 0;
+	err = security_task_wait(p);
+	if (err)
+		return err;
 
 	return 1;
 }
@@ -1449,6 +1452,7 @@ static long do_wait(pid_t pid, int options, struct siginfo __user *infop,
 	DECLARE_WAITQUEUE(wait, current);
 	struct task_struct *tsk;
 	int flag, retval;
+	int allowed, denied;
 
 	add_wait_queue(&current->signal->wait_chldexit,&wait);
 repeat:
@@ -1457,6 +1461,7 @@ repeat:
 	 * match our criteria, even if we are not able to reap it yet.
 	 */
 	flag = 0;
+	allowed = denied = 0;
 	current->state = TASK_INTERRUPTIBLE;
 	read_lock(&tasklist_lock);
 	tsk = current;
@@ -1472,6 +1477,12 @@ repeat:
 			if (!ret)
 				continue;
 
+			if (unlikely(ret < 0)) {
+				denied = ret;
+				continue;
+			}
+			allowed = 1;
+
 			switch (p->state) {
 			case TASK_TRACED:
 				/*
@@ -1570,6 +1581,8 @@ check_continued:
 		goto repeat;
 	}
 	retval = -ECHILD;
+	if (unlikely(denied) && !allowed)
+		retval = denied;
 end:
 	current->state = TASK_RUNNING;
 	remove_wait_queue(&current->signal->wait_chldexit,&wait);
-- 
cgit v1.2.3


From 433ecb4ab312f873870b67ee374502e84f6dcf92 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 6 May 2007 14:50:40 -0700
Subject: fix refrigerator() vs thaw_process() race

refrigerator() can miss a wakeup, "wait event" loop needs a proper memory
ordering.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/process.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index 6d566bf7085c..0eb5c420e8ed 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -47,8 +47,10 @@ void refrigerator(void)
 	recalc_sigpending(); /* We sent fake signal, clean it up */
 	spin_unlock_irq(&current->sighand->siglock);
 
-	while (frozen(current)) {
-		current->state = TASK_UNINTERRUPTIBLE;
+	for (;;) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (!frozen(current))
+			break;
 		schedule();
 	}
 	pr_debug("%s left refrigerator\n", current->comm);
-- 
cgit v1.2.3


From 7be9823491ecbaf9700d7d3502cb4b4dd0ed868a Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 6 May 2007 14:50:42 -0700
Subject: swsusp: use inline functions for changing page flags

Replace direct invocations of SetPageNosave(), SetPageNosaveFree() etc.  with
calls to inline functions that can be changed in subsequent patches without
modifying the code calling them.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/snapshot.c | 48 +++++++++++++++++++++++++-----------------------
 1 file changed, 25 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 704c25a3ffec..48fc7a35571b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -67,15 +67,15 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed)
 
 	res = (void *)get_zeroed_page(gfp_mask);
 	if (safe_needed)
-		while (res && PageNosaveFree(virt_to_page(res))) {
+		while (res && swsusp_page_is_free(virt_to_page(res))) {
 			/* The page is unsafe, mark it for swsusp_free() */
-			SetPageNosave(virt_to_page(res));
+			swsusp_set_page_forbidden(virt_to_page(res));
 			allocated_unsafe_pages++;
 			res = (void *)get_zeroed_page(gfp_mask);
 		}
 	if (res) {
-		SetPageNosave(virt_to_page(res));
-		SetPageNosaveFree(virt_to_page(res));
+		swsusp_set_page_forbidden(virt_to_page(res));
+		swsusp_set_page_free(virt_to_page(res));
 	}
 	return res;
 }
@@ -91,8 +91,8 @@ static struct page *alloc_image_page(gfp_t gfp_mask)
 
 	page = alloc_page(gfp_mask);
 	if (page) {
-		SetPageNosave(page);
-		SetPageNosaveFree(page);
+		swsusp_set_page_forbidden(page);
+		swsusp_set_page_free(page);
 	}
 	return page;
 }
@@ -110,9 +110,9 @@ static inline void free_image_page(void *addr, int clear_nosave_free)
 
 	page = virt_to_page(addr);
 
-	ClearPageNosave(page);
+	swsusp_unset_page_forbidden(page);
 	if (clear_nosave_free)
-		ClearPageNosaveFree(page);
+		swsusp_unset_page_free(page);
 
 	__free_page(page);
 }
@@ -615,7 +615,8 @@ static struct page *saveable_highmem_page(unsigned long pfn)
 
 	BUG_ON(!PageHighMem(page));
 
-	if (PageNosave(page) || PageReserved(page) || PageNosaveFree(page))
+	if (swsusp_page_is_forbidden(page) ||  swsusp_page_is_free(page) ||
+	    PageReserved(page))
 		return NULL;
 
 	return page;
@@ -670,7 +671,7 @@ static struct page *saveable_page(unsigned long pfn)
 
 	BUG_ON(PageHighMem(page));
 
-	if (PageNosave(page) || PageNosaveFree(page))
+	if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page))
 		return NULL;
 
 	if (PageReserved(page) && pfn_is_nosave(pfn))
@@ -810,9 +811,10 @@ void swsusp_free(void)
 			if (pfn_valid(pfn)) {
 				struct page *page = pfn_to_page(pfn);
 
-				if (PageNosave(page) && PageNosaveFree(page)) {
-					ClearPageNosave(page);
-					ClearPageNosaveFree(page);
+				if (swsusp_page_is_forbidden(page) &&
+				    swsusp_page_is_free(page)) {
+					swsusp_unset_page_forbidden(page);
+					swsusp_unset_page_free(page);
 					__free_page(page);
 				}
 			}
@@ -1135,7 +1137,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
 		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
 			if (pfn_valid(pfn))
-				ClearPageNosaveFree(pfn_to_page(pfn));
+				swsusp_unset_page_free(pfn_to_page(pfn));
 	}
 
 	/* Mark pages that correspond to the "original" pfns as "unsafe" */
@@ -1144,7 +1146,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
 		pfn = memory_bm_next_pfn(bm);
 		if (likely(pfn != BM_END_OF_MAP)) {
 			if (likely(pfn_valid(pfn)))
-				SetPageNosaveFree(pfn_to_page(pfn));
+				swsusp_set_page_free(pfn_to_page(pfn));
 			else
 				return -EFAULT;
 		}
@@ -1310,14 +1312,14 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
 		struct page *page;
 
 		page = alloc_page(__GFP_HIGHMEM);
-		if (!PageNosaveFree(page)) {
+		if (!swsusp_page_is_free(page)) {
 			/* The page is "safe", set its bit the bitmap */
 			memory_bm_set_bit(bm, page_to_pfn(page));
 			safe_highmem_pages++;
 		}
 		/* Mark the page as allocated */
-		SetPageNosave(page);
-		SetPageNosaveFree(page);
+		swsusp_set_page_forbidden(page);
+		swsusp_set_page_free(page);
 	}
 	memory_bm_position_reset(bm);
 	safe_highmem_bm = bm;
@@ -1349,7 +1351,7 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
 	struct highmem_pbe *pbe;
 	void *kaddr;
 
-	if (PageNosave(page) && PageNosaveFree(page)) {
+	if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) {
 		/* We have allocated the "original" page frame and we can
 		 * use it directly to store the loaded page.
 		 */
@@ -1511,14 +1513,14 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 			error = -ENOMEM;
 			goto Free;
 		}
-		if (!PageNosaveFree(virt_to_page(lp))) {
+		if (!swsusp_page_is_free(virt_to_page(lp))) {
 			/* The page is "safe", add it to the list */
 			lp->next = safe_pages_list;
 			safe_pages_list = lp;
 		}
 		/* Mark the page as allocated */
-		SetPageNosave(virt_to_page(lp));
-		SetPageNosaveFree(virt_to_page(lp));
+		swsusp_set_page_forbidden(virt_to_page(lp));
+		swsusp_set_page_free(virt_to_page(lp));
 		nr_pages--;
 	}
 	/* Free the reserved safe pages so that chain_alloc() can use them */
@@ -1547,7 +1549,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
 	if (PageHighMem(page))
 		return get_highmem_page_buffer(page, ca);
 
-	if (PageNosave(page) && PageNosaveFree(page))
+	if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page))
 		/* We have allocated the "original" page frame and we can
 		 * use it directly to store the loaded page.
 		 */
-- 
cgit v1.2.3


From 74dfd666de861c97d47bdbd892f6d21b801d0247 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 6 May 2007 14:50:43 -0700
Subject: swsusp: do not use page flags

Make swsusp use memory bitmaps instead of page flags for marking 'nosave' and
free pages.  This allows us to 'recycle' two page flags that can be used for
other purposes.  Also, the memory needed to store the bitmaps is allocated
when necessary (ie.  before the suspend) and freed after the resume which is
more reasonable.

The patch is designed to minimize the amount of changes and there are some
nice simplifications and optimizations possible on top of it.  I am going to
implement them separately in the future.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/disk.c     |  23 +++--
 kernel/power/main.c     |   2 +
 kernel/power/power.h    |   2 +
 kernel/power/snapshot.c | 250 ++++++++++++++++++++++++++++++++++++++++++++----
 kernel/power/user.c     |   4 +
 5 files changed, 259 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 8df51c23bba4..403bc3722fee 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -139,14 +139,19 @@ int pm_suspend_disk(void)
 		mdelay(5000);
 		goto Thaw;
 	}
+	/* Allocate memory management structures */
+	error = create_basic_memory_bitmaps();
+	if (error)
+		goto Thaw;
+
 	/* Free memory before shutting down devices. */
 	error = swsusp_shrink_memory();
 	if (error)
-		goto Thaw;
+		goto Finish;
 
 	error = platform_prepare();
 	if (error)
-		goto Thaw;
+		goto Finish;
 
 	suspend_console();
 	error = device_suspend(PMSG_FREEZE);
@@ -181,7 +186,7 @@ int pm_suspend_disk(void)
 			power_down();
 		else {
 			swsusp_free();
-			goto Thaw;
+			goto Finish;
 		}
 	} else {
 		pr_debug("PM: Image restored successfully.\n");
@@ -194,6 +199,8 @@ int pm_suspend_disk(void)
 	platform_finish();
 	device_resume();
 	resume_console();
+ Finish:
+	free_basic_memory_bitmaps();
  Thaw:
 	unprepare_processes();
 	return error;
@@ -239,13 +246,15 @@ static int software_resume(void)
 	}
 
 	pr_debug("PM: Checking swsusp image.\n");
-
 	error = swsusp_check();
 	if (error)
-		goto Done;
+		goto Unlock;
 
-	pr_debug("PM: Preparing processes for restore.\n");
+	error = create_basic_memory_bitmaps();
+	if (error)
+		goto Unlock;
 
+	pr_debug("PM: Preparing processes for restore.\n");
 	error = prepare_processes();
 	if (error) {
 		swsusp_close();
@@ -280,7 +289,9 @@ static int software_resume(void)
 	printk(KERN_ERR "PM: Restore failed, recovering.\n");
 	unprepare_processes();
  Done:
+	free_basic_memory_bitmaps();
 	/* For success case, the suspend path will release the lock */
+ Unlock:
 	mutex_unlock(&pm_mutex);
 	pr_debug("PM: Resume from disk failed.\n");
 	return 0;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b21c2a56f960..5a779270cdbc 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -244,6 +244,7 @@ static int enter_state(suspend_state_t state)
 	return error;
 }
 
+#ifdef CONFIG_SOFTWARE_SUSPEND
 /*
  * This is main interface to the outside world. It needs to be
  * called from process context.
@@ -252,6 +253,7 @@ int software_suspend(void)
 {
 	return enter_state(PM_SUSPEND_DISK);
 }
+#endif
 
 
 /**
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 33bd94ceba32..1f8052bda0f7 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -48,6 +48,8 @@ extern sector_t swsusp_resume_block;
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
 
+extern int create_basic_memory_bitmaps(void);
+extern void free_basic_memory_bitmaps(void);
 extern unsigned int count_data_pages(void);
 
 /**
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 48fc7a35571b..f66e4411795b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -21,6 +21,7 @@
 #include <linux/kernel.h>
 #include <linux/pm.h>
 #include <linux/device.h>
+#include <linux/init.h>
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
 #include <linux/console.h>
@@ -34,6 +35,10 @@
 
 #include "power.h"
 
+static int swsusp_page_is_free(struct page *);
+static void swsusp_set_page_forbidden(struct page *);
+static void swsusp_unset_page_forbidden(struct page *);
+
 /* List of PBEs needed for restoring the pages that were allocated before
  * the suspend and included in the suspend image, but have also been
  * allocated by the "resume" kernel, so their contents cannot be written
@@ -224,11 +229,6 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
  *	of type unsigned long each).  It also contains the pfns that
  *	correspond to the start and end of the represented memory area and
  *	the number of bit chunks in the block.
- *
- *	NOTE: Memory bitmaps are used for two types of operations only:
- *	"set a bit" and "find the next bit set".  Moreover, the searching
- *	is always carried out after all of the "set a bit" operations
- *	on given bitmap.
  */
 
 #define BM_END_OF_MAP	(~0UL)
@@ -443,15 +443,13 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
 }
 
 /**
- *	memory_bm_set_bit - set the bit in the bitmap @bm that corresponds
+ *	memory_bm_find_bit - find the bit in the bitmap @bm that corresponds
  *	to given pfn.  The cur_zone_bm member of @bm and the cur_block member
  *	of @bm->cur_zone_bm are updated.
- *
- *	If the bit cannot be set, the function returns -EINVAL .
  */
 
-static int
-memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
+static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
+				void **addr, unsigned int *bit_nr)
 {
 	struct zone_bitmap *zone_bm;
 	struct bm_block *bb;
@@ -463,8 +461,8 @@ memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
 		/* We don't assume that the zones are sorted by pfns */
 		while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
 			zone_bm = zone_bm->next;
-			if (unlikely(!zone_bm))
-				return -EINVAL;
+
+			BUG_ON(!zone_bm);
 		}
 		bm->cur.zone_bm = zone_bm;
 	}
@@ -475,13 +473,40 @@ memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
 
 	while (pfn >= bb->end_pfn) {
 		bb = bb->next;
-		if (unlikely(!bb))
-			return -EINVAL;
+
+		BUG_ON(!bb);
 	}
 	zone_bm->cur_block = bb;
 	pfn -= bb->start_pfn;
-	set_bit(pfn % BM_BITS_PER_CHUNK, bb->data + pfn / BM_BITS_PER_CHUNK);
-	return 0;
+	*bit_nr = pfn % BM_BITS_PER_CHUNK;
+	*addr = bb->data + pfn / BM_BITS_PER_CHUNK;
+}
+
+static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
+{
+	void *addr;
+	unsigned int bit;
+
+	memory_bm_find_bit(bm, pfn, &addr, &bit);
+	set_bit(bit, addr);
+}
+
+static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
+{
+	void *addr;
+	unsigned int bit;
+
+	memory_bm_find_bit(bm, pfn, &addr, &bit);
+	clear_bit(bit, addr);
+}
+
+static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
+{
+	void *addr;
+	unsigned int bit;
+
+	memory_bm_find_bit(bm, pfn, &addr, &bit);
+	return test_bit(bit, addr);
 }
 
 /* Two auxiliary functions for memory_bm_next_pfn */
@@ -563,6 +588,199 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
 	return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit;
 }
 
+/**
+ *	This structure represents a range of page frames the contents of which
+ *	should not be saved during the suspend.
+ */
+
+struct nosave_region {
+	struct list_head list;
+	unsigned long start_pfn;
+	unsigned long end_pfn;
+};
+
+static LIST_HEAD(nosave_regions);
+
+/**
+ *	register_nosave_region - register a range of page frames the contents
+ *	of which should not be saved during the suspend (to be used in the early
+ *	initialization code)
+ */
+
+void __init
+register_nosave_region(unsigned long start_pfn, unsigned long end_pfn)
+{
+	struct nosave_region *region;
+
+	if (start_pfn >= end_pfn)
+		return;
+
+	if (!list_empty(&nosave_regions)) {
+		/* Try to extend the previous region (they should be sorted) */
+		region = list_entry(nosave_regions.prev,
+					struct nosave_region, list);
+		if (region->end_pfn == start_pfn) {
+			region->end_pfn = end_pfn;
+			goto Report;
+		}
+	}
+	/* This allocation cannot fail */
+	region = alloc_bootmem_low(sizeof(struct nosave_region));
+	region->start_pfn = start_pfn;
+	region->end_pfn = end_pfn;
+	list_add_tail(&region->list, &nosave_regions);
+ Report:
+	printk("swsusp: Registered nosave memory region: %016lx - %016lx\n",
+		start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
+}
+
+/*
+ * Set bits in this map correspond to the page frames the contents of which
+ * should not be saved during the suspend.
+ */
+static struct memory_bitmap *forbidden_pages_map;
+
+/* Set bits in this map correspond to free page frames. */
+static struct memory_bitmap *free_pages_map;
+
+/*
+ * Each page frame allocated for creating the image is marked by setting the
+ * corresponding bits in forbidden_pages_map and free_pages_map simultaneously
+ */
+
+void swsusp_set_page_free(struct page *page)
+{
+	if (free_pages_map)
+		memory_bm_set_bit(free_pages_map, page_to_pfn(page));
+}
+
+static int swsusp_page_is_free(struct page *page)
+{
+	return free_pages_map ?
+		memory_bm_test_bit(free_pages_map, page_to_pfn(page)) : 0;
+}
+
+void swsusp_unset_page_free(struct page *page)
+{
+	if (free_pages_map)
+		memory_bm_clear_bit(free_pages_map, page_to_pfn(page));
+}
+
+static void swsusp_set_page_forbidden(struct page *page)
+{
+	if (forbidden_pages_map)
+		memory_bm_set_bit(forbidden_pages_map, page_to_pfn(page));
+}
+
+int swsusp_page_is_forbidden(struct page *page)
+{
+	return forbidden_pages_map ?
+		memory_bm_test_bit(forbidden_pages_map, page_to_pfn(page)) : 0;
+}
+
+static void swsusp_unset_page_forbidden(struct page *page)
+{
+	if (forbidden_pages_map)
+		memory_bm_clear_bit(forbidden_pages_map, page_to_pfn(page));
+}
+
+/**
+ *	mark_nosave_pages - set bits corresponding to the page frames the
+ *	contents of which should not be saved in a given bitmap.
+ */
+
+static void mark_nosave_pages(struct memory_bitmap *bm)
+{
+	struct nosave_region *region;
+
+	if (list_empty(&nosave_regions))
+		return;
+
+	list_for_each_entry(region, &nosave_regions, list) {
+		unsigned long pfn;
+
+		printk("swsusp: Marking nosave pages: %016lx - %016lx\n",
+				region->start_pfn << PAGE_SHIFT,
+				region->end_pfn << PAGE_SHIFT);
+
+		for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
+			memory_bm_set_bit(bm, pfn);
+	}
+}
+
+/**
+ *	create_basic_memory_bitmaps - create bitmaps needed for marking page
+ *	frames that should not be saved and free page frames.  The pointers
+ *	forbidden_pages_map and free_pages_map are only modified if everything
+ *	goes well, because we don't want the bits to be used before both bitmaps
+ *	are set up.
+ */
+
+int create_basic_memory_bitmaps(void)
+{
+	struct memory_bitmap *bm1, *bm2;
+	int error = 0;
+
+	BUG_ON(forbidden_pages_map || free_pages_map);
+
+	bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_ATOMIC);
+	if (!bm1)
+		return -ENOMEM;
+
+	error = memory_bm_create(bm1, GFP_ATOMIC | __GFP_COLD, PG_ANY);
+	if (error)
+		goto Free_first_object;
+
+	bm2 = kzalloc(sizeof(struct memory_bitmap), GFP_ATOMIC);
+	if (!bm2)
+		goto Free_first_bitmap;
+
+	error = memory_bm_create(bm2, GFP_ATOMIC | __GFP_COLD, PG_ANY);
+	if (error)
+		goto Free_second_object;
+
+	forbidden_pages_map = bm1;
+	free_pages_map = bm2;
+	mark_nosave_pages(forbidden_pages_map);
+
+	printk("swsusp: Basic memory bitmaps created\n");
+
+	return 0;
+
+ Free_second_object:
+	kfree(bm2);
+ Free_first_bitmap:
+ 	memory_bm_free(bm1, PG_UNSAFE_CLEAR);
+ Free_first_object:
+	kfree(bm1);
+	return -ENOMEM;
+}
+
+/**
+ *	free_basic_memory_bitmaps - free memory bitmaps allocated by
+ *	create_basic_memory_bitmaps().  The auxiliary pointers are necessary
+ *	so that the bitmaps themselves are not referred to while they are being
+ *	freed.
+ */
+
+void free_basic_memory_bitmaps(void)
+{
+	struct memory_bitmap *bm1, *bm2;
+
+	BUG_ON(!(forbidden_pages_map && free_pages_map));
+
+	bm1 = forbidden_pages_map;
+	bm2 = free_pages_map;
+	forbidden_pages_map = NULL;
+	free_pages_map = NULL;
+	memory_bm_free(bm1, PG_UNSAFE_CLEAR);
+	kfree(bm1);
+	memory_bm_free(bm2, PG_UNSAFE_CLEAR);
+	kfree(bm2);
+
+	printk("swsusp: Basic memory bitmaps freed\n");
+}
+
 /**
  *	snapshot_additional_pages - estimate the number of additional pages
  *	be needed for setting up the suspend image data structures for given
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 7cf6713b2325..845acd84cb23 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -52,6 +52,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
 	if ((filp->f_flags & O_ACCMODE) == O_RDWR)
 		return -ENOSYS;
 
+	if(create_basic_memory_bitmaps())
+		return -ENOMEM;
+
 	nonseekable_open(inode, filp);
 	data = &snapshot_state;
 	filp->private_data = data;
@@ -77,6 +80,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
 	struct snapshot_data *data;
 
 	swsusp_free();
+	free_basic_memory_bitmaps();
 	data = filp->private_data;
 	free_all_swap_pages(data->swap, data->bitmap);
 	free_bitmap(data->bitmap);
-- 
cgit v1.2.3


From 1525a2ad76f991eba9755f75c9b6d4d97abad25e Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 6 May 2007 14:50:44 -0700
Subject: swsusp: fix error paths in snapshot_open

We forget to increase device_available if there's an error in snapshot_open(),
so the snapshot device cannot be open at all after snapshot_open() has
returned an error.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/user.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/user.c b/kernel/power/user.c
index 845acd84cb23..bd1771f7a64e 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -49,12 +49,14 @@ static int snapshot_open(struct inode *inode, struct file *filp)
 	if (!atomic_add_unless(&device_available, -1, 0))
 		return -EBUSY;
 
-	if ((filp->f_flags & O_ACCMODE) == O_RDWR)
+	if ((filp->f_flags & O_ACCMODE) == O_RDWR) {
+		atomic_inc(&device_available);
 		return -ENOSYS;
-
-	if(create_basic_memory_bitmaps())
+	}
+	if(create_basic_memory_bitmaps()) {
+		atomic_inc(&device_available);
 		return -ENOMEM;
-
+	}
 	nonseekable_open(inode, filp);
 	data = &snapshot_state;
 	filp->private_data = data;
-- 
cgit v1.2.3


From 0709db6072c2e799eba1aa61bd19e0d7f38aa2cd Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 6 May 2007 14:50:45 -0700
Subject: swsusp: use GFP_KERNEL for creating basic data structures

Make swsusp call create_basic_memory_bitmaps() before processes are frozen, so
that GFP_KERNEL allocations can be made in it.  Additionally, ensure that the
swsusp's userland interface won't be used while either pm_suspend_disk() or
software_resume() is being executed.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/disk.c     | 37 ++++++++++++++++++++++++++-----------
 kernel/power/power.h    |  3 +++
 kernel/power/snapshot.c |  8 ++++----
 kernel/power/user.c     | 10 +++++-----
 4 files changed, 38 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 403bc3722fee..e518379b667a 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -130,28 +130,33 @@ int pm_suspend_disk(void)
 {
 	int error;
 
+	/* The snapshot device should not be opened while we're running */
+	if (!atomic_add_unless(&snapshot_device_available, -1, 0))
+		return -EBUSY;
+
+	/* Allocate memory management structures */
+	error = create_basic_memory_bitmaps();
+	if (error)
+		goto Exit;
+
 	error = prepare_processes();
 	if (error)
-		return error;
+		goto Finish;
 
 	if (pm_disk_mode == PM_DISK_TESTPROC) {
 		printk("swsusp debug: Waiting for 5 seconds.\n");
 		mdelay(5000);
 		goto Thaw;
 	}
-	/* Allocate memory management structures */
-	error = create_basic_memory_bitmaps();
-	if (error)
-		goto Thaw;
 
 	/* Free memory before shutting down devices. */
 	error = swsusp_shrink_memory();
 	if (error)
-		goto Finish;
+		goto Thaw;
 
 	error = platform_prepare();
 	if (error)
-		goto Finish;
+		goto Thaw;
 
 	suspend_console();
 	error = device_suspend(PMSG_FREEZE);
@@ -186,7 +191,7 @@ int pm_suspend_disk(void)
 			power_down();
 		else {
 			swsusp_free();
-			goto Finish;
+			goto Thaw;
 		}
 	} else {
 		pr_debug("PM: Image restored successfully.\n");
@@ -199,10 +204,12 @@ int pm_suspend_disk(void)
 	platform_finish();
 	device_resume();
 	resume_console();
- Finish:
-	free_basic_memory_bitmaps();
  Thaw:
 	unprepare_processes();
+ Finish:
+	free_basic_memory_bitmaps();
+ Exit:
+	atomic_inc(&snapshot_device_available);
 	return error;
 }
 
@@ -250,9 +257,15 @@ static int software_resume(void)
 	if (error)
 		goto Unlock;
 
+	/* The snapshot device should not be opened while we're running */
+	if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
+		error = -EBUSY;
+		goto Unlock;
+	}
+
 	error = create_basic_memory_bitmaps();
 	if (error)
-		goto Unlock;
+		goto Finish;
 
 	pr_debug("PM: Preparing processes for restore.\n");
 	error = prepare_processes();
@@ -290,6 +303,8 @@ static int software_resume(void)
 	unprepare_processes();
  Done:
 	free_basic_memory_bitmaps();
+ Finish:
+	atomic_inc(&snapshot_device_available);
 	/* For success case, the suspend path will release the lock */
  Unlock:
 	mutex_unlock(&pm_mutex);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 1f8052bda0f7..a64d3f22de97 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -140,6 +140,9 @@ struct resume_swap_area {
 #define PMOPS_ENTER	2
 #define PMOPS_FINISH	3
 
+/* If unset, the snapshot device cannot be open. */
+extern atomic_t snapshot_device_available;
+
 /**
  *	The bitmap is used for tracing allocated swap pages
  *
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f66e4411795b..128da11f01c2 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -723,19 +723,19 @@ int create_basic_memory_bitmaps(void)
 
 	BUG_ON(forbidden_pages_map || free_pages_map);
 
-	bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_ATOMIC);
+	bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
 	if (!bm1)
 		return -ENOMEM;
 
-	error = memory_bm_create(bm1, GFP_ATOMIC | __GFP_COLD, PG_ANY);
+	error = memory_bm_create(bm1, GFP_KERNEL, PG_ANY);
 	if (error)
 		goto Free_first_object;
 
-	bm2 = kzalloc(sizeof(struct memory_bitmap), GFP_ATOMIC);
+	bm2 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
 	if (!bm2)
 		goto Free_first_bitmap;
 
-	error = memory_bm_create(bm2, GFP_ATOMIC | __GFP_COLD, PG_ANY);
+	error = memory_bm_create(bm2, GFP_KERNEL, PG_ANY);
 	if (error)
 		goto Free_second_object;
 
diff --git a/kernel/power/user.c b/kernel/power/user.c
index bd1771f7a64e..72dbfd01408e 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -40,21 +40,21 @@ static struct snapshot_data {
 	char platform_suspend;
 } snapshot_state;
 
-static atomic_t device_available = ATOMIC_INIT(1);
+atomic_t snapshot_device_available = ATOMIC_INIT(1);
 
 static int snapshot_open(struct inode *inode, struct file *filp)
 {
 	struct snapshot_data *data;
 
-	if (!atomic_add_unless(&device_available, -1, 0))
+	if (!atomic_add_unless(&snapshot_device_available, -1, 0))
 		return -EBUSY;
 
 	if ((filp->f_flags & O_ACCMODE) == O_RDWR) {
-		atomic_inc(&device_available);
+		atomic_inc(&snapshot_device_available);
 		return -ENOSYS;
 	}
 	if(create_basic_memory_bitmaps()) {
-		atomic_inc(&device_available);
+		atomic_inc(&snapshot_device_available);
 		return -ENOMEM;
 	}
 	nonseekable_open(inode, filp);
@@ -92,7 +92,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
 		enable_nonboot_cpus();
 		mutex_unlock(&pm_mutex);
 	}
-	atomic_inc(&device_available);
+	atomic_inc(&snapshot_device_available);
 	return 0;
 }
 
-- 
cgit v1.2.3


From d1d241cc2c5feec057c370aa71637380b1b945d5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 6 May 2007 14:50:47 -0700
Subject: swsusp: use rbtree for tracking allocated swap

Make swsusp use extents instead of a bitmap to trace swap pages allocated
for saving the image (the tracking is only needed in case there's an error,
so that the allocated swap pages can be released).

This should allow us to reduce the memory usage, practically always, and
improve performance.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Nigel Cunningham <nigel@nigel.suspend2.net>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/power.h  |  27 ++--------
 kernel/power/swap.c   |  18 ++-----
 kernel/power/swsusp.c | 137 +++++++++++++++++++++++++++-----------------------
 kernel/power/user.c   |  22 ++------
 4 files changed, 86 insertions(+), 118 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index a64d3f22de97..a3e47cbdaf31 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -143,30 +143,9 @@ struct resume_swap_area {
 /* If unset, the snapshot device cannot be open. */
 extern atomic_t snapshot_device_available;
 
-/**
- *	The bitmap is used for tracing allocated swap pages
- *
- *	The entire bitmap consists of a number of bitmap_page
- *	structures linked with the help of the .next member.
- *	Thus each page can be allocated individually, so we only
- *	need to make 0-order memory allocations to create
- *	the bitmap.
- */
-
-#define BITMAP_PAGE_SIZE	(PAGE_SIZE - sizeof(void *))
-#define BITMAP_PAGE_CHUNKS	(BITMAP_PAGE_SIZE / sizeof(long))
-#define BITS_PER_CHUNK		(sizeof(long) * 8)
-#define BITMAP_PAGE_BITS	(BITMAP_PAGE_CHUNKS * BITS_PER_CHUNK)
-
-struct bitmap_page {
-	unsigned long		chunks[BITMAP_PAGE_CHUNKS];
-	struct bitmap_page	*next;
-};
-
-extern void free_bitmap(struct bitmap_page *bitmap);
-extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
-extern sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap);
-extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
+extern sector_t alloc_swapdev_block(int swap);
+extern void free_all_swap_pages(int swap);
+extern int swsusp_swap_in_use(void);
 
 extern int swsusp_check(void);
 extern int swsusp_shrink_memory(void);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index b18c155cbb60..e83ed9945a80 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -243,7 +243,6 @@ struct swap_map_page {
 struct swap_map_handle {
 	struct swap_map_page *cur;
 	sector_t cur_swap;
-	struct bitmap_page *bitmap;
 	unsigned int k;
 };
 
@@ -252,9 +251,6 @@ static void release_swap_writer(struct swap_map_handle *handle)
 	if (handle->cur)
 		free_page((unsigned long)handle->cur);
 	handle->cur = NULL;
-	if (handle->bitmap)
-		free_bitmap(handle->bitmap);
-	handle->bitmap = NULL;
 }
 
 static int get_swap_writer(struct swap_map_handle *handle)
@@ -262,12 +258,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
 	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
 	if (!handle->cur)
 		return -ENOMEM;
-	handle->bitmap = alloc_bitmap(count_swap_pages(root_swap, 0));
-	if (!handle->bitmap) {
-		release_swap_writer(handle);
-		return -ENOMEM;
-	}
-	handle->cur_swap = alloc_swapdev_block(root_swap, handle->bitmap);
+	handle->cur_swap = alloc_swapdev_block(root_swap);
 	if (!handle->cur_swap) {
 		release_swap_writer(handle);
 		return -ENOSPC;
@@ -284,7 +275,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
 
 	if (!handle->cur)
 		return -EINVAL;
-	offset = alloc_swapdev_block(root_swap, handle->bitmap);
+	offset = alloc_swapdev_block(root_swap);
 	error = write_page(buf, offset, bio_chain);
 	if (error)
 		return error;
@@ -293,7 +284,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
 		error = wait_on_bio_chain(bio_chain);
 		if (error)
 			goto out;
-		offset = alloc_swapdev_block(root_swap, handle->bitmap);
+		offset = alloc_swapdev_block(root_swap);
 		if (!offset)
 			return -ENOSPC;
 		handle->cur->next_swap = offset;
@@ -430,7 +421,8 @@ int swsusp_write(void)
 		}
 	}
 	if (error)
-		free_all_swap_pages(root_swap, handle.bitmap);
+		free_all_swap_pages(root_swap);
+
 	release_swap_writer(&handle);
  out:
 	swsusp_close();
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 175370824f37..1109023d8358 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -50,6 +50,7 @@
 #include <linux/syscalls.h>
 #include <linux/highmem.h>
 #include <linux/time.h>
+#include <linux/rbtree.h>
 
 #include "power.h"
 
@@ -74,72 +75,69 @@ static inline unsigned int count_highmem_pages(void) { return 0; }
 /**
  *	The following functions are used for tracing the allocated
  *	swap pages, so that they can be freed in case of an error.
- *
- *	The functions operate on a linked bitmap structure defined
- *	in power.h
  */
 
-void free_bitmap(struct bitmap_page *bitmap)
-{
-	struct bitmap_page *bp;
+struct swsusp_extent {
+	struct rb_node node;
+	unsigned long start;
+	unsigned long end;
+};
 
-	while (bitmap) {
-		bp = bitmap->next;
-		free_page((unsigned long)bitmap);
-		bitmap = bp;
-	}
-}
+static struct rb_root swsusp_extents = RB_ROOT;
 
-struct bitmap_page *alloc_bitmap(unsigned int nr_bits)
+static int swsusp_extents_insert(unsigned long swap_offset)
 {
-	struct bitmap_page *bitmap, *bp;
-	unsigned int n;
-
-	if (!nr_bits)
-		return NULL;
-
-	bitmap = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
-	bp = bitmap;
-	for (n = BITMAP_PAGE_BITS; n < nr_bits; n += BITMAP_PAGE_BITS) {
-		bp->next = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
-		bp = bp->next;
-		if (!bp) {
-			free_bitmap(bitmap);
-			return NULL;
+	struct rb_node **new = &(swsusp_extents.rb_node);
+	struct rb_node *parent = NULL;
+	struct swsusp_extent *ext;
+
+	/* Figure out where to put the new node */
+	while (*new) {
+		ext = container_of(*new, struct swsusp_extent, node);
+		parent = *new;
+		if (swap_offset < ext->start) {
+			/* Try to merge */
+			if (swap_offset == ext->start - 1) {
+				ext->start--;
+				return 0;
+			}
+			new = &((*new)->rb_left);
+		} else if (swap_offset > ext->end) {
+			/* Try to merge */
+			if (swap_offset == ext->end + 1) {
+				ext->end++;
+				return 0;
+			}
+			new = &((*new)->rb_right);
+		} else {
+			/* It already is in the tree */
+			return -EINVAL;
 		}
 	}
-	return bitmap;
-}
-
-static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit)
-{
-	unsigned int n;
-
-	n = BITMAP_PAGE_BITS;
-	while (bitmap && n <= bit) {
-		n += BITMAP_PAGE_BITS;
-		bitmap = bitmap->next;
-	}
-	if (!bitmap)
-		return -EINVAL;
-	n -= BITMAP_PAGE_BITS;
-	bit -= n;
-	n = 0;
-	while (bit >= BITS_PER_CHUNK) {
-		bit -= BITS_PER_CHUNK;
-		n++;
-	}
-	bitmap->chunks[n] |= (1UL << bit);
+	/* Add the new node and rebalance the tree. */
+	ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL);
+	if (!ext)
+		return -ENOMEM;
+
+	ext->start = swap_offset;
+	ext->end = swap_offset;
+	rb_link_node(&ext->node, parent, new);
+	rb_insert_color(&ext->node, &swsusp_extents);
 	return 0;
 }
 
-sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap)
+/**
+ *	alloc_swapdev_block - allocate a swap page and register that it has
+ *	been allocated, so that it can be freed in case of an error.
+ */
+
+sector_t alloc_swapdev_block(int swap)
 {
 	unsigned long offset;
 
 	offset = swp_offset(get_swap_page_of_type(swap));
 	if (offset) {
-		if (bitmap_set(bitmap, offset))
+		if (swsusp_extents_insert(offset))
 			swap_free(swp_entry(swap, offset));
 		else
 			return swapdev_block(swap, offset);
@@ -147,23 +145,34 @@ sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap)
 	return 0;
 }
 
-void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
+/**
+ *	free_all_swap_pages - free swap pages allocated for saving image data.
+ *	It also frees the extents used to register which swap entres had been
+ *	allocated.
+ */
+
+void free_all_swap_pages(int swap)
 {
-	unsigned int bit, n;
-	unsigned long test;
-
-	bit = 0;
-	while (bitmap) {
-		for (n = 0; n < BITMAP_PAGE_CHUNKS; n++)
-			for (test = 1UL; test; test <<= 1) {
-				if (bitmap->chunks[n] & test)
-					swap_free(swp_entry(swap, bit));
-				bit++;
-			}
-		bitmap = bitmap->next;
+	struct rb_node *node;
+
+	while ((node = swsusp_extents.rb_node)) {
+		struct swsusp_extent *ext;
+		unsigned long offset;
+
+		ext = container_of(node, struct swsusp_extent, node);
+		rb_erase(node, &swsusp_extents);
+		for (offset = ext->start; offset <= ext->end; offset++)
+			swap_free(swp_entry(swap, offset));
+
+		kfree(ext);
 	}
 }
 
+int swsusp_swap_in_use(void)
+{
+	return (swsusp_extents.rb_node != NULL);
+}
+
 /**
  *	swsusp_show_speed - print the time elapsed between two events represented by
  *	@start and @stop
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 72dbfd01408e..ad4e10208cde 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -33,7 +33,6 @@
 static struct snapshot_data {
 	struct snapshot_handle handle;
 	int swap;
-	struct bitmap_page *bitmap;
 	int mode;
 	char frozen;
 	char ready;
@@ -69,7 +68,6 @@ static int snapshot_open(struct inode *inode, struct file *filp)
 		data->swap = -1;
 		data->mode = O_WRONLY;
 	}
-	data->bitmap = NULL;
 	data->frozen = 0;
 	data->ready = 0;
 	data->platform_suspend = 0;
@@ -84,8 +82,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
 	swsusp_free();
 	free_basic_memory_bitmaps();
 	data = filp->private_data;
-	free_all_swap_pages(data->swap, data->bitmap);
-	free_bitmap(data->bitmap);
+	free_all_swap_pages(data->swap);
 	if (data->frozen) {
 		mutex_lock(&pm_mutex);
 		thaw_processes();
@@ -300,14 +297,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 			error = -ENODEV;
 			break;
 		}
-		if (!data->bitmap) {
-			data->bitmap = alloc_bitmap(count_swap_pages(data->swap, 0));
-			if (!data->bitmap) {
-				error = -ENOMEM;
-				break;
-			}
-		}
-		offset = alloc_swapdev_block(data->swap, data->bitmap);
+		offset = alloc_swapdev_block(data->swap);
 		if (offset) {
 			offset <<= PAGE_SHIFT;
 			error = put_user(offset, (sector_t __user *)arg);
@@ -321,13 +311,11 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 			error = -ENODEV;
 			break;
 		}
-		free_all_swap_pages(data->swap, data->bitmap);
-		free_bitmap(data->bitmap);
-		data->bitmap = NULL;
+		free_all_swap_pages(data->swap);
 		break;
 
 	case SNAPSHOT_SET_SWAP_FILE:
-		if (!data->bitmap) {
+		if (!swsusp_swap_in_use()) {
 			/*
 			 * User space encodes device types as two-byte values,
 			 * so we need to recode them
@@ -426,7 +414,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		break;
 
 	case SNAPSHOT_SET_SWAP_AREA:
-		if (data->bitmap) {
+		if (swsusp_swap_in_use()) {
 			error = -EPERM;
 		} else {
 			struct resume_swap_area swap_area;
-- 
cgit v1.2.3


From ab3bfca7abf3fd0fe41d26d839610a787aa7e587 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Sun, 6 May 2007 14:50:49 -0700
Subject: remove software_suspend()

Remove software_suspend() and all its users since
pm_suspend(PM_SUSPEND_DISK) should be equivalent and there's no point in
having two interfaces for the same thing.

The patch also changes the valid_state function to return 0 (false) for
PM_SUSPEND_DISK when SOFTWARE_SUSPEND is not configured instead of
accepting it and having the whole thing fail later.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/main.c | 21 +++++++--------------
 kernel/sys.c        |  2 +-
 2 files changed, 8 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index 5a779270cdbc..f6dda685e7e2 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -184,17 +184,21 @@ static void suspend_finish(suspend_state_t state)
 static const char * const pm_states[PM_SUSPEND_MAX] = {
 	[PM_SUSPEND_STANDBY]	= "standby",
 	[PM_SUSPEND_MEM]	= "mem",
-#ifdef CONFIG_SOFTWARE_SUSPEND
 	[PM_SUSPEND_DISK]	= "disk",
-#endif
 };
 
 static inline int valid_state(suspend_state_t state)
 {
 	/* Suspend-to-disk does not really need low-level support.
-	 * It can work with reboot if needed. */
+	 * It can work with shutdown/reboot if needed. If it isn't
+	 * configured, then it cannot be supported.
+	 */
 	if (state == PM_SUSPEND_DISK)
+#ifdef CONFIG_SOFTWARE_SUSPEND
 		return 1;
+#else
+		return 0;
+#endif
 
 	/* all other states need lowlevel support and need to be
 	 * valid to the lowlevel implementation, no valid callback
@@ -244,17 +248,6 @@ static int enter_state(suspend_state_t state)
 	return error;
 }
 
-#ifdef CONFIG_SOFTWARE_SUSPEND
-/*
- * This is main interface to the outside world. It needs to be
- * called from process context.
- */
-int software_suspend(void)
-{
-	return enter_state(PM_SUSPEND_DISK);
-}
-#endif
-
 
 /**
  *	pm_suspend - Externally visible function for suspending system.
diff --git a/kernel/sys.c b/kernel/sys.c
index 123b165080e6..fe1f3ab20477 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -881,7 +881,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 #ifdef CONFIG_SOFTWARE_SUSPEND
 	case LINUX_REBOOT_CMD_SW_SUSPEND:
 		{
-			int ret = software_suspend();
+			int ret = pm_suspend(PM_SUSPEND_DISK);
 			unlock_kernel();
 			return ret;
 		}
-- 
cgit v1.2.3


From f0ced9b229cfbc76b5db9837b4b256b602d56610 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Sun, 6 May 2007 14:50:50 -0700
Subject: power management: change /sys/power/disk display

Change /sys/power/disk to display all valid modes as well as the currently
selected one in a fashion known from the LED subsystem.

This changes userspace API, but it is apparently not used much (we asked
some userspace developers)

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/disk.c | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index e518379b667a..06331374d862 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -350,7 +350,34 @@ static const char * const pm_disk_modes[] = {
 
 static ssize_t disk_show(struct kset *kset, char *buf)
 {
-	return sprintf(buf, "%s\n", pm_disk_modes[pm_disk_mode]);
+	int i;
+	char *start = buf;
+
+	for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) {
+		if (!pm_disk_modes[i])
+			continue;
+		switch (i) {
+		case PM_DISK_SHUTDOWN:
+		case PM_DISK_REBOOT:
+		case PM_DISK_TEST:
+		case PM_DISK_TESTPROC:
+			break;
+		default:
+			if (pm_ops && pm_ops->enter &&
+			    (i == pm_ops->pm_disk_mode))
+				break;
+			/* not a valid mode, continue with loop */
+			continue;
+		}
+		if (i == pm_disk_mode)
+			buf += sprintf(buf, "[%s]", pm_disk_modes[i]);
+		else
+			buf += sprintf(buf, "%s", pm_disk_modes[i]);
+		if (i+1 != PM_DISK_MAX)
+			buf += sprintf(buf, " ");
+	}
+	buf += sprintf(buf, "\n");
+	return buf-start;
 }
 
 
-- 
cgit v1.2.3


From a7ee2e5f5b4c9c72f4390c60ba7ea30306f47188 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Sun, 6 May 2007 14:50:50 -0700
Subject: kconfig: mention 'hibernation' not just swsusp

Clarify that "software suspend" is what's called "hibernation" in most user
interfaces, shrinking a terminology gap.  (Examples include Gnome and
MS-Windows.)

Also provide a more succinct description of what it does, so you won't have
to read the whole novel in Kconfig; and highlights just why the lack of
BIOS requirements for swsusp are a big deal.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/Kconfig | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 51a4dd0f1b74..877721708fa4 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -78,17 +78,22 @@ config PM_SYSFS_DEPRECATED
 	  are likely to be bus or driver specific.
 
 config SOFTWARE_SUSPEND
-	bool "Software Suspend"
+	bool "Software Suspend (Hibernation)"
 	depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP))
 	---help---
-	  Enable the suspend to disk (STD) functionality.
+	  Enable the suspend to disk (STD) functionality, which is usually
+	  called "hibernation" in user interfaces.  STD checkpoints the
+	  system and powers it off; and restores that checkpoint on reboot.
 
 	  You can suspend your machine with 'echo disk > /sys/power/state'.
 	  Alternatively, you can use the additional userland tools available
 	  from <http://suspend.sf.net>.
 
 	  In principle it does not require ACPI or APM, although for example
-	  ACPI will be used if available.
+	  ACPI will be used for the final steps when it is available.  One
+	  of the reasons to use software suspend is that the firmware hooks
+	  for suspend states like suspend-to-RAM (STR) often don't work very
+	  well with Linux.
 
 	  It creates an image which is saved in your active swap. Upon the next
 	  boot, pass the 'resume=/dev/swappartition' argument to the kernel to
-- 
cgit v1.2.3


From 9b95e43763cfdfebc1318d27e55712e7b6bfe098 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 6 May 2007 14:50:51 -0700
Subject: swsusp: fix snapshot_release

Remove the leftover enable_nonboot_cpus() from snapshot_release().

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Nigel Cunningham <nigel@nigel.suspend2.net>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/user.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/user.c b/kernel/power/user.c
index ad4e10208cde..040560d9c312 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -86,7 +86,6 @@ static int snapshot_release(struct inode *inode, struct file *filp)
 	if (data->frozen) {
 		mutex_lock(&pm_mutex);
 		thaw_processes();
-		enable_nonboot_cpus();
 		mutex_unlock(&pm_mutex);
 	}
 	atomic_inc(&snapshot_device_available);
-- 
cgit v1.2.3


From 56f99bcb52d64d70078b41cc176dd8b6f5763108 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 6 May 2007 14:50:52 -0700
Subject: swsusp: free more memory

Move the definition of PAGES_FOR_IO to kernel/power/power.h and introduce
SPARE_PAGES representing the number of pages that should be freed by the
swsusp's memory shrinker in addition to PAGES_FOR_IO so that device drivers
can allocate some memory (up to 1 MB total) in their .suspend() routines
without causing the suspend to fail.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@nigel.suspend2.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/power.h  | 12 +++++++++++-
 kernel/power/swsusp.c |  2 +-
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index a3e47cbdaf31..34b43542785a 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -14,8 +14,18 @@ struct swsusp_info {
 
 
 #ifdef CONFIG_SOFTWARE_SUSPEND
-extern int pm_suspend_disk(void);
+/*
+ * Keep some memory free so that I/O operations can succeed without paging
+ * [Might this be more than 4 MB?]
+ */
+#define PAGES_FOR_IO	((4096 * 1024) >> PAGE_SHIFT)
+/*
+ * Keep 1 MB of memory free so that device drivers can allocate some pages in
+ * their .suspend() routines without breaking the suspend to disk.
+ */
+#define SPARE_PAGES	((1024 * 1024) >> PAGE_SHIFT)
 
+extern int pm_suspend_disk(void);
 #else
 static inline int pm_suspend_disk(void)
 {
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 1109023d8358..5da304c8f1f6 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -233,7 +233,7 @@ int swsusp_shrink_memory(void)
 		long size, highmem_size;
 
 		highmem_size = count_highmem_pages();
-		size = count_data_pages() + PAGES_FOR_IO;
+		size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
 		tmp = size;
 		size += highmem_size;
 		for_each_zone (zone)
-- 
cgit v1.2.3


From ab1b6f03a10ba1f5638188ab06bf46e33ac3a160 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 8 May 2007 00:23:29 -0700
Subject: simplify the stacktrace code

Simplify the stacktrace code:

 - remove the unused task argument to save_stack_trace, it's always
   current
 - remove the all_contexts flag, it's alwasy 0

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Andi Kleen <ak@suse.de>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/lockdep.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 7065a687ac54..c1e308a080b8 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -257,9 +257,8 @@ static int save_trace(struct stack_trace *trace)
 	trace->entries = stack_trace + nr_stack_trace_entries;
 
 	trace->skip = 3;
-	trace->all_contexts = 0;
 
-	save_stack_trace(trace, NULL);
+	save_stack_trace(trace);
 
 	trace->max_entries = trace->nr_entries;
 
-- 
cgit v1.2.3


From 1065d130dddc3241706c50a01ced7b03bcb657be Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 8 May 2007 00:24:01 -0700
Subject: freezer: task->exit_state should be treated as bolean

Except for BUG_ON() checks, we should not use EXIT_XXXX defines outside of
exit/wait paths.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/process.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0eb5c420e8ed..179529dc3819 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -25,10 +25,9 @@
 
 static inline int freezeable(struct task_struct * p)
 {
-	if ((p == current) || 
+	if ((p == current) ||
 	    (p->flags & PF_NOFREEZE) ||
-	    (p->exit_state == EXIT_ZOMBIE) ||
-	    (p->exit_state == EXIT_DEAD))
+	    (p->exit_state != 0))
 		return 0;
 	return 1;
 }
-- 
cgit v1.2.3


From 02fb6149f7a64c62934c035e7635321cb9a8cf2e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 8 May 2007 00:24:03 -0700
Subject: softlockup: s/99/MAX_RT_PRIO/

Don't use hardcoded 99 value, use MAX_RT_PRIO.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/softlockup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 50afeb813305..81d43caa2012 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -82,7 +82,7 @@ void softlockup_tick(void)
  */
 static int watchdog(void * __bind_cpu)
 {
-	struct sched_param param = { .sched_priority = 99 };
+	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 
 	sched_setscheduler(current, SCHED_FIFO, &param);
 	current->flags |= PF_NOFREEZE;
-- 
cgit v1.2.3


From 6d4f9c55002544bac1c99d0bab46c89319ab876e Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Tue, 8 May 2007 00:24:58 -0700
Subject: module: use krealloc

This converts an open-coded krealloc() to use the shiny new API.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/module.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 1eb8ca565ba0..9bdbd1217a6f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -310,14 +310,14 @@ static int split_block(unsigned int i, unsigned short size)
 {
 	/* Reallocation required? */
 	if (pcpu_num_used + 1 > pcpu_num_allocated) {
-		int *new = kmalloc(sizeof(new[0]) * pcpu_num_allocated*2,
-				   GFP_KERNEL);
+		int *new;
+
+		new = krealloc(pcpu_size, sizeof(new[0])*pcpu_num_allocated*2,
+			       GFP_KERNEL);
 		if (!new)
 			return 0;
 
-		memcpy(new, pcpu_size, sizeof(new[0])*pcpu_num_allocated);
 		pcpu_num_allocated *= 2;
-		kfree(pcpu_size);
 		pcpu_size = new;
 	}
 
-- 
cgit v1.2.3


From ee527cd3a20c2aeaac17d939e5d011f7a76d69f5 Mon Sep 17 00:00:00 2001
From: Prarit Bhargava <prarit@redhat.com>
Date: Tue, 8 May 2007 00:25:08 -0700
Subject: Use stop_machine_run in the Intel RNG driver

Replace call_smp_function with stop_machine_run in the Intel RNG driver.

CPU A has done read_lock(&lock)
CPU B has done write_lock_irq(&lock) and is waiting for A to release the lock.

A third CPU calls call_smp_function and issues the IPI.  CPU A takes CPU
C's IPI.  CPU B is waiting with interrupts disabled and does not see the
IPI.  CPU C is stuck waiting for CPU B to respond to the IPI.

Deadlock.

The solution is to use stop_machine_run instead of call_smp_function
(call_smp_function should not be called in situations where the CPUs may be
suspended).

[haruo.tomita@toshiba.co.jp: fix a typo in mod_init()]
[haruo.tomita@toshiba.co.jp: fix memory leak]
Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Cc: Jan Beulich <jbeulich@novell.com>
Cc: "Tomita, Haruo" <haruo.tomita@toshiba.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/stop_machine.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 12458040e665..daabb74ee0bc 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,11 +1,12 @@
 /* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
  * GPL v2 and any later version.
  */
-#include <linux/stop_machine.h>
-#include <linux/kthread.h>
-#include <linux/sched.h>
 #include <linux/cpu.h>
 #include <linux/err.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/stop_machine.h>
 #include <linux/syscalls.h>
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
@@ -208,3 +209,4 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(stop_machine_run);
-- 
cgit v1.2.3


From e3222c4ecc649c4ae568e61dda9349482401b501 Mon Sep 17 00:00:00 2001
From: Badari Pulavarty <pbadari@us.ibm.com>
Date: Tue, 8 May 2007 00:25:21 -0700
Subject: Merge sys_clone()/sys_unshare() nsproxy and namespace handling

sys_clone() and sys_unshare() both makes copies of nsproxy and its associated
namespaces.  But they have different code paths.

This patch merges all the nsproxy and its associated namespace copy/clone
handling (as much as possible).  Posted on container list earlier for
feedback.

- Create a new nsproxy and its associated namespaces and pass it back to
  caller to attach it to right process.

- Changed all copy_*_ns() routines to return a new copy of namespace
  instead of attaching it to task->nsproxy.

- Moved the CAP_SYS_ADMIN checks out of copy_*_ns() routines.

- Removed unnessary !ns checks from copy_*_ns() and added BUG_ON()
  just incase.

- Get rid of all individual unshare_*_ns() routines and make use of
  copy_*_ns() instead.

[akpm@osdl.org: cleanups, warning fix]
[clg@fr.ibm.com: remove dup_namespaces() declaration]
[serue@us.ibm.com: fix CONFIG_IPC_NS=n, clone(CLONE_NEWIPC) retval]
[akpm@linux-foundation.org: fix build with CONFIG_SYSVIPC=n]
Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Serge Hallyn <serue@us.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: <containers@lists.osdl.org>
Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c    |  85 ++--------------------------------
 kernel/nsproxy.c | 139 ++++++++++++++++++++++++++++++++++---------------------
 kernel/pid.c     |  11 ++---
 kernel/utsname.c |  41 ++--------------
 4 files changed, 98 insertions(+), 178 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index b7d169def942..fd211b9dddd4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1515,26 +1515,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 	return 0;
 }
 
-/*
- * Unshare the mnt_namespace structure if it is being shared
- */
-static int unshare_mnt_namespace(unsigned long unshare_flags,
-		struct mnt_namespace **new_nsp, struct fs_struct *new_fs)
-{
-	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
-
-	if ((unshare_flags & CLONE_NEWNS) && ns) {
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		*new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs);
-		if (!*new_nsp)
-			return -ENOMEM;
-	}
-
-	return 0;
-}
-
 /*
  * Unsharing of sighand is not supported yet
  */
@@ -1593,16 +1573,6 @@ static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **n
 	return 0;
 }
 
-#ifndef CONFIG_IPC_NS
-static inline int unshare_ipcs(unsigned long flags, struct ipc_namespace **ns)
-{
-	if (flags & CLONE_NEWIPC)
-		return -EINVAL;
-
-	return 0;
-}
-#endif
-
 /*
  * unshare allows a process to 'unshare' part of the process
  * context which was originally shared using clone.  copy_*
@@ -1615,14 +1585,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 {
 	int err = 0;
 	struct fs_struct *fs, *new_fs = NULL;
-	struct mnt_namespace *ns, *new_ns = NULL;
 	struct sighand_struct *new_sigh = NULL;
 	struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
 	struct files_struct *fd, *new_fd = NULL;
 	struct sem_undo_list *new_ulist = NULL;
 	struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL;
-	struct uts_namespace *uts, *new_uts = NULL;
-	struct ipc_namespace *ipc, *new_ipc = NULL;
 
 	check_unshare_flags(&unshare_flags);
 
@@ -1637,36 +1604,24 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 		goto bad_unshare_out;
 	if ((err = unshare_fs(unshare_flags, &new_fs)))
 		goto bad_unshare_cleanup_thread;
-	if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs)))
-		goto bad_unshare_cleanup_fs;
 	if ((err = unshare_sighand(unshare_flags, &new_sigh)))
-		goto bad_unshare_cleanup_ns;
+		goto bad_unshare_cleanup_fs;
 	if ((err = unshare_vm(unshare_flags, &new_mm)))
 		goto bad_unshare_cleanup_sigh;
 	if ((err = unshare_fd(unshare_flags, &new_fd)))
 		goto bad_unshare_cleanup_vm;
 	if ((err = unshare_semundo(unshare_flags, &new_ulist)))
 		goto bad_unshare_cleanup_fd;
-	if ((err = unshare_utsname(unshare_flags, &new_uts)))
+	if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
+			new_fs)))
 		goto bad_unshare_cleanup_semundo;
-	if ((err = unshare_ipcs(unshare_flags, &new_ipc)))
-		goto bad_unshare_cleanup_uts;
-
-	if (new_ns || new_uts || new_ipc) {
-		old_nsproxy = current->nsproxy;
-		new_nsproxy = dup_namespaces(old_nsproxy);
-		if (!new_nsproxy) {
-			err = -ENOMEM;
-			goto bad_unshare_cleanup_ipc;
-		}
-	}
 
-	if (new_fs || new_ns || new_mm || new_fd || new_ulist ||
-				new_uts || new_ipc) {
+	if (new_fs ||  new_mm || new_fd || new_ulist || new_nsproxy) {
 
 		task_lock(current);
 
 		if (new_nsproxy) {
+			old_nsproxy = current->nsproxy;
 			current->nsproxy = new_nsproxy;
 			new_nsproxy = old_nsproxy;
 		}
@@ -1677,12 +1632,6 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 			new_fs = fs;
 		}
 
-		if (new_ns) {
-			ns = current->nsproxy->mnt_ns;
-			current->nsproxy->mnt_ns = new_ns;
-			new_ns = ns;
-		}
-
 		if (new_mm) {
 			mm = current->mm;
 			active_mm = current->active_mm;
@@ -1698,32 +1647,12 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 			new_fd = fd;
 		}
 
-		if (new_uts) {
-			uts = current->nsproxy->uts_ns;
-			current->nsproxy->uts_ns = new_uts;
-			new_uts = uts;
-		}
-
-		if (new_ipc) {
-			ipc = current->nsproxy->ipc_ns;
-			current->nsproxy->ipc_ns = new_ipc;
-			new_ipc = ipc;
-		}
-
 		task_unlock(current);
 	}
 
 	if (new_nsproxy)
 		put_nsproxy(new_nsproxy);
 
-bad_unshare_cleanup_ipc:
-	if (new_ipc)
-		put_ipc_ns(new_ipc);
-
-bad_unshare_cleanup_uts:
-	if (new_uts)
-		put_uts_ns(new_uts);
-
 bad_unshare_cleanup_semundo:
 bad_unshare_cleanup_fd:
 	if (new_fd)
@@ -1738,10 +1667,6 @@ bad_unshare_cleanup_sigh:
 		if (atomic_dec_and_test(&new_sigh->count))
 			kmem_cache_free(sighand_cachep, new_sigh);
 
-bad_unshare_cleanup_ns:
-	if (new_ns)
-		put_mnt_ns(new_ns);
-
 bad_unshare_cleanup_fs:
 	if (new_fs)
 		put_fs_struct(new_fs);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f5b9ee6f6bbb..1bc4b55241a8 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -38,10 +38,8 @@ void get_task_namespaces(struct task_struct *tsk)
 
 /*
  * creates a copy of "orig" with refcount 1.
- * This does not grab references to the contained namespaces,
- * so that needs to be done by dup_namespaces.
  */
-static inline struct nsproxy *clone_namespaces(struct nsproxy *orig)
+static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig)
 {
 	struct nsproxy *ns;
 
@@ -52,26 +50,49 @@ static inline struct nsproxy *clone_namespaces(struct nsproxy *orig)
 }
 
 /*
- * copies the nsproxy, setting refcount to 1, and grabbing a
- * reference to all contained namespaces.  Called from
- * sys_unshare()
+ * Create new nsproxy and all of its the associated namespaces.
+ * Return the newly created nsproxy.  Do not attach this to the task,
+ * leave it to the caller to do proper locking and attach it to task.
  */
-struct nsproxy *dup_namespaces(struct nsproxy *orig)
+static struct nsproxy *create_new_namespaces(int flags, struct task_struct *tsk,
+			struct fs_struct *new_fs)
 {
-	struct nsproxy *ns = clone_namespaces(orig);
+	struct nsproxy *new_nsp;
 
-	if (ns) {
-		if (ns->mnt_ns)
-			get_mnt_ns(ns->mnt_ns);
-		if (ns->uts_ns)
-			get_uts_ns(ns->uts_ns);
-		if (ns->ipc_ns)
-			get_ipc_ns(ns->ipc_ns);
-		if (ns->pid_ns)
-			get_pid_ns(ns->pid_ns);
-	}
+	new_nsp = clone_nsproxy(tsk->nsproxy);
+	if (!new_nsp)
+		return ERR_PTR(-ENOMEM);
 
-	return ns;
+	new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs);
+	if (IS_ERR(new_nsp->mnt_ns))
+		goto out_ns;
+
+	new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns);
+	if (IS_ERR(new_nsp->uts_ns))
+		goto out_uts;
+
+	new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns);
+	if (IS_ERR(new_nsp->ipc_ns))
+		goto out_ipc;
+
+	new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns);
+	if (IS_ERR(new_nsp->pid_ns))
+		goto out_pid;
+
+	return new_nsp;
+
+out_pid:
+	if (new_nsp->ipc_ns)
+		put_ipc_ns(new_nsp->ipc_ns);
+out_ipc:
+	if (new_nsp->uts_ns)
+		put_uts_ns(new_nsp->uts_ns);
+out_uts:
+	if (new_nsp->mnt_ns)
+		put_mnt_ns(new_nsp->mnt_ns);
+out_ns:
+	kfree(new_nsp);
+	return ERR_PTR(-ENOMEM);
 }
 
 /*
@@ -92,47 +113,21 @@ int copy_namespaces(int flags, struct task_struct *tsk)
 	if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC)))
 		return 0;
 
-	new_ns = clone_namespaces(old_ns);
-	if (!new_ns) {
-		err = -ENOMEM;
+	if (!capable(CAP_SYS_ADMIN)) {
+		err = -EPERM;
 		goto out;
 	}
 
-	tsk->nsproxy = new_ns;
-
-	err = copy_mnt_ns(flags, tsk);
-	if (err)
-		goto out_ns;
-
-	err = copy_utsname(flags, tsk);
-	if (err)
-		goto out_uts;
-
-	err = copy_ipcs(flags, tsk);
-	if (err)
-		goto out_ipc;
-
-	err = copy_pid_ns(flags, tsk);
-	if (err)
-		goto out_pid;
+	new_ns = create_new_namespaces(flags, tsk, tsk->fs);
+	if (IS_ERR(new_ns)) {
+		err = PTR_ERR(new_ns);
+		goto out;
+	}
 
+	tsk->nsproxy = new_ns;
 out:
 	put_nsproxy(old_ns);
 	return err;
-
-out_pid:
-	if (new_ns->ipc_ns)
-		put_ipc_ns(new_ns->ipc_ns);
-out_ipc:
-	if (new_ns->uts_ns)
-		put_uts_ns(new_ns->uts_ns);
-out_uts:
-	if (new_ns->mnt_ns)
-		put_mnt_ns(new_ns->mnt_ns);
-out_ns:
-	tsk->nsproxy = old_ns;
-	kfree(new_ns);
-	goto out;
 }
 
 void free_nsproxy(struct nsproxy *ns)
@@ -147,3 +142,41 @@ void free_nsproxy(struct nsproxy *ns)
 		put_pid_ns(ns->pid_ns);
 	kfree(ns);
 }
+
+/*
+ * Called from unshare. Unshare all the namespaces part of nsproxy.
+ * On sucess, returns the new nsproxy and a reference to old nsproxy
+ * to make sure it stays around.
+ */
+int unshare_nsproxy_namespaces(unsigned long unshare_flags,
+		struct nsproxy **new_nsp, struct fs_struct *new_fs)
+{
+	struct nsproxy *old_ns = current->nsproxy;
+	int err = 0;
+
+	if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC)))
+		return 0;
+
+#ifndef CONFIG_IPC_NS
+	if (unshare_flags & CLONE_NEWIPC)
+		return -EINVAL;
+#endif
+
+#ifndef CONFIG_UTS_NS
+	if (unshare_flags & CLONE_NEWUTS)
+		return -EINVAL;
+#endif
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	get_nsproxy(old_ns);
+
+	*new_nsp = create_new_namespaces(unshare_flags, current,
+				new_fs ? new_fs : current->fs);
+	if (IS_ERR(*new_nsp)) {
+		err = PTR_ERR(*new_nsp);
+		put_nsproxy(old_ns);
+	}
+	return err;
+}
diff --git a/kernel/pid.c b/kernel/pid.c
index 9c80bc23d6b8..d3ad724afa83 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -360,16 +360,11 @@ struct pid *find_ge_pid(int nr)
 }
 EXPORT_SYMBOL_GPL(find_get_pid);
 
-int copy_pid_ns(int flags, struct task_struct *tsk)
+struct pid_namespace *copy_pid_ns(int flags, struct pid_namespace *old_ns)
 {
-	struct pid_namespace *old_ns = tsk->nsproxy->pid_ns;
-	int err = 0;
-
-	if (!old_ns)
-		return 0;
-
+	BUG_ON(!old_ns);
 	get_pid_ns(old_ns);
-	return err;
+	return old_ns;
 }
 
 void free_pid_ns(struct kref *kref)
diff --git a/kernel/utsname.c b/kernel/utsname.c
index c859164a6993..160c8c5136bd 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -31,59 +31,26 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
 	return ns;
 }
 
-/*
- * unshare the current process' utsname namespace.
- * called only in sys_unshare()
- */
-int unshare_utsname(unsigned long unshare_flags, struct uts_namespace **new_uts)
-{
-	if (unshare_flags & CLONE_NEWUTS) {
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		*new_uts = clone_uts_ns(current->nsproxy->uts_ns);
-		if (!*new_uts)
-			return -ENOMEM;
-	}
-
-	return 0;
-}
-
 /*
  * Copy task tsk's utsname namespace, or clone it if flags
  * specifies CLONE_NEWUTS.  In latter case, changes to the
  * utsname of this process won't be seen by parent, and vice
  * versa.
  */
-int copy_utsname(int flags, struct task_struct *tsk)
+struct uts_namespace *copy_utsname(int flags, struct uts_namespace *old_ns)
 {
-	struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
 	struct uts_namespace *new_ns;
-	int err = 0;
-
-	if (!old_ns)
-		return 0;
 
+	BUG_ON(!old_ns);
 	get_uts_ns(old_ns);
 
 	if (!(flags & CLONE_NEWUTS))
-		return 0;
-
-	if (!capable(CAP_SYS_ADMIN)) {
-		err = -EPERM;
-		goto out;
-	}
+		return old_ns;
 
 	new_ns = clone_uts_ns(old_ns);
-	if (!new_ns) {
-		err = -ENOMEM;
-		goto out;
-	}
-	tsk->nsproxy->uts_ns = new_ns;
 
-out:
 	put_uts_ns(old_ns);
-	return err;
+	return new_ns;
 }
 
 void free_uts_ns(struct kref *kref)
-- 
cgit v1.2.3


From 6f8bc500a10ab9cb3861e5bb71155d7bd2bbd2d5 Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@freedesktop.org>
Date: Tue, 8 May 2007 00:25:24 -0700
Subject: rcutorture: Mark rcu_torture_init as __init

The corresponding rcu_torture_cleanup cannot get marked as __exit, because
rcu_torture_init uses it to clean up if init fails.

Signed-off-by: Josh Triplett <josh@freedesktop.org>
Acked-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/rcutorture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index bcd14e83ef39..75ca2a740471 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -866,7 +866,7 @@ rcu_torture_cleanup(void)
 		rcu_torture_print_module_parms("End of test: SUCCESS");
 }
 
-static int
+static int __init
 rcu_torture_init(void)
 {
 	int i;
-- 
cgit v1.2.3


From 753e9c5cd9b123156152c66c816f751954b15e53 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 8 May 2007 00:25:32 -0700
Subject: Optimize timespec_trunc()

The first thing done by timespec_trunc() is :

  if (gran <= jiffies_to_usecs(1) * 1000)

This should really be a test against a constant known at compile time.

Alas, it isnt. jiffies_to_usec() was unilined so C compiler emits a function
call and a multiply to compute : a CONSTANT.

mov    $0x1,%edi
mov    %rbx,0xffffffffffffffe8(%rbp)
mov    %r12,0xfffffffffffffff0(%rbp)
mov    %edx,%ebx
mov    %rsi,0xffffffffffffffc8(%rbp)
mov    %rsi,%r12
callq  ffffffff80232010 <jiffies_to_usecs>
imul   $0x3e8,%eax,%eax
cmp    %ebx,%eax

This patch reorders kernel/time.c a bit so that jiffies_to_usecs() is defined
before timespec_trunc() so that compiler now generates :

cmp    $0x3d0900,%edx  (HZ=250 on my machine)

This gives a better code (timespec_trunc() becoming a leaf function), and
shorter kernel size as well.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/time.c | 60 +++++++++++++++++++++++++++++------------------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time.c b/kernel/time.c
index ba18ec4899bd..6d98ab72f38b 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -247,6 +247,36 @@ struct timespec current_fs_time(struct super_block *sb)
 }
 EXPORT_SYMBOL(current_fs_time);
 
+/*
+ * Convert jiffies to milliseconds and back.
+ *
+ * Avoid unnecessary multiplications/divisions in the
+ * two most common HZ cases:
+ */
+unsigned int inline jiffies_to_msecs(const unsigned long j)
+{
+#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
+	return (MSEC_PER_SEC / HZ) * j;
+#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
+	return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
+#else
+	return (j * MSEC_PER_SEC) / HZ;
+#endif
+}
+EXPORT_SYMBOL(jiffies_to_msecs);
+
+unsigned int inline jiffies_to_usecs(const unsigned long j)
+{
+#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
+	return (USEC_PER_SEC / HZ) * j;
+#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
+	return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
+#else
+	return (j * USEC_PER_SEC) / HZ;
+#endif
+}
+EXPORT_SYMBOL(jiffies_to_usecs);
+
 /**
  * timespec_trunc - Truncate timespec to a granularity
  * @t: Timespec
@@ -472,36 +502,6 @@ struct timeval ns_to_timeval(const s64 nsec)
 }
 EXPORT_SYMBOL(ns_to_timeval);
 
-/*
- * Convert jiffies to milliseconds and back.
- *
- * Avoid unnecessary multiplications/divisions in the
- * two most common HZ cases:
- */
-unsigned int jiffies_to_msecs(const unsigned long j)
-{
-#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
-	return (MSEC_PER_SEC / HZ) * j;
-#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
-	return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
-#else
-	return (j * MSEC_PER_SEC) / HZ;
-#endif
-}
-EXPORT_SYMBOL(jiffies_to_msecs);
-
-unsigned int jiffies_to_usecs(const unsigned long j)
-{
-#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
-	return (USEC_PER_SEC / HZ) * j;
-#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
-	return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
-#else
-	return (j * USEC_PER_SEC) / HZ;
-#endif
-}
-EXPORT_SYMBOL(jiffies_to_usecs);
-
 /*
  * When we convert to jiffies then we interpret incoming values
  * the following way:
-- 
cgit v1.2.3


From 5096add84b9e96e2e0a9c72675c442fe5433388a Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@outflux.net>
Date: Tue, 8 May 2007 00:26:04 -0700
Subject: proc: maps protection

The /proc/pid/ "maps", "smaps", and "numa_maps" files contain sensitive
information about the memory location and usage of processes.  Issues:

- maps should not be world-readable, especially if programs expect any
  kind of ASLR protection from local attackers.
- maps cannot just be 0400 because "-D_FORTIFY_SOURCE=2 -O2" makes glibc
  check the maps when %n is in a *printf call, and a setuid(getuid())
  process wouldn't be able to read its own maps file.  (For reference
  see http://lkml.org/lkml/2006/1/22/150)
- a system-wide toggle is needed to allow prior behavior in the case of
  non-root applications that depend on access to the maps contents.

This change implements a check using "ptrace_may_attach" before allowing
access to read the maps contents.  To control this protection, the new knob
/proc/sys/kernel/maps_protect has been added, with corresponding updates to
the procfs documentation.

[akpm@linux-foundation.org: build fixes]
[akpm@linux-foundation.org: New sysctl numbers are old hat]
Signed-off-by: Kees Cook <kees@outflux.net>
Cc: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c904748f2290..f0664bd5011c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -76,6 +76,7 @@ extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
+extern int maps_protect;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -603,6 +604,16 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
+#ifdef CONFIG_PROC_FS
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "maps_protect",
+		.data           = &maps_protect,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec,
+	},
+#endif
 
 	{ .ctl_name = 0 }
 };
-- 
cgit v1.2.3


From 9adef58b1d4fbb58d7daed931b6790c5a3b7543a Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 8 May 2007 00:26:42 -0700
Subject: futex: get_futex_key, get_key_refs and drop_key_refs

lguest uses the convenient futex infrastructure for inter-domain I/O, so
expose get_futex_key, get_key_refs (renamed get_futex_key_refs) and
drop_key_refs (renamed drop_futex_key_refs).  Also means we need to expose the
union that these use.

No code changes.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/futex.c | 50 ++++++++++++++------------------------------------
 1 file changed, 14 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 5a270b5e3f95..7ae2f50641ed 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -48,38 +48,13 @@
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
+#include <linux/module.h>
 #include <asm/futex.h>
 
 #include "rtmutex_common.h"
 
 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
 
-/*
- * Futexes are matched on equal values of this key.
- * The key type depends on whether it's a shared or private mapping.
- * Don't rearrange members without looking at hash_futex().
- *
- * offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
- * We set bit 0 to indicate if it's an inode-based key.
- */
-union futex_key {
-	struct {
-		unsigned long pgoff;
-		struct inode *inode;
-		int offset;
-	} shared;
-	struct {
-		unsigned long address;
-		struct mm_struct *mm;
-		int offset;
-	} private;
-	struct {
-		unsigned long word;
-		void *ptr;
-		int offset;
-	} both;
-};
-
 /*
  * Priority Inheritance state:
  */
@@ -175,7 +150,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
  *
  * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
  */
-static int get_futex_key(u32 __user *uaddr, union futex_key *key)
+int get_futex_key(u32 __user *uaddr, union futex_key *key)
 {
 	unsigned long address = (unsigned long)uaddr;
 	struct mm_struct *mm = current->mm;
@@ -246,6 +221,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key)
 	}
 	return err;
 }
+EXPORT_SYMBOL_GPL(get_futex_key);
 
 /*
  * Take a reference to the resource addressed by a key.
@@ -254,7 +230,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key)
  * NOTE: mmap_sem MUST be held between get_futex_key() and calling this
  * function, if it is called at all.  mmap_sem keeps key->shared.inode valid.
  */
-static inline void get_key_refs(union futex_key *key)
+inline void get_futex_key_refs(union futex_key *key)
 {
 	if (key->both.ptr != 0) {
 		if (key->both.offset & 1)
@@ -263,12 +239,13 @@ static inline void get_key_refs(union futex_key *key)
 			atomic_inc(&key->private.mm->mm_count);
 	}
 }
+EXPORT_SYMBOL_GPL(get_futex_key_refs);
 
 /*
  * Drop a reference to the resource addressed by a key.
  * The hash bucket spinlock must not be held.
  */
-static void drop_key_refs(union futex_key *key)
+void drop_futex_key_refs(union futex_key *key)
 {
 	if (key->both.ptr != 0) {
 		if (key->both.offset & 1)
@@ -277,6 +254,7 @@ static void drop_key_refs(union futex_key *key)
 			mmdrop(key->private.mm);
 	}
 }
+EXPORT_SYMBOL_GPL(drop_futex_key_refs);
 
 static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
 {
@@ -873,7 +851,7 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
 				this->lock_ptr = &hb2->lock;
 			}
 			this->key = key2;
-			get_key_refs(&key2);
+			get_futex_key_refs(&key2);
 			drop_count++;
 
 			if (ret - nr_wake >= nr_requeue)
@@ -886,9 +864,9 @@ out_unlock:
 	if (hb1 != hb2)
 		spin_unlock(&hb2->lock);
 
-	/* drop_key_refs() must be called outside the spinlocks. */
+	/* drop_futex_key_refs() must be called outside the spinlocks. */
 	while (--drop_count >= 0)
-		drop_key_refs(&key1);
+		drop_futex_key_refs(&key1);
 
 out:
 	up_read(&current->mm->mmap_sem);
@@ -906,7 +884,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
 
 	init_waitqueue_head(&q->waiters);
 
-	get_key_refs(&q->key);
+	get_futex_key_refs(&q->key);
 	hb = hash_futex(&q->key);
 	q->lock_ptr = &hb->lock;
 
@@ -925,7 +903,7 @@ static inline void
 queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
 {
 	spin_unlock(&hb->lock);
-	drop_key_refs(&q->key);
+	drop_futex_key_refs(&q->key);
 }
 
 /*
@@ -980,7 +958,7 @@ static int unqueue_me(struct futex_q *q)
 		ret = 1;
 	}
 
-	drop_key_refs(&q->key);
+	drop_futex_key_refs(&q->key);
 	return ret;
 }
 
@@ -999,7 +977,7 @@ static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
 
 	spin_unlock(&hb->lock);
 
-	drop_key_refs(&q->key);
+	drop_futex_key_refs(&q->key);
 }
 
 static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
-- 
cgit v1.2.3


From 72c1bbf308c75a136803d2d76d0e18258be14c7a Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 8 May 2007 00:26:43 -0700
Subject: futex: restartable futex_wait

LTP test sigaction_16_24 fails, because it expects sem_wait to be restarted
if SA_RESTART is set.  sem_wait is implemented with futex_wait, that
currently doesn't support being restarted.  Ulrich confirms that the call
should be restartable.

Implement a restart_block method to handle the relative timeout, and allow
restarts.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/futex.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 51 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 7ae2f50641ed..600bc9d801f2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -980,12 +980,15 @@ static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
 	drop_futex_key_refs(&q->key);
 }
 
-static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
+static long futex_wait_restart(struct restart_block *restart);
+static int futex_wait_abstime(u32 __user *uaddr, u32 val,
+			int timed, unsigned long abs_time)
 {
 	struct task_struct *curr = current;
 	DECLARE_WAITQUEUE(wait, curr);
 	struct futex_hash_bucket *hb;
 	struct futex_q q;
+	unsigned long time_left = 0;
 	u32 uval;
 	int ret;
 
@@ -1065,8 +1068,21 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
 	 * !list_empty() is safe here without any lock.
 	 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
 	 */
-	if (likely(!list_empty(&q.list)))
-		time = schedule_timeout(time);
+	time_left = 0;
+	if (likely(!list_empty(&q.list))) {
+		unsigned long rel_time;
+
+		if (timed) {
+			unsigned long now = jiffies;
+			if (time_after(now, abs_time))
+				rel_time = 0;
+			else
+				rel_time = abs_time - now;
+		} else
+			rel_time = MAX_SCHEDULE_TIMEOUT;
+
+		time_left = schedule_timeout(rel_time);
+	}
 	__set_current_state(TASK_RUNNING);
 
 	/*
@@ -1077,13 +1093,25 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
 	/* If we were woken (and unqueued), we succeeded, whatever. */
 	if (!unqueue_me(&q))
 		return 0;
-	if (time == 0)
+	if (time_left == 0)
 		return -ETIMEDOUT;
+
 	/*
 	 * We expect signal_pending(current), but another thread may
 	 * have handled it for us already.
 	 */
-	return -EINTR;
+	if (time_left == MAX_SCHEDULE_TIMEOUT)
+		return -ERESTARTSYS;
+	else {
+		struct restart_block *restart;
+		restart = &current_thread_info()->restart_block;
+		restart->fn = futex_wait_restart;
+		restart->arg0 = (unsigned long)uaddr;
+		restart->arg1 = (unsigned long)val;
+		restart->arg2 = (unsigned long)timed;
+		restart->arg3 = abs_time;
+		return -ERESTART_RESTARTBLOCK;
+	}
 
  out_unlock_release_sem:
 	queue_unlock(&q, hb);
@@ -1093,6 +1121,24 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
 	return ret;
 }
 
+static int futex_wait(u32 __user *uaddr, u32 val, unsigned long rel_time)
+{
+	int timed = (rel_time != MAX_SCHEDULE_TIMEOUT);
+	return futex_wait_abstime(uaddr, val, timed, jiffies+rel_time);
+}
+
+static long futex_wait_restart(struct restart_block *restart)
+{
+	u32 __user *uaddr = (u32 __user *)restart->arg0;
+	u32 val = (u32)restart->arg1;
+	int timed = (int)restart->arg2;
+	unsigned long abs_time = restart->arg3;
+
+	restart->fn = do_no_restart_syscall;
+	return (long)futex_wait_abstime(uaddr, val, timed, abs_time);
+}
+
+
 /*
  * Userspace tried a 0 -> TID atomic transition of the futex value
  * and failed. The kernel side here does the whole locking operation:
-- 
cgit v1.2.3


From 69331af79cf29e26d1231152a172a1a10c2df511 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@suse.de>
Date: Tue, 8 May 2007 00:26:49 -0700
Subject: Fixes and cleanups for earlyprintk aka boot console

The console subsystem already has an idea of a boot console, using the
CON_BOOT flag.  The implementation has some flaws though.  The major
problem is that presence of a boot console makes register_console() ignore
any other console devices (unless explicitly specified on the kernel
command line).

This patch fixes the console selection code to *not* consider a boot
console a full-featured one, so the first non-boot console registering will
become the default console instead.  This way the unregister call for the
boot console in the register_console() function actually triggers and the
handover from the boot console to the real console device works smoothly.
Added a printk for the handover, so you know which console device the
output goes to when the boot console stops printing messages.

The disable_early_printk() call is obsolete with that patch, explicitly
disabling the early console isn't needed any more as it works automagically
with that patch.

I've walked through the tree, dropped all disable_early_printk() instances
found below arch/ and tagged the consoles with CON_BOOT if needed.  The
code is tested on x86, sh (thanks to Paul) and mips (thanks to Ralf).

Changes to last version: Rediffed against -rc3, adapted to mips cleanups by
Ralf, fixed "udbg-immortal" cmd line arg on powerpc.

Signed-off-by: Gerd Hoffmann <kraxel@exsuse.de>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Andi Kleen <ak@suse.de>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 4b47e59248df..c4c5a29a7bed 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -931,8 +931,16 @@ void register_console(struct console *console)
 {
 	int i;
 	unsigned long flags;
+	struct console *bootconsole = NULL;
 
-	if (preferred_console < 0)
+	if (console_drivers) {
+		if (console->flags & CON_BOOT)
+			return;
+		if (console_drivers->flags & CON_BOOT)
+			bootconsole = console_drivers;
+	}
+
+	if (preferred_console < 0 || bootconsole || !console_drivers)
 		preferred_console = selected_console;
 
 	/*
@@ -978,8 +986,11 @@ void register_console(struct console *console)
 	if (!(console->flags & CON_ENABLED))
 		return;
 
-	if (console_drivers && (console_drivers->flags & CON_BOOT)) {
-		unregister_console(console_drivers);
+	if (bootconsole) {
+		printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n",
+		       bootconsole->name, bootconsole->index,
+		       console->name, console->index);
+		unregister_console(bootconsole);
 		console->flags &= ~CON_PRINTBUFFER;
 	}
 
@@ -1030,16 +1041,11 @@ int unregister_console(struct console *console)
 		}
 	}
 
-	/* If last console is removed, we re-enable picking the first
-	 * one that gets registered. Without that, pmac early boot console
-	 * would prevent fbcon from taking over.
-	 *
+	/*
 	 * If this isn't the last console and it has CON_CONSDEV set, we
 	 * need to set it on the next preferred console.
 	 */
-	if (console_drivers == NULL)
-		preferred_console = selected_console;
-	else if (console->flags & CON_CONSDEV)
+	if (console_drivers != NULL && console->flags & CON_CONSDEV)
 		console_drivers->flags |= CON_CONSDEV;
 
 	release_console_sem();
-- 
cgit v1.2.3


From e3869792990f708c97be5877499cada70d469bd3 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 8 May 2007 00:27:01 -0700
Subject: kprobes: fix sparse NULL warning

Fix sparse NULL warnings:
kernel/kprobes.c:915:49: warning: Using plain integer as NULL pointer

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d25a9ada3f8e..6c86d67ed1a7 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -35,6 +35,7 @@
 #include <linux/hash.h>
 #include <linux/init.h>
 #include <linux/slab.h>
+#include <linux/stddef.h>
 #include <linux/module.h>
 #include <linux/moduleloader.h>
 #include <linux/kallsyms.h>
@@ -912,7 +913,7 @@ static int __kprobes debugfs_kprobe_init(void)
 	if (!dir)
 		return -ENOMEM;
 
-	file = debugfs_create_file("list", 0444, dir , 0 ,
+	file = debugfs_create_file("list", 0444, dir, NULL,
 				&debugfs_kprobes_operations);
 	if (!file) {
 		debugfs_remove(dir);
-- 
cgit v1.2.3


From 1eeb66a1bb973534dc3d064920a5ca683823372e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 8 May 2007 00:27:03 -0700
Subject: move die notifier handling to common code

This patch moves the die notifier handling to common code.  Previous
various architectures had exactly the same code for it.  Note that the new
code is compiled unconditionally, this should be understood as an appel to
the other architecture maintainer to implement support for it aswell (aka
sprinkling a notify_die or two in the proper place)

arm had a notifiy_die that did something totally different, I renamed it to
arm_notify_die as part of the patch and made it static to the file it's
declared and used at.  avr32 used to pass slightly less information through
this interface and I brought it into line with the other architectures.

[akpm@linux-foundation.org: build fix]
[akpm@linux-foundation.org: fix vmalloc_sync_all bustage]
[bryan.wu@analog.com: fix vmalloc_sync_all in nommu]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: <linux-arch@vger.kernel.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Bryan Wu <bryan.wu@analog.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/Makefile       |  2 +-
 kernel/die_notifier.c | 38 ++++++++++++++++++++++++++++++++++++++
 kernel/kprobes.c      |  2 +-
 3 files changed, 40 insertions(+), 2 deletions(-)
 create mode 100644 kernel/die_notifier.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index ac6b27abb1ad..642d4277c2ea 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
-	    hrtimer.o rwsem.o latency.o nsproxy.o srcu.o
+	    hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o
 
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
diff --git a/kernel/die_notifier.c b/kernel/die_notifier.c
new file mode 100644
index 000000000000..0d98827887a7
--- /dev/null
+++ b/kernel/die_notifier.c
@@ -0,0 +1,38 @@
+
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/vmalloc.h>
+#include <linux/kdebug.h>
+
+
+static ATOMIC_NOTIFIER_HEAD(die_chain);
+
+int notify_die(enum die_val val, const char *str,
+	       struct pt_regs *regs, long err, int trap, int sig)
+{
+	struct die_args args = {
+		.regs		= regs,
+		.str		= str,
+		.err		= err,
+		.trapnr		= trap,
+		.signr		= sig,
+
+	};
+
+	return atomic_notifier_call_chain(&die_chain, val, &args);
+}
+
+int register_die_notifier(struct notifier_block *nb)
+{
+	vmalloc_sync_all();
+	return atomic_notifier_chain_register(&die_chain, nb);
+}
+EXPORT_SYMBOL_GPL(register_die_notifier);
+
+int unregister_die_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&die_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_die_notifier);
+
+
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 6c86d67ed1a7..0207045b4f6f 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -42,10 +42,10 @@
 #include <linux/freezer.h>
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
+#include <linux/kdebug.h>
 #include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
 #include <asm/errno.h>
-#include <asm/kdebug.h>
 
 #define KPROBE_HASH_BITS 6
 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
-- 
cgit v1.2.3


From dd9037a26a1e6ebec9121b4681c414dc77189a90 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Date: Tue, 8 May 2007 00:27:25 -0700
Subject: Fix race between attach_task and cpuset_exit

Currently cpuset_exit() changes the exiting task's ->cpuset pointer w/o
taking task_lock().  This can lead to ugly races between attach_task and
cpuset_exit.  Details of the races are described at
http://lkml.org/lkml/2007/3/24/132.

Patch below closes those races.

Signed-off-by: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d240349cbf0f..bde714db2b26 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2200,10 +2200,6 @@ void cpuset_fork(struct task_struct *child)
  * it is holding that mutex while calling check_for_release(),
  * which calls kmalloc(), so can't be called holding callback_mutex().
  *
- * We don't need to task_lock() this reference to tsk->cpuset,
- * because tsk is already marked PF_EXITING, so attach_task() won't
- * mess with it, or task is a failed fork, never visible to attach_task.
- *
  * the_top_cpuset_hack:
  *
  *    Set the exiting tasks cpuset to the root cpuset (top_cpuset).
@@ -2242,8 +2238,10 @@ void cpuset_exit(struct task_struct *tsk)
 {
 	struct cpuset *cs;
 
+	task_lock(current);
 	cs = tsk->cpuset;
 	tsk->cpuset = &top_cpuset;	/* the_top_cpuset_hack - see above */
+	task_unlock(current);
 
 	if (notify_on_release(cs)) {
 		char *pathbuf = NULL;
-- 
cgit v1.2.3


From d2d9433a4c84c9e7ed78d633fdbffb35d5afda17 Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Tue, 8 May 2007 00:27:31 -0700
Subject: kernel/irq/proc.c: unprotected iteration over the IRQ action list in
 name_unique()

setup_irq() releases a desc->lock before calling register_handler_proc(), so
the iteration over the IRQ action list is not protected.

(akpm: the check itself is still racy, but at least it probably won't oops
now).

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/proc.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 2db91eb54ad8..ddde0ef9ccdc 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -66,12 +66,19 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
 {
 	struct irq_desc *desc = irq_desc + irq;
 	struct irqaction *action;
+	unsigned long flags;
+	int ret = 1;
 
-	for (action = desc->action ; action; action = action->next)
+	spin_lock_irqsave(&desc->lock, flags);
+	for (action = desc->action ; action; action = action->next) {
 		if ((action != new_action) && action->name &&
-				!strcmp(new_action->name, action->name))
-			return 0;
-	return 1;
+				!strcmp(new_action->name, action->name)) {
+			ret = 0;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&desc->lock, flags);
+	return ret;
 }
 
 void register_handler_proc(unsigned int irq, struct irqaction *action)
-- 
cgit v1.2.3


From 6e453a67510a17f01b63835f18569e8c3939a38c Mon Sep 17 00:00:00 2001
From: Venki Pallipadi <venkatesh.pallipadi@intel.com>
Date: Tue, 8 May 2007 00:27:44 -0700
Subject: Add support for deferrable timers

Introduce a new flag for timers - deferrable: Timers that work normally
when system is busy.  But, will not cause CPU to come out of idle (just to
service this timer), when CPU is idle.  Instead, this timer will be
serviced when CPU eventually wakes up with a subsequent non-deferrable
timer.

The main advantage of this is to avoid unnecessary timer interrupts when
CPU is idle.  If the routine currently called by a timer can wait until
next event without any issues, this new timer can be used to setup timer
event for that routine.  This, with dynticks, allows CPUs to be lazy,
allowing them to stay in idle for extended period of time by reducing
unnecesary wakeup and thereby reducing the power consumption.

This patch:

Builds this new timer on top of existing timer infrastructure.  It uses
last bit in 'base' pointer of timer_list structure to store this deferrable
timer flag.  __next_timer_interrupt() function skips over these deferrable
timers when CPU looks for next timer event for which it has to wake up.

This is exported by a new interface init_timer_deferrable() that can be
called in place of regular init_timer().

[akpm@linux-foundation.org: Privatise a #define]
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Dave Jones <davej@codemonkey.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/timer.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 57 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index b22bd39740dd..dbe966feff2f 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -74,7 +74,7 @@ struct tvec_t_base_s {
 	tvec_t tv3;
 	tvec_t tv4;
 	tvec_t tv5;
-} ____cacheline_aligned_in_smp;
+} ____cacheline_aligned;
 
 typedef struct tvec_t_base_s tvec_base_t;
 
@@ -82,6 +82,37 @@ tvec_base_t boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
 static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
 
+/*
+ * Note that all tvec_bases is 2 byte aligned and lower bit of
+ * base in timer_list is guaranteed to be zero. Use the LSB for
+ * the new flag to indicate whether the timer is deferrable
+ */
+#define TBASE_DEFERRABLE_FLAG		(0x1)
+
+/* Functions below help us manage 'deferrable' flag */
+static inline unsigned int tbase_get_deferrable(tvec_base_t *base)
+{
+	return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG);
+}
+
+static inline tvec_base_t *tbase_get_base(tvec_base_t *base)
+{
+	return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
+}
+
+static inline void timer_set_deferrable(struct timer_list *timer)
+{
+	timer->base = ((tvec_base_t *)((unsigned long)(timer->base) |
+	                               TBASE_DEFERRABLE_FLAG));
+}
+
+static inline void
+timer_set_base(struct timer_list *timer, tvec_base_t *new_base)
+{
+	timer->base = (tvec_base_t *)((unsigned long)(new_base) |
+	                              tbase_get_deferrable(timer->base));
+}
+
 /**
  * __round_jiffies - function to round jiffies to a full second
  * @j: the time in (absolute) jiffies that should be rounded
@@ -295,6 +326,13 @@ void fastcall init_timer(struct timer_list *timer)
 }
 EXPORT_SYMBOL(init_timer);
 
+void fastcall init_timer_deferrable(struct timer_list *timer)
+{
+	init_timer(timer);
+	timer_set_deferrable(timer);
+}
+EXPORT_SYMBOL(init_timer_deferrable);
+
 static inline void detach_timer(struct timer_list *timer,
 				int clear_pending)
 {
@@ -325,10 +363,11 @@ static tvec_base_t *lock_timer_base(struct timer_list *timer,
 	tvec_base_t *base;
 
 	for (;;) {
-		base = timer->base;
+		tvec_base_t *prelock_base = timer->base;
+		base = tbase_get_base(prelock_base);
 		if (likely(base != NULL)) {
 			spin_lock_irqsave(&base->lock, *flags);
-			if (likely(base == timer->base))
+			if (likely(prelock_base == timer->base))
 				return base;
 			/* The timer has migrated to another CPU */
 			spin_unlock_irqrestore(&base->lock, *flags);
@@ -365,11 +404,11 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
 		 */
 		if (likely(base->running_timer != timer)) {
 			/* See the comment in lock_timer_base() */
-			timer->base = NULL;
+			timer_set_base(timer, NULL);
 			spin_unlock(&base->lock);
 			base = new_base;
 			spin_lock(&base->lock);
-			timer->base = base;
+			timer_set_base(timer, base);
 		}
 	}
 
@@ -397,7 +436,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
 	timer_stats_timer_set_start_info(timer);
   	BUG_ON(timer_pending(timer) || !timer->function);
 	spin_lock_irqsave(&base->lock, flags);
-	timer->base = base;
+	timer_set_base(timer, base);
 	internal_add_timer(base, timer);
 	spin_unlock_irqrestore(&base->lock, flags);
 }
@@ -550,7 +589,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
 	 * don't have to detach them individually.
 	 */
 	list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
-		BUG_ON(timer->base != base);
+		BUG_ON(tbase_get_base(timer->base) != base);
 		internal_add_timer(base, timer);
 	}
 
@@ -636,6 +675,9 @@ static unsigned long __next_timer_interrupt(tvec_base_t *base)
 	index = slot = timer_jiffies & TVR_MASK;
 	do {
 		list_for_each_entry(nte, base->tv1.vec + slot, entry) {
+ 			if (tbase_get_deferrable(nte->base))
+ 				continue;
+
 			found = 1;
 			expires = nte->expires;
 			/* Look at the cascade bucket(s)? */
@@ -1617,6 +1659,13 @@ static int __devinit init_timers_cpu(int cpu)
 						cpu_to_node(cpu));
 			if (!base)
 				return -ENOMEM;
+
+			/* Make sure that tvec_base is 2 byte aligned */
+			if (tbase_get_deferrable(base)) {
+				WARN_ON(1);
+				kfree(base);
+				return -ENOMEM;
+			}
 			memset(base, 0, sizeof(*base));
 			per_cpu(tvec_bases, cpu) = base;
 		} else {
@@ -1658,7 +1707,7 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
 	while (!list_empty(head)) {
 		timer = list_entry(head->next, struct timer_list, entry);
 		detach_timer(timer, 0);
-		timer->base = new_base;
+		timer_set_base(timer, new_base);
 		internal_add_timer(new_base, timer);
 	}
 }
-- 
cgit v1.2.3


From f75d222b836f7febfab0954c7612b23059d748cb Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <darwish.07@gmail.com>
Date: Tue, 8 May 2007 00:27:55 -0700
Subject: IRQ: check for PERCPU flag only when adding first irqaction

An irqaction structure won't be added to an IRQ descriptor irqaction list if
it doesn't agree with other irqactions on the IRQF_PERCPU flag.  Don't check
for this flag to change IRQ descriptor `status' for every irqaction added to
the list, Doing the check only for the first irqaction added is enough.

Signed-off-by: Ahmed S. Darwish <darwish.07@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/manage.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5597c157442a..203a518b6f14 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -317,10 +317,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 	}
 
 	*p = new;
-#if defined(CONFIG_IRQ_PER_CPU)
-	if (new->flags & IRQF_PERCPU)
-		desc->status |= IRQ_PER_CPU;
-#endif
+
 	/* Exclude IRQ from balancing */
 	if (new->flags & IRQF_NOBALANCING)
 		desc->status |= IRQ_NO_BALANCING;
@@ -328,6 +325,11 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 	if (!shared) {
 		irq_chip_set_defaults(desc->chip);
 
+#if defined(CONFIG_IRQ_PER_CPU)
+		if (new->flags & IRQF_PERCPU)
+			desc->status |= IRQ_PER_CPU;
+#endif
+
 		/* Setup the type (level, edge polarity) if configured: */
 		if (new->flags & IRQF_TRIGGER_MASK) {
 			if (desc->chip && desc->chip->set_type)
-- 
cgit v1.2.3


From 8524070b7982d76258942275908b7434cfcab4b4 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Tue, 8 May 2007 00:27:59 -0700
Subject: Move timekeeping code to timekeeping.c

Move the timekeeping code out of kernel/timer.c and into
kernel/time/timekeeping.c.  I made no cleanups or other changes in transit.

[akpm@linux-foundation.org: build fix]
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/time/Makefile      |   2 +-
 kernel/time/timekeeping.c | 476 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/timer.c            | 459 +-------------------------------------------
 3 files changed, 478 insertions(+), 459 deletions(-)
 create mode 100644 kernel/time/timekeeping.c

(limited to 'kernel')

diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 93bccba1f265..99b6034fc86b 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,4 @@
-obj-y += ntp.o clocksource.o jiffies.o timer_list.o
+obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
 
 obj-$(CONFIG_GENERIC_CLOCKEVENTS)		+= clockevents.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS)		+= tick-common.o
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
new file mode 100644
index 000000000000..f9217bf644f6
--- /dev/null
+++ b/kernel/time/timekeeping.c
@@ -0,0 +1,476 @@
+/*
+ *  linux/kernel/time/timekeeping.c
+ *
+ *  Kernel timekeeping code and accessor functions
+ *
+ *  This code was moved from linux/kernel/timer.c.
+ *  Please see that file for copyright and history logs.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/sysdev.h>
+#include <linux/clocksource.h>
+#include <linux/jiffies.h>
+#include <linux/time.h>
+#include <linux/tick.h>
+
+
+/*
+ * This read-write spinlock protects us from races in SMP while
+ * playing with xtime and avenrun.
+ */
+__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
+
+EXPORT_SYMBOL(xtime_lock);
+
+
+/*
+ * The current time
+ * wall_to_monotonic is what we need to add to xtime (or xtime corrected
+ * for sub jiffie times) to get to monotonic time.  Monotonic is pegged
+ * at zero at system boot time, so wall_to_monotonic will be negative,
+ * however, we will ALWAYS keep the tv_nsec part positive so we can use
+ * the usual normalization.
+ */
+struct timespec xtime __attribute__ ((aligned (16)));
+struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
+
+EXPORT_SYMBOL(xtime);
+
+
+static struct clocksource *clock; /* pointer to current clocksource */
+
+
+#ifdef CONFIG_GENERIC_TIME
+/**
+ * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
+ *
+ * private function, must hold xtime_lock lock when being
+ * called. Returns the number of nanoseconds since the
+ * last call to update_wall_time() (adjusted by NTP scaling)
+ */
+static inline s64 __get_nsec_offset(void)
+{
+	cycle_t cycle_now, cycle_delta;
+	s64 ns_offset;
+
+	/* read clocksource: */
+	cycle_now = clocksource_read(clock);
+
+	/* calculate the delta since the last update_wall_time: */
+	cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+
+	/* convert to nanoseconds: */
+	ns_offset = cyc2ns(clock, cycle_delta);
+
+	return ns_offset;
+}
+
+/**
+ * __get_realtime_clock_ts - Returns the time of day in a timespec
+ * @ts:		pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec. Used by
+ * do_gettimeofday() and get_realtime_clock_ts().
+ */
+static inline void __get_realtime_clock_ts(struct timespec *ts)
+{
+	unsigned long seq;
+	s64 nsecs;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+
+		*ts = xtime;
+		nsecs = __get_nsec_offset();
+
+	} while (read_seqretry(&xtime_lock, seq));
+
+	timespec_add_ns(ts, nsecs);
+}
+
+/**
+ * getnstimeofday - Returns the time of day in a timespec
+ * @ts:		pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec.
+ */
+void getnstimeofday(struct timespec *ts)
+{
+	__get_realtime_clock_ts(ts);
+}
+
+EXPORT_SYMBOL(getnstimeofday);
+
+/**
+ * do_gettimeofday - Returns the time of day in a timeval
+ * @tv:		pointer to the timeval to be set
+ *
+ * NOTE: Users should be converted to using get_realtime_clock_ts()
+ */
+void do_gettimeofday(struct timeval *tv)
+{
+	struct timespec now;
+
+	__get_realtime_clock_ts(&now);
+	tv->tv_sec = now.tv_sec;
+	tv->tv_usec = now.tv_nsec/1000;
+}
+
+EXPORT_SYMBOL(do_gettimeofday);
+/**
+ * do_settimeofday - Sets the time of day
+ * @tv:		pointer to the timespec variable containing the new time
+ *
+ * Sets the time of day to the new time and update NTP and notify hrtimers
+ */
+int do_settimeofday(struct timespec *tv)
+{
+	unsigned long flags;
+	time_t wtm_sec, sec = tv->tv_sec;
+	long wtm_nsec, nsec = tv->tv_nsec;
+
+	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
+		return -EINVAL;
+
+	write_seqlock_irqsave(&xtime_lock, flags);
+
+	nsec -= __get_nsec_offset();
+
+	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
+	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
+
+	set_normalized_timespec(&xtime, sec, nsec);
+	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
+
+	clock->error = 0;
+	ntp_clear();
+
+	update_vsyscall(&xtime, clock);
+
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+
+	/* signal hrtimers about time change */
+	clock_was_set();
+
+	return 0;
+}
+
+EXPORT_SYMBOL(do_settimeofday);
+
+/**
+ * change_clocksource - Swaps clocksources if a new one is available
+ *
+ * Accumulates current time interval and initializes new clocksource
+ */
+static void change_clocksource(void)
+{
+	struct clocksource *new;
+	cycle_t now;
+	u64 nsec;
+
+	new = clocksource_get_next();
+
+	if (clock == new)
+		return;
+
+	now = clocksource_read(new);
+	nsec =  __get_nsec_offset();
+	timespec_add_ns(&xtime, nsec);
+
+	clock = new;
+	clock->cycle_last = now;
+
+	clock->error = 0;
+	clock->xtime_nsec = 0;
+	clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
+
+	tick_clock_notify();
+
+	printk(KERN_INFO "Time: %s clocksource has been installed.\n",
+	       clock->name);
+}
+#else
+static inline void change_clocksource(void) { }
+#endif
+
+/**
+ * timekeeping_is_continuous - check to see if timekeeping is free running
+ */
+int timekeeping_is_continuous(void)
+{
+	unsigned long seq;
+	int ret;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+
+		ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+
+	} while (read_seqretry(&xtime_lock, seq));
+
+	return ret;
+}
+
+/**
+ * read_persistent_clock -  Return time in seconds from the persistent clock.
+ *
+ * Weak dummy function for arches that do not yet support it.
+ * Returns seconds from epoch using the battery backed persistent clock.
+ * Returns zero if unsupported.
+ *
+ *  XXX - Do be sure to remove it once all arches implement it.
+ */
+unsigned long __attribute__((weak)) read_persistent_clock(void)
+{
+	return 0;
+}
+
+/*
+ * timekeeping_init - Initializes the clocksource and common timekeeping values
+ */
+void __init timekeeping_init(void)
+{
+	unsigned long flags;
+	unsigned long sec = read_persistent_clock();
+
+	write_seqlock_irqsave(&xtime_lock, flags);
+
+	ntp_clear();
+
+	clock = clocksource_get_next();
+	clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
+	clock->cycle_last = clocksource_read(clock);
+
+	xtime.tv_sec = sec;
+	xtime.tv_nsec = 0;
+	set_normalized_timespec(&wall_to_monotonic,
+		-xtime.tv_sec, -xtime.tv_nsec);
+
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+}
+
+/* flag for if timekeeping is suspended */
+static int timekeeping_suspended;
+/* time in seconds when suspend began */
+static unsigned long timekeeping_suspend_time;
+
+/**
+ * timekeeping_resume - Resumes the generic timekeeping subsystem.
+ * @dev:	unused
+ *
+ * This is for the generic clocksource timekeeping.
+ * xtime/wall_to_monotonic/jiffies/etc are
+ * still managed by arch specific suspend/resume code.
+ */
+static int timekeeping_resume(struct sys_device *dev)
+{
+	unsigned long flags;
+	unsigned long now = read_persistent_clock();
+
+	write_seqlock_irqsave(&xtime_lock, flags);
+
+	if (now && (now > timekeeping_suspend_time)) {
+		unsigned long sleep_length = now - timekeeping_suspend_time;
+
+		xtime.tv_sec += sleep_length;
+		wall_to_monotonic.tv_sec -= sleep_length;
+	}
+	/* re-base the last cycle value */
+	clock->cycle_last = clocksource_read(clock);
+	clock->error = 0;
+	timekeeping_suspended = 0;
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+
+	touch_softlockup_watchdog();
+
+	clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
+
+	/* Resume hrtimers */
+	hres_timers_resume();
+
+	return 0;
+}
+
+static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
+{
+	unsigned long flags;
+
+	write_seqlock_irqsave(&xtime_lock, flags);
+	timekeeping_suspended = 1;
+	timekeeping_suspend_time = read_persistent_clock();
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+
+	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+
+	return 0;
+}
+
+/* sysfs resume/suspend bits for timekeeping */
+static struct sysdev_class timekeeping_sysclass = {
+	.resume		= timekeeping_resume,
+	.suspend	= timekeeping_suspend,
+	set_kset_name("timekeeping"),
+};
+
+static struct sys_device device_timer = {
+	.id		= 0,
+	.cls		= &timekeeping_sysclass,
+};
+
+static int __init timekeeping_init_device(void)
+{
+	int error = sysdev_class_register(&timekeeping_sysclass);
+	if (!error)
+		error = sysdev_register(&device_timer);
+	return error;
+}
+
+device_initcall(timekeeping_init_device);
+
+/*
+ * If the error is already larger, we look ahead even further
+ * to compensate for late or lost adjustments.
+ */
+static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
+						 s64 *offset)
+{
+	s64 tick_error, i;
+	u32 look_ahead, adj;
+	s32 error2, mult;
+
+	/*
+	 * Use the current error value to determine how much to look ahead.
+	 * The larger the error the slower we adjust for it to avoid problems
+	 * with losing too many ticks, otherwise we would overadjust and
+	 * produce an even larger error.  The smaller the adjustment the
+	 * faster we try to adjust for it, as lost ticks can do less harm
+	 * here.  This is tuned so that an error of about 1 msec is adusted
+	 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
+	 */
+	error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ);
+	error2 = abs(error2);
+	for (look_ahead = 0; error2 > 0; look_ahead++)
+		error2 >>= 2;
+
+	/*
+	 * Now calculate the error in (1 << look_ahead) ticks, but first
+	 * remove the single look ahead already included in the error.
+	 */
+	tick_error = current_tick_length() >>
+		(TICK_LENGTH_SHIFT - clock->shift + 1);
+	tick_error -= clock->xtime_interval >> 1;
+	error = ((error - tick_error) >> look_ahead) + tick_error;
+
+	/* Finally calculate the adjustment shift value.  */
+	i = *interval;
+	mult = 1;
+	if (error < 0) {
+		error = -error;
+		*interval = -*interval;
+		*offset = -*offset;
+		mult = -1;
+	}
+	for (adj = 0; error > i; adj++)
+		error >>= 1;
+
+	*interval <<= adj;
+	*offset <<= adj;
+	return mult << adj;
+}
+
+/*
+ * Adjust the multiplier to reduce the error value,
+ * this is optimized for the most common adjustments of -1,0,1,
+ * for other values we can do a bit more work.
+ */
+static void clocksource_adjust(struct clocksource *clock, s64 offset)
+{
+	s64 error, interval = clock->cycle_interval;
+	int adj;
+
+	error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
+	if (error > interval) {
+		error >>= 2;
+		if (likely(error <= interval))
+			adj = 1;
+		else
+			adj = clocksource_bigadjust(error, &interval, &offset);
+	} else if (error < -interval) {
+		error >>= 2;
+		if (likely(error >= -interval)) {
+			adj = -1;
+			interval = -interval;
+			offset = -offset;
+		} else
+			adj = clocksource_bigadjust(error, &interval, &offset);
+	} else
+		return;
+
+	clock->mult += adj;
+	clock->xtime_interval += interval;
+	clock->xtime_nsec -= offset;
+	clock->error -= (interval - offset) <<
+			(TICK_LENGTH_SHIFT - clock->shift);
+}
+
+/**
+ * update_wall_time - Uses the current clocksource to increment the wall time
+ *
+ * Called from the timer interrupt, must hold a write on xtime_lock.
+ */
+void update_wall_time(void)
+{
+	cycle_t offset;
+
+	/* Make sure we're fully resumed: */
+	if (unlikely(timekeeping_suspended))
+		return;
+
+#ifdef CONFIG_GENERIC_TIME
+	offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
+#else
+	offset = clock->cycle_interval;
+#endif
+	clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
+
+	/* normally this loop will run just once, however in the
+	 * case of lost or late ticks, it will accumulate correctly.
+	 */
+	while (offset >= clock->cycle_interval) {
+		/* accumulate one interval */
+		clock->xtime_nsec += clock->xtime_interval;
+		clock->cycle_last += clock->cycle_interval;
+		offset -= clock->cycle_interval;
+
+		if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
+			clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
+			xtime.tv_sec++;
+			second_overflow();
+		}
+
+		/* interpolator bits */
+		time_interpolator_update(clock->xtime_interval
+						>> clock->shift);
+
+		/* accumulate error between NTP and clock interval */
+		clock->error += current_tick_length();
+		clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
+	}
+
+	/* correct the clock when NTP error is too big */
+	clocksource_adjust(clock, offset);
+
+	/* store full nanoseconds into xtime */
+	xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
+	clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
+
+	/* check to see if there is a new clocksource to use */
+	change_clocksource();
+	update_vsyscall(&xtime, clock);
+}
diff --git a/kernel/timer.c b/kernel/timer.c
index dbe966feff2f..ba41af2bb6cc 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1,7 +1,7 @@
 /*
  *  linux/kernel/timer.c
  *
- *  Kernel internal timers, kernel timekeeping, basic process system calls
+ *  Kernel internal timers, basic process system calls
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *
@@ -794,455 +794,6 @@ unsigned long next_timer_interrupt(void)
 
 #endif
 
-/******************************************************************/
-
-/* 
- * The current time 
- * wall_to_monotonic is what we need to add to xtime (or xtime corrected 
- * for sub jiffie times) to get to monotonic time.  Monotonic is pegged
- * at zero at system boot time, so wall_to_monotonic will be negative,
- * however, we will ALWAYS keep the tv_nsec part positive so we can use
- * the usual normalization.
- */
-struct timespec xtime __attribute__ ((aligned (16)));
-struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
-
-EXPORT_SYMBOL(xtime);
-
-
-/* XXX - all of this timekeeping code should be later moved to time.c */
-#include <linux/clocksource.h>
-static struct clocksource *clock; /* pointer to current clocksource */
-
-#ifdef CONFIG_GENERIC_TIME
-/**
- * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
- *
- * private function, must hold xtime_lock lock when being
- * called. Returns the number of nanoseconds since the
- * last call to update_wall_time() (adjusted by NTP scaling)
- */
-static inline s64 __get_nsec_offset(void)
-{
-	cycle_t cycle_now, cycle_delta;
-	s64 ns_offset;
-
-	/* read clocksource: */
-	cycle_now = clocksource_read(clock);
-
-	/* calculate the delta since the last update_wall_time: */
-	cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
-
-	/* convert to nanoseconds: */
-	ns_offset = cyc2ns(clock, cycle_delta);
-
-	return ns_offset;
-}
-
-/**
- * __get_realtime_clock_ts - Returns the time of day in a timespec
- * @ts:		pointer to the timespec to be set
- *
- * Returns the time of day in a timespec. Used by
- * do_gettimeofday() and get_realtime_clock_ts().
- */
-static inline void __get_realtime_clock_ts(struct timespec *ts)
-{
-	unsigned long seq;
-	s64 nsecs;
-
-	do {
-		seq = read_seqbegin(&xtime_lock);
-
-		*ts = xtime;
-		nsecs = __get_nsec_offset();
-
-	} while (read_seqretry(&xtime_lock, seq));
-
-	timespec_add_ns(ts, nsecs);
-}
-
-/**
- * getnstimeofday - Returns the time of day in a timespec
- * @ts:		pointer to the timespec to be set
- *
- * Returns the time of day in a timespec.
- */
-void getnstimeofday(struct timespec *ts)
-{
-	__get_realtime_clock_ts(ts);
-}
-
-EXPORT_SYMBOL(getnstimeofday);
-
-/**
- * do_gettimeofday - Returns the time of day in a timeval
- * @tv:		pointer to the timeval to be set
- *
- * NOTE: Users should be converted to using get_realtime_clock_ts()
- */
-void do_gettimeofday(struct timeval *tv)
-{
-	struct timespec now;
-
-	__get_realtime_clock_ts(&now);
-	tv->tv_sec = now.tv_sec;
-	tv->tv_usec = now.tv_nsec/1000;
-}
-
-EXPORT_SYMBOL(do_gettimeofday);
-/**
- * do_settimeofday - Sets the time of day
- * @tv:		pointer to the timespec variable containing the new time
- *
- * Sets the time of day to the new time and update NTP and notify hrtimers
- */
-int do_settimeofday(struct timespec *tv)
-{
-	unsigned long flags;
-	time_t wtm_sec, sec = tv->tv_sec;
-	long wtm_nsec, nsec = tv->tv_nsec;
-
-	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
-		return -EINVAL;
-
-	write_seqlock_irqsave(&xtime_lock, flags);
-
-	nsec -= __get_nsec_offset();
-
-	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
-	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
-
-	set_normalized_timespec(&xtime, sec, nsec);
-	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
-
-	clock->error = 0;
-	ntp_clear();
-
-	update_vsyscall(&xtime, clock);
-
-	write_sequnlock_irqrestore(&xtime_lock, flags);
-
-	/* signal hrtimers about time change */
-	clock_was_set();
-
-	return 0;
-}
-
-EXPORT_SYMBOL(do_settimeofday);
-
-/**
- * change_clocksource - Swaps clocksources if a new one is available
- *
- * Accumulates current time interval and initializes new clocksource
- */
-static void change_clocksource(void)
-{
-	struct clocksource *new;
-	cycle_t now;
-	u64 nsec;
-
-	new = clocksource_get_next();
-
-	if (clock == new)
-		return;
-
-	now = clocksource_read(new);
-	nsec =  __get_nsec_offset();
-	timespec_add_ns(&xtime, nsec);
-
-	clock = new;
-	clock->cycle_last = now;
-
-	clock->error = 0;
-	clock->xtime_nsec = 0;
-	clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
-
-	tick_clock_notify();
-
-	printk(KERN_INFO "Time: %s clocksource has been installed.\n",
-	       clock->name);
-}
-#else
-static inline void change_clocksource(void) { }
-#endif
-
-/**
- * timekeeping_is_continuous - check to see if timekeeping is free running
- */
-int timekeeping_is_continuous(void)
-{
-	unsigned long seq;
-	int ret;
-
-	do {
-		seq = read_seqbegin(&xtime_lock);
-
-		ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
-
-	} while (read_seqretry(&xtime_lock, seq));
-
-	return ret;
-}
-
-/**
- * read_persistent_clock -  Return time in seconds from the persistent clock.
- *
- * Weak dummy function for arches that do not yet support it.
- * Returns seconds from epoch using the battery backed persistent clock.
- * Returns zero if unsupported.
- *
- *  XXX - Do be sure to remove it once all arches implement it.
- */
-unsigned long __attribute__((weak)) read_persistent_clock(void)
-{
-	return 0;
-}
-
-/*
- * timekeeping_init - Initializes the clocksource and common timekeeping values
- */
-void __init timekeeping_init(void)
-{
-	unsigned long flags;
-	unsigned long sec = read_persistent_clock();
-
-	write_seqlock_irqsave(&xtime_lock, flags);
-
-	ntp_clear();
-
-	clock = clocksource_get_next();
-	clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
-	clock->cycle_last = clocksource_read(clock);
-
-	xtime.tv_sec = sec;
-	xtime.tv_nsec = 0;
-	set_normalized_timespec(&wall_to_monotonic,
-		-xtime.tv_sec, -xtime.tv_nsec);
-
-	write_sequnlock_irqrestore(&xtime_lock, flags);
-}
-
-/* flag for if timekeeping is suspended */
-static int timekeeping_suspended;
-/* time in seconds when suspend began */
-static unsigned long timekeeping_suspend_time;
-
-/**
- * timekeeping_resume - Resumes the generic timekeeping subsystem.
- * @dev:	unused
- *
- * This is for the generic clocksource timekeeping.
- * xtime/wall_to_monotonic/jiffies/etc are
- * still managed by arch specific suspend/resume code.
- */
-static int timekeeping_resume(struct sys_device *dev)
-{
-	unsigned long flags;
-	unsigned long now = read_persistent_clock();
-
-	write_seqlock_irqsave(&xtime_lock, flags);
-
-	if (now && (now > timekeeping_suspend_time)) {
-		unsigned long sleep_length = now - timekeeping_suspend_time;
-
-		xtime.tv_sec += sleep_length;
-		wall_to_monotonic.tv_sec -= sleep_length;
-	}
-	/* re-base the last cycle value */
-	clock->cycle_last = clocksource_read(clock);
-	clock->error = 0;
-	timekeeping_suspended = 0;
-	write_sequnlock_irqrestore(&xtime_lock, flags);
-
-	touch_softlockup_watchdog();
-
-	clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
-
-	/* Resume hrtimers */
-	hres_timers_resume();
-
-	return 0;
-}
-
-static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
-{
-	unsigned long flags;
-
-	write_seqlock_irqsave(&xtime_lock, flags);
-	timekeeping_suspended = 1;
-	timekeeping_suspend_time = read_persistent_clock();
-	write_sequnlock_irqrestore(&xtime_lock, flags);
-
-	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
-
-	return 0;
-}
-
-/* sysfs resume/suspend bits for timekeeping */
-static struct sysdev_class timekeeping_sysclass = {
-	.resume		= timekeeping_resume,
-	.suspend	= timekeeping_suspend,
-	set_kset_name("timekeeping"),
-};
-
-static struct sys_device device_timer = {
-	.id		= 0,
-	.cls		= &timekeeping_sysclass,
-};
-
-static int __init timekeeping_init_device(void)
-{
-	int error = sysdev_class_register(&timekeeping_sysclass);
-	if (!error)
-		error = sysdev_register(&device_timer);
-	return error;
-}
-
-device_initcall(timekeeping_init_device);
-
-/*
- * If the error is already larger, we look ahead even further
- * to compensate for late or lost adjustments.
- */
-static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
-						 s64 *offset)
-{
-	s64 tick_error, i;
-	u32 look_ahead, adj;
-	s32 error2, mult;
-
-	/*
-	 * Use the current error value to determine how much to look ahead.
-	 * The larger the error the slower we adjust for it to avoid problems
-	 * with losing too many ticks, otherwise we would overadjust and
-	 * produce an even larger error.  The smaller the adjustment the
-	 * faster we try to adjust for it, as lost ticks can do less harm
-	 * here.  This is tuned so that an error of about 1 msec is adusted
-	 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
-	 */
-	error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ);
-	error2 = abs(error2);
-	for (look_ahead = 0; error2 > 0; look_ahead++)
-		error2 >>= 2;
-
-	/*
-	 * Now calculate the error in (1 << look_ahead) ticks, but first
-	 * remove the single look ahead already included in the error.
-	 */
-	tick_error = current_tick_length() >>
-		(TICK_LENGTH_SHIFT - clock->shift + 1);
-	tick_error -= clock->xtime_interval >> 1;
-	error = ((error - tick_error) >> look_ahead) + tick_error;
-
-	/* Finally calculate the adjustment shift value.  */
-	i = *interval;
-	mult = 1;
-	if (error < 0) {
-		error = -error;
-		*interval = -*interval;
-		*offset = -*offset;
-		mult = -1;
-	}
-	for (adj = 0; error > i; adj++)
-		error >>= 1;
-
-	*interval <<= adj;
-	*offset <<= adj;
-	return mult << adj;
-}
-
-/*
- * Adjust the multiplier to reduce the error value,
- * this is optimized for the most common adjustments of -1,0,1,
- * for other values we can do a bit more work.
- */
-static void clocksource_adjust(struct clocksource *clock, s64 offset)
-{
-	s64 error, interval = clock->cycle_interval;
-	int adj;
-
-	error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
-	if (error > interval) {
-		error >>= 2;
-		if (likely(error <= interval))
-			adj = 1;
-		else
-			adj = clocksource_bigadjust(error, &interval, &offset);
-	} else if (error < -interval) {
-		error >>= 2;
-		if (likely(error >= -interval)) {
-			adj = -1;
-			interval = -interval;
-			offset = -offset;
-		} else
-			adj = clocksource_bigadjust(error, &interval, &offset);
-	} else
-		return;
-
-	clock->mult += adj;
-	clock->xtime_interval += interval;
-	clock->xtime_nsec -= offset;
-	clock->error -= (interval - offset) <<
-			(TICK_LENGTH_SHIFT - clock->shift);
-}
-
-/**
- * update_wall_time - Uses the current clocksource to increment the wall time
- *
- * Called from the timer interrupt, must hold a write on xtime_lock.
- */
-static void update_wall_time(void)
-{
-	cycle_t offset;
-
-	/* Make sure we're fully resumed: */
-	if (unlikely(timekeeping_suspended))
-		return;
-
-#ifdef CONFIG_GENERIC_TIME
-	offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
-#else
-	offset = clock->cycle_interval;
-#endif
-	clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
-
-	/* normally this loop will run just once, however in the
-	 * case of lost or late ticks, it will accumulate correctly.
-	 */
-	while (offset >= clock->cycle_interval) {
-		/* accumulate one interval */
-		clock->xtime_nsec += clock->xtime_interval;
-		clock->cycle_last += clock->cycle_interval;
-		offset -= clock->cycle_interval;
-
-		if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
-			clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
-			xtime.tv_sec++;
-			second_overflow();
-		}
-
-		/* interpolator bits */
-		time_interpolator_update(clock->xtime_interval
-						>> clock->shift);
-
-		/* accumulate error between NTP and clock interval */
-		clock->error += current_tick_length();
-		clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
-	}
-
-	/* correct the clock when NTP error is too big */
-	clocksource_adjust(clock, offset);
-
-	/* store full nanoseconds into xtime */
-	xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
-	clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
-
-	/* check to see if there is a new clocksource to use */
-	change_clocksource();
-	update_vsyscall(&xtime, clock);
-}
-
 /*
  * Called from the timer interrupt handler to charge one tick to the current 
  * process.  user_tick is 1 if the tick is user time, 0 for system.
@@ -1305,14 +856,6 @@ static inline void calc_load(unsigned long ticks)
 	}
 }
 
-/*
- * This read-write spinlock protects us from races in SMP while
- * playing with xtime and avenrun.
- */
-__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
-
-EXPORT_SYMBOL(xtime_lock);
-
 /*
  * This function runs timers and the timer-tq in bottom half context.
  */
-- 
cgit v1.2.3


From 966812dc98e6a7fcdf759cbfa0efab77500a8868 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Tue, 8 May 2007 00:28:02 -0700
Subject: Ignore stolen time in the softlockup watchdog

The softlockup watchdog is currently a nuisance in a virtual machine, since
the whole system could have the CPU stolen from it for a long period of
time.  While it would be unlikely for a guest domain to be denied timer
interrupts for over 10s, it could happen and any softlockup message would
be completely spurious.

Earlier I proposed that sched_clock() return time in unstolen nanoseconds,
which is how Xen and VMI currently implement it.  If the softlockup
watchdog uses sched_clock() to measure time, it would automatically ignore
stolen time, and therefore only report when the guest itself locked up.
When running native, sched_clock() returns real-time nanoseconds, so the
behaviour would be unchanged.

Note that sched_clock() used this way is inherently per-cpu, so this patch
makes sure that the per-processor watchdog thread initialized its own
timestamp.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Zachary Amsden <zach@vmware.com>
Cc: James Morris <jmorris@namei.org>
Cc: Dan Hecht <dhecht@vmware.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Chris Lalancette <clalance@redhat.com>
Cc: Rick Lindsley <ricklind@us.ibm.com>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/softlockup.c | 35 +++++++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 81d43caa2012..5ea631742dbc 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -34,9 +34,19 @@ static struct notifier_block panic_block = {
 	.notifier_call = softlock_panic,
 };
 
+/*
+ * Returns seconds, approximately.  We don't need nanosecond
+ * resolution, and we don't need to waste time with a big divide when
+ * 2^30ns == 1.074s.
+ */
+static unsigned long get_timestamp(void)
+{
+	return sched_clock() >> 30;  /* 2^30 ~= 10^9 */
+}
+
 void touch_softlockup_watchdog(void)
 {
-	__raw_get_cpu_var(touch_timestamp) = jiffies;
+	__raw_get_cpu_var(touch_timestamp) = get_timestamp();
 }
 EXPORT_SYMBOL(touch_softlockup_watchdog);
 
@@ -48,9 +58,17 @@ void softlockup_tick(void)
 {
 	int this_cpu = smp_processor_id();
 	unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu);
+	unsigned long print_timestamp;
+	unsigned long now;
 
-	/* prevent double reports: */
-	if (per_cpu(print_timestamp, this_cpu) == touch_timestamp ||
+	/* watchdog task hasn't updated timestamp yet */
+	if (touch_timestamp == 0)
+		return;
+
+	print_timestamp = per_cpu(print_timestamp, this_cpu);
+
+	/* report at most once a second */
+	if (print_timestamp < (touch_timestamp + 1) ||
 		did_panic ||
 			!per_cpu(watchdog_task, this_cpu))
 		return;
@@ -61,12 +79,14 @@ void softlockup_tick(void)
 		return;
 	}
 
+	now = get_timestamp();
+
 	/* Wake up the high-prio watchdog task every second: */
-	if (time_after(jiffies, touch_timestamp + HZ))
+	if (now > (touch_timestamp + 1))
 		wake_up_process(per_cpu(watchdog_task, this_cpu));
 
 	/* Warn about unreasonable 10+ seconds delays: */
-	if (time_after(jiffies, touch_timestamp + 10*HZ)) {
+	if (now > (touch_timestamp + 10)) {
 		per_cpu(print_timestamp, this_cpu) = touch_timestamp;
 
 		spin_lock(&print_lock);
@@ -87,6 +107,9 @@ static int watchdog(void * __bind_cpu)
 	sched_setscheduler(current, SCHED_FIFO, &param);
 	current->flags |= PF_NOFREEZE;
 
+	/* initialize timestamp */
+	touch_softlockup_watchdog();
+
 	/*
 	 * Run briefly once per second to reset the softlockup timestamp.
 	 * If this gets delayed for more than 10 seconds then the
@@ -118,7 +141,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 			printk("watchdog for %i failed\n", hotcpu);
 			return NOTIFY_BAD;
 		}
-  		per_cpu(touch_timestamp, hotcpu) = jiffies;
+  		per_cpu(touch_timestamp, hotcpu) = 0;
   		per_cpu(watchdog_task, hotcpu) = p;
 		kthread_bind(p, hotcpu);
  		break;
-- 
cgit v1.2.3


From 04c9167f91e309c9c4ea982992aa08e83b2eb42e Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Tue, 8 May 2007 00:28:05 -0700
Subject: add touch_all_softlockup_watchdogs()

Add touch_all_softlockup_watchdogs() to allow the softlockup watchdog
timers on all cpus to be updated.  This is used to prevent sysrq-t from
generating a spurious watchdog message when generating lots of output.

Softlockup watchdogs use sched_clock() as its timebase, which is inherently
per-cpu (at least, when it is measuring unstolen time).  Because of this,
it isn't possible for one CPU to directly update the other CPU's timers,
but it is possible to tell the other CPUs to do update themselves
appropriately.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Acked-by: Chris Lalancette <clalance@redhat.com>
Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Cc: Rick Lindsley <ricklind@us.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c      |  2 ++
 kernel/softlockup.c | 15 +++++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0227f1625a75..5530ed211f72 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4750,6 +4750,8 @@ void show_state_filter(unsigned long state_filter)
 			show_task(p);
 	} while_each_thread(g, p);
 
+	touch_all_softlockup_watchdogs();
+
 	read_unlock(&tasklist_lock);
 	/*
 	 * Only show locks if all tasks are dumped:
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 5ea631742dbc..8fa7040247ad 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -50,6 +50,16 @@ void touch_softlockup_watchdog(void)
 }
 EXPORT_SYMBOL(touch_softlockup_watchdog);
 
+void touch_all_softlockup_watchdogs(void)
+{
+	int cpu;
+
+	/* Cause each CPU to re-update its timestamp rather than complain */
+	for_each_online_cpu(cpu)
+		per_cpu(touch_timestamp, cpu) = 0;
+}
+EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
+
 /*
  * This callback runs from the timer interrupt, and checks
  * whether the watchdog thread has hung or not:
@@ -61,9 +71,10 @@ void softlockup_tick(void)
 	unsigned long print_timestamp;
 	unsigned long now;
 
-	/* watchdog task hasn't updated timestamp yet */
-	if (touch_timestamp == 0)
+	if (touch_timestamp == 0) {
+		touch_softlockup_watchdog();
 		return;
+	}
 
 	print_timestamp = per_cpu(print_timestamp, this_cpu);
 
-- 
cgit v1.2.3


From e63340ae6b6205fef26b40a75673d1c9c0c8bb90 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 8 May 2007 00:28:08 -0700
Subject: header cleaning: don't include smp_lock.h when not used

Remove includes of <linux/smp_lock.h> where it is not used/needed.
Suggested by Al Viro.

Builds cleanly on x86_64, i386, alpha, ia64, powerpc, sparc,
sparc64, and arm (all 59 defconfigs).

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c         | 1 -
 kernel/exit.c           | 1 -
 kernel/fork.c           | 1 -
 kernel/itimer.c         | 1 -
 kernel/kmod.c           | 1 -
 kernel/posix-timers.c   | 1 -
 kernel/power/process.c  | 1 -
 kernel/power/snapshot.c | 1 -
 kernel/power/swap.c     | 1 -
 kernel/printk.c         | 1 -
 kernel/signal.c         | 1 -
 kernel/time.c           | 1 -
 kernel/uid16.c          | 1 -
 13 files changed, 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index bde714db2b26..e6d002fc7392 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -42,7 +42,6 @@
 #include <linux/seq_file.h>
 #include <linux/security.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/spinlock.h>
 #include <linux/stat.h>
 #include <linux/string.h>
diff --git a/kernel/exit.c b/kernel/exit.c
index 92369240d91d..f5a7abb621f3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -7,7 +7,6 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
-#include <linux/smp_lock.h>
 #include <linux/module.h>
 #include <linux/capability.h>
 #include <linux/completion.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index fd211b9dddd4..a8dd75d4992b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -14,7 +14,6 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
-#include <linux/smp_lock.h>
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/completion.h>
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 307c6a632ef6..4523f3396f23 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -7,7 +7,6 @@
 /* These are all the functions necessary to implement itimers */
 
 #include <linux/mm.h>
-#include <linux/smp_lock.h>
 #include <linux/interrupt.h>
 #include <linux/syscalls.h>
 #include <linux/time.h>
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 796276141e51..11c584cd16b3 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -23,7 +23,6 @@
 #include <linux/syscalls.h>
 #include <linux/unistd.h>
 #include <linux/kmod.h>
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include <linux/mnt_namespace.h>
 #include <linux/completion.h>
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 44318ca71978..588c99da0307 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -31,7 +31,6 @@
  * POSIX clocks & timers
  */
 #include <linux/mm.h>
-#include <linux/smp_lock.h>
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/time.h>
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 179529dc3819..088419387388 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -8,7 +8,6 @@
 
 #undef DEBUG
 
-#include <linux/smp_lock.h>
 #include <linux/interrupt.h>
 #include <linux/suspend.h>
 #include <linux/module.h>
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 128da11f01c2..b7039772b05c 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -14,7 +14,6 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/suspend.h>
-#include <linux/smp_lock.h>
 #include <linux/delay.h>
 #include <linux/bitops.h>
 #include <linux/spinlock.h>
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index e83ed9945a80..b8b235cc19d1 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -12,7 +12,6 @@
  */
 
 #include <linux/module.h>
-#include <linux/smp_lock.h>
 #include <linux/file.h>
 #include <linux/utsname.h>
 #include <linux/version.h>
diff --git a/kernel/printk.c b/kernel/printk.c
index c4c5a29a7bed..0bbdeac2810c 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -20,7 +20,6 @@
 #include <linux/mm.h>
 #include <linux/tty.h>
 #include <linux/tty_driver.h>
-#include <linux/smp_lock.h>
 #include <linux/console.h>
 #include <linux/init.h>
 #include <linux/module.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index 2b4087d545a3..1368e67c8482 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -12,7 +12,6 @@
 
 #include <linux/slab.h>
 #include <linux/module.h>
-#include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
diff --git a/kernel/time.c b/kernel/time.c
index 6d98ab72f38b..f04791f69408 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -31,7 +31,6 @@
 #include <linux/timex.h>
 #include <linux/capability.h>
 #include <linux/errno.h>
-#include <linux/smp_lock.h>
 #include <linux/syscalls.h>
 #include <linux/security.h>
 #include <linux/fs.h>
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 187e2a423878..dd308ba4e03b 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -6,7 +6,6 @@
 #include <linux/mm.h>
 #include <linux/utsname.h>
 #include <linux/mman.h>
-#include <linux/smp_lock.h>
 #include <linux/notifier.h>
 #include <linux/reboot.h>
 #include <linux/prctl.h>
-- 
cgit v1.2.3


From 6672f76a5a1878d42264c1deba8f1ab52b4618d9 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Tue, 8 May 2007 00:28:22 -0700
Subject: kdump/kexec: calculate note size at compile time

Currently the size of the per-cpu region reserved to save crash notes is
set by the per-architecture value MAX_NOTE_BYTES.  Which in turn is
currently set to 1024 on all supported architectures.

While testing ia64 I recently discovered that this value is in fact too
small.  The particular setup I was using actually needs 1172 bytes.  This
lead to very tedious failure mode where the tail of one elf note would
overwrite the head of another if they ended up being alocated sequentially
by kmalloc, which was often the case.

It seems to me that a far better approach is to caclculate the size that
the area needs to be.  This patch does just that.

If a simpler stop-gap patch for ia64 to be squeezed into 2.6.21(.X) is
needed then this should be as easy as making MAX_NOTE_BYTES larger in
arch/asm-ia64/kexec.h.  Perhaps 2048 would be a good choice.  However, I
think that the approach in this patch is a much more robust idea.

Acked-by:  Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 2a59c8a01ae0..25db14b89e82 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1118,8 +1118,8 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
 	memset(&prstatus, 0, sizeof(prstatus));
 	prstatus.pr_pid = current->pid;
 	elf_core_copy_regs(&prstatus.pr_reg, regs);
-	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
-				sizeof(prstatus));
+	buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+		      	      &prstatus, sizeof(prstatus));
 	final_note(buf);
 }
 
-- 
cgit v1.2.3


From b73a7e76c1eeaa770a41554698917c3c45686a07 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@linux01.gwdg.de>
Date: Tue, 8 May 2007 00:28:24 -0700
Subject: Fix kevent's childs priority greediness

Fix kevent's childs priority greediness.  Such tasks were always scheduled
at nice level -5 and, at that time, udev stole us the CPU time with -5.

Already posted at http://lkml.org/lkml/2005/1/10/85

[akpm@linux-foundation.org: add comment]
Signed-off-by: Jan Engelhardt <jengelh@gmx.de>
Cc: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kmod.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 11c584cd16b3..49cc4b9c1a8d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -165,6 +165,12 @@ static int ____call_usermodehelper(void *data)
 	/* We can run anywhere, unlike our parent keventd(). */
 	set_cpus_allowed(current, CPU_MASK_ALL);
 
+	/*
+	 * Our parent is keventd, which runs with elevated scheduling priority.
+	 * Avoid propagating that into the userspace child.
+	 */
+	set_user_nice(current, 0);
+
 	retval = -EPERM;
 	if (current->fs->root)
 		retval = kernel_execve(sub_info->path,
-- 
cgit v1.2.3


From ae84e324709d6320ed8c1fd7b1736fcbaf26df95 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@sw.ru>
Date: Tue, 8 May 2007 00:28:38 -0700
Subject: Simplify module_get_kallsym() by dropping length arg

module_get_kallsym() could in theory truncate module symbol name to fit in
buffer, but nobody does this.  Always use KSYM_NAME_LEN + 1 bytes for name.

Suggested by lg^WRusty.

Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kallsyms.c | 2 +-
 kernel/module.c   | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 5a0de8409739..d086c91d44ed 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -312,7 +312,7 @@ static int get_ksymbol_mod(struct kallsym_iter *iter)
 {
 	iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
 					 &iter->value, &iter->type,
-					 iter->name, sizeof(iter->name));
+					 iter->name);
 	if (iter->owner == NULL)
 		return 0;
 
diff --git a/kernel/module.c b/kernel/module.c
index 9bdbd1217a6f..43a529a1fa48 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -19,6 +19,7 @@
 #include <linux/module.h>
 #include <linux/moduleloader.h>
 #include <linux/init.h>
+#include <linux/kallsyms.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
@@ -2124,7 +2125,7 @@ const char *module_address_lookup(unsigned long addr,
 }
 
 struct module *module_get_kallsym(unsigned int symnum, unsigned long *value,
-				char *type, char *name, size_t namelen)
+				char *type, char *name)
 {
 	struct module *mod;
 
@@ -2134,7 +2135,7 @@ struct module *module_get_kallsym(unsigned int symnum, unsigned long *value,
 			*value = mod->symtab[symnum].st_value;
 			*type = mod->symtab[symnum].st_info;
 			strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
-				namelen);
+				KSYM_NAME_LEN + 1);
 			mutex_unlock(&module_mutex);
 			return mod;
 		}
-- 
cgit v1.2.3


From ea07890a680273b25127129fb555aac0d9324bea Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@sw.ru>
Date: Tue, 8 May 2007 00:28:39 -0700
Subject: Fix race between rmmod and cat /proc/kallsyms

module_get_kallsym() leaks "struct module *" outside of module_mutex which is
no-no, because module can dissapear right after mutex unlock.

Copy all needed information from inside module_mutex into caller-supplied
space.

[bunk@stusta.de: is_exported() can now become static]
Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kallsyms.c | 30 +++++++++++++++---------------
 kernel/module.c   | 12 +++++++-----
 2 files changed, 22 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index d086c91d44ed..f1ea6f66ac6c 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -301,25 +301,20 @@ void __print_symbol(const char *fmt, unsigned long address)
 struct kallsym_iter
 {
 	loff_t pos;
-	struct module *owner;
 	unsigned long value;
 	unsigned int nameoff; /* If iterating in core kernel symbols */
 	char type;
 	char name[KSYM_NAME_LEN+1];
+	char module_name[MODULE_NAME_LEN + 1];
+	int exported;
 };
 
 static int get_ksymbol_mod(struct kallsym_iter *iter)
 {
-	iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
-					 &iter->value, &iter->type,
-					 iter->name);
-	if (iter->owner == NULL)
+	if (module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value,
+				&iter->type, iter->name, iter->module_name,
+				&iter->exported) < 0)
 		return 0;
-
-	/* Label it "global" if it is exported, "local" if not exported. */
-	iter->type = is_exported(iter->name, iter->owner)
-		? toupper(iter->type) : tolower(iter->type);
-
 	return 1;
 }
 
@@ -328,7 +323,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
 {
 	unsigned off = iter->nameoff;
 
-	iter->owner = NULL;
+	iter->module_name[0] = '\0';
 	iter->value = kallsyms_addresses[iter->pos];
 
 	iter->type = kallsyms_get_symbol_type(off);
@@ -392,12 +387,17 @@ static int s_show(struct seq_file *m, void *p)
 	if (!iter->name[0])
 		return 0;
 
-	if (iter->owner)
+	if (iter->module_name[0]) {
+		char type;
+
+		/* Label it "global" if it is exported,
+		 * "local" if not exported. */
+		type = iter->exported ? toupper(iter->type) :
+					tolower(iter->type);
 		seq_printf(m, "%0*lx %c %s\t[%s]\n",
 			   (int)(2*sizeof(void*)),
-			   iter->value, iter->type, iter->name,
-			   module_name(iter->owner));
-	else
+			   iter->value, type, iter->name, iter->module_name);
+	} else
 		seq_printf(m, "%0*lx %c %s\n",
 			   (int)(2*sizeof(void*)),
 			   iter->value, iter->type, iter->name);
diff --git a/kernel/module.c b/kernel/module.c
index 43a529a1fa48..5ee65994a3bc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1472,7 +1472,7 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
 }
 
 #ifdef CONFIG_KALLSYMS
-int is_exported(const char *name, const struct module *mod)
+static int is_exported(const char *name, const struct module *mod)
 {
 	if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
 		return 1;
@@ -2124,8 +2124,8 @@ const char *module_address_lookup(unsigned long addr,
 	return NULL;
 }
 
-struct module *module_get_kallsym(unsigned int symnum, unsigned long *value,
-				char *type, char *name)
+int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+			char *name, char *module_name, int *exported)
 {
 	struct module *mod;
 
@@ -2136,13 +2136,15 @@ struct module *module_get_kallsym(unsigned int symnum, unsigned long *value,
 			*type = mod->symtab[symnum].st_info;
 			strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
 				KSYM_NAME_LEN + 1);
+			strlcpy(module_name, mod->name, MODULE_NAME_LEN + 1);
+			*exported = is_exported(name, mod);
 			mutex_unlock(&module_mutex);
-			return mod;
+			return 0;
 		}
 		symnum -= mod->num_symtab;
 	}
 	mutex_unlock(&module_mutex);
-	return NULL;
+	return -ERANGE;
 }
 
 static unsigned long mod_find_symname(struct module *mod, const char *name)
-- 
cgit v1.2.3


From ffb45122766db220d0bf3d01848d575fbbcb6430 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@sw.ru>
Date: Tue, 8 May 2007 00:28:41 -0700
Subject: Simplify kallsyms_lookup()

Several kallsyms_lookup() pass dummy arguments but only need, say, module's
name.  Make kallsyms_lookup() accept NULLs where possible.

Also, makes picture clearer about what interfaces are needed for all symbol
resolving business.

Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kallsyms.c         | 6 ++++--
 kernel/kprobes.c          | 4 ++--
 kernel/lockdep.c          | 5 +----
 kernel/module.c           | 6 ++++--
 kernel/time/timer_list.c  | 4 +---
 kernel/time/timer_stats.c | 4 +---
 6 files changed, 13 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index f1ea6f66ac6c..f66da025cb7f 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -214,8 +214,10 @@ static unsigned long get_symbol_pos(unsigned long addr,
 			symbol_end = (unsigned long)_etext;
 	}
 
-	*symbolsize = symbol_end - symbol_start;
-	*offset = addr - symbol_start;
+	if (symbolsize)
+		*symbolsize = symbol_end - symbol_start;
+	if (offset)
+		*offset = addr - symbol_start;
 
 	return low;
 }
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 0207045b4f6f..bee29bde6adf 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -868,13 +868,13 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
 	struct kprobe *p, *kp;
 	const char *sym = NULL;
 	unsigned int i = *(loff_t *) v;
-	unsigned long size, offset = 0;
+	unsigned long offset = 0;
 	char *modname, namebuf[128];
 
 	head = &kprobe_table[i];
 	preempt_disable();
 	hlist_for_each_entry_rcu(p, node, head, hlist) {
-		sym = kallsyms_lookup((unsigned long)p->addr, &size,
+		sym = kallsyms_lookup((unsigned long)p->addr, NULL,
 					&offset, &modname, namebuf);
 		if (p->pre_handler == aggr_pre_handler) {
 			list_for_each_entry_rcu(kp, &p->list, list)
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index c1e308a080b8..7f573136a905 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -340,10 +340,7 @@ static const char *usage_str[] =
 
 const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
 {
-	unsigned long offs, size;
-	char *modname;
-
-	return kallsyms_lookup((unsigned long)key, &size, &offs, &modname, str);
+	return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str);
 }
 
 void
diff --git a/kernel/module.c b/kernel/module.c
index 5ee65994a3bc..bf4dccadf7b8 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2098,8 +2098,10 @@ static const char *get_ksymbol(struct module *mod,
 	if (!best)
 		return NULL;
 
-	*size = nextval - mod->symtab[best].st_value;
-	*offset = addr - mod->symtab[best].st_value;
+	if (size)
+		*size = nextval - mod->symtab[best].st_value;
+	if (offset)
+		*offset = addr - mod->symtab[best].st_value;
 	return mod->strtab + mod->symtab[best].st_name;
 }
 
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 59df5e8555a8..fe9314a89f20 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -40,11 +40,9 @@ static void print_name_offset(struct seq_file *m, void *sym)
 {
 	unsigned long addr = (unsigned long)sym;
 	char namebuf[KSYM_NAME_LEN+1];
-	unsigned long size, offset;
 	const char *sym_name;
-	char *modname;
 
-	sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
+	sym_name = kallsyms_lookup(addr, NULL, NULL, NULL, namebuf);
 	if (sym_name)
 		SEQ_printf(m, "%s", sym_name);
 	else
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 1bc4882e28e0..946ed45f7d2f 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -258,11 +258,9 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
 static void print_name_offset(struct seq_file *m, unsigned long addr)
 {
 	char namebuf[KSYM_NAME_LEN+1];
-	unsigned long size, offset;
 	const char *sym_name;
-	char *modname;
 
-	sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
+	sym_name = kallsyms_lookup(addr, NULL, NULL, NULL, namebuf);
 	if (sym_name)
 		seq_printf(m, "%s", sym_name);
 	else
-- 
cgit v1.2.3


From 9d65cb4a1718a072898c7a57a3bc61b2dc4bcd4d Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@sw.ru>
Date: Tue, 8 May 2007 00:28:43 -0700
Subject: Fix race between cat /proc/*/wchan and rmmod et al

kallsyms_lookup() can go iterating over modules list unprotected which is OK
for emergency situations (oops), but not OK for regular stuff like
/proc/*/wchan.

Introduce lookup_symbol_name()/lookup_module_symbol_name() which copy symbol
name into caller-supplied buffer or return -ERANGE.  All copying is done with
module_mutex held, so...

Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kallsyms.c         | 17 +++++++++++++++++
 kernel/module.c           | 23 +++++++++++++++++++++++
 kernel/time/timer_list.c  | 11 ++++-------
 kernel/time/timer_stats.c | 10 ++++------
 4 files changed, 48 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index f66da025cb7f..4e2ec191a127 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -269,6 +269,23 @@ const char *kallsyms_lookup(unsigned long addr,
 	return NULL;
 }
 
+int lookup_symbol_name(unsigned long addr, char *symname)
+{
+	symname[0] = '\0';
+	symname[KSYM_NAME_LEN] = '\0';
+
+	if (is_ksym_addr(addr)) {
+		unsigned long pos;
+
+		pos = get_symbol_pos(addr, NULL, NULL);
+		/* Grab name */
+		kallsyms_expand_symbol(get_symbol_offset(pos), symname);
+		return 0;
+	}
+	/* see if it's in a module */
+	return lookup_module_symbol_name(addr, symname);
+}
+
 /* Look up a kernel symbol and return it in a text buffer. */
 int sprint_symbol(char *buffer, unsigned long address)
 {
diff --git a/kernel/module.c b/kernel/module.c
index bf4dccadf7b8..3da76ad32d78 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2126,6 +2126,29 @@ const char *module_address_lookup(unsigned long addr,
 	return NULL;
 }
 
+int lookup_module_symbol_name(unsigned long addr, char *symname)
+{
+	struct module *mod;
+
+	mutex_lock(&module_mutex);
+	list_for_each_entry(mod, &modules, list) {
+		if (within(addr, mod->module_init, mod->init_size) ||
+		    within(addr, mod->module_core, mod->core_size)) {
+			const char *sym;
+
+			sym = get_ksymbol(mod, addr, NULL, NULL);
+			if (!sym)
+				goto out;
+			strlcpy(symname, sym, KSYM_NAME_LEN + 1);
+			mutex_unlock(&module_mutex);
+			return 0;
+		}
+	}
+out:
+	mutex_unlock(&module_mutex);
+	return -ERANGE;
+}
+
 int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 			char *name, char *module_name, int *exported)
 {
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index fe9314a89f20..b734ca4bc75e 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -38,15 +38,12 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
 
 static void print_name_offset(struct seq_file *m, void *sym)
 {
-	unsigned long addr = (unsigned long)sym;
-	char namebuf[KSYM_NAME_LEN+1];
-	const char *sym_name;
+	char symname[KSYM_NAME_LEN+1];
 
-	sym_name = kallsyms_lookup(addr, NULL, NULL, NULL, namebuf);
-	if (sym_name)
-		SEQ_printf(m, "%s", sym_name);
-	else
+	if (lookup_symbol_name((unsigned long)sym, symname) < 0)
 		SEQ_printf(m, "<%p>", sym);
+	else
+		SEQ_printf(m, "%s", symname);
 }
 
 static void
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 946ed45f7d2f..868f1bceb07f 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -257,14 +257,12 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
 
 static void print_name_offset(struct seq_file *m, unsigned long addr)
 {
-	char namebuf[KSYM_NAME_LEN+1];
-	const char *sym_name;
+	char symname[KSYM_NAME_LEN+1];
 
-	sym_name = kallsyms_lookup(addr, NULL, NULL, NULL, namebuf);
-	if (sym_name)
-		seq_printf(m, "%s", sym_name);
-	else
+	if (lookup_symbol_name(addr, symname) < 0)
 		seq_printf(m, "<%p>", (void *)addr);
+	else
+		seq_printf(m, "%s", symname);
 }
 
 static int tstats_show(struct seq_file *m, void *v)
-- 
cgit v1.2.3


From a5c43dae7ae38c2a6b3e9a819bcf45f010bf6a4a Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@sw.ru>
Date: Tue, 8 May 2007 00:28:47 -0700
Subject: Fix race between cat /proc/slab_allocators and rmmod

Same story as with cat /proc/*/wchan race vs rmmod race, only
/proc/slab_allocators want more info than just symbol name.

Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kallsyms.c | 19 +++++++++++++++++++
 kernel/module.c   | 27 +++++++++++++++++++++++++++
 2 files changed, 46 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 4e2ec191a127..c36581b40c04 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -286,6 +286,25 @@ int lookup_symbol_name(unsigned long addr, char *symname)
 	return lookup_module_symbol_name(addr, symname);
 }
 
+int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
+			unsigned long *offset, char *modname, char *name)
+{
+	name[0] = '\0';
+	name[KSYM_NAME_LEN] = '\0';
+
+	if (is_ksym_addr(addr)) {
+		unsigned long pos;
+
+		pos = get_symbol_pos(addr, size, offset);
+		/* Grab name */
+		kallsyms_expand_symbol(get_symbol_offset(pos), name);
+		modname[0] = '\0';
+		return 0;
+	}
+	/* see if it's in a module */
+	return lookup_module_symbol_attrs(addr, size, offset, modname, name);
+}
+
 /* Look up a kernel symbol and return it in a text buffer. */
 int sprint_symbol(char *buffer, unsigned long address)
 {
diff --git a/kernel/module.c b/kernel/module.c
index 3da76ad32d78..d36e45477fac 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2149,6 +2149,33 @@ out:
 	return -ERANGE;
 }
 
+int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
+			unsigned long *offset, char *modname, char *name)
+{
+	struct module *mod;
+
+	mutex_lock(&module_mutex);
+	list_for_each_entry(mod, &modules, list) {
+		if (within(addr, mod->module_init, mod->init_size) ||
+		    within(addr, mod->module_core, mod->core_size)) {
+			const char *sym;
+
+			sym = get_ksymbol(mod, addr, size, offset);
+			if (!sym)
+				goto out;
+			if (modname)
+				strlcpy(modname, mod->name, MODULE_NAME_LEN + 1);
+			if (name)
+				strlcpy(name, sym, KSYM_NAME_LEN + 1);
+			mutex_unlock(&module_mutex);
+			return 0;
+		}
+	}
+out:
+	mutex_unlock(&module_mutex);
+	return -ERANGE;
+}
+
 int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 			char *name, char *module_name, int *exported)
 {
-- 
cgit v1.2.3


From 9730b5b06fee7ffd1f0150855f03cf319c0e004f Mon Sep 17 00:00:00 2001
From: Bert Wesarg <wesarg@informatik.uni-halle.de>
Date: Tue, 8 May 2007 00:28:50 -0700
Subject: kernel/params.c: fix lying comment for param_array()

This fixes the comment for the function param_array. Which lies that it
only *temporarily* mangle the input string @val.

Signed-off-by: Bert Wesarg <wesarg@informatik.uni-halle.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/params.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 312172320b4c..e61c46c97ce7 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -269,7 +269,7 @@ int param_get_invbool(char *buffer, struct kernel_param *kp)
 	return param_get_bool(buffer, &dummy);
 }
 
-/* We cheat here and temporarily mangle the string. */
+/* We break the rule and mangle the string. */
 static int param_array(const char *name,
 		       const char *val,
 		       unsigned int min, unsigned int max,
-- 
cgit v1.2.3


From 428e6ce023c5890cfecc8ad10335da3f28dbf893 Mon Sep 17 00:00:00 2001
From: Pavel Emelianov <xemul@sw.ru>
Date: Tue, 8 May 2007 00:29:10 -0700
Subject: Lockdep treats down_write_trylock like regular down_write

This causes constructions like

down_write(&mm1->mmap_sem);
if (down_write_trylock(&mm2->mmap_sem)) {
       ...
       up_write(&mm2->mmap_sem);
}
up_write(&mm1->mmap_sem);

generate a lockdep warning about circular locking dependence.

Call rwsem_acquire() with trylock set to 1.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/rwsem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index 291ded556aa0..9a87886b022e 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -60,7 +60,7 @@ int down_write_trylock(struct rw_semaphore *sem)
 	int ret = __down_write_trylock(sem);
 
 	if (ret == 1)
-		rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+		rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
 	return ret;
 }
 
-- 
cgit v1.2.3


From e729aa16b168fb202d1a20f936028cb7c2a0278d Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Tue, 8 May 2007 00:29:13 -0700
Subject: Pad irq_desc to internode cacheline size

We noticed a drop in n/w performance due to the irq_desc being cacheline
aligned rather than internode aligned.  We see 50% of expected performance
when two e1000 nics local to two different nodes have consecutive irq
descriptors allocated, due to false sharing.

Note that this patch does away with cacheline padding for the UP case, as
it does not seem useful for UP configurations.

Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/handle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index aff1f0fabb0d..515ad40bde15 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -48,7 +48,7 @@ handle_bad_irq(unsigned int irq, struct irq_desc *desc)
  *
  * Controller mappings for all interrupt sources:
  */
-struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = {
+struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	[0 ... NR_IRQS-1] = {
 		.status = IRQ_DISABLED,
 		.chip = &no_irq_chip,
-- 
cgit v1.2.3


From 039b6b3ed84e45a6f8316358dd2bfdc83d59fc45 Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@mindspring.com>
Date: Tue, 8 May 2007 00:29:20 -0700
Subject: audit: add spaces on either side of case "..." operator.

Following the programming advice laid down in the gcc manual, make
sure the case "..." operator has spaces on either side.

According to:

http://gcc.gnu.org/onlinedocs/gcc-4.1.2/gcc/Case-Ranges.html#Case-Ranges:

  "Be careful: Write spaces around the ..., for otherwise it may be
parsed wrong when you use it with integer values."

Signed-off-by: Robert P. J. Day <rpjday@mindspring.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/audit.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 4e9d20829681..d13276d41410 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -515,8 +515,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
 			err = -EPERM;
 		break;
 	case AUDIT_USER:
-	case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
-	case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
+	case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
+	case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
 		if (security_netlink_recv(skb, CAP_AUDIT_WRITE))
 			err = -EPERM;
 		break;
@@ -614,8 +614,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 							loginuid, sid);
 		break;
 	case AUDIT_USER:
-	case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
-	case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
+	case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
+	case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
 		if (!audit_enabled && msg_type != AUDIT_USER_AVC)
 			return 0;
 
-- 
cgit v1.2.3


From 5a0c6a0d1ae97473291f479ef64573d6b2c0e2d5 Mon Sep 17 00:00:00 2001
From: Martin Peschke <mp3@de.ibm.com>
Date: Tue, 8 May 2007 00:29:25 -0700
Subject: kallsyms: cleanup: use seq_release_private() where appropriate

We can save some lines of code by using seq_release_private().

Signed-off-by: Martin Peschke <mp3@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kallsyms.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index c36581b40c04..f1bda23140b2 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -470,18 +470,11 @@ static int kallsyms_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
-static int kallsyms_release(struct inode *inode, struct file *file)
-{
-	struct seq_file *m = (struct seq_file *)file->private_data;
-	kfree(m->private);
-	return seq_release(inode, file);
-}
-
 static const struct file_operations kallsyms_operations = {
 	.open = kallsyms_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
-	.release = kallsyms_release,
+	.release = seq_release_private,
 };
 
 static int __init kallsyms_init(void)
-- 
cgit v1.2.3


From d3ed782458f315c30ea679b919a2cc59f2b82565 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 May 2007 00:30:03 -0700
Subject: highres/dyntick: prevent xtime lock contention

While the !highres/!dyntick code assigns the duty of the do_timer() call to
one specific CPU, this was dropped in the highres/dyntick part during
development.

Steven Rostedt discovered the xtime lock contention on highres/dyntick due
to several CPUs trying to update jiffies.

Add the single CPU assignement back.  In the dyntick case this needs to be
handled carefully, as the CPU which has the do_timer() duty must drop the
assignement and let it be grabbed by another CPU, which is active.
Otherwise the do_timer() calls would not happen during the long sleep.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Mark Lord <mlord@pobox.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/time/tick-common.c   |  8 +++++++-
 kernel/time/tick-internal.h |  1 +
 kernel/time/tick-sched.c    | 42 ++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 48 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index bfda3f7f0716..a96ec9ab3454 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -31,7 +31,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
  */
 ktime_t tick_next_period;
 ktime_t tick_period;
-static int tick_do_timer_cpu = -1;
+int tick_do_timer_cpu __read_mostly = -1;
 DEFINE_SPINLOCK(tick_device_lock);
 
 /*
@@ -295,6 +295,12 @@ static void tick_shutdown(unsigned int *cpup)
 		clockevents_exchange_device(dev, NULL);
 		td->evtdev = NULL;
 	}
+	/* Transfer the do_timer job away from this cpu */
+	if (*cpup == tick_do_timer_cpu) {
+		int cpu = first_cpu(cpu_online_map);
+
+		tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1;
+	}
 	spin_unlock_irqrestore(&tick_device_lock, flags);
 }
 
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index c9d203bde518..bb13f2724905 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -5,6 +5,7 @@ DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
 extern spinlock_t tick_device_lock;
 extern ktime_t tick_next_period;
 extern ktime_t tick_period;
+extern int tick_do_timer_cpu __read_mostly;
 
 extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
 extern void tick_handle_periodic(struct clock_event_device *dev);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 51556b95f60f..f4fc867f467d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -221,6 +221,18 @@ void tick_nohz_stop_sched_tick(void)
 			ts->tick_stopped = 1;
 			ts->idle_jiffies = last_jiffies;
 		}
+
+		/*
+		 * If this cpu is the one which updates jiffies, then
+		 * give up the assignment and let it be taken by the
+		 * cpu which runs the tick timer next, which might be
+		 * this cpu as well. If we don't drop this here the
+		 * jiffies might be stale and do_timer() never
+		 * invoked.
+		 */
+		if (cpu == tick_do_timer_cpu)
+			tick_do_timer_cpu = -1;
+
 		/*
 		 * calculate the expiry time for the next timer wheel
 		 * timer
@@ -338,12 +350,24 @@ static void tick_nohz_handler(struct clock_event_device *dev)
 {
 	struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
 	struct pt_regs *regs = get_irq_regs();
+	int cpu = smp_processor_id();
 	ktime_t now = ktime_get();
 
 	dev->next_event.tv64 = KTIME_MAX;
 
+	/*
+	 * Check if the do_timer duty was dropped. We don't care about
+	 * concurrency: This happens only when the cpu in charge went
+	 * into a long sleep. If two cpus happen to assign themself to
+	 * this duty, then the jiffies update is still serialized by
+	 * xtime_lock.
+	 */
+	if (unlikely(tick_do_timer_cpu == -1))
+		tick_do_timer_cpu = cpu;
+
 	/* Check, if the jiffies need an update */
-	tick_do_update_jiffies64(now);
+	if (tick_do_timer_cpu == cpu)
+		tick_do_update_jiffies64(now);
 
 	/*
 	 * When we are idle and the tick is stopped, we have to touch
@@ -431,9 +455,23 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
 	struct hrtimer_cpu_base *base = timer->base->cpu_base;
 	struct pt_regs *regs = get_irq_regs();
 	ktime_t now = ktime_get();
+	int cpu = smp_processor_id();
+
+#ifdef CONFIG_NO_HZ
+	/*
+	 * Check if the do_timer duty was dropped. We don't care about
+	 * concurrency: This happens only when the cpu in charge went
+	 * into a long sleep. If two cpus happen to assign themself to
+	 * this duty, then the jiffies update is still serialized by
+	 * xtime_lock.
+	 */
+	if (unlikely(tick_do_timer_cpu == -1))
+		tick_do_timer_cpu = cpu;
+#endif
 
 	/* Check, if the jiffies need an update */
-	tick_do_update_jiffies64(now);
+	if (tick_do_timer_cpu == cpu)
+		tick_do_update_jiffies64(now);
 
 	/*
 	 * Do not call, when we are not in irq context and have
-- 
cgit v1.2.3


From 9e860d000a90cfc9ca270ddb6e99b177e6aa91cd Mon Sep 17 00:00:00 2001
From: Jarek Poplawski <jarkao2@o2.pl>
Date: Tue, 8 May 2007 00:30:12 -0700
Subject: lockdep: lookup_chain_cache comment errata

Signed-off-by: Jarek Poplawski <jarkao2@o2.pl>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/lockdep.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 7f573136a905..dc4ea4c2b4c8 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -1309,8 +1309,9 @@ out_unlock_set:
 
 /*
  * Look up a dependency chain. If the key is not present yet then
- * add it and return 0 - in this case the new dependency chain is
- * validated. If the key is already hashed, return 1.
+ * add it and return 1 - in this case the new dependency chain is
+ * validated. If the key is already hashed, return 0.
+ * (On return with 1 graph_lock is held.)
  */
 static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class)
 {
-- 
cgit v1.2.3


From b5e618181a927210f8be1d3d2249d31904ba358d Mon Sep 17 00:00:00 2001
From: Pavel Emelianov <xemul@sw.ru>
Date: Tue, 8 May 2007 00:30:19 -0700
Subject: Introduce a handy list_first_entry macro

There are many places in the kernel where the construction like

   foo = list_entry(head->next, struct foo_struct, list);

are used.
The code might look more descriptive and neat if using the macro

   list_first_entry(head, type, member) \
             list_entry((head)->next, type, member)

Here is the macro itself and the examples of its usage in the generic code.
 If it will turn out to be useful, I can prepare the set of patches to
inject in into arch-specific code, drivers, networking, etc.

Signed-off-by: Pavel Emelianov <xemul@openvz.org>
Signed-off-by: Kirill Korotaev <dev@openvz.org>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Zach Brown <zach.brown@oracle.com>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: John McCutchan <ttb@tentacle.dhs.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/posix-cpu-timers.c | 14 +++++++-------
 kernel/timer.c            |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 657f77697415..1de710e18373 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -971,7 +971,7 @@ static void check_thread_timers(struct task_struct *tsk,
 	maxfire = 20;
 	tsk->it_prof_expires = cputime_zero;
 	while (!list_empty(timers)) {
-		struct cpu_timer_list *t = list_entry(timers->next,
+		struct cpu_timer_list *t = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
 		if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
@@ -986,7 +986,7 @@ static void check_thread_timers(struct task_struct *tsk,
 	maxfire = 20;
 	tsk->it_virt_expires = cputime_zero;
 	while (!list_empty(timers)) {
-		struct cpu_timer_list *t = list_entry(timers->next,
+		struct cpu_timer_list *t = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
 		if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
@@ -1001,7 +1001,7 @@ static void check_thread_timers(struct task_struct *tsk,
 	maxfire = 20;
 	tsk->it_sched_expires = 0;
 	while (!list_empty(timers)) {
-		struct cpu_timer_list *t = list_entry(timers->next,
+		struct cpu_timer_list *t = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
 		if (!--maxfire || tsk->sched_time < t->expires.sched) {
@@ -1057,7 +1057,7 @@ static void check_process_timers(struct task_struct *tsk,
 	maxfire = 20;
 	prof_expires = cputime_zero;
 	while (!list_empty(timers)) {
-		struct cpu_timer_list *t = list_entry(timers->next,
+		struct cpu_timer_list *t = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
 		if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) {
@@ -1072,7 +1072,7 @@ static void check_process_timers(struct task_struct *tsk,
 	maxfire = 20;
 	virt_expires = cputime_zero;
 	while (!list_empty(timers)) {
-		struct cpu_timer_list *t = list_entry(timers->next,
+		struct cpu_timer_list *t = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
 		if (!--maxfire || cputime_lt(utime, t->expires.cpu)) {
@@ -1087,7 +1087,7 @@ static void check_process_timers(struct task_struct *tsk,
 	maxfire = 20;
 	sched_expires = 0;
 	while (!list_empty(timers)) {
-		struct cpu_timer_list *t = list_entry(timers->next,
+		struct cpu_timer_list *t = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
 		if (!--maxfire || sched_time < t->expires.sched) {
@@ -1400,7 +1400,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 	 */
 	head = &tsk->signal->cpu_timers[clock_idx];
 	if (list_empty(head) ||
-	    cputime_ge(list_entry(head->next,
+	    cputime_ge(list_first_entry(head,
 				  struct cpu_timer_list, entry)->expires.cpu,
 		       *newval)) {
 		/*
diff --git a/kernel/timer.c b/kernel/timer.c
index ba41af2bb6cc..7a6448340f90 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -629,7 +629,7 @@ static inline void __run_timers(tvec_base_t *base)
 			void (*fn)(unsigned long);
 			unsigned long data;
 
-			timer = list_entry(head->next,struct timer_list,entry);
+			timer = list_first_entry(head, struct timer_list,entry);
  			fn = timer->function;
  			data = timer->data;
 
@@ -1248,7 +1248,7 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
 	struct timer_list *timer;
 
 	while (!list_empty(head)) {
-		timer = list_entry(head->next, struct timer_list, entry);
+		timer = list_first_entry(head, struct timer_list, entry);
 		detach_timer(timer, 0);
 		timer_set_base(timer, new_base);
 		internal_add_timer(new_base, timer);
-- 
cgit v1.2.3


From 9926e4c74300c4b31dee007298c6475d33369df0 Mon Sep 17 00:00:00 2001
From: Tom Alsberg <alsbergt@cs.huji.ac.il>
Date: Tue, 8 May 2007 00:30:31 -0700
Subject: CPU time limit patch / setrlimit(RLIMIT_CPU, 0) cheat fix

As discovered here today, the change in Kernel 2.6.17 intended to inhibit
users from setting RLIMIT_CPU to 0 (as that is equivalent to unlimited) by
"cheating" and setting it to 1 in such a case, does not make a difference,
as the check is done in the wrong place (too late), and only applies to the
profiling code.

On all systems I checked running kernels above 2.6.17, no matter what the
hard and soft CPU time limits were before, a user could escape them by
issuing in the shell (sh/bash/zsh) "ulimit -t 0", and then the user's
process was not ever killed.

Attached is a trivial patch to fix that.  Simply moving the check to a
slightly earlier location (specifically, before the line that actually
assigns the limit - *old_rlim = new_rlim), does the trick.

Do note that at least the zsh (but not ash, dash, or bash) shell has the
problem of "caching" the limits set by the ulimit command, so when running
zsh the fix will not immediately be evident - after entering "ulimit -t 0",
"ulimit -a" will show "-t: cpu time (seconds) 0", even though the actual
limit as returned by getrlimit(...) will be 1.  It can be verified by
opening a subshell (which will not have the values of the parent shell in
cache) and checking in it, or just by running a CPU intensive command like
"echo '65536^1048576' | bc" and verifying that it dumps core after one
second.

Regardless of whether that is a misfeature in the shell, perhaps it would
be better to return -EINVAL from setrlimit in such a case instead of
cheating and setting to 1, as that does not really reflect the actual state
of the process anymore.  I do not however know what the ground for that
decision was in the original 2.6.17 change, and whether there would be any
"backward" compatibility issues, so I preferred not to touch that right
now.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index fe1f3ab20477..926bf9d7ac45 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1923,6 +1923,16 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
 	if (retval)
 		return retval;
 
+	if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) {
+		/*
+		 * The caller is asking for an immediate RLIMIT_CPU
+		 * expiry.  But we use the zero value to mean "it was
+		 * never set".  So let's cheat and make it one second
+		 * instead
+		 */
+		new_rlim.rlim_cur = 1;
+	}
+
 	task_lock(current->group_leader);
 	*old_rlim = new_rlim;
 	task_unlock(current->group_leader);
@@ -1944,15 +1954,6 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
 		unsigned long rlim_cur = new_rlim.rlim_cur;
 		cputime_t cputime;
 
-		if (rlim_cur == 0) {
-			/*
-			 * The caller is asking for an immediate RLIMIT_CPU
-			 * expiry.  But we use the zero value to mean "it was
-			 * never set".  So let's cheat and make it one second
-			 * instead
-			 */
-			rlim_cur = 1;
-		}
 		cputime = secs_to_cputime(rlim_cur);
 		read_lock(&tasklist_lock);
 		spin_lock_irq(&current->sighand->siglock);
-- 
cgit v1.2.3


From 35bab756b49bd148760a3bbd3c907cd74754dbb4 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 8 May 2007 00:30:49 -0700
Subject: The scheduled -EINVAL for invalid timevals in setitimer

As scheduled, do_setitimer() now returns -EINVAL for invalid timeval.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/itimer.c | 59 +++------------------------------------------------------
 1 file changed, 3 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/kernel/itimer.c b/kernel/itimer.c
index 4523f3396f23..3205e8e114fa 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -137,60 +137,12 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
-/*
- * We do not care about correctness. We just sanitize the values so
- * the ktime_t operations which expect normalized values do not
- * break. This converts negative values to long timeouts similar to
- * the code in kernel versions < 2.6.16
- *
- * Print a limited number of warning messages when an invalid timeval
- * is detected.
- */
-static void fixup_timeval(struct timeval *tv, int interval)
-{
-	static int warnlimit = 10;
-	unsigned long tmp;
-
-	if (warnlimit > 0) {
-		warnlimit--;
-		printk(KERN_WARNING
-		       "setitimer: %s (pid = %d) provided "
-		       "invalid timeval %s: tv_sec = %ld tv_usec = %ld\n",
-		       current->comm, current->pid,
-		       interval ? "it_interval" : "it_value",
-		       tv->tv_sec, (long) tv->tv_usec);
-	}
-
-	tmp = tv->tv_usec;
-	if (tmp >= USEC_PER_SEC) {
-		tv->tv_usec = tmp % USEC_PER_SEC;
-		tv->tv_sec += tmp / USEC_PER_SEC;
-	}
-
-	tmp = tv->tv_sec;
-	if (tmp > LONG_MAX)
-		tv->tv_sec = LONG_MAX;
-}
-
 /*
  * Returns true if the timeval is in canonical form
  */
 #define timeval_valid(t) \
 	(((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC))
 
-/*
- * Check for invalid timevals, sanitize them and print a limited
- * number of warnings.
- */
-static void check_itimerval(struct itimerval *value) {
-
-	if (unlikely(!timeval_valid(&value->it_value)))
-		fixup_timeval(&value->it_value, 0);
-
-	if (unlikely(!timeval_valid(&value->it_interval)))
-		fixup_timeval(&value->it_interval, 1);
-}
-
 int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
 {
 	struct task_struct *tsk = current;
@@ -200,15 +152,10 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
 
 	/*
 	 * Validate the timevals in value.
-	 *
-	 * Note: Although the spec requires that invalid values shall
-	 * return -EINVAL, we just fixup the value and print a limited
-	 * number of warnings in order not to break users of this
-	 * historical misfeature.
-	 *
-	 * Scheduled for replacement in March 2007
 	 */
-	check_itimerval(value);
+	if (!timeval_valid(&value->it_value) ||
+	    !timeval_valid(&value->it_interval))
+		return -EINVAL;
 
 	switch (which) {
 	case ITIMER_REAL:
-- 
cgit v1.2.3


From 4ff773bbde87f7f7dddc0f579ad53e077a6587b9 Mon Sep 17 00:00:00 2001
From: Jarek Poplawski <jarkao2@o2.pl>
Date: Tue, 8 May 2007 00:31:00 -0700
Subject: lockdep: removed unused ip argument in mark_lock & mark_held_locks

It looks like a remainder from designing...

Signed-off-by: Jarek Poplawski <jarkao@o2.pl>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/lockdep.c | 38 +++++++++++++++-----------------------
 1 file changed, 15 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index dc4ea4c2b4c8..1a5ff2211d88 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -1574,7 +1574,7 @@ valid_state(struct task_struct *curr, struct held_lock *this,
  * Mark a lock with a usage bit, and validate the state transition:
  */
 static int mark_lock(struct task_struct *curr, struct held_lock *this,
-		     enum lock_usage_bit new_bit, unsigned long ip)
+		     enum lock_usage_bit new_bit)
 {
 	unsigned int new_mask = 1 << new_bit, ret = 1;
 
@@ -1597,14 +1597,6 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 
 	this->class->usage_mask |= new_mask;
 
-#ifdef CONFIG_TRACE_IRQFLAGS
-	if (new_bit == LOCK_ENABLED_HARDIRQS ||
-			new_bit == LOCK_ENABLED_HARDIRQS_READ)
-		ip = curr->hardirq_enable_ip;
-	else if (new_bit == LOCK_ENABLED_SOFTIRQS ||
-			new_bit == LOCK_ENABLED_SOFTIRQS_READ)
-		ip = curr->softirq_enable_ip;
-#endif
 	if (!save_trace(this->class->usage_traces + new_bit))
 		return 0;
 
@@ -1803,7 +1795,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
  * Mark all held locks with a usage bit:
  */
 static int
-mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip)
+mark_held_locks(struct task_struct *curr, int hardirq)
 {
 	enum lock_usage_bit usage_bit;
 	struct held_lock *hlock;
@@ -1823,7 +1815,7 @@ mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip)
 			else
 				usage_bit = LOCK_ENABLED_SOFTIRQS;
 		}
-		if (!mark_lock(curr, hlock, usage_bit, ip))
+		if (!mark_lock(curr, hlock, usage_bit))
 			return 0;
 	}
 
@@ -1876,7 +1868,7 @@ void trace_hardirqs_on(void)
 	 * We are going to turn hardirqs on, so set the
 	 * usage bit for all held locks:
 	 */
-	if (!mark_held_locks(curr, 1, ip))
+	if (!mark_held_locks(curr, 1))
 		return;
 	/*
 	 * If we have softirqs enabled, then set the usage
@@ -1884,7 +1876,7 @@ void trace_hardirqs_on(void)
 	 * this bit from being set before)
 	 */
 	if (curr->softirqs_enabled)
-		if (!mark_held_locks(curr, 0, ip))
+		if (!mark_held_locks(curr, 0))
 			return;
 
 	curr->hardirq_enable_ip = ip;
@@ -1952,7 +1944,7 @@ void trace_softirqs_on(unsigned long ip)
 	 * enabled too:
 	 */
 	if (curr->hardirqs_enabled)
-		mark_held_locks(curr, 0, ip);
+		mark_held_locks(curr, 0);
 }
 
 /*
@@ -2090,43 +2082,43 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 		if (read) {
 			if (curr->hardirq_context)
 				if (!mark_lock(curr, hlock,
-						LOCK_USED_IN_HARDIRQ_READ, ip))
+						LOCK_USED_IN_HARDIRQ_READ))
 					return 0;
 			if (curr->softirq_context)
 				if (!mark_lock(curr, hlock,
-						LOCK_USED_IN_SOFTIRQ_READ, ip))
+						LOCK_USED_IN_SOFTIRQ_READ))
 					return 0;
 		} else {
 			if (curr->hardirq_context)
-				if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ, ip))
+				if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ))
 					return 0;
 			if (curr->softirq_context)
-				if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ, ip))
+				if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ))
 					return 0;
 		}
 	}
 	if (!hardirqs_off) {
 		if (read) {
 			if (!mark_lock(curr, hlock,
-					LOCK_ENABLED_HARDIRQS_READ, ip))
+					LOCK_ENABLED_HARDIRQS_READ))
 				return 0;
 			if (curr->softirqs_enabled)
 				if (!mark_lock(curr, hlock,
-						LOCK_ENABLED_SOFTIRQS_READ, ip))
+						LOCK_ENABLED_SOFTIRQS_READ))
 					return 0;
 		} else {
 			if (!mark_lock(curr, hlock,
-					LOCK_ENABLED_HARDIRQS, ip))
+					LOCK_ENABLED_HARDIRQS))
 				return 0;
 			if (curr->softirqs_enabled)
 				if (!mark_lock(curr, hlock,
-						LOCK_ENABLED_SOFTIRQS, ip))
+						LOCK_ENABLED_SOFTIRQS))
 					return 0;
 		}
 	}
 #endif
 	/* mark it as used: */
-	if (!mark_lock(curr, hlock, LOCK_USED, ip))
+	if (!mark_lock(curr, hlock, LOCK_USED))
 		return 0;
 out_calc_hash:
 	/*
-- 
cgit v1.2.3


From 6f7f02e78a75a09195d963e0392b195bc2d55c5c Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 8 May 2007 00:31:43 -0700
Subject: cpusets: allow empty {cpus,mems}_allowed to be set for unpopulated
 cpuset

You currently cannot remove all cpus or mems from cpus_allowed or
mems_allowed of a cpuset.  We now allow both if there are no attached
tasks.

Acked-by: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Paul Menage <menage@google.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 38 ++++++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e6d002fc7392..88b416dfbc72 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -821,11 +821,22 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 		return -EACCES;
 
 	trialcs = *cs;
-	retval = cpulist_parse(buf, trialcs.cpus_allowed);
-	if (retval < 0)
-		return retval;
+
+	/*
+	 * We allow a cpuset's cpus_allowed to be empty; if it has attached
+	 * tasks, we'll catch it later when we validate the change and return
+	 * -ENOSPC.
+	 */
+	if (!buf[0] || (buf[0] == '\n' && !buf[1])) {
+		cpus_clear(trialcs.cpus_allowed);
+	} else {
+		retval = cpulist_parse(buf, trialcs.cpus_allowed);
+		if (retval < 0)
+			return retval;
+	}
 	cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map);
-	if (cpus_empty(trialcs.cpus_allowed))
+	/* cpus_allowed cannot be empty for a cpuset with attached tasks. */
+	if (atomic_read(&cs->count) && cpus_empty(trialcs.cpus_allowed))
 		return -ENOSPC;
 	retval = validate_change(cs, &trialcs);
 	if (retval < 0)
@@ -918,16 +929,27 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 		return -EACCES;
 
 	trialcs = *cs;
-	retval = nodelist_parse(buf, trialcs.mems_allowed);
-	if (retval < 0)
-		goto done;
+
+	/*
+	 * We allow a cpuset's mems_allowed to be empty; if it has attached
+	 * tasks, we'll catch it later when we validate the change and return
+	 * -ENOSPC.
+	 */
+	if (!buf[0] || (buf[0] == '\n' && !buf[1])) {
+		nodes_clear(trialcs.mems_allowed);
+	} else {
+		retval = nodelist_parse(buf, trialcs.mems_allowed);
+		if (retval < 0)
+			goto done;
+	}
 	nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map);
 	oldmem = cs->mems_allowed;
 	if (nodes_equal(oldmem, trialcs.mems_allowed)) {
 		retval = 0;		/* Too easy - nothing to do */
 		goto done;
 	}
-	if (nodes_empty(trialcs.mems_allowed)) {
+	/* mems_allowed cannot be empty for a cpuset with attached tasks. */
+	if (atomic_read(&cs->count) && nodes_empty(trialcs.mems_allowed)) {
 		retval = -ENOSPC;
 		goto done;
 	}
-- 
cgit v1.2.3


From 6bdb6b620e335d38ced8d5981e8f44778225c1d8 Mon Sep 17 00:00:00 2001
From: Stas Sergeev <stsp@aknet.ru>
Date: Tue, 8 May 2007 00:31:58 -0700
Subject: export hrtimer_forward

Other symbols of the hrtimers API are already exported.

Signed-off-by: Stas Sergeev <stsp@aknet.ru>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/hrtimer.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 1b3033105b40..c9f4f044a8a8 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -669,6 +669,7 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 
 	return orun;
 }
+EXPORT_SYMBOL_GPL(hrtimer_forward);
 
 /*
  * enqueue_hrtimer - internal function to (re)start a timer
-- 
cgit v1.2.3


From bdecea3a9282d529b54954f3f1e59877629baba1 Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Tue, 8 May 2007 00:32:48 -0700
Subject: sched: fix idle load balancing in softirqd context

Periodic load balancing in recent kernels happen in the softirq.  In
certain -rt configurations, these softirqs are handled in softirqd context.
 And hence the check for idle processor was always returning busy (as
nr_running > 1).

This patch captures the idle information at the tick and passes this info
to softirq context through an element 'idle_at_tick' in rq.

[kernel@kolivas.org: Fix reverse idle at tick logic]
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5530ed211f72..ba053d88c8c6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -223,6 +223,7 @@ struct rq {
 	unsigned long raw_weighted_load;
 #ifdef CONFIG_SMP
 	unsigned long cpu_load[3];
+	unsigned char idle_at_tick;
 #endif
 	unsigned long long nr_switches;
 
@@ -2943,12 +2944,7 @@ static void run_rebalance_domains(struct softirq_action *h)
 	struct rq *this_rq = cpu_rq(this_cpu);
 	unsigned long interval;
 	struct sched_domain *sd;
-	/*
-	 * We are idle if there are no processes running. This
-	 * is valid even if we are the idle process (SMT).
-	 */
-	enum idle_type idle = !this_rq->nr_running ?
-				SCHED_IDLE : NOT_IDLE;
+	enum idle_type idle = this_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE;
 	/* Earliest time when we have to call run_rebalance_domains again */
 	unsigned long next_balance = jiffies + 60*HZ;
 
@@ -3218,14 +3214,16 @@ void scheduler_tick(void)
 	unsigned long long now = sched_clock();
 	struct task_struct *p = current;
 	int cpu = smp_processor_id();
+	int idle_at_tick = idle_cpu(cpu);
 	struct rq *rq = cpu_rq(cpu);
 
 	update_cpu_clock(p, rq, now);
 
-	if (p != rq->idle)
+	if (!idle_at_tick)
 		task_running_tick(rq, p);
 #ifdef CONFIG_SMP
 	update_load(rq);
+	rq->idle_at_tick = idle_at_tick;
 	if (time_after_eq(jiffies, rq->next_balance))
 		raise_softirq(SCHED_SOFTIRQ);
 #endif
-- 
cgit v1.2.3


From 46cb4b7c88fa5517f64b5bee42939ea3614cddcb Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Tue, 8 May 2007 00:32:51 -0700
Subject: sched: dynticks idle load balancing

Fix the process idle load balancing in the presence of dynticks.  cpus for
which ticks are stopped will sleep till the next event wakes it up.
Potentially these sleeps can be for large durations and during which today,
there is no periodic idle load balancing being done.

This patch nominates an owner among the idle cpus, which does the idle load
balancing on behalf of the other idle cpus.  And once all the cpus are
completely idle, then we can stop this idle load balancing too.  Checks added
in fast path are minimized.  Whenever there are busy cpus in the system, there
will be an owner(idle cpu) doing the system wide idle load balancing.

Open items:
1. Intelligent owner selection (like an idle core in a busy package).
2. Merge with rcu's nohz_cpu_mask?

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c           | 223 ++++++++++++++++++++++++++++++++++++++++++++---
 kernel/time/tick-sched.c |   9 ++
 2 files changed, 219 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ba053d88c8c6..74599286230c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -224,6 +224,9 @@ struct rq {
 #ifdef CONFIG_SMP
 	unsigned long cpu_load[3];
 	unsigned char idle_at_tick;
+#ifdef CONFIG_NO_HZ
+	unsigned char in_nohz_recently;
+#endif
 #endif
 	unsigned long long nr_switches;
 
@@ -1050,6 +1053,17 @@ static void resched_task(struct task_struct *p)
 	if (!tsk_is_polling(p))
 		smp_send_reschedule(cpu);
 }
+
+static void resched_cpu(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	if (!spin_trylock_irqsave(&rq->lock, flags))
+		return;
+	resched_task(cpu_curr(cpu));
+	spin_unlock_irqrestore(&rq->lock, flags);
+}
 #else
 static inline void resched_task(struct task_struct *p)
 {
@@ -2658,6 +2672,12 @@ redo:
 		double_rq_unlock(this_rq, busiest);
 		local_irq_restore(flags);
 
+		/*
+		 * some other cpu did the load balance for us.
+		 */
+		if (nr_moved && this_cpu != smp_processor_id())
+			resched_cpu(this_cpu);
+
 		/* All tasks on this runqueue were pinned by CPU affinity */
 		if (unlikely(all_pinned)) {
 			cpu_clear(cpu_of(busiest), cpus);
@@ -2928,27 +2948,98 @@ static void update_load(struct rq *this_rq)
 	}
 }
 
+#ifdef CONFIG_NO_HZ
+static struct {
+	atomic_t load_balancer;
+	cpumask_t  cpu_mask;
+} nohz ____cacheline_aligned = {
+	.load_balancer = ATOMIC_INIT(-1),
+	.cpu_mask = CPU_MASK_NONE,
+};
+
 /*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * This routine will try to nominate the ilb (idle load balancing)
+ * owner among the cpus whose ticks are stopped. ilb owner will do the idle
+ * load balancing on behalf of all those cpus. If all the cpus in the system
+ * go into this tickless mode, then there will be no ilb owner (as there is
+ * no need for one) and all the cpus will sleep till the next wakeup event
+ * arrives...
+ *
+ * For the ilb owner, tick is not stopped. And this tick will be used
+ * for idle load balancing. ilb owner will still be part of
+ * nohz.cpu_mask..
+ *
+ * While stopping the tick, this cpu will become the ilb owner if there
+ * is no other owner. And will be the owner till that cpu becomes busy
+ * or if all cpus in the system stop their ticks at which point
+ * there is no need for ilb owner.
  *
+ * When the ilb owner becomes busy, it nominates another owner, during the
+ * next busy scheduler_tick()
+ */
+int select_nohz_load_balancer(int stop_tick)
+{
+	int cpu = smp_processor_id();
+
+	if (stop_tick) {
+		cpu_set(cpu, nohz.cpu_mask);
+		cpu_rq(cpu)->in_nohz_recently = 1;
+
+		/*
+		 * If we are going offline and still the leader, give up!
+		 */
+		if (cpu_is_offline(cpu) &&
+		    atomic_read(&nohz.load_balancer) == cpu) {
+			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+				BUG();
+			return 0;
+		}
+
+		/* time for ilb owner also to sleep */
+		if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+			if (atomic_read(&nohz.load_balancer) == cpu)
+				atomic_set(&nohz.load_balancer, -1);
+			return 0;
+		}
+
+		if (atomic_read(&nohz.load_balancer) == -1) {
+			/* make me the ilb owner */
+			if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
+				return 1;
+		} else if (atomic_read(&nohz.load_balancer) == cpu)
+			return 1;
+	} else {
+		if (!cpu_isset(cpu, nohz.cpu_mask))
+			return 0;
+
+		cpu_clear(cpu, nohz.cpu_mask);
+
+		if (atomic_read(&nohz.load_balancer) == cpu)
+			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+				BUG();
+	}
+	return 0;
+}
+#endif
+
+static DEFINE_SPINLOCK(balancing);
+
+/*
  * It checks each scheduling domain to see if it is due to be balanced,
  * and initiates a balancing operation if so.
  *
  * Balancing parameters are set up in arch_init_sched_domains.
  */
-static DEFINE_SPINLOCK(balancing);
-
-static void run_rebalance_domains(struct softirq_action *h)
+static inline void rebalance_domains(int cpu, enum idle_type idle)
 {
-	int this_cpu = smp_processor_id(), balance = 1;
-	struct rq *this_rq = cpu_rq(this_cpu);
+	int balance = 1;
+	struct rq *rq = cpu_rq(cpu);
 	unsigned long interval;
 	struct sched_domain *sd;
-	enum idle_type idle = this_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE;
-	/* Earliest time when we have to call run_rebalance_domains again */
+	/* Earliest time when we have to do rebalance again */
 	unsigned long next_balance = jiffies + 60*HZ;
 
-	for_each_domain(this_cpu, sd) {
+	for_each_domain(cpu, sd) {
 		if (!(sd->flags & SD_LOAD_BALANCE))
 			continue;
 
@@ -2967,7 +3058,7 @@ static void run_rebalance_domains(struct softirq_action *h)
 		}
 
 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
-			if (load_balance(this_cpu, this_rq, sd, idle, &balance)) {
+			if (load_balance(cpu, rq, sd, idle, &balance)) {
 				/*
 				 * We've pulled tasks over so either we're no
 				 * longer idle, or one of our SMT siblings is
@@ -2991,7 +3082,114 @@ out:
 		if (!balance)
 			break;
 	}
-	this_rq->next_balance = next_balance;
+	rq->next_balance = next_balance;
+}
+
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * In CONFIG_NO_HZ case, the idle load balance owner will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static void run_rebalance_domains(struct softirq_action *h)
+{
+	int local_cpu = smp_processor_id();
+	struct rq *local_rq = cpu_rq(local_cpu);
+	enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE;
+
+	rebalance_domains(local_cpu, idle);
+
+#ifdef CONFIG_NO_HZ
+	/*
+	 * If this cpu is the owner for idle load balancing, then do the
+	 * balancing on behalf of the other idle cpus whose ticks are
+	 * stopped.
+	 */
+	if (local_rq->idle_at_tick &&
+	    atomic_read(&nohz.load_balancer) == local_cpu) {
+		cpumask_t cpus = nohz.cpu_mask;
+		struct rq *rq;
+		int balance_cpu;
+
+		cpu_clear(local_cpu, cpus);
+		for_each_cpu_mask(balance_cpu, cpus) {
+			/*
+			 * If this cpu gets work to do, stop the load balancing
+			 * work being done for other cpus. Next load
+			 * balancing owner will pick it up.
+			 */
+			if (need_resched())
+				break;
+
+			rebalance_domains(balance_cpu, SCHED_IDLE);
+
+			rq = cpu_rq(balance_cpu);
+			if (time_after(local_rq->next_balance, rq->next_balance))
+				local_rq->next_balance = rq->next_balance;
+		}
+	}
+#endif
+}
+
+/*
+ * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
+ *
+ * In case of CONFIG_NO_HZ, this is the place where we nominate a new
+ * idle load balancing owner or decide to stop the periodic load balancing,
+ * if the whole system is idle.
+ */
+static inline void trigger_load_balance(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+#ifdef CONFIG_NO_HZ
+	/*
+	 * If we were in the nohz mode recently and busy at the current
+	 * scheduler tick, then check if we need to nominate new idle
+	 * load balancer.
+	 */
+	if (rq->in_nohz_recently && !rq->idle_at_tick) {
+		rq->in_nohz_recently = 0;
+
+		if (atomic_read(&nohz.load_balancer) == cpu) {
+			cpu_clear(cpu, nohz.cpu_mask);
+			atomic_set(&nohz.load_balancer, -1);
+		}
+
+		if (atomic_read(&nohz.load_balancer) == -1) {
+			/*
+			 * simple selection for now: Nominate the
+			 * first cpu in the nohz list to be the next
+			 * ilb owner.
+			 *
+			 * TBD: Traverse the sched domains and nominate
+			 * the nearest cpu in the nohz.cpu_mask.
+			 */
+			int ilb = first_cpu(nohz.cpu_mask);
+
+			if (ilb != NR_CPUS)
+				resched_cpu(ilb);
+		}
+	}
+
+	/*
+	 * If this cpu is idle and doing idle load balancing for all the
+	 * cpus with ticks stopped, is it time for that to stop?
+	 */
+	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
+	    cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+		resched_cpu(cpu);
+		return;
+	}
+
+	/*
+	 * If this cpu is idle and the idle load balancing is done by
+	 * someone else, then no need raise the SCHED_SOFTIRQ
+	 */
+	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
+	    cpu_isset(cpu, nohz.cpu_mask))
+		return;
+#endif
+	if (time_after_eq(jiffies, rq->next_balance))
+		raise_softirq(SCHED_SOFTIRQ);
 }
 #else
 /*
@@ -3224,8 +3422,7 @@ void scheduler_tick(void)
 #ifdef CONFIG_SMP
 	update_load(rq);
 	rq->idle_at_tick = idle_at_tick;
-	if (time_after_eq(jiffies, rq->next_balance))
-		raise_softirq(SCHED_SOFTIRQ);
+	trigger_load_balance(cpu);
 #endif
 }
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f4fc867f467d..3483e6cb9549 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -217,6 +217,14 @@ void tick_nohz_stop_sched_tick(void)
 		 * the scheduler tick in nohz_restart_sched_tick.
 		 */
 		if (!ts->tick_stopped) {
+			if (select_nohz_load_balancer(1)) {
+				/*
+				 * sched tick not stopped!
+				 */
+				cpu_clear(cpu, nohz_cpu_mask);
+				goto out;
+			}
+
 			ts->idle_tick = ts->sched_timer.expires;
 			ts->tick_stopped = 1;
 			ts->idle_jiffies = last_jiffies;
@@ -285,6 +293,7 @@ void tick_nohz_restart_sched_tick(void)
 	now = ktime_get();
 
 	local_irq_disable();
+	select_nohz_load_balancer(0);
 	tick_do_update_jiffies64(now);
 	cpu_clear(cpu, nohz_cpu_mask);
 
-- 
cgit v1.2.3


From 5517d86bea237c1d7078840182d9ebc0fe4c1afc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 8 May 2007 00:32:57 -0700
Subject: Speed up divides by cpu_power in scheduler

I noticed expensive divides done in try_to_wakeup() and
find_busiest_group() on a bi dual core Opteron machine (total of 4 cores),
moderatly loaded (15.000 context switch per second)

oprofile numbers :

CPU: AMD64 processors, speed 2600.05 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Cycles outside of halt state) with a unit
mask of 0x00 (No unit mask) count 50000
samples  %        symbol name
...
613914    1.0498  try_to_wake_up
    834  0.0013 :ffffffff80227ae1:   div    %rcx
77513  0.1191 :ffffffff80227ae4:   mov    %rax,%r11

608893    1.0413  find_busiest_group
   1841  0.0031 :ffffffff802260bf:       div    %rdi
140109  0.2394 :ffffffff802260c2:       test   %sil,%sil

Some of these divides can use the reciprocal divides we introduced some
time ago (currently used in slab AFAIK)

We can assume a load will fit in a 32bits number, because with a
SCHED_LOAD_SCALE=128 value, its still a theorical limit of 33554432

When/if we reach this limit one day, probably cpus will have a fast
hardware divide and we can zap the reciprocal divide trick.

Ingo suggested to rename cpu_power to __cpu_power to make clear it should
not be modified without changing its reciprocal value too.

I did not convert the divide in cpu_avg_load_per_task(), because tracking
nr_running changes may be not worth it ?  We could use a static table of 32
reciprocal values but it would add a conditional branch and table lookup.

[akpm@linux-foundation.org: !SMP build fix]
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c | 83 ++++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 54 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 74599286230c..e4a5888549a5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -52,8 +52,9 @@
 #include <linux/tsacct_kern.h>
 #include <linux/kprobes.h>
 #include <linux/delayacct.h>
-#include <asm/tlb.h>
+#include <linux/reciprocal_div.h>
 
+#include <asm/tlb.h>
 #include <asm/unistd.h>
 
 /*
@@ -181,6 +182,27 @@ static unsigned int static_prio_timeslice(int static_prio)
 		return SCALE_PRIO(DEF_TIMESLICE, static_prio);
 }
 
+#ifdef CONFIG_SMP
+/*
+ * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
+ * Since cpu_power is a 'constant', we can use a reciprocal divide.
+ */
+static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
+{
+	return reciprocal_divide(load, sg->reciprocal_cpu_power);
+}
+
+/*
+ * Each time a sched group cpu_power is changed,
+ * we must compute its reciprocal value
+ */
+static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
+{
+	sg->__cpu_power += val;
+	sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
+}
+#endif
+
 /*
  * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
  * to time slice values: [800ms ... 100ms ... 5ms]
@@ -1256,7 +1278,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 		}
 
 		/* Adjust by relative CPU power of the group */
-		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+		avg_load = sg_div_cpu_power(group,
+				avg_load * SCHED_LOAD_SCALE);
 
 		if (local_group) {
 			this_load = avg_load;
@@ -2367,12 +2390,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		}
 
 		total_load += avg_load;
-		total_pwr += group->cpu_power;
+		total_pwr += group->__cpu_power;
 
 		/* Adjust by relative CPU power of the group */
-		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+		avg_load = sg_div_cpu_power(group,
+				avg_load * SCHED_LOAD_SCALE);
 
-		group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
+		group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
 
 		if (local_group) {
 			this_load = avg_load;
@@ -2483,8 +2507,8 @@ group_next:
 	max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
 
 	/* How much load to actually move to equalise the imbalance */
-	*imbalance = min(max_pull * busiest->cpu_power,
-				(avg_load - this_load) * this->cpu_power)
+	*imbalance = min(max_pull * busiest->__cpu_power,
+				(avg_load - this_load) * this->__cpu_power)
 			/ SCHED_LOAD_SCALE;
 
 	/*
@@ -2518,28 +2542,29 @@ small_imbalance:
 		 * moving them.
 		 */
 
-		pwr_now += busiest->cpu_power *
-			min(busiest_load_per_task, max_load);
-		pwr_now += this->cpu_power *
-			min(this_load_per_task, this_load);
+		pwr_now += busiest->__cpu_power *
+				min(busiest_load_per_task, max_load);
+		pwr_now += this->__cpu_power *
+				min(this_load_per_task, this_load);
 		pwr_now /= SCHED_LOAD_SCALE;
 
 		/* Amount of load we'd subtract */
-		tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
-			busiest->cpu_power;
+		tmp = sg_div_cpu_power(busiest,
+				busiest_load_per_task * SCHED_LOAD_SCALE);
 		if (max_load > tmp)
-			pwr_move += busiest->cpu_power *
+			pwr_move += busiest->__cpu_power *
 				min(busiest_load_per_task, max_load - tmp);
 
 		/* Amount of load we'd add */
-		if (max_load * busiest->cpu_power <
+		if (max_load * busiest->__cpu_power <
 				busiest_load_per_task * SCHED_LOAD_SCALE)
-			tmp = max_load * busiest->cpu_power / this->cpu_power;
+			tmp = sg_div_cpu_power(this,
+					max_load * busiest->__cpu_power);
 		else
-			tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
-				this->cpu_power;
-		pwr_move += this->cpu_power *
-			min(this_load_per_task, this_load + tmp);
+			tmp = sg_div_cpu_power(this,
+				busiest_load_per_task * SCHED_LOAD_SCALE);
+		pwr_move += this->__cpu_power *
+				min(this_load_per_task, this_load + tmp);
 		pwr_move /= SCHED_LOAD_SCALE;
 
 		/* Move if we gain throughput */
@@ -5501,7 +5526,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 				break;
 			}
 
-			if (!group->cpu_power) {
+			if (!group->__cpu_power) {
 				printk("\n");
 				printk(KERN_ERR "ERROR: domain->cpu_power not "
 						"set\n");
@@ -5678,7 +5703,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
 			continue;
 
 		sg->cpumask = CPU_MASK_NONE;
-		sg->cpu_power = 0;
+		sg->__cpu_power = 0;
 
 		for_each_cpu_mask(j, span) {
 			if (group_fn(j, cpu_map, NULL) != group)
@@ -6367,7 +6392,7 @@ next_sg:
 			continue;
 		}
 
-		sg->cpu_power += sd->groups->cpu_power;
+		sg_inc_cpu_power(sg, sd->groups->__cpu_power);
 	}
 	sg = sg->next;
 	if (sg != group_head)
@@ -6442,6 +6467,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 
 	child = sd->child;
 
+	sd->groups->__cpu_power = 0;
+
 	/*
 	 * For perf policy, if the groups in child domain share resources
 	 * (for example cores sharing some portions of the cache hierarchy
@@ -6452,18 +6479,16 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 	if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
 		       (child->flags &
 			(SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
-		sd->groups->cpu_power = SCHED_LOAD_SCALE;
+		sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
 		return;
 	}
 
-	sd->groups->cpu_power = 0;
-
 	/*
 	 * add cpu_power of each child group to this groups cpu_power
 	 */
 	group = child->groups;
 	do {
-		sd->groups->cpu_power += group->cpu_power;
+		sg_inc_cpu_power(sd->groups, group->__cpu_power);
 		group = group->next;
 	} while (group != child->groups);
 }
@@ -6623,7 +6648,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 			sd = &per_cpu(node_domains, j);
 			sd->groups = sg;
 		}
-		sg->cpu_power = 0;
+		sg->__cpu_power = 0;
 		sg->cpumask = nodemask;
 		sg->next = sg;
 		cpus_or(covered, covered, nodemask);
@@ -6651,7 +6676,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 				"Can not alloc domain group for node %d\n", j);
 				goto error;
 			}
-			sg->cpu_power = 0;
+			sg->__cpu_power = 0;
 			sg->cpumask = tmp;
 			sg->next = prev->next;
 			cpus_or(covered, covered, tmp);
-- 
cgit v1.2.3


From 4953198b6ce07b008b0f1c2edd41c9d027a118b4 Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Tue, 8 May 2007 00:33:01 -0700
Subject: sched: optimize siblings status check logic in wake_idle()

When a logical cpu 'x' already has more than one process running, then most
likely the siblings of that cpu 'x' must be busy.  Otherwise the idle
siblings would have likely(in most of the scenarios) picked up the extra
load making the load on 'x' atmost one.

Use this logic to eliminate the siblings status check and minimize the cache
misses encountered on a heavily loaded system.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e4a5888549a5..e60786eb731c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1406,7 +1406,16 @@ static int wake_idle(int cpu, struct task_struct *p)
 	struct sched_domain *sd;
 	int i;
 
-	if (idle_cpu(cpu))
+	/*
+	 * If it is idle, then it is the best cpu to run this task.
+	 *
+	 * This cpu is also the best, if it has more than one task already.
+	 * Siblings must be also busy(in most cases) as they didn't already
+	 * pickup the extra load from this cpu and hence we need not check
+	 * sibling runqueue info. This will avoid the checks and cache miss
+	 * penalities associated with that.
+	 */
+	if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
 		return cpu;
 
 	for_each_domain(cpu, sd) {
-- 
cgit v1.2.3


From bd53f96ca54a21c07e7a0ae1886fa623d370b85f Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Tue, 8 May 2007 00:33:06 -0700
Subject: sched: redundant reschedule when set_user_nice() boosts a prio of a
 task from the "expired" array

- Make TASK_PREEMPTS_CURR(task, rq) return "true" only if the task's prio
  is higher than the current's one and the task is in the "active" array.
  This ensures we don't make redundant resched_task() calls when the task
  is in the "expired" array (as may happen now in set_user_prio(),
  rt_mutex_setprio() and pull_task() ) ;

- generalise conditions for a call to resched_task() in set_user_nice(),
  rt_mutex_setprio() and sched_setscheduler()

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Cc: Con Kolivas <kernel@kolivas.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e60786eb731c..be2706c885a7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -169,7 +169,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 		(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
 
 #define TASK_PREEMPTS_CURR(p, rq) \
-	((p)->prio < (rq)->curr->prio)
+	(((p)->prio < (rq)->curr->prio) && ((p)->array == (rq)->active))
 
 #define SCALE_PRIO(x, prio) \
 	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
@@ -4076,13 +4076,13 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	struct prio_array *array;
 	unsigned long flags;
 	struct rq *rq;
-	int oldprio;
+	int delta;
 
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 
 	rq = task_rq_lock(p, &flags);
 
-	oldprio = p->prio;
+	delta = prio - p->prio;
 	array = p->array;
 	if (array)
 		dequeue_task(p, array);
@@ -4098,13 +4098,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		enqueue_task(p, array);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
-		 * our priority decreased, or if we are not currently running on
-		 * this runqueue and our priority is higher than the current's
+		 * our priority decreased, or if our priority became higher
+		 * than the current's.
 		 */
-		if (task_running(rq, p)) {
-			if (p->prio > oldprio)
-				resched_task(rq->curr);
-		} else if (TASK_PREEMPTS_CURR(p, rq))
+		if (TASK_PREEMPTS_CURR(p, rq) ||
+				(delta > 0 && task_running(rq, p)))
 			resched_task(rq->curr);
 	}
 	task_rq_unlock(rq, &flags);
@@ -4152,10 +4150,12 @@ void set_user_nice(struct task_struct *p, long nice)
 		enqueue_task(p, array);
 		inc_raw_weighted_load(rq, p);
 		/*
-		 * If the task increased its priority or is running and
-		 * lowered its priority, then reschedule its CPU:
+		 * Reschedule if we are currently running on this runqueue and
+		 * our priority decreased, or if our priority became higher
+		 * than the current's.
 		 */
-		if (delta < 0 || (delta > 0 && task_running(rq, p)))
+		if (TASK_PREEMPTS_CURR(p, rq) ||
+				(delta > 0 && task_running(rq, p)))
 			resched_task(rq->curr);
 	}
 out_unlock:
@@ -4382,13 +4382,11 @@ recheck:
 		__activate_task(p, rq);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
-		 * our priority decreased, or if we are not currently running on
-		 * this runqueue and our priority is higher than the current's
+		 * our priority decreased, or our priority became higher
+		 * than the current's.
 		 */
-		if (task_running(rq, p)) {
-			if (p->prio > oldprio)
-				resched_task(rq->curr);
-		} else if (TASK_PREEMPTS_CURR(p, rq))
+		if (TASK_PREEMPTS_CURR(p, rq) ||
+				(task_running(rq, p) && p->prio > oldprio))
 			resched_task(rq->curr);
 	}
 	__task_rq_unlock(rq);
-- 
cgit v1.2.3


From c3396620cace20639bdf380f893f4dccad090d91 Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Tue, 8 May 2007 00:33:09 -0700
Subject: sched: align rq to cacheline boundary

Align the per cpu runqueue to the cacheline boundary.  This will minimize
the number of cachelines touched during remote wakeup.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Ravikiran G Thirumalai <kiran@scalex86.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index be2706c885a7..a3a04085e794 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -304,7 +304,7 @@ struct rq {
 	struct lock_class_key rq_lock_key;
 };
 
-static DEFINE_PER_CPU(struct rq, runqueues);
+static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
 
 static inline int cpu_of(struct rq *rq)
 {
-- 
cgit v1.2.3


From 788e770eb2d17212eafaeedde0be56e3444c6b9a Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <darwish.07@gmail.com>
Date: Tue, 8 May 2007 00:33:14 -0700
Subject: rcutorture: Use ARRAY_SIZE macro when appropriate

Use ARRAY_SIZE macro already defined in kernel.h

Signed-off-by: Ahmed S. Darwish <darwish.07@gmail.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/rcutorture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 75ca2a740471..7d20d764b847 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -899,7 +899,7 @@ rcu_torture_init(void)
 	/* Set up the freelist. */
 
 	INIT_LIST_HEAD(&rcu_torture_freelist);
-	for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) {
+	for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) {
 		rcu_tortures[i].rtort_mbtest = 0;
 		list_add_tail(&rcu_tortures[i].rtort_free,
 			      &rcu_torture_freelist);
-- 
cgit v1.2.3


From c8e5b163101c5d716f08e90b686da620baa022f1 Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@kernel.org>
Date: Tue, 8 May 2007 00:33:20 -0700
Subject: rcutorture: style cleanup: avoid != NULL in boolean tests

Signed-off-by: Josh Triplett <josh@kernel.org>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/rcutorture.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 7d20d764b847..17cba7a6b224 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -534,7 +534,7 @@ rcu_torture_writer(void *arg)
 		rp->rtort_mbtest = 1;
 		rcu_assign_pointer(rcu_torture_current, rp);
 		smp_wmb();
-		if (old_rp != NULL) {
+		if (old_rp) {
 			i = old_rp->rtort_pipe_count;
 			if (i > RCU_TORTURE_PIPE_LEN)
 				i = RCU_TORTURE_PIPE_LEN;
@@ -685,7 +685,7 @@ rcu_torture_printk(char *page)
 			       atomic_read(&rcu_torture_wcount[i]));
 	}
 	cnt += sprintf(&page[cnt], "\n");
-	if (cur_ops->stats != NULL)
+	if (cur_ops->stats)
 		cnt += cur_ops->stats(&page[cnt]);
 	return cnt;
 }
@@ -749,13 +749,13 @@ static void rcu_torture_shuffle_tasks(void)
 
 	set_cpus_allowed(current, tmp_mask);
 
-	if (reader_tasks != NULL) {
+	if (reader_tasks) {
 		for (i = 0; i < nrealreaders; i++)
 			if (reader_tasks[i])
 				set_cpus_allowed(reader_tasks[i], tmp_mask);
 	}
 
-	if (fakewriter_tasks != NULL) {
+	if (fakewriter_tasks) {
 		for (i = 0; i < nfakewriters; i++)
 			if (fakewriter_tasks[i])
 				set_cpus_allowed(fakewriter_tasks[i], tmp_mask);
@@ -808,21 +808,21 @@ rcu_torture_cleanup(void)
 	int i;
 
 	fullstop = 1;
-	if (shuffler_task != NULL) {
+	if (shuffler_task) {
 		VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
 		kthread_stop(shuffler_task);
 	}
 	shuffler_task = NULL;
 
-	if (writer_task != NULL) {
+	if (writer_task) {
 		VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
 		kthread_stop(writer_task);
 	}
 	writer_task = NULL;
 
-	if (reader_tasks != NULL) {
+	if (reader_tasks) {
 		for (i = 0; i < nrealreaders; i++) {
-			if (reader_tasks[i] != NULL) {
+			if (reader_tasks[i]) {
 				VERBOSE_PRINTK_STRING(
 					"Stopping rcu_torture_reader task");
 				kthread_stop(reader_tasks[i]);
@@ -834,9 +834,9 @@ rcu_torture_cleanup(void)
 	}
 	rcu_torture_current = NULL;
 
-	if (fakewriter_tasks != NULL) {
+	if (fakewriter_tasks) {
 		for (i = 0; i < nfakewriters; i++) {
-			if (fakewriter_tasks[i] != NULL) {
+			if (fakewriter_tasks[i]) {
 				VERBOSE_PRINTK_STRING(
 					"Stopping rcu_torture_fakewriter task");
 				kthread_stop(fakewriter_tasks[i]);
@@ -847,7 +847,7 @@ rcu_torture_cleanup(void)
 		fakewriter_tasks = NULL;
 	}
 
-	if (stats_task != NULL) {
+	if (stats_task) {
 		VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
 		kthread_stop(stats_task);
 	}
@@ -858,7 +858,7 @@ rcu_torture_cleanup(void)
 
 	rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
 
-	if (cur_ops->cleanup != NULL)
+	if (cur_ops->cleanup)
 		cur_ops->cleanup();
 	if (atomic_read(&n_rcu_torture_error))
 		rcu_torture_print_module_parms("End of test: FAILURE");
@@ -875,7 +875,7 @@ rcu_torture_init(void)
 
 	/* Process args and tell the world that the torturer is on the job. */
 
-	for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) {
+	for (i = 0; cur_ops = torture_ops[i], cur_ops; i++) {
 		cur_ops = torture_ops[i];
 		if (strcmp(torture_type, cur_ops->name) == 0) {
 			break;
@@ -886,7 +886,7 @@ rcu_torture_init(void)
 		       torture_type);
 		return (-EINVAL);
 	}
-	if (cur_ops->init != NULL)
+	if (cur_ops->init)
 		cur_ops->init(); /* no "goto unwind" prior to this point!!! */
 
 	if (nreaders >= 0)
-- 
cgit v1.2.3


From ade5fb818fb1861fd5f84619c761920ade762b5d Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@kernel.org>
Date: Tue, 8 May 2007 00:33:22 -0700
Subject: rcutorture: Remove redundant assignment to cur_ops in for loop

The for loop in rcutorture_init uses the condition
cur_ops = torture_ops[i], cur_ops
but then makes the same assignment to cur_ops inside the loop.  Remove the
redundant assignment inside the loop, and remove now-unnecessary braces.

Signed-off-by: Josh Triplett <josh@kernel.org>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/rcutorture.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 17cba7a6b224..55ba82a85a66 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -502,10 +502,6 @@ static struct rcu_torture_ops sched_ops = {
 	.name = "sched"
 };
 
-static struct rcu_torture_ops *torture_ops[] =
-	{ &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &srcu_ops,
-	  &sched_ops, NULL };
-
 /*
  * RCU torture writer kthread.  Repeatedly substitutes a new structure
  * for that pointed to by rcu_torture_current, freeing the old structure
@@ -872,16 +868,17 @@ rcu_torture_init(void)
 	int i;
 	int cpu;
 	int firsterr = 0;
+	static struct rcu_torture_ops *torture_ops[] =
+		{ &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
+		  &srcu_ops, &sched_ops, };
 
 	/* Process args and tell the world that the torturer is on the job. */
-
-	for (i = 0; cur_ops = torture_ops[i], cur_ops; i++) {
+	for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
 		cur_ops = torture_ops[i];
-		if (strcmp(torture_type, cur_ops->name) == 0) {
+		if (strcmp(torture_type, cur_ops->name) == 0)
 			break;
-		}
 	}
-	if (cur_ops == NULL) {
+	if (i == ARRAY_SIZE(torture_ops)) {
 		printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
 		       torture_type);
 		return (-EINVAL);
-- 
cgit v1.2.3


From b0bb501651b467096723dcfcf4565d910a2aadf8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 8 May 2007 00:34:11 -0700
Subject: kprobes: use hlist_for_each_entry

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index bee29bde6adf..993452324a1f 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -134,8 +134,7 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
 	struct hlist_node *pos;
 
       retry:
-	hlist_for_each(pos, &kprobe_insn_pages) {
-		kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
+	hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
 		if (kip->nused < INSNS_PER_PAGE) {
 			int i;
 			for (i = 0; i < INSNS_PER_PAGE; i++) {
@@ -214,9 +213,8 @@ static int __kprobes collect_garbage_slots(void)
 	if (check_safety() != 0)
 		return -EAGAIN;
 
-	hlist_for_each_safe(pos, next, &kprobe_insn_pages) {
+	hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) {
 		int i;
-		kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
 		if (kip->ngarbage == 0)
 			continue;
 		kip->ngarbage = 0;	/* we will collect all garbages */
@@ -235,8 +233,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
 	struct kprobe_insn_page *kip;
 	struct hlist_node *pos;
 
-	hlist_for_each(pos, &kprobe_insn_pages) {
-		kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
+	hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
 		if (kip->insns <= slot &&
 		    slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
 			int i = (slot - kip->insns) / MAX_INSN_SIZE;
-- 
cgit v1.2.3


From 6f716acd5fa20ae6a35ab29ae37fa9189e839ed5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 8 May 2007 00:34:13 -0700
Subject: kprobes: codingstyle cleanups

Remove superflous braces and fix indentation aswell as comments.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 55 ++++++++++++++++++++++++++++++-------------------------
 1 file changed, 30 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 993452324a1f..22857003a65b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -133,7 +133,7 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
 	struct kprobe_insn_page *kip;
 	struct hlist_node *pos;
 
-      retry:
+ retry:
 	hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
 		if (kip->nused < INSNS_PER_PAGE) {
 			int i;
@@ -155,9 +155,8 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
 	}
 	/* All out of space.  Need to allocate a new page. Use slot 0. */
 	kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
-	if (!kip) {
+	if (!kip)
 		return NULL;
-	}
 
 	/*
 	 * Use module_alloc so this page is within +/- 2GB of where the
@@ -246,9 +245,9 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
 			break;
 		}
 	}
-	if (dirty && (++kprobe_garbage_slots > INSNS_PER_PAGE)) {
+
+	if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE)
 		collect_garbage_slots();
-	}
 }
 #endif
 
@@ -314,7 +313,6 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
 			reset_kprobe_instance();
 		}
 	}
-	return;
 }
 
 static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
@@ -533,8 +531,8 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 
 static int __kprobes in_kprobes_functions(unsigned long addr)
 {
-	if (addr >= (unsigned long)__kprobes_text_start
-		&& addr < (unsigned long)__kprobes_text_end)
+	if (addr >= (unsigned long)__kprobes_text_start &&
+	    addr < (unsigned long)__kprobes_text_end)
 		return -EINVAL;
 	return 0;
 }
@@ -561,19 +559,24 @@ static int __kprobes __register_kprobe(struct kprobe *p,
 		return -EINVAL;
 	p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset);
 
-	if ((!kernel_text_address((unsigned long) p->addr)) ||
-		in_kprobes_functions((unsigned long) p->addr))
+	if (!kernel_text_address((unsigned long) p->addr) ||
+	    in_kprobes_functions((unsigned long) p->addr))
 		return -EINVAL;
 
 	p->mod_refcounted = 0;
-	/* Check are we probing a module */
-	if ((probed_mod = module_text_address((unsigned long) p->addr))) {
+
+	/*
+	 * Check if are we probing a module.
+	 */
+	probed_mod = module_text_address((unsigned long) p->addr);
+	if (probed_mod) {
 		struct module *calling_mod = module_text_address(called_from);
-		/* We must allow modules to probe themself and
-		 * in this case avoid incrementing the module refcount,
-		 * so as to allow unloading of self probing modules.
+		/*
+		 * We must allow modules to probe themself and in this case
+		 * avoid incrementing the module refcount, so as to allow
+		 * unloading of self probing modules.
 		 */
-		if (calling_mod && (calling_mod != probed_mod)) {
+		if (calling_mod && calling_mod != probed_mod) {
 			if (unlikely(!try_module_get(probed_mod)))
 				return -EINVAL;
 			p->mod_refcounted = 1;
@@ -591,7 +594,8 @@ static int __kprobes __register_kprobe(struct kprobe *p,
 		goto out;
 	}
 
-	if ((ret = arch_prepare_kprobe(p)) != 0)
+	ret = arch_prepare_kprobe(p);
+	if (ret)
 		goto out;
 
 	INIT_HLIST_NODE(&p->hlist);
@@ -614,8 +618,7 @@ out:
 
 int __kprobes register_kprobe(struct kprobe *p)
 {
-	return __register_kprobe(p,
-		(unsigned long)__builtin_return_address(0));
+	return __register_kprobe(p, (unsigned long)__builtin_return_address(0));
 }
 
 void __kprobes unregister_kprobe(struct kprobe *p)
@@ -639,9 +642,9 @@ void __kprobes unregister_kprobe(struct kprobe *p)
 		return;
 	}
 valid_p:
-	if ((old_p == p) || ((old_p->pre_handler == aggr_pre_handler) &&
-		(p->list.next == &old_p->list) &&
-		(p->list.prev == &old_p->list))) {
+	if (old_p == p ||
+	    (old_p->pre_handler == aggr_pre_handler &&
+	     p->list.next == &old_p->list && p->list.prev == &old_p->list)) {
 		/* Only probe on the hash list */
 		arch_disarm_kprobe(p);
 		hlist_del_rcu(&old_p->hlist);
@@ -654,9 +657,11 @@ valid_p:
 	mutex_unlock(&kprobe_mutex);
 
 	synchronize_sched();
-	if (p->mod_refcounted &&
-	    (mod = module_text_address((unsigned long)p->addr)))
-		module_put(mod);
+	if (p->mod_refcounted) {
+		mod = module_text_address((unsigned long)p->addr);
+		if (mod)
+			module_put(mod);
+	}
 
 	if (cleanup_p) {
 		if (p != old_p) {
-- 
cgit v1.2.3


From 4c4308cb93450989846ac49faeb6dab943e7657e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 8 May 2007 00:34:14 -0700
Subject: kprobes: kretprobes simplifications

 - consolidate duplicate code in all arch_prepare_kretprobe instances
   into common code
 - replace various odd helpers that use hlist_for_each_entry to get
   the first elemenet of a list with either a hlist_for_each_entry_save
   or an opencoded access to the first element in the caller
 - inline add_rp_inst into it's only remaining caller
 - use kretprobe_inst_table_head instead of opencoding it

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 64 +++++++++++++++++++-------------------------------------
 1 file changed, 21 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 22857003a65b..f58f171bd65f 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -357,46 +357,6 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
 	return;
 }
 
-/* Called with kretprobe_lock held */
-struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
-{
-	struct hlist_node *node;
-	struct kretprobe_instance *ri;
-	hlist_for_each_entry(ri, node, &rp->free_instances, uflist)
-		return ri;
-	return NULL;
-}
-
-/* Called with kretprobe_lock held */
-static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe
-							      *rp)
-{
-	struct hlist_node *node;
-	struct kretprobe_instance *ri;
-	hlist_for_each_entry(ri, node, &rp->used_instances, uflist)
-		return ri;
-	return NULL;
-}
-
-/* Called with kretprobe_lock held */
-void __kprobes add_rp_inst(struct kretprobe_instance *ri)
-{
-	/*
-	 * Remove rp inst off the free list -
-	 * Add it back when probed function returns
-	 */
-	hlist_del(&ri->uflist);
-
-	/* Add rp inst onto table */
-	INIT_HLIST_NODE(&ri->hlist);
-	hlist_add_head(&ri->hlist,
-			&kretprobe_inst_table[hash_ptr(ri->task, KPROBE_HASH_BITS)]);
-
-	/* Also add this rp inst to the used list. */
-	INIT_HLIST_NODE(&ri->uflist);
-	hlist_add_head(&ri->uflist, &ri->rp->used_instances);
-}
-
 /* Called with kretprobe_lock held */
 void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
 				struct hlist_head *head)
@@ -450,7 +410,9 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
 static inline void free_rp_inst(struct kretprobe *rp)
 {
 	struct kretprobe_instance *ri;
-	while ((ri = get_free_rp_inst(rp)) != NULL) {
+	struct hlist_node *pos, *next;
+
+	hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, uflist) {
 		hlist_del(&ri->uflist);
 		kfree(ri);
 	}
@@ -732,7 +694,21 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
 
 	/*TODO: consider to only swap the RA after the last pre_handler fired */
 	spin_lock_irqsave(&kretprobe_lock, flags);
-	arch_prepare_kretprobe(rp, regs);
+	if (!hlist_empty(&rp->free_instances)) {
+		struct kretprobe_instance *ri;
+
+		ri = hlist_entry(rp->free_instances.first,
+				 struct kretprobe_instance, uflist);
+		ri->rp = rp;
+		ri->task = current;
+		arch_prepare_kretprobe(ri, regs);
+
+		/* XXX(hch): why is there no hlist_move_head? */
+		hlist_del(&ri->uflist);
+		hlist_add_head(&ri->uflist, &ri->rp->used_instances);
+		hlist_add_head(&ri->hlist, kretprobe_inst_table_head(ri->task));
+	} else
+		rp->nmissed++;
 	spin_unlock_irqrestore(&kretprobe_lock, flags);
 	return 0;
 }
@@ -795,11 +771,13 @@ void __kprobes unregister_kretprobe(struct kretprobe *rp)
 {
 	unsigned long flags;
 	struct kretprobe_instance *ri;
+	struct hlist_node *pos, *next;
 
 	unregister_kprobe(&rp->kp);
+
 	/* No race here */
 	spin_lock_irqsave(&kretprobe_lock, flags);
-	while ((ri = get_used_rp_inst(rp)) != NULL) {
+	hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) {
 		ri->rp = NULL;
 		hlist_del(&ri->uflist);
 	}
-- 
cgit v1.2.3


From bf8f6e5b3e51ee0c64c2d1350c70198ddc8ad3f7 Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Tue, 8 May 2007 00:34:16 -0700
Subject: Kprobes: The ON/OFF knob thru debugfs

This patch provides a debugfs knob to turn kprobes on/off

o A new file /debug/kprobes/enabled indicates if kprobes is enabled or
  not (default enabled)
o Echoing 0 to this file will disarm all installed probes
o Any new probe registration when disabled will register the probe but
  not arm it. A message will be printed out in such a case.
o When a value 1 is echoed to the file, all probes (including ones
  registered in the intervening period) will be enabled
o Unregistration will happen irrespective of whether probes are globally
  enabled or not.
o Update Documentation/kprobes.txt to reflect these changes. While there
  also update the doc to make it current.

We are also looking at providing sysrq key support to tie to the disabling
feature provided by this patch.

[akpm@linux-foundation.org: Use bool like a bool!]
[akpm@linux-foundation.org: add printk facility levels]
[cornelia.huck@de.ibm.com: Add the missing arch_trampoline_kprobe() for s390]
Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Signed-off-by: Srinivasa DS <srinivasa@in.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 149 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index f58f171bd65f..9e47d8c493f3 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -43,9 +43,11 @@
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
 #include <linux/kdebug.h>
+
 #include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
 #include <asm/errno.h>
+#include <asm/uaccess.h>
 
 #define KPROBE_HASH_BITS 6
 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
@@ -64,6 +66,9 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 static atomic_t kprobe_count;
 
+/* NOTE: change this value only with kprobe_mutex held */
+static bool kprobe_enabled;
+
 DEFINE_MUTEX(kprobe_mutex);		/* Protects kprobe_table */
 DEFINE_SPINLOCK(kretprobe_lock);	/* Protects kretprobe_inst_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
@@ -564,12 +569,13 @@ static int __kprobes __register_kprobe(struct kprobe *p,
 	hlist_add_head_rcu(&p->hlist,
 		       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
 
-	if (atomic_add_return(1, &kprobe_count) == \
+	if (kprobe_enabled) {
+		if (atomic_add_return(1, &kprobe_count) == \
 				(ARCH_INACTIVE_KPROBE_COUNT + 1))
-		register_page_fault_notifier(&kprobe_page_fault_nb);
-
-	arch_arm_kprobe(p);
+			register_page_fault_notifier(&kprobe_page_fault_nb);
 
+		arch_arm_kprobe(p);
+	}
 out:
 	mutex_unlock(&kprobe_mutex);
 
@@ -607,8 +613,13 @@ valid_p:
 	if (old_p == p ||
 	    (old_p->pre_handler == aggr_pre_handler &&
 	     p->list.next == &old_p->list && p->list.prev == &old_p->list)) {
-		/* Only probe on the hash list */
-		arch_disarm_kprobe(p);
+		/*
+		 * Only probe on the hash list. Disarm only if kprobes are
+		 * enabled - otherwise, the breakpoint would already have
+		 * been removed. We save on flushing icache.
+		 */
+		if (kprobe_enabled)
+			arch_disarm_kprobe(p);
 		hlist_del_rcu(&old_p->hlist);
 		cleanup_p = 1;
 	} else {
@@ -797,6 +808,9 @@ static int __init init_kprobes(void)
 	}
 	atomic_set(&kprobe_count, 0);
 
+	/* By default, kprobes are enabled */
+	kprobe_enabled = true;
+
 	err = arch_init_kprobes();
 	if (!err)
 		err = register_die_notifier(&kprobe_exceptions_nb);
@@ -806,7 +820,7 @@ static int __init init_kprobes(void)
 
 #ifdef CONFIG_DEBUG_FS
 static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
-               const char *sym, int offset,char *modname)
+		const char *sym, int offset,char *modname)
 {
 	char *kprobe_type;
 
@@ -885,9 +899,130 @@ static struct file_operations debugfs_kprobes_operations = {
 	.release        = seq_release,
 };
 
+static void __kprobes enable_all_kprobes(void)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct kprobe *p;
+	unsigned int i;
+
+	mutex_lock(&kprobe_mutex);
+
+	/* If kprobes are already enabled, just return */
+	if (kprobe_enabled)
+		goto already_enabled;
+
+	/*
+	 * Re-register the page fault notifier only if there are any
+	 * active probes at the time of enabling kprobes globally
+	 */
+	if (atomic_read(&kprobe_count) > ARCH_INACTIVE_KPROBE_COUNT)
+		register_page_fault_notifier(&kprobe_page_fault_nb);
+
+	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+		head = &kprobe_table[i];
+		hlist_for_each_entry_rcu(p, node, head, hlist)
+			arch_arm_kprobe(p);
+	}
+
+	kprobe_enabled = true;
+	printk(KERN_INFO "Kprobes globally enabled\n");
+
+already_enabled:
+	mutex_unlock(&kprobe_mutex);
+	return;
+}
+
+static void __kprobes disable_all_kprobes(void)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct kprobe *p;
+	unsigned int i;
+
+	mutex_lock(&kprobe_mutex);
+
+	/* If kprobes are already disabled, just return */
+	if (!kprobe_enabled)
+		goto already_disabled;
+
+	kprobe_enabled = false;
+	printk(KERN_INFO "Kprobes globally disabled\n");
+	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+		head = &kprobe_table[i];
+		hlist_for_each_entry_rcu(p, node, head, hlist) {
+			if (!arch_trampoline_kprobe(p))
+				arch_disarm_kprobe(p);
+		}
+	}
+
+	mutex_unlock(&kprobe_mutex);
+	/* Allow all currently running kprobes to complete */
+	synchronize_sched();
+
+	mutex_lock(&kprobe_mutex);
+	/* Unconditionally unregister the page_fault notifier */
+	unregister_page_fault_notifier(&kprobe_page_fault_nb);
+
+already_disabled:
+	mutex_unlock(&kprobe_mutex);
+	return;
+}
+
+/*
+ * XXX: The debugfs bool file interface doesn't allow for callbacks
+ * when the bool state is switched. We can reuse that facility when
+ * available
+ */
+static ssize_t read_enabled_file_bool(struct file *file,
+	       char __user *user_buf, size_t count, loff_t *ppos)
+{
+	char buf[3];
+
+	if (kprobe_enabled)
+		buf[0] = '1';
+	else
+		buf[0] = '0';
+	buf[1] = '\n';
+	buf[2] = 0x00;
+	return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t write_enabled_file_bool(struct file *file,
+	       const char __user *user_buf, size_t count, loff_t *ppos)
+{
+	char buf[32];
+	int buf_size;
+
+	buf_size = min(count, (sizeof(buf)-1));
+	if (copy_from_user(buf, user_buf, buf_size))
+		return -EFAULT;
+
+	switch (buf[0]) {
+	case 'y':
+	case 'Y':
+	case '1':
+		enable_all_kprobes();
+		break;
+	case 'n':
+	case 'N':
+	case '0':
+		disable_all_kprobes();
+		break;
+	}
+
+	return count;
+}
+
+static struct file_operations fops_kp = {
+	.read =         read_enabled_file_bool,
+	.write =        write_enabled_file_bool,
+};
+
 static int __kprobes debugfs_kprobe_init(void)
 {
 	struct dentry *dir, *file;
+	unsigned int value = 1;
 
 	dir = debugfs_create_dir("kprobes", NULL);
 	if (!dir)
@@ -900,6 +1035,13 @@ static int __kprobes debugfs_kprobe_init(void)
 		return -ENOMEM;
 	}
 
+	file = debugfs_create_file("enabled", 0600, dir,
+					&value, &fops_kp);
+	if (!file) {
+		debugfs_remove(dir);
+		return -ENOMEM;
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From d85a60d85ea5b7c597508c1510c88e657773d378 Mon Sep 17 00:00:00 2001
From: Bernhard Walle <bwalle@suse.de>
Date: Tue, 8 May 2007 00:35:24 -0700
Subject: Add IRQF_IRQPOLL flag (common code)

irqpoll is broken on some architectures that don't use the IRQ 0 for the timer
interrupt like IA64.  This patch adds a IRQF_IRQPOLL flag.

Each architecture is handled in a separate pach.  As I left the irq == 0 as
condition, this should not break existing architectures that use timer_irq ==
0 and that I did't address with that patch (because I don't know).

This patch:

This patch adds a IRQF_IRQPOLL flag that the interrupt registration code could
use for the interrupt it wants to use for IRQ polling.

Because this must not be the timer interrupt, an additional flag was added
instead of re-using the IRQF_TIMER constant.  Until all architectures will
have an IRQF_IRQPOLL interrupt, irq == 0 will stay as alternative as it should
not break anything.

Also, note_interrupt() is called on CPU-specific interrupts to be used as
interrupt source for IRQ polling.

Signed-off-by: Bernhard Walle <bwalle@suse.de>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Matthew Wilcox <willy@debian.org>
Cc: Grant Grundler <grundler@google.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/handle.c   | 2 ++
 kernel/irq/spurious.c | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 515ad40bde15..32e1ab1477d1 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -180,6 +180,8 @@ fastcall unsigned int __do_IRQ(unsigned int irq)
 		if (desc->chip->ack)
 			desc->chip->ack(irq);
 		action_ret = handle_IRQ_event(irq, desc->action);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret);
 		desc->chip->end(irq);
 		return 1;
 	}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 9d8c79b48823..b0d81aae472f 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -146,7 +146,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
 
 	if (unlikely(irqfixup)) {
 		/* Don't punish working computers */
-		if ((irqfixup == 2 && irq == 0) || action_ret == IRQ_NONE) {
+		if ((irqfixup == 2 && ((irq == 0) ||
+				(desc->action->flags & IRQF_IRQPOLL))) ||
+				action_ret == IRQ_NONE) {
 			int ok = misrouted_irq(irq);
 			if (action_ret == IRQ_NONE)
 				desc->irqs_unhandled -= ok;
-- 
cgit v1.2.3


From d5f9f942c601fdebe57f7805e4b4fbad9c28ada8 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 8 May 2007 20:27:06 -0700
Subject: revert 'sched: redundant reschedule when set_user_nice() boosts a
 prio of a task from the "expired" array'

Revert commit bd53f96ca54a21c07e7a0ae1886fa623d370b85f.

Con says:

This is no good, sorry. The one I saw originally was with the staircase
deadline cpu scheduler in situ and was different.

  #define TASK_PREEMPTS_CURR(p, rq) \
     ((p)->prio < (rq)->curr->prio)
     (((p)->prio < (rq)->curr->prio) && ((p)->array == (rq)->active))

This will fail to wake up a runqueue for a task that has been migrated to the
expired array of a runqueue which is otherwise idle which can happen with smp
balancing,

Cc: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Cc: Con Kolivas <kernel@kolivas.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index a3a04085e794..66bd7ff23f18 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -169,7 +169,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 		(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
 
 #define TASK_PREEMPTS_CURR(p, rq) \
-	(((p)->prio < (rq)->curr->prio) && ((p)->array == (rq)->active))
+	((p)->prio < (rq)->curr->prio)
 
 #define SCALE_PRIO(x, prio) \
 	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
@@ -4076,13 +4076,13 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	struct prio_array *array;
 	unsigned long flags;
 	struct rq *rq;
-	int delta;
+	int oldprio;
 
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 
 	rq = task_rq_lock(p, &flags);
 
-	delta = prio - p->prio;
+	oldprio = p->prio;
 	array = p->array;
 	if (array)
 		dequeue_task(p, array);
@@ -4098,11 +4098,13 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		enqueue_task(p, array);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
-		 * our priority decreased, or if our priority became higher
-		 * than the current's.
+		 * our priority decreased, or if we are not currently running on
+		 * this runqueue and our priority is higher than the current's
 		 */
-		if (TASK_PREEMPTS_CURR(p, rq) ||
-				(delta > 0 && task_running(rq, p)))
+		if (task_running(rq, p)) {
+			if (p->prio > oldprio)
+				resched_task(rq->curr);
+		} else if (TASK_PREEMPTS_CURR(p, rq))
 			resched_task(rq->curr);
 	}
 	task_rq_unlock(rq, &flags);
@@ -4150,12 +4152,10 @@ void set_user_nice(struct task_struct *p, long nice)
 		enqueue_task(p, array);
 		inc_raw_weighted_load(rq, p);
 		/*
-		 * Reschedule if we are currently running on this runqueue and
-		 * our priority decreased, or if our priority became higher
-		 * than the current's.
+		 * If the task increased its priority or is running and
+		 * lowered its priority, then reschedule its CPU:
 		 */
-		if (TASK_PREEMPTS_CURR(p, rq) ||
-				(delta > 0 && task_running(rq, p)))
+		if (delta < 0 || (delta > 0 && task_running(rq, p)))
 			resched_task(rq->curr);
 	}
 out_unlock:
@@ -4382,11 +4382,13 @@ recheck:
 		__activate_task(p, rq);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
-		 * our priority decreased, or our priority became higher
-		 * than the current's.
+		 * our priority decreased, or if we are not currently running on
+		 * this runqueue and our priority is higher than the current's
 		 */
-		if (TASK_PREEMPTS_CURR(p, rq) ||
-				(task_running(rq, p) && p->prio > oldprio))
+		if (task_running(rq, p)) {
+			if (p->prio > oldprio)
+				resched_task(rq->curr);
+		} else if (TASK_PREEMPTS_CURR(p, rq))
 			resched_task(rq->curr);
 	}
 	__task_rq_unlock(rq);
-- 
cgit v1.2.3


From 3dde6ad8fc3939d345a3768464ecff43c91d511a Mon Sep 17 00:00:00 2001
From: David Sterba <dave@jikos.cz>
Date: Wed, 9 May 2007 07:12:20 +0200
Subject: Fix trivial typos in Kconfig* files

Fix several typos in help text in Kconfig* files.

Signed-off-by: David Sterba <dave@jikos.cz>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 kernel/Kconfig.preempt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 0b46a5dff4c0..c64ce9c14207 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -23,7 +23,7 @@ config PREEMPT_VOLUNTARY
 	  "explicit preemption points" to the kernel code. These new
 	  preemption points have been selected to reduce the maximum
 	  latency of rescheduling, providing faster application reactions,
-	  at the cost of slighly lower throughput.
+	  at the cost of slightly lower throughput.
 
 	  This allows reaction to interactive events by allowing a
 	  low priority process to voluntarily preempt itself even if it
@@ -43,7 +43,7 @@ config PREEMPT
 	  even if it is in kernel mode executing a system call and would
 	  otherwise not be about to reach a natural preemption point.
 	  This allows applications to run more 'smoothly' even when the
-	  system is under load, at the cost of slighly lower throughput
+	  system is under load, at the cost of slightly lower throughput
 	  and a slight runtime overhead to kernel code.
 
 	  Select this if you are building a kernel for a desktop or
-- 
cgit v1.2.3


From 02a3e59a088749c08b0293ee1535f5bf48f5926c Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@mindspring.com>
Date: Wed, 9 May 2007 07:26:28 +0200
Subject: Fix minor typoes in kernel/module.c

Fix minor (comment) typoes in kernel/module.c.

Signed-off-by: Robert P. J. Day <rpjday@mindspring.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 kernel/module.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index d36e45477fac..9bd93de01f4a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -96,9 +96,9 @@ static inline void add_taint_module(struct module *mod, unsigned flag)
 	mod->taints |= flag;
 }
 
-/* A thread that wants to hold a reference to a module only while it
- * is running can call ths to safely exit.
- * nfsd and lockd use this.
+/*
+ * A thread that wants to hold a reference to a module only while it
+ * is running can call this to safely exit.  nfsd and lockd use this.
  */
 void __module_put_and_exit(struct module *mod, long code)
 {
@@ -1199,7 +1199,7 @@ static int __unlink_module(void *_mod)
 	return 0;
 }
 
-/* Free a module, remove from lists, etc (must hold module mutex). */
+/* Free a module, remove from lists, etc (must hold module_mutex). */
 static void free_module(struct module *mod)
 {
 	/* Delete from various lists */
@@ -1246,7 +1246,7 @@ EXPORT_SYMBOL_GPL(__symbol_get);
 
 /*
  * Ensure that an exported symbol [global namespace] does not already exist
- * in the Kernel or in some other modules exported symbol table.
+ * in the kernel or in some other module's exported symbol table.
  */
 static int verify_export_symbols(struct module *mod)
 {
-- 
cgit v1.2.3


From 940d67f6b95166475ff6e600ef7658e1cd441278 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 8 May 2007 19:23:49 +1000
Subject: [POWERPC] swsusp: Introduce register_nosave_region_late

This patch introduces a new register_nosave_region_late function that
can be called from initcalls when register_nosave_region can no longer
be used because it uses bootmem.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 kernel/power/snapshot.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index b7039772b05c..59fb89ba9a4d 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -607,7 +607,8 @@ static LIST_HEAD(nosave_regions);
  */
 
 void __init
-register_nosave_region(unsigned long start_pfn, unsigned long end_pfn)
+__register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
+			 int use_kmalloc)
 {
 	struct nosave_region *region;
 
@@ -623,8 +624,13 @@ register_nosave_region(unsigned long start_pfn, unsigned long end_pfn)
 			goto Report;
 		}
 	}
-	/* This allocation cannot fail */
-	region = alloc_bootmem_low(sizeof(struct nosave_region));
+	if (use_kmalloc) {
+		/* during init, this shouldn't fail */
+		region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL);
+		BUG_ON(!region);
+	} else
+		/* This allocation cannot fail */
+		region = alloc_bootmem_low(sizeof(struct nosave_region));
 	region->start_pfn = start_pfn;
 	region->end_pfn = end_pfn;
 	list_add_tail(&region->list, &nosave_regions);
-- 
cgit v1.2.3


From 59c51591a0ac7568824f541f57de967e88adaa07 Mon Sep 17 00:00:00 2001
From: Michael Opdenacker <michael@free-electrons.com>
Date: Wed, 9 May 2007 08:57:56 +0200
Subject: Fix occurrences of "the the "

Signed-off-by: Michael Opdenacker <michael@free-electrons.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 kernel/relay.c | 2 +-
 kernel/wait.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 577f251c7e28..d24395e8b6e5 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -310,7 +310,7 @@ static struct rchan_callbacks default_channel_callbacks = {
 
 /**
  *	wakeup_readers - wake up readers waiting on a channel
- *	@work: work struct that contains the the channel buffer
+ *	@work: work struct that contains the channel buffer
  *
  *	This is the work function used to defer reader waking.  The
  *	reason waking is deferred is that calling directly from write
diff --git a/kernel/wait.c b/kernel/wait.c
index 59a82f63275d..444ddbfaefc4 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -61,7 +61,7 @@ EXPORT_SYMBOL(remove_wait_queue);
  * The spin_unlock() itself is semi-permeable and only protects
  * one way (it only protects stuff inside the critical region and
  * stops them from bleeding out - it would still allow subsequent
- * loads to move into the the critical region).
+ * loads to move into the critical region).
  */
 void fastcall
 prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
-- 
cgit v1.2.3


From f42df9e658be10ca10d0d9b19a0e9d484694f04f Mon Sep 17 00:00:00 2001
From: John Anthony Kazos Jr <jakj@j-a-k-j.com>
Date: Wed, 9 May 2007 08:23:08 +0200
Subject: general: convert "kernel" subdirectory to UTF-8

Convert the "kernel" subdirectory of the tree to UTF-8. The only file
modified is <kernel/sys.c>.

Signed-off-by: John Anthony Kazos Jr. <jakj@j-a-k-j.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 kernel/sys.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 926bf9d7ac45..0742c938dfa7 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1292,7 +1292,7 @@ asmlinkage long sys_setfsuid(uid_t uid)
 }
 
 /*
- * Samma p� svenska..
+ * Samma på svenska..
  */
 asmlinkage long sys_setfsgid(gid_t gid)
 {
-- 
cgit v1.2.3


From d60846c4d16f9518b098b905af2b87cb6bf6dc42 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 9 May 2007 02:33:17 -0700
Subject: swsusp: clean up printk

Remove an inexplicable /

Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/snapshot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index b7039772b05c..48383ea72290 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1227,7 +1227,7 @@ asmlinkage int swsusp_save(void)
 	nr_copy_pages = nr_pages;
 	nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
 
-	printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
+	printk("swsusp: critical section: done (%d pages copied)\n", nr_pages);
 
 	return 0;
 }
-- 
cgit v1.2.3


From a3d25c275d383975504dc53c25b691df59bd3c48 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 9 May 2007 02:33:18 -0700
Subject: PM: Separate hibernation code from suspend code

[ With Johannes Berg <johannes@sipsolutions.net> ]

Separate the hibernation (aka suspend to disk code) from the other suspend
code.  In particular:

 * Remove the definitions related to hibernation from include/linux/pm.h
 * Introduce struct hibernation_ops and a new hibernate() function to hibernate
   the system, defined in include/linux/suspend.h
 * Separate suspend code in kernel/power/main.c from hibernation-related code
   in kernel/power/disk.c and kernel/power/user.c (with the help of
   hibernation_ops)
 * Switch ACPI (the only user of pm_ops.pm_disk_mode) to hibernation_ops

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Greg KH <greg@kroah.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@nigel.suspend2.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/disk.c  | 195 ++++++++++++++++++++++++++++-----------------------
 kernel/power/main.c  |  42 +++++------
 kernel/power/power.h |   7 +-
 kernel/power/user.c  |  13 ++--
 kernel/sys.c         |   2 +-
 5 files changed, 132 insertions(+), 127 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 06331374d862..b5f0543ed84d 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -30,30 +30,69 @@ char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
 sector_t swsusp_resume_block;
 
+enum {
+	HIBERNATION_INVALID,
+	HIBERNATION_PLATFORM,
+	HIBERNATION_TEST,
+	HIBERNATION_TESTPROC,
+	HIBERNATION_SHUTDOWN,
+	HIBERNATION_REBOOT,
+	/* keep last */
+	__HIBERNATION_AFTER_LAST
+};
+#define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1)
+#define HIBERNATION_FIRST (HIBERNATION_INVALID + 1)
+
+static int hibernation_mode = HIBERNATION_SHUTDOWN;
+
+struct hibernation_ops *hibernation_ops;
+
+/**
+ * hibernation_set_ops - set the global hibernate operations
+ * @ops: the hibernation operations to use in subsequent hibernation transitions
+ */
+
+void hibernation_set_ops(struct hibernation_ops *ops)
+{
+	if (ops && !(ops->prepare && ops->enter && ops->finish)) {
+		WARN_ON(1);
+		return;
+	}
+	mutex_lock(&pm_mutex);
+	hibernation_ops = ops;
+	if (ops)
+		hibernation_mode = HIBERNATION_PLATFORM;
+	else if (hibernation_mode == HIBERNATION_PLATFORM)
+		hibernation_mode = HIBERNATION_SHUTDOWN;
+
+	mutex_unlock(&pm_mutex);
+}
+
+
 /**
  *	platform_prepare - prepare the machine for hibernation using the
  *	platform driver if so configured and return an error code if it fails
  */
 
-static inline int platform_prepare(void)
+static int platform_prepare(void)
 {
-	int error = 0;
+	return (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) ?
+		hibernation_ops->prepare() : 0;
+}
 
-	switch (pm_disk_mode) {
-	case PM_DISK_TEST:
-	case PM_DISK_TESTPROC:
-	case PM_DISK_SHUTDOWN:
-	case PM_DISK_REBOOT:
-		break;
-	default:
-		if (pm_ops && pm_ops->prepare)
-			error = pm_ops->prepare(PM_SUSPEND_DISK);
-	}
-	return error;
+/**
+ *	platform_finish - switch the machine to the normal mode of operation
+ *	using the platform driver (must be called after platform_prepare())
+ */
+
+static void platform_finish(void)
+{
+	if (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops)
+		hibernation_ops->finish();
 }
 
 /**
- *	power_down - Shut machine down for hibernate.
+ *	power_down - Shut the machine down for hibernation.
  *
  *	Use the platform driver, if configured so; otherwise try
  *	to power off or reboot.
@@ -61,20 +100,20 @@ static inline int platform_prepare(void)
 
 static void power_down(void)
 {
-	switch (pm_disk_mode) {
-	case PM_DISK_TEST:
-	case PM_DISK_TESTPROC:
+	switch (hibernation_mode) {
+	case HIBERNATION_TEST:
+	case HIBERNATION_TESTPROC:
 		break;
-	case PM_DISK_SHUTDOWN:
+	case HIBERNATION_SHUTDOWN:
 		kernel_power_off();
 		break;
-	case PM_DISK_REBOOT:
+	case HIBERNATION_REBOOT:
 		kernel_restart(NULL);
 		break;
-	default:
-		if (pm_ops && pm_ops->enter) {
+	case HIBERNATION_PLATFORM:
+		if (hibernation_ops) {
 			kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
-			pm_ops->enter(PM_SUSPEND_DISK);
+			hibernation_ops->enter();
 			break;
 		}
 	}
@@ -87,20 +126,6 @@ static void power_down(void)
 	while(1);
 }
 
-static inline void platform_finish(void)
-{
-	switch (pm_disk_mode) {
-	case PM_DISK_TEST:
-	case PM_DISK_TESTPROC:
-	case PM_DISK_SHUTDOWN:
-	case PM_DISK_REBOOT:
-		break;
-	default:
-		if (pm_ops && pm_ops->finish)
-			pm_ops->finish(PM_SUSPEND_DISK);
-	}
-}
-
 static void unprepare_processes(void)
 {
 	thaw_processes();
@@ -120,13 +145,10 @@ static int prepare_processes(void)
 }
 
 /**
- *	pm_suspend_disk - The granpappy of hibernation power management.
- *
- *	If not, then call swsusp to do its thing, then figure out how
- *	to power down the system.
+ *	hibernate - The granpappy of the built-in hibernation management
  */
 
-int pm_suspend_disk(void)
+int hibernate(void)
 {
 	int error;
 
@@ -143,7 +165,8 @@ int pm_suspend_disk(void)
 	if (error)
 		goto Finish;
 
-	if (pm_disk_mode == PM_DISK_TESTPROC) {
+	mutex_lock(&pm_mutex);
+	if (hibernation_mode == HIBERNATION_TESTPROC) {
 		printk("swsusp debug: Waiting for 5 seconds.\n");
 		mdelay(5000);
 		goto Thaw;
@@ -168,7 +191,7 @@ int pm_suspend_disk(void)
 	if (error)
 		goto Enable_cpus;
 
-	if (pm_disk_mode == PM_DISK_TEST) {
+	if (hibernation_mode == HIBERNATION_TEST) {
 		printk("swsusp debug: Waiting for 5 seconds.\n");
 		mdelay(5000);
 		goto Enable_cpus;
@@ -205,6 +228,7 @@ int pm_suspend_disk(void)
 	device_resume();
 	resume_console();
  Thaw:
+	mutex_unlock(&pm_mutex);
 	unprepare_processes();
  Finish:
 	free_basic_memory_bitmaps();
@@ -220,7 +244,7 @@ int pm_suspend_disk(void)
  *	Called as a late_initcall (so all devices are discovered and
  *	initialized), we call swsusp to see if we have a saved image or not.
  *	If so, we quiesce devices, the restore the saved image. We will
- *	return above (in pm_suspend_disk() ) if everything goes well.
+ *	return above (in hibernate() ) if everything goes well.
  *	Otherwise, we fail gracefully and return to the normally
  *	scheduled program.
  *
@@ -315,25 +339,26 @@ static int software_resume(void)
 late_initcall(software_resume);
 
 
-static const char * const pm_disk_modes[] = {
-	[PM_DISK_PLATFORM]	= "platform",
-	[PM_DISK_SHUTDOWN]	= "shutdown",
-	[PM_DISK_REBOOT]	= "reboot",
-	[PM_DISK_TEST]		= "test",
-	[PM_DISK_TESTPROC]	= "testproc",
+static const char * const hibernation_modes[] = {
+	[HIBERNATION_PLATFORM]	= "platform",
+	[HIBERNATION_SHUTDOWN]	= "shutdown",
+	[HIBERNATION_REBOOT]	= "reboot",
+	[HIBERNATION_TEST]	= "test",
+	[HIBERNATION_TESTPROC]	= "testproc",
 };
 
 /**
- *	disk - Control suspend-to-disk mode
+ *	disk - Control hibernation mode
  *
  *	Suspend-to-disk can be handled in several ways. We have a few options
  *	for putting the system to sleep - using the platform driver (e.g. ACPI
- *	or other pm_ops), powering off the system or rebooting the system
- *	(for testing) as well as the two test modes.
+ *	or other hibernation_ops), powering off the system or rebooting the
+ *	system (for testing) as well as the two test modes.
  *
  *	The system can support 'platform', and that is known a priori (and
- *	encoded in pm_ops). However, the user may choose 'shutdown' or 'reboot'
- *	as alternatives, as well as the test modes 'test' and 'testproc'.
+ *	encoded by the presence of hibernation_ops). However, the user may
+ *	choose 'shutdown' or 'reboot' as alternatives, as well as one fo the
+ *	test modes, 'test' or 'testproc'.
  *
  *	show() will display what the mode is currently set to.
  *	store() will accept one of
@@ -345,7 +370,7 @@ static const char * const pm_disk_modes[] = {
  *	'testproc'
  *
  *	It will only change to 'platform' if the system
- *	supports it (as determined from pm_ops->pm_disk_mode).
+ *	supports it (as determined by having hibernation_ops).
  */
 
 static ssize_t disk_show(struct kset *kset, char *buf)
@@ -353,28 +378,25 @@ static ssize_t disk_show(struct kset *kset, char *buf)
 	int i;
 	char *start = buf;
 
-	for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) {
-		if (!pm_disk_modes[i])
+	for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
+		if (!hibernation_modes[i])
 			continue;
 		switch (i) {
-		case PM_DISK_SHUTDOWN:
-		case PM_DISK_REBOOT:
-		case PM_DISK_TEST:
-		case PM_DISK_TESTPROC:
+		case HIBERNATION_SHUTDOWN:
+		case HIBERNATION_REBOOT:
+		case HIBERNATION_TEST:
+		case HIBERNATION_TESTPROC:
 			break;
-		default:
-			if (pm_ops && pm_ops->enter &&
-			    (i == pm_ops->pm_disk_mode))
+		case HIBERNATION_PLATFORM:
+			if (hibernation_ops)
 				break;
 			/* not a valid mode, continue with loop */
 			continue;
 		}
-		if (i == pm_disk_mode)
-			buf += sprintf(buf, "[%s]", pm_disk_modes[i]);
+		if (i == hibernation_mode)
+			buf += sprintf(buf, "[%s] ", hibernation_modes[i]);
 		else
-			buf += sprintf(buf, "%s", pm_disk_modes[i]);
-		if (i+1 != PM_DISK_MAX)
-			buf += sprintf(buf, " ");
+			buf += sprintf(buf, "%s ", hibernation_modes[i]);
 	}
 	buf += sprintf(buf, "\n");
 	return buf-start;
@@ -387,39 +409,38 @@ static ssize_t disk_store(struct kset *kset, const char *buf, size_t n)
 	int i;
 	int len;
 	char *p;
-	suspend_disk_method_t mode = 0;
+	int mode = HIBERNATION_INVALID;
 
 	p = memchr(buf, '\n', n);
 	len = p ? p - buf : n;
 
 	mutex_lock(&pm_mutex);
-	for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) {
-		if (!strncmp(buf, pm_disk_modes[i], len)) {
+	for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
+		if (!strncmp(buf, hibernation_modes[i], len)) {
 			mode = i;
 			break;
 		}
 	}
-	if (mode) {
+	if (mode != HIBERNATION_INVALID) {
 		switch (mode) {
-		case PM_DISK_SHUTDOWN:
-		case PM_DISK_REBOOT:
-		case PM_DISK_TEST:
-		case PM_DISK_TESTPROC:
-			pm_disk_mode = mode;
+		case HIBERNATION_SHUTDOWN:
+		case HIBERNATION_REBOOT:
+		case HIBERNATION_TEST:
+		case HIBERNATION_TESTPROC:
+			hibernation_mode = mode;
 			break;
-		default:
-			if (pm_ops && pm_ops->enter &&
-			    (mode == pm_ops->pm_disk_mode))
-				pm_disk_mode = mode;
+		case HIBERNATION_PLATFORM:
+			if (hibernation_ops)
+				hibernation_mode = mode;
 			else
 				error = -EINVAL;
 		}
-	} else {
+	} else
 		error = -EINVAL;
-	}
 
-	pr_debug("PM: suspend-to-disk mode set to '%s'\n",
-		 pm_disk_modes[mode]);
+	if (!error)
+		pr_debug("PM: suspend-to-disk mode set to '%s'\n",
+			 hibernation_modes[mode]);
 	mutex_unlock(&pm_mutex);
 	return error ? error : n;
 }
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f6dda685e7e2..40d56a31245e 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -30,7 +30,6 @@
 DEFINE_MUTEX(pm_mutex);
 
 struct pm_ops *pm_ops;
-suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
 
 /**
  *	pm_set_ops - Set the global power method table. 
@@ -41,10 +40,6 @@ void pm_set_ops(struct pm_ops * ops)
 {
 	mutex_lock(&pm_mutex);
 	pm_ops = ops;
-	if (ops && ops->pm_disk_mode != PM_DISK_INVALID) {
-		pm_disk_mode = ops->pm_disk_mode;
-	} else
-		pm_disk_mode = PM_DISK_SHUTDOWN;
 	mutex_unlock(&pm_mutex);
 }
 
@@ -184,24 +179,12 @@ static void suspend_finish(suspend_state_t state)
 static const char * const pm_states[PM_SUSPEND_MAX] = {
 	[PM_SUSPEND_STANDBY]	= "standby",
 	[PM_SUSPEND_MEM]	= "mem",
-	[PM_SUSPEND_DISK]	= "disk",
 };
 
 static inline int valid_state(suspend_state_t state)
 {
-	/* Suspend-to-disk does not really need low-level support.
-	 * It can work with shutdown/reboot if needed. If it isn't
-	 * configured, then it cannot be supported.
-	 */
-	if (state == PM_SUSPEND_DISK)
-#ifdef CONFIG_SOFTWARE_SUSPEND
-		return 1;
-#else
-		return 0;
-#endif
-
-	/* all other states need lowlevel support and need to be
-	 * valid to the lowlevel implementation, no valid callback
+	/* All states need lowlevel support and need to be valid
+	 * to the lowlevel implementation, no valid callback
 	 * implies that none are valid. */
 	if (!pm_ops || !pm_ops->valid || !pm_ops->valid(state))
 		return 0;
@@ -229,11 +212,6 @@ static int enter_state(suspend_state_t state)
 	if (!mutex_trylock(&pm_mutex))
 		return -EBUSY;
 
-	if (state == PM_SUSPEND_DISK) {
-		error = pm_suspend_disk();
-		goto Unlock;
-	}
-
 	pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
 	if ((error = suspend_prepare(state)))
 		goto Unlock;
@@ -251,7 +229,7 @@ static int enter_state(suspend_state_t state)
 
 /**
  *	pm_suspend - Externally visible function for suspending system.
- *	@state:		Enumarted value of state to enter.
+ *	@state:		Enumerated value of state to enter.
  *
  *	Determine whether or not value is within range, get state 
  *	structure, and enter (above).
@@ -289,7 +267,13 @@ static ssize_t state_show(struct kset *kset, char *buf)
 		if (pm_states[i] && valid_state(i))
 			s += sprintf(s,"%s ", pm_states[i]);
 	}
-	s += sprintf(s,"\n");
+#ifdef CONFIG_SOFTWARE_SUSPEND
+	s += sprintf(s, "%s\n", "disk");
+#else
+	if (s != buf)
+		/* convert the last space to a newline */
+		*(s-1) = '\n';
+#endif
 	return (s - buf);
 }
 
@@ -304,6 +288,12 @@ static ssize_t state_store(struct kset *kset, const char *buf, size_t n)
 	p = memchr(buf, '\n', n);
 	len = p ? p - buf : n;
 
+	/* First, check if we are requested to hibernate */
+	if (!strncmp(buf, "disk", len)) {
+		error = hibernate();
+		return error ? error : n;
+	}
+
 	for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
 		if (*s && !strncmp(buf, *s, len))
 			break;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 34b43542785a..51381487103f 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -25,12 +25,7 @@ struct swsusp_info {
  */
 #define SPARE_PAGES	((1024 * 1024) >> PAGE_SHIFT)
 
-extern int pm_suspend_disk(void);
-#else
-static inline int pm_suspend_disk(void)
-{
-	return -EPERM;
-}
+extern struct hibernation_ops *hibernation_ops;
 #endif
 
 extern int pfn_is_nosave(unsigned long);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 040560d9c312..24d7d78e6f42 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -130,16 +130,16 @@ static inline int platform_prepare(void)
 {
 	int error = 0;
 
-	if (pm_ops && pm_ops->prepare)
-		error = pm_ops->prepare(PM_SUSPEND_DISK);
+	if (hibernation_ops)
+		error = hibernation_ops->prepare();
 
 	return error;
 }
 
 static inline void platform_finish(void)
 {
-	if (pm_ops && pm_ops->finish)
-		pm_ops->finish(PM_SUSPEND_DISK);
+	if (hibernation_ops)
+		hibernation_ops->finish();
 }
 
 static inline int snapshot_suspend(int platform_suspend)
@@ -384,7 +384,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		switch (arg) {
 
 		case PMOPS_PREPARE:
-			if (pm_ops && pm_ops->enter) {
+			if (hibernation_ops) {
 				data->platform_suspend = 1;
 				error = 0;
 			} else {
@@ -395,8 +395,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		case PMOPS_ENTER:
 			if (data->platform_suspend) {
 				kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
-				error = pm_ops->enter(PM_SUSPEND_DISK);
-				error = 0;
+				error = hibernation_ops->enter();
 			}
 			break;
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 926bf9d7ac45..8ecfe3473779 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -881,7 +881,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 #ifdef CONFIG_SOFTWARE_SUSPEND
 	case LINUX_REBOOT_CMD_SW_SUSPEND:
 		{
-			int ret = pm_suspend(PM_SUSPEND_DISK);
+			int ret = hibernate();
 			unlock_kernel();
 			return ret;
 		}
-- 
cgit v1.2.3


From cb0c78cc94a63871cad051674516b38e4a8aef95 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Wed, 9 May 2007 02:33:22 -0700
Subject: Fix Linuxdoc comment

A linuxdoc comment had fallen out of date - it refers to an argument which no
longer exists.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/handle.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 32e1ab1477d1..e391cbb1f566 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -22,7 +22,6 @@
  * handle_bad_irq - handle spurious and unhandled irqs
  * @irq:       the interrupt number
  * @desc:      description of the interrupt
- * @regs:      pointer to a register structure
  *
  * Handles spurious and unhandled IRQ's. It also prints a debugmessage.
  */
-- 
cgit v1.2.3


From 85badbdf5120d246ce2bb3f1a7689a805f9c9006 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 9 May 2007 02:33:33 -0700
Subject: use simple_read_from_buffer in kernel/

Cleanup using simple_read_from_buffer() for /dev/cpuset/tasks and
/proc/config.gz.

Cc: Paul Jackson <pj@sgi.com>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/configs.c | 15 +++------------
 kernel/cpuset.c  |  7 +------
 2 files changed, 4 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/configs.c b/kernel/configs.c
index 8fa1fb28f8a7..e84d3f9c6c7b 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -61,18 +61,9 @@ static ssize_t
 ikconfig_read_current(struct file *file, char __user *buf,
 		      size_t len, loff_t * offset)
 {
-	loff_t pos = *offset;
-	ssize_t count;
-
-	if (pos >= kernel_config_data_size)
-		return 0;
-
-	count = min(len, (size_t)(kernel_config_data_size - pos));
-	if (copy_to_user(buf, kernel_config_data + MAGIC_SIZE + pos, count))
-		return -EFAULT;
-
-	*offset += count;
-	return count;
+	return simple_read_from_buffer(buf, len, offset,
+				       kernel_config_data + MAGIC_SIZE,
+				       kernel_config_data_size);
 }
 
 static const struct file_operations ikconfig_file_ops = {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 88b416dfbc72..f57854b08922 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1772,12 +1772,7 @@ static ssize_t cpuset_tasks_read(struct file *file, char __user *buf,
 {
 	struct ctr_struct *ctr = file->private_data;
 
-	if (*ppos + nbytes > ctr->bufsz)
-		nbytes = ctr->bufsz - *ppos;
-	if (copy_to_user(buf, ctr->buf + *ppos, nbytes))
-		return -EFAULT;
-	*ppos += nbytes;
-	return nbytes;
+	return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
 }
 
 static int cpuset_tasks_release(struct inode *unused_inode, struct file *file)
-- 
cgit v1.2.3


From 55c0d1f83e481dd6c77f52f7dcfeb043b8b740fa Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 9 May 2007 02:33:37 -0700
Subject: Move sig_kernel_* et al macros to linux/signal.h

This patch moves the sig_kernel_* and related macros from kernel/signal.c
to linux/signal.h, and cleans them up slightly.  I need the sig_kernel_*
macros for default signal behavior in the utrace code, and want to avoid
duplication or overhead to share the knowledge.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 119 --------------------------------------------------------
 1 file changed, 119 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 1368e67c8482..4c8f49eadf7d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -38,125 +38,6 @@
 
 static struct kmem_cache *sigqueue_cachep;
 
-/*
- * In POSIX a signal is sent either to a specific thread (Linux task)
- * or to the process as a whole (Linux thread group).  How the signal
- * is sent determines whether it's to one thread or the whole group,
- * which determines which signal mask(s) are involved in blocking it
- * from being delivered until later.  When the signal is delivered,
- * either it's caught or ignored by a user handler or it has a default
- * effect that applies to the whole thread group (POSIX process).
- *
- * The possible effects an unblocked signal set to SIG_DFL can have are:
- *   ignore	- Nothing Happens
- *   terminate	- kill the process, i.e. all threads in the group,
- * 		  similar to exit_group.  The group leader (only) reports
- *		  WIFSIGNALED status to its parent.
- *   coredump	- write a core dump file describing all threads using
- *		  the same mm and then kill all those threads
- *   stop 	- stop all the threads in the group, i.e. TASK_STOPPED state
- *
- * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored.
- * Other signals when not blocked and set to SIG_DFL behaves as follows.
- * The job control signals also have other special effects.
- *
- *	+--------------------+------------------+
- *	|  POSIX signal      |  default action  |
- *	+--------------------+------------------+
- *	|  SIGHUP            |  terminate	|
- *	|  SIGINT            |	terminate	|
- *	|  SIGQUIT           |	coredump 	|
- *	|  SIGILL            |	coredump 	|
- *	|  SIGTRAP           |	coredump 	|
- *	|  SIGABRT/SIGIOT    |	coredump 	|
- *	|  SIGBUS            |	coredump 	|
- *	|  SIGFPE            |	coredump 	|
- *	|  SIGKILL           |	terminate(+)	|
- *	|  SIGUSR1           |	terminate	|
- *	|  SIGSEGV           |	coredump 	|
- *	|  SIGUSR2           |	terminate	|
- *	|  SIGPIPE           |	terminate	|
- *	|  SIGALRM           |	terminate	|
- *	|  SIGTERM           |	terminate	|
- *	|  SIGCHLD           |	ignore   	|
- *	|  SIGCONT           |	ignore(*)	|
- *	|  SIGSTOP           |	stop(*)(+)  	|
- *	|  SIGTSTP           |	stop(*)  	|
- *	|  SIGTTIN           |	stop(*)  	|
- *	|  SIGTTOU           |	stop(*)  	|
- *	|  SIGURG            |	ignore   	|
- *	|  SIGXCPU           |	coredump 	|
- *	|  SIGXFSZ           |	coredump 	|
- *	|  SIGVTALRM         |	terminate	|
- *	|  SIGPROF           |	terminate	|
- *	|  SIGPOLL/SIGIO     |	terminate	|
- *	|  SIGSYS/SIGUNUSED  |	coredump 	|
- *	|  SIGSTKFLT         |	terminate	|
- *	|  SIGWINCH          |	ignore   	|
- *	|  SIGPWR            |	terminate	|
- *	|  SIGRTMIN-SIGRTMAX |	terminate       |
- *	+--------------------+------------------+
- *	|  non-POSIX signal  |  default action  |
- *	+--------------------+------------------+
- *	|  SIGEMT            |  coredump	|
- *	+--------------------+------------------+
- *
- * (+) For SIGKILL and SIGSTOP the action is "always", not just "default".
- * (*) Special job control effects:
- * When SIGCONT is sent, it resumes the process (all threads in the group)
- * from TASK_STOPPED state and also clears any pending/queued stop signals
- * (any of those marked with "stop(*)").  This happens regardless of blocking,
- * catching, or ignoring SIGCONT.  When any stop signal is sent, it clears
- * any pending/queued SIGCONT signals; this happens regardless of blocking,
- * catching, or ignored the stop signal, though (except for SIGSTOP) the
- * default action of stopping the process may happen later or never.
- */
-
-#ifdef SIGEMT
-#define M_SIGEMT	M(SIGEMT)
-#else
-#define M_SIGEMT	0
-#endif
-
-#if SIGRTMIN > BITS_PER_LONG
-#define M(sig) (1ULL << ((sig)-1))
-#else
-#define M(sig) (1UL << ((sig)-1))
-#endif
-#define T(sig, mask) (M(sig) & (mask))
-
-#define SIG_KERNEL_ONLY_MASK (\
-	M(SIGKILL)   |  M(SIGSTOP)                                   )
-
-#define SIG_KERNEL_STOP_MASK (\
-	M(SIGSTOP)   |  M(SIGTSTP)   |  M(SIGTTIN)   |  M(SIGTTOU)   )
-
-#define SIG_KERNEL_COREDUMP_MASK (\
-        M(SIGQUIT)   |  M(SIGILL)    |  M(SIGTRAP)   |  M(SIGABRT)   | \
-        M(SIGFPE)    |  M(SIGSEGV)   |  M(SIGBUS)    |  M(SIGSYS)    | \
-        M(SIGXCPU)   |  M(SIGXFSZ)   |  M_SIGEMT                     )
-
-#define SIG_KERNEL_IGNORE_MASK (\
-        M(SIGCONT)   |  M(SIGCHLD)   |  M(SIGWINCH)  |  M(SIGURG)    )
-
-#define sig_kernel_only(sig) \
-		(((sig) < SIGRTMIN)  && T(sig, SIG_KERNEL_ONLY_MASK))
-#define sig_kernel_coredump(sig) \
-		(((sig) < SIGRTMIN)  && T(sig, SIG_KERNEL_COREDUMP_MASK))
-#define sig_kernel_ignore(sig) \
-		(((sig) < SIGRTMIN)  && T(sig, SIG_KERNEL_IGNORE_MASK))
-#define sig_kernel_stop(sig) \
-		(((sig) < SIGRTMIN)  && T(sig, SIG_KERNEL_STOP_MASK))
-
-#define sig_needs_tasklist(sig)	((sig) == SIGCONT)
-
-#define sig_user_defined(t, signr) \
-	(((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) &&	\
-	 ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN))
-
-#define sig_fatal(t, signr) \
-	(!T(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
-	 (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)
 
 static int sig_ignored(struct task_struct *t, int sig)
 {
-- 
cgit v1.2.3


From 35c35d1afa8f9afe01d922ff9668a14c5d879b02 Mon Sep 17 00:00:00 2001
From: Daniel Walker <dwalker@mvista.com>
Date: Wed, 9 May 2007 02:33:40 -0700
Subject: clocksource: spelling error in watchdog code

There's more that need fixing, and fix my own subject spelling error too.

Signed-off-by: Daniel Walker <dwalker@mvista.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/time/clocksource.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index fe5c7db24247..db0c725de5ea 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -75,14 +75,14 @@ static struct timer_list watchdog_timer;
 static DEFINE_SPINLOCK(watchdog_lock);
 static cycle_t watchdog_last;
 /*
- * Interval: 0.5sec Treshold: 0.0625s
+ * Interval: 0.5sec Threshold: 0.0625s
  */
 #define WATCHDOG_INTERVAL (HZ >> 1)
-#define WATCHDOG_TRESHOLD (NSEC_PER_SEC >> 4)
+#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
 
 static void clocksource_ratewd(struct clocksource *cs, int64_t delta)
 {
-	if (delta > -WATCHDOG_TRESHOLD && delta < WATCHDOG_TRESHOLD)
+	if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD)
 		return;
 
 	printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
-- 
cgit v1.2.3


From 9b04bd27564cfd7224e0135ba37df778f1d490bf Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Wed, 9 May 2007 02:33:43 -0700
Subject: Fix printk format warnings in timer_list.c

u64 and s64 are not necessarily 'long long' on some 64-bit
platforms, so explicit the type to kill the compiler warnings.

Also consistently use '%Lu' which is unsigned.

Signed-off-by: David S. Miller <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/time/timer_list.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index b734ca4bc75e..8bbcfb77f7d2 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -65,7 +65,7 @@ print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now)
 	SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
 #endif
 	SEQ_printf(m, "\n");
-	SEQ_printf(m, " # expires at %Ld nsecs [in %Ld nsecs]\n",
+	SEQ_printf(m, " # expires at %Lu nsecs [in %Lu nsecs]\n",
 		(unsigned long long)ktime_to_ns(timer->expires),
 		(unsigned long long)(ktime_to_ns(timer->expires) - now));
 }
@@ -111,14 +111,14 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
 {
 	SEQ_printf(m, "  .index:      %d\n",
 			base->index);
-	SEQ_printf(m, "  .resolution: %Ld nsecs\n",
+	SEQ_printf(m, "  .resolution: %Lu nsecs\n",
 			(unsigned long long)ktime_to_ns(base->resolution));
 	SEQ_printf(m,   "  .get_time:   ");
 	print_name_offset(m, base->get_time);
 	SEQ_printf(m,   "\n");
 #ifdef CONFIG_HIGH_RES_TIMERS
-	SEQ_printf(m, "  .offset:     %Ld nsecs\n",
-			ktime_to_ns(base->offset));
+	SEQ_printf(m, "  .offset:     %Lu nsecs\n",
+		   (unsigned long long) ktime_to_ns(base->offset));
 #endif
 	SEQ_printf(m,   "active timers:\n");
 	print_active_timers(m, base, now);
@@ -135,10 +135,11 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
 		print_base(m, cpu_base->clock_base + i, now);
 	}
 #define P(x) \
-	SEQ_printf(m, "  .%-15s: %Ld\n", #x, (u64)(cpu_base->x))
+	SEQ_printf(m, "  .%-15s: %Lu\n", #x, \
+		   (unsigned long long)(cpu_base->x))
 #define P_ns(x) \
-	SEQ_printf(m, "  .%-15s: %Ld nsecs\n", #x, \
-		(u64)(ktime_to_ns(cpu_base->x)))
+	SEQ_printf(m, "  .%-15s: %Lu nsecs\n", #x, \
+		   (unsigned long long)(ktime_to_ns(cpu_base->x)))
 
 #ifdef CONFIG_HIGH_RES_TIMERS
 	P_ns(expires_next);
@@ -150,10 +151,11 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
 
 #ifdef CONFIG_TICK_ONESHOT
 # define P(x) \
-	SEQ_printf(m, "  .%-15s: %Ld\n", #x, (u64)(ts->x))
+	SEQ_printf(m, "  .%-15s: %Lu\n", #x, \
+		   (unsigned long long)(ts->x))
 # define P_ns(x) \
-	SEQ_printf(m, "  .%-15s: %Ld nsecs\n", #x, \
-		(u64)(ktime_to_ns(ts->x)))
+	SEQ_printf(m, "  .%-15s: %Lu nsecs\n", #x, \
+		   (unsigned long long)(ktime_to_ns(ts->x)))
 	{
 		struct tick_sched *ts = tick_get_tick_sched(cpu);
 		P(nohz_mode);
@@ -167,7 +169,8 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
 		P(last_jiffies);
 		P(next_jiffies);
 		P_ns(idle_expires);
-		SEQ_printf(m, "jiffies: %Ld\n", (u64)jiffies);
+		SEQ_printf(m, "jiffies: %Lu\n",
+			   (unsigned long long)jiffies);
 	}
 #endif
 
-- 
cgit v1.2.3


From e18f3ffb9c3ddfc1b4ad8f38f5f2acae8c16f0c9 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 9 May 2007 02:33:50 -0700
Subject: schedule_on_each_cpu(): use preempt_disable()

We take workqueue_mutex in there to keep CPU hotplug away.  But
preempt_disable() will suffice for that.

Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b6fa5e63085d..1ea4bcb86974 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -638,7 +638,7 @@ int schedule_on_each_cpu(work_func_t func)
 	if (!works)
 		return -ENOMEM;
 
-	mutex_lock(&workqueue_mutex);
+	preempt_disable();		/* CPU hotplug */
 	for_each_online_cpu(cpu) {
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
@@ -646,7 +646,7 @@ int schedule_on_each_cpu(work_func_t func)
 		set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
 		__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
 	}
-	mutex_unlock(&workqueue_mutex);
+	preempt_enable();
 	flush_workqueue(keventd_wq);
 	free_percpu(works);
 	return 0;
-- 
cgit v1.2.3


From fc2e4d70410546307344821eed6fd23803a45286 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:33:51 -0700
Subject: reimplement flush_workqueue()

Remove ->remove_sequence, ->insert_sequence, and ->work_done from struct
cpu_workqueue_struct.  To implement flush_workqueue() we can queue a
barrier work on each CPU and wait for its completition.

The barrier is queued under workqueue_mutex to ensure that per cpu
wq->cpu_wq is alive, we drop this mutex before going to sleep.  If CPU goes
down while we are waiting for completition, take_over_work() will move the
barrier on another CPU, and the handler will wake up us eventually.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 70 ++++++++++++++++++++++++------------------------------
 1 file changed, 31 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1ea4bcb86974..b7bb37ab03bc 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -36,23 +36,13 @@
 /*
  * The per-CPU workqueue (if single thread, we always use the first
  * possible cpu).
- *
- * The sequence counters are for flush_scheduled_work().  It wants to wait
- * until all currently-scheduled works are completed, but it doesn't
- * want to be livelocked by new, incoming ones.  So it waits until
- * remove_sequence is >= the insert_sequence which pertained when
- * flush_scheduled_work() was called.
  */
 struct cpu_workqueue_struct {
 
 	spinlock_t lock;
 
-	long remove_sequence;	/* Least-recently added (next to run) */
-	long insert_sequence;	/* Next to add */
-
 	struct list_head worklist;
 	wait_queue_head_t more_work;
-	wait_queue_head_t work_done;
 
 	struct workqueue_struct *wq;
 	struct task_struct *thread;
@@ -138,8 +128,6 @@ static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work
 		f(work);
 
 		spin_lock_irqsave(&cwq->lock, flags);
-		cwq->remove_sequence++;
-		wake_up(&cwq->work_done);
 		ret = 1;
 	}
 	spin_unlock_irqrestore(&cwq->lock, flags);
@@ -187,7 +175,6 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
 	spin_lock_irqsave(&cwq->lock, flags);
 	set_wq_data(work, cwq);
 	list_add_tail(&work->entry, &cwq->worklist);
-	cwq->insert_sequence++;
 	wake_up(&cwq->more_work);
 	spin_unlock_irqrestore(&cwq->lock, flags);
 }
@@ -338,8 +325,6 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
 		}
 
 		spin_lock_irqsave(&cwq->lock, flags);
-		cwq->remove_sequence++;
-		wake_up(&cwq->work_done);
 	}
 	cwq->run_depth--;
 	spin_unlock_irqrestore(&cwq->lock, flags);
@@ -394,6 +379,25 @@ static int worker_thread(void *__cwq)
 	return 0;
 }
 
+struct wq_barrier {
+	struct work_struct	work;
+	struct completion	done;
+};
+
+static void wq_barrier_func(struct work_struct *work)
+{
+	struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
+	complete(&barr->done);
+}
+
+static inline void init_wq_barrier(struct wq_barrier *barr)
+{
+	INIT_WORK(&barr->work, wq_barrier_func);
+	__set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
+
+	init_completion(&barr->done);
+}
+
 static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
 {
 	if (cwq->thread == current) {
@@ -401,23 +405,18 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
 		 * Probably keventd trying to flush its own queue. So simply run
 		 * it by hand rather than deadlocking.
 		 */
+		mutex_unlock(&workqueue_mutex);
 		run_workqueue(cwq);
+		mutex_lock(&workqueue_mutex);
 	} else {
-		DEFINE_WAIT(wait);
-		long sequence_needed;
+		struct wq_barrier barr;
 
-		spin_lock_irq(&cwq->lock);
-		sequence_needed = cwq->insert_sequence;
+		init_wq_barrier(&barr);
+		__queue_work(cwq, &barr.work);
 
-		while (sequence_needed - cwq->remove_sequence > 0) {
-			prepare_to_wait(&cwq->work_done, &wait,
-					TASK_UNINTERRUPTIBLE);
-			spin_unlock_irq(&cwq->lock);
-			schedule();
-			spin_lock_irq(&cwq->lock);
-		}
-		finish_wait(&cwq->work_done, &wait);
-		spin_unlock_irq(&cwq->lock);
+		mutex_unlock(&workqueue_mutex);
+		wait_for_completion(&barr.done);
+		mutex_lock(&workqueue_mutex);
 	}
 }
 
@@ -428,29 +427,25 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
  * Forces execution of the workqueue and blocks until its completion.
  * This is typically used in driver shutdown handlers.
  *
- * This function will sample each workqueue's current insert_sequence number and
- * will sleep until the head sequence is greater than or equal to that.  This
- * means that we sleep until all works which were queued on entry have been
- * handled, but we are not livelocked by new incoming ones.
+ * We sleep until all works which were queued on entry have been handled,
+ * but we are not livelocked by new incoming ones.
  *
  * This function used to run the workqueues itself.  Now we just wait for the
  * helper threads to do it.
  */
 void fastcall flush_workqueue(struct workqueue_struct *wq)
 {
-	might_sleep();
-
+	mutex_lock(&workqueue_mutex);
 	if (is_single_threaded(wq)) {
 		/* Always use first cpu's area. */
 		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));
 	} else {
 		int cpu;
 
-		mutex_lock(&workqueue_mutex);
 		for_each_online_cpu(cpu)
 			flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
-		mutex_unlock(&workqueue_mutex);
 	}
+	mutex_unlock(&workqueue_mutex);
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
 
@@ -463,12 +458,9 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
 	spin_lock_init(&cwq->lock);
 	cwq->wq = wq;
 	cwq->thread = NULL;
-	cwq->insert_sequence = 0;
-	cwq->remove_sequence = 0;
 	cwq->freezeable = freezeable;
 	INIT_LIST_HEAD(&cwq->worklist);
 	init_waitqueue_head(&cwq->more_work);
-	init_waitqueue_head(&cwq->work_done);
 
 	if (is_single_threaded(wq))
 		p = kthread_create(worker_thread, cwq, "%s", wq->name);
-- 
cgit v1.2.3


From b89deed32ccc96098bd6bc953c64bba6b847774f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:33:52 -0700
Subject: implement flush_work()

A basic problem with flush_scheduled_work() is that it blocks behind _all_
presently-queued works, rather than just the work whcih the caller wants to
flush.  If the caller holds some lock, and if one of the queued work happens
to want that lock as well then accidental deadlocks can occur.

One example of this is the phy layer: it wants to flush work while holding
rtnl_lock().  But if a linkwatch event happens to be queued, the phy code will
deadlock because the linkwatch callback function takes rtnl_lock.

So we implement a new function which will flush a *single* work - just the one
which the caller wants to free up.  Thus we avoid the accidental deadlocks
which can arise from unrelated subsystems' callbacks taking shared locks.

flush_work() non-blockingly dequeues the work_struct which we want to kill,
then it waits for its handler to complete on all CPUs.

Add ->current_work to the "struct cpu_workqueue_struct", it points to
currently running "struct work_struct". When flush_work(work) detects
->current_work == work, it inserts a barrier at the _head_ of ->worklist
(and thus right _after_ that work) and waits for completition. This means
that the next work fired on that CPU will be this barrier, or another
barrier queued by concurrent flush_work(), so the caller of flush_work()
will be woken before any "regular" work has a chance to run.

When wait_on_work() unlocks workqueue_mutex (or whatever we choose to protect
against CPU hotplug), CPU may go away. But in that case take_over_work() will
move a barrier we queued to another CPU, it will be fired sometime, and
wait_on_work() will be woken.

Actually, we are doing cleanup_workqueue_thread()->kthread_stop() before
take_over_work(), so cwq->thread should complete its ->worklist (and thus
the barrier), because currently we don't check kthread_should_stop() in
run_workqueue(). But even if we did, everything should be ok.

[akpm@osdl.org: cleanup]
[akpm@osdl.org: add flush_work_keventd() wrapper]
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 92 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b7bb37ab03bc..918d55267a12 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -46,6 +46,7 @@ struct cpu_workqueue_struct {
 
 	struct workqueue_struct *wq;
 	struct task_struct *thread;
+	struct work_struct *current_work;
 
 	int run_depth;		/* Detect run_workqueue() recursion depth */
 
@@ -120,6 +121,7 @@ static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work
 	    && work_pending(work)
 	    && !list_empty(&work->entry)) {
 		work_func_t f = work->func;
+		cwq->current_work = work;
 		list_del_init(&work->entry);
 		spin_unlock_irqrestore(&cwq->lock, flags);
 
@@ -128,6 +130,7 @@ static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work
 		f(work);
 
 		spin_lock_irqsave(&cwq->lock, flags);
+		cwq->current_work = NULL;
 		ret = 1;
 	}
 	spin_unlock_irqrestore(&cwq->lock, flags);
@@ -166,6 +169,17 @@ int fastcall run_scheduled_work(struct work_struct *work)
 }
 EXPORT_SYMBOL(run_scheduled_work);
 
+static void insert_work(struct cpu_workqueue_struct *cwq,
+				struct work_struct *work, int tail)
+{
+	set_wq_data(work, cwq);
+	if (tail)
+		list_add_tail(&work->entry, &cwq->worklist);
+	else
+		list_add(&work->entry, &cwq->worklist);
+	wake_up(&cwq->more_work);
+}
+
 /* Preempt must be disabled. */
 static void __queue_work(struct cpu_workqueue_struct *cwq,
 			 struct work_struct *work)
@@ -173,9 +187,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
 	unsigned long flags;
 
 	spin_lock_irqsave(&cwq->lock, flags);
-	set_wq_data(work, cwq);
-	list_add_tail(&work->entry, &cwq->worklist);
-	wake_up(&cwq->more_work);
+	insert_work(cwq, work, 1);
 	spin_unlock_irqrestore(&cwq->lock, flags);
 }
 
@@ -305,6 +317,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
 						struct work_struct, entry);
 		work_func_t f = work->func;
 
+		cwq->current_work = work;
 		list_del_init(cwq->worklist.next);
 		spin_unlock_irqrestore(&cwq->lock, flags);
 
@@ -325,6 +338,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
 		}
 
 		spin_lock_irqsave(&cwq->lock, flags);
+		cwq->current_work = NULL;
 	}
 	cwq->run_depth--;
 	spin_unlock_irqrestore(&cwq->lock, flags);
@@ -449,6 +463,75 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
 
+static void wait_on_work(struct cpu_workqueue_struct *cwq,
+				struct work_struct *work)
+{
+	struct wq_barrier barr;
+	int running = 0;
+
+	spin_lock_irq(&cwq->lock);
+	if (unlikely(cwq->current_work == work)) {
+		init_wq_barrier(&barr);
+		insert_work(cwq, &barr.work, 0);
+		running = 1;
+	}
+	spin_unlock_irq(&cwq->lock);
+
+	if (unlikely(running)) {
+		mutex_unlock(&workqueue_mutex);
+		wait_for_completion(&barr.done);
+		mutex_lock(&workqueue_mutex);
+	}
+}
+
+/**
+ * flush_work - block until a work_struct's callback has terminated
+ * @wq: the workqueue on which the work is queued
+ * @work: the work which is to be flushed
+ *
+ * flush_work() will attempt to cancel the work if it is queued.  If the work's
+ * callback appears to be running, flush_work() will block until it has
+ * completed.
+ *
+ * flush_work() is designed to be used when the caller is tearing down data
+ * structures which the callback function operates upon.  It is expected that,
+ * prior to calling flush_work(), the caller has arranged for the work to not
+ * be requeued.
+ */
+void flush_work(struct workqueue_struct *wq, struct work_struct *work)
+{
+	struct cpu_workqueue_struct *cwq;
+
+	mutex_lock(&workqueue_mutex);
+	cwq = get_wq_data(work);
+	/* Was it ever queued ? */
+	if (!cwq)
+		goto out;
+
+	/*
+	 * This work can't be re-queued, and the lock above protects us
+	 * from take_over_work(), no need to re-check that get_wq_data()
+	 * is still the same when we take cwq->lock.
+	 */
+	spin_lock_irq(&cwq->lock);
+	list_del_init(&work->entry);
+	work_release(work);
+	spin_unlock_irq(&cwq->lock);
+
+	if (is_single_threaded(wq)) {
+		/* Always use first cpu's area. */
+		wait_on_work(per_cpu_ptr(wq->cpu_wq, singlethread_cpu), work);
+	} else {
+		int cpu;
+
+		for_each_online_cpu(cpu)
+			wait_on_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
+	}
+out:
+	mutex_unlock(&workqueue_mutex);
+}
+EXPORT_SYMBOL_GPL(flush_work);
+
 static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
 						   int cpu, int freezeable)
 {
@@ -650,6 +733,12 @@ void flush_scheduled_work(void)
 }
 EXPORT_SYMBOL(flush_scheduled_work);
 
+void flush_work_keventd(struct work_struct *work)
+{
+	flush_work(keventd_wq, work);
+}
+EXPORT_SYMBOL(flush_work_keventd);
+
 /**
  * cancel_rearming_delayed_workqueue - reliably kill off a delayed work whose handler rearms the delayed work.
  * @wq:   the controlling workqueue structure
-- 
cgit v1.2.3


From edab2516a6c1752e8e5e3d55727cabf12346e5df Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 9 May 2007 02:33:53 -0700
Subject: flush_workqueue(): use preempt_disable to hold off cpu hotplug

Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Gautham Shenoy <ego@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 918d55267a12..5176d51bcc2a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -419,18 +419,22 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
 		 * Probably keventd trying to flush its own queue. So simply run
 		 * it by hand rather than deadlocking.
 		 */
-		mutex_unlock(&workqueue_mutex);
+		preempt_enable();
+		/*
+		 * We can still touch *cwq here because we are keventd, and
+		 * hot-unplug will be waiting us to exit.
+		 */
 		run_workqueue(cwq);
-		mutex_lock(&workqueue_mutex);
+		preempt_disable();
 	} else {
 		struct wq_barrier barr;
 
 		init_wq_barrier(&barr);
 		__queue_work(cwq, &barr.work);
 
-		mutex_unlock(&workqueue_mutex);
+		preempt_enable();	/* Can no longer touch *cwq */
 		wait_for_completion(&barr.done);
-		mutex_lock(&workqueue_mutex);
+		preempt_disable();
 	}
 }
 
@@ -449,7 +453,7 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
  */
 void fastcall flush_workqueue(struct workqueue_struct *wq)
 {
-	mutex_lock(&workqueue_mutex);
+	preempt_disable();		/* CPU hotplug */
 	if (is_single_threaded(wq)) {
 		/* Always use first cpu's area. */
 		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));
@@ -459,7 +463,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
 		for_each_online_cpu(cpu)
 			flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
 	}
-	mutex_unlock(&workqueue_mutex);
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
 
-- 
cgit v1.2.3


From 83c22520c51bf67529367e8237f95c03fe44e2da Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:33:54 -0700
Subject: flush_cpu_workqueue: don't flush an empty ->worklist

Now when we have ->current_work we can avoid adding a barrier and waiting
for its completition when cwq's queue is empty.

Note: this change is also useful if we change flush_workqueue() to also
check the dead CPUs.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Gautham Shenoy <ego@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5176d51bcc2a..5ecf4984e382 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -404,12 +404,15 @@ static void wq_barrier_func(struct work_struct *work)
 	complete(&barr->done);
 }
 
-static inline void init_wq_barrier(struct wq_barrier *barr)
+static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
+					struct wq_barrier *barr, int tail)
 {
 	INIT_WORK(&barr->work, wq_barrier_func);
 	__set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
 
 	init_completion(&barr->done);
+
+	insert_work(cwq, &barr->work, tail);
 }
 
 static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
@@ -428,13 +431,20 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
 		preempt_disable();
 	} else {
 		struct wq_barrier barr;
+		int active = 0;
 
-		init_wq_barrier(&barr);
-		__queue_work(cwq, &barr.work);
+		spin_lock_irq(&cwq->lock);
+		if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
+			insert_wq_barrier(cwq, &barr, 1);
+			active = 1;
+		}
+		spin_unlock_irq(&cwq->lock);
 
-		preempt_enable();	/* Can no longer touch *cwq */
-		wait_for_completion(&barr.done);
-		preempt_disable();
+		if (active) {
+			preempt_enable();
+			wait_for_completion(&barr.done);
+			preempt_disable();
+		}
 	}
 }
 
@@ -475,8 +485,7 @@ static void wait_on_work(struct cpu_workqueue_struct *cwq,
 
 	spin_lock_irq(&cwq->lock);
 	if (unlikely(cwq->current_work == work)) {
-		init_wq_barrier(&barr);
-		insert_work(cwq, &barr.work, 0);
+		insert_wq_barrier(cwq, &barr, 0);
 		running = 1;
 	}
 	spin_unlock_irq(&cwq->lock);
-- 
cgit v1.2.3


From 7c9cb38302e78d24e37f7d8a2ea7eed4ae5f2fa7 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@comcast.net>
Date: Wed, 9 May 2007 02:34:01 -0700
Subject: relay: use plain timer instead of delayed work

relay doesn't need to use schedule_delayed_work() for waking readers
when a simple timer will do.

Signed-off-by: Tom Zanussi <zanussi@comcast.net>
Cc: Satyam Sharma <satyam.sharma@gmail.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/relay.c | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 577f251c7e28..e804589c863c 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -310,16 +310,13 @@ static struct rchan_callbacks default_channel_callbacks = {
 
 /**
  *	wakeup_readers - wake up readers waiting on a channel
- *	@work: work struct that contains the the channel buffer
+ *	@data: contains the the channel buffer
  *
- *	This is the work function used to defer reader waking.  The
- *	reason waking is deferred is that calling directly from write
- *	causes problems if you're writing from say the scheduler.
+ *	This is the timer function used to defer reader waking.
  */
-static void wakeup_readers(struct work_struct *work)
+static void wakeup_readers(unsigned long data)
 {
-	struct rchan_buf *buf =
-		container_of(work, struct rchan_buf, wake_readers.work);
+	struct rchan_buf *buf = (struct rchan_buf *)data;
 	wake_up_interruptible(&buf->read_wait);
 }
 
@@ -337,11 +334,9 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
 	if (init) {
 		init_waitqueue_head(&buf->read_wait);
 		kref_init(&buf->kref);
-		INIT_DELAYED_WORK(&buf->wake_readers, NULL);
-	} else {
-		cancel_delayed_work(&buf->wake_readers);
-		flush_scheduled_work();
-	}
+		setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
+	} else
+		del_timer_sync(&buf->timer);
 
 	buf->subbufs_produced = 0;
 	buf->subbufs_consumed = 0;
@@ -447,8 +442,7 @@ end:
 static void relay_close_buf(struct rchan_buf *buf)
 {
 	buf->finalized = 1;
-	cancel_delayed_work(&buf->wake_readers);
-	flush_scheduled_work();
+	del_timer_sync(&buf->timer);
 	kref_put(&buf->kref, relay_remove_buf);
 }
 
@@ -608,11 +602,14 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
 		buf->dentry->d_inode->i_size += buf->chan->subbuf_size -
 			buf->padding[old_subbuf];
 		smp_mb();
-		if (waitqueue_active(&buf->read_wait)) {
-			PREPARE_DELAYED_WORK(&buf->wake_readers,
-					     wakeup_readers);
-			schedule_delayed_work(&buf->wake_readers, 1);
-		}
+		if (waitqueue_active(&buf->read_wait))
+			/*
+			 * Calling wake_up_interruptible() from here
+			 * will deadlock if we happen to be logging
+			 * from the scheduler (trying to re-grab
+			 * rq->lock), so defer it.
+			 */
+			__mod_timer(&buf->timer, jiffies + 1);
 	}
 
 	old = buf->data;
-- 
cgit v1.2.3


From 6f7cc11aa6c7d5002e16096c7590944daece70ed Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 9 May 2007 02:34:02 -0700
Subject: Extend notifier_call_chain to count nr_calls made

Since 2.6.18-something, the community has been bugged by the problem to
provide a clean and a stable mechanism to postpone a cpu-hotplug event as
lock_cpu_hotplug was badly broken.

This is another proposal towards solving that problem.  This one is along the
lines of the solution provided in kernel/workqueue.c

Instead of having a global mechanism like lock_cpu_hotplug, we allow the
subsytems to define their own per-subsystem hot cpu mutexes.  These would be
taken(released) where ever we are currently calling
lock_cpu_hotplug(unlock_cpu_hotplug).

Also, in the per-subsystem hotcpu callback function,we take this mutex before
we handle any pre-cpu-hotplug events and release it once we finish handling
the post-cpu-hotplug events.  A standard means for doing this has been
provided in [PATCH 2/4] and demonstrated in [PATCH 3/4].

The ordering of these per-subsystem mutexes might still prove to be a
problem, but hopefully lockdep should help us get out of that muddle.

The patch set to be applied against linux-2.6.19-rc5 is as follows:

[PATCH 1/4] :	Extend notifier_call_chain with an option to specify the
		number of notifications to be sent and also count the
		number of notifications actually sent.

[PATCH 2/4] :	Define events CPU_LOCK_ACQUIRE and CPU_LOCK_RELEASE
		and send out notifications for these in _cpu_up and
		_cpu_down. This would help us standardise the acquire and
		release of the subsystem locks in the hotcpu
		callback functions of these subsystems.

[PATCH 3/4] :	Eliminate lock_cpu_hotplug from kernel/sched.c.

[PATCH 4/4] :	In workqueue_cpu_callback function, acquire(release) the
		workqueue_mutex while handling
		CPU_LOCK_ACQUIRE(CPU_LOCK_RELEASE).

If the per-subsystem-locking approach survives the test of time, we can expect
a slow phasing out of lock_cpu_hotplug, which has not yet been eliminated in
these patches :)

This patch:

Provide notifier_call_chain with an option to call only a specified number of
notifiers and also record the number of call to notifiers made.

The need for this enhancement was identified in the post entitled
"Slab - Eliminate lock_cpu_hotplug from slab"
(http://lkml.org/lkml/2006/10/28/92) by Ravikiran G Thirumalai and
Andrew Morton.

This patch adds two additional parameters to notifier_call_chain API namely
 - int nr_to_calls : Number of notifier_functions to be called.
 		     The don't care value is -1.

 - unsigned int *nr_calls : Records the total number of notifier_funtions
			    called by notifier_call_chain. The don't care
			    value is NULL.

[michal.k.k.piotrowski@gmail.com: build fix]
Credit: Andrew Morton <akpm@osdl.org>
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Michal Piotrowski <michal.k.k.piotrowski@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 77 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 8ecfe3473779..d4985df21b60 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -134,19 +134,39 @@ static int notifier_chain_unregister(struct notifier_block **nl,
 	return -ENOENT;
 }
 
+/**
+ * notifier_call_chain - Informs the registered notifiers about an event.
+ *	@nl:		Pointer to head of the blocking notifier chain
+ *	@val:		Value passed unmodified to notifier function
+ *	@v:		Pointer passed unmodified to notifier function
+ *	@nr_to_call:	Number of notifier functions to be called. Don't care
+ *		     	value of this parameter is -1.
+ *	@nr_calls:	Records the number of notifications sent. Don't care
+ *		   	value of this field is NULL.
+ * 	@returns:	notifier_call_chain returns the value returned by the
+ *			last notifier function called.
+ */
+
 static int __kprobes notifier_call_chain(struct notifier_block **nl,
-		unsigned long val, void *v)
+					unsigned long val, void *v,
+					int nr_to_call,	int *nr_calls)
 {
 	int ret = NOTIFY_DONE;
 	struct notifier_block *nb, *next_nb;
 
 	nb = rcu_dereference(*nl);
-	while (nb) {
+
+	while (nb && nr_to_call) {
 		next_nb = rcu_dereference(nb->next);
 		ret = nb->notifier_call(nb, val, v);
+
+		if (nr_calls)
+			(*nr_calls)++;
+
 		if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
 			break;
 		nb = next_nb;
+		nr_to_call--;
 	}
 	return ret;
 }
@@ -205,10 +225,12 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
 EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
 
 /**
- *	atomic_notifier_call_chain - Call functions in an atomic notifier chain
+ *	__atomic_notifier_call_chain - Call functions in an atomic notifier chain
  *	@nh: Pointer to head of the atomic notifier chain
  *	@val: Value passed unmodified to notifier function
  *	@v: Pointer passed unmodified to notifier function
+ *	@nr_to_call: See the comment for notifier_call_chain.
+ *	@nr_calls: See the comment for notifier_call_chain.
  *
  *	Calls each function in a notifier chain in turn.  The functions
  *	run in an atomic context, so they must not block.
@@ -222,19 +244,27 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
  *	of the last notifier function called.
  */
  
-int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
-		unsigned long val, void *v)
+int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+					unsigned long val, void *v,
+					int nr_to_call, int *nr_calls)
 {
 	int ret;
 
 	rcu_read_lock();
-	ret = notifier_call_chain(&nh->head, val, v);
+	ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
 	rcu_read_unlock();
 	return ret;
 }
 
-EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
+EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
+
+int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+		unsigned long val, void *v)
+{
+	return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
+}
 
+EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
 /*
  *	Blocking notifier chain routines.  All access to the chain is
  *	synchronized by an rwsem.
@@ -304,10 +334,12 @@ int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
 EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
 
 /**
- *	blocking_notifier_call_chain - Call functions in a blocking notifier chain
+ *	__blocking_notifier_call_chain - Call functions in a blocking notifier chain
  *	@nh: Pointer to head of the blocking notifier chain
  *	@val: Value passed unmodified to notifier function
  *	@v: Pointer passed unmodified to notifier function
+ *	@nr_to_call: See comment for notifier_call_chain.
+ *	@nr_calls: See comment for notifier_call_chain.
  *
  *	Calls each function in a notifier chain in turn.  The functions
  *	run in a process context, so they are allowed to block.
@@ -320,8 +352,9 @@ EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
  *	of the last notifier function called.
  */
  
-int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
-		unsigned long val, void *v)
+int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+				   unsigned long val, void *v,
+				   int nr_to_call, int *nr_calls)
 {
 	int ret = NOTIFY_DONE;
 
@@ -332,12 +365,19 @@ int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
 	 */
 	if (rcu_dereference(nh->head)) {
 		down_read(&nh->rwsem);
-		ret = notifier_call_chain(&nh->head, val, v);
+		ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
+					nr_calls);
 		up_read(&nh->rwsem);
 	}
 	return ret;
 }
+EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain);
 
+int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+		unsigned long val, void *v)
+{
+	return __blocking_notifier_call_chain(nh, val, v, -1, NULL);
+}
 EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
 
 /*
@@ -383,10 +423,12 @@ int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
 EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
 
 /**
- *	raw_notifier_call_chain - Call functions in a raw notifier chain
+ *	__raw_notifier_call_chain - Call functions in a raw notifier chain
  *	@nh: Pointer to head of the raw notifier chain
  *	@val: Value passed unmodified to notifier function
  *	@v: Pointer passed unmodified to notifier function
+ *	@nr_to_call: See comment for notifier_call_chain.
+ *	@nr_calls: See comment for notifier_call_chain
  *
  *	Calls each function in a notifier chain in turn.  The functions
  *	run in an undefined context.
@@ -400,10 +442,19 @@ EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
  *	of the last notifier function called.
  */
 
+int __raw_notifier_call_chain(struct raw_notifier_head *nh,
+			      unsigned long val, void *v,
+			      int nr_to_call, int *nr_calls)
+{
+	return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
+}
+
+EXPORT_SYMBOL_GPL(__raw_notifier_call_chain);
+
 int raw_notifier_call_chain(struct raw_notifier_head *nh,
 		unsigned long val, void *v)
 {
-	return notifier_call_chain(&nh->head, val, v);
+	return __raw_notifier_call_chain(nh, val, v, -1, NULL);
 }
 
 EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
@@ -478,10 +529,12 @@ int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
 EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
 
 /**
- *	srcu_notifier_call_chain - Call functions in an SRCU notifier chain
+ *	__srcu_notifier_call_chain - Call functions in an SRCU notifier chain
  *	@nh: Pointer to head of the SRCU notifier chain
  *	@val: Value passed unmodified to notifier function
  *	@v: Pointer passed unmodified to notifier function
+ *	@nr_to_call: See comment for notifier_call_chain.
+ *	@nr_calls: See comment for notifier_call_chain
  *
  *	Calls each function in a notifier chain in turn.  The functions
  *	run in a process context, so they are allowed to block.
@@ -494,18 +547,25 @@ EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
  *	of the last notifier function called.
  */
 
-int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
-		unsigned long val, void *v)
+int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
+			       unsigned long val, void *v,
+			       int nr_to_call, int *nr_calls)
 {
 	int ret;
 	int idx;
 
 	idx = srcu_read_lock(&nh->srcu);
-	ret = notifier_call_chain(&nh->head, val, v);
+	ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
 	srcu_read_unlock(&nh->srcu, idx);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain);
 
+int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
+		unsigned long val, void *v)
+{
+	return __srcu_notifier_call_chain(nh, val, v, -1, NULL);
+}
 EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
 
 /**
-- 
cgit v1.2.3


From baaca49f415b25fdbe2a8f3c22b39929e450fbfd Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 9 May 2007 02:34:03 -0700
Subject: Define and use new events,CPU_LOCK_ACQUIRE and CPU_LOCK_RELEASE

This is an attempt to provide an alternate mechanism for postponing
a hotplug event instead of using a global mechanism like lock_cpu_hotplug.

The proposal is to add two new events namely CPU_LOCK_ACQUIRE and
CPU_LOCK_RELEASE. The notification for these two events would be sent
out before and after a cpu_hotplug event respectively.

During the CPU_LOCK_ACQUIRE event, a cpu-hotplug-aware subsystem is
supposed to acquire any per-subsystem hotcpu mutex ( Eg. workqueue_mutex
in kernel/workqueue.c ).

During the CPU_LOCK_RELEASE release event the cpu-hotplug-aware subsystem
is supposed to release the per-subsystem hotcpu mutex.

The reasons for defining new events as opposed to reusing the existing events
like CPU_UP_PREPARE/CPU_UP_FAILED/CPU_ONLINE for locking/unlocking of
per-subsystem hotcpu mutexes are as follow:

	- CPU_LOCK_ACQUIRE: All hotcpu mutexes are taken before subsystems
	start handling pre-hotplug events like CPU_UP_PREPARE/CPU_DOWN_PREPARE
	etc, thus ensuring a clean handling of these events.

	- CPU_LOCK_RELEASE: The hotcpu mutexes will be released only after
	all subsystems have handled post-hotplug events like CPU_DOWN_FAILED,
	CPU_DEAD,CPU_ONLINE etc thereby ensuring that there are no subsequent
	clashes amongst the interdependent subsystems after a cpu hotplugs.

This patch also uses __raw_notifier_call chain in _cpu_up to take care
of the dependency between the two consequetive calls to
raw_notifier_call_chain.

[akpm@linux-foundation.org: fix a bug]
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpu.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 36e70845cfc3..48810498b355 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -132,12 +132,15 @@ static int _cpu_down(unsigned int cpu)
 	if (!cpu_online(cpu))
 		return -EINVAL;
 
+	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE,
+						(void *)(long)cpu);
 	err = raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
 						(void *)(long)cpu);
 	if (err == NOTIFY_BAD) {
 		printk("%s: attempt to take down CPU %u failed\n",
 				__FUNCTION__, cpu);
-		return -EINVAL;
+		err = -EINVAL;
+		goto out_release;
 	}
 
 	/* Ensure that we are not runnable on dying cpu */
@@ -185,6 +188,9 @@ out_thread:
 	err = kthread_stop(p);
 out_allowed:
 	set_cpus_allowed(current, old_allowed);
+out_release:
+	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE,
+						(void *)(long)cpu);
 	return err;
 }
 
@@ -206,13 +212,15 @@ int cpu_down(unsigned int cpu)
 /* Requires cpu_add_remove_lock to be held */
 static int __cpuinit _cpu_up(unsigned int cpu)
 {
-	int ret;
+	int ret, nr_calls = 0;
 	void *hcpu = (void *)(long)cpu;
 
 	if (cpu_online(cpu) || !cpu_present(cpu))
 		return -EINVAL;
 
-	ret = raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
+	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
+	ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu,
+							-1, &nr_calls);
 	if (ret == NOTIFY_BAD) {
 		printk("%s: attempt to bring up CPU %u failed\n",
 				__FUNCTION__, cpu);
@@ -233,8 +241,9 @@ static int __cpuinit _cpu_up(unsigned int cpu)
 
 out_notify:
 	if (ret != 0)
-		raw_notifier_call_chain(&cpu_chain,
-				CPU_UP_CANCELED, hcpu);
+		__raw_notifier_call_chain(&cpu_chain,
+				CPU_UP_CANCELED, hcpu, nr_calls, NULL);
+	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 5be9361cdff17fc76fa0c3e262ead94158555f16 Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 9 May 2007 02:34:04 -0700
Subject: Eliminate lock_cpu_hotplug in kernel/schedc

Eliminate lock_cpu_hotplug from kernel/sched.c and use sched_hotcpu_mutex
instead to postpone a hotplug event.

In the migration_call hotcpu callback function, take sched_hotcpu_mutex
while handling the event CPU_LOCK_ACQUIRE and release it while handling
CPU_LOCK_RELEASE event.

[akpm@linux-foundation.org: fix deadlock]
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 66bd7ff23f18..fe1a9c2b855a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -305,6 +305,7 @@ struct rq {
 };
 
 static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
+static DEFINE_MUTEX(sched_hotcpu_mutex);
 
 static inline int cpu_of(struct rq *rq)
 {
@@ -4520,13 +4521,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
 	struct task_struct *p;
 	int retval;
 
-	lock_cpu_hotplug();
+	mutex_lock(&sched_hotcpu_mutex);
 	read_lock(&tasklist_lock);
 
 	p = find_process_by_pid(pid);
 	if (!p) {
 		read_unlock(&tasklist_lock);
-		unlock_cpu_hotplug();
+		mutex_unlock(&sched_hotcpu_mutex);
 		return -ESRCH;
 	}
 
@@ -4553,7 +4554,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
 
 out_unlock:
 	put_task_struct(p);
-	unlock_cpu_hotplug();
+	mutex_unlock(&sched_hotcpu_mutex);
 	return retval;
 }
 
@@ -4610,7 +4611,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
 	struct task_struct *p;
 	int retval;
 
-	lock_cpu_hotplug();
+	mutex_lock(&sched_hotcpu_mutex);
 	read_lock(&tasklist_lock);
 
 	retval = -ESRCH;
@@ -4626,7 +4627,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
 
 out_unlock:
 	read_unlock(&tasklist_lock);
-	unlock_cpu_hotplug();
+	mutex_unlock(&sched_hotcpu_mutex);
 	if (retval)
 		return retval;
 
@@ -5388,6 +5389,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	struct rq *rq;
 
 	switch (action) {
+	case CPU_LOCK_ACQUIRE:
+		mutex_lock(&sched_hotcpu_mutex);
+		break;
+
 	case CPU_UP_PREPARE:
 		p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
 		if (IS_ERR(p))
@@ -5433,7 +5438,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		BUG_ON(rq->nr_running != 0);
 
 		/* No need to migrate the tasks: it was best-effort if
-		 * they didn't do lock_cpu_hotplug().  Just wake up
+		 * they didn't take sched_hotcpu_mutex.  Just wake up
 		 * the requestors. */
 		spin_lock_irq(&rq->lock);
 		while (!list_empty(&rq->migration_queue)) {
@@ -5447,6 +5452,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		spin_unlock_irq(&rq->lock);
 		break;
 #endif
+	case CPU_LOCK_RELEASE:
+		mutex_unlock(&sched_hotcpu_mutex);
+		break;
 	}
 	return NOTIFY_OK;
 }
@@ -6822,10 +6830,10 @@ int arch_reinit_sched_domains(void)
 {
 	int err;
 
-	lock_cpu_hotplug();
+	mutex_lock(&sched_hotcpu_mutex);
 	detach_destroy_domains(&cpu_online_map);
 	err = arch_init_sched_domains(&cpu_online_map);
-	unlock_cpu_hotplug();
+	mutex_unlock(&sched_hotcpu_mutex);
 
 	return err;
 }
@@ -6930,12 +6938,12 @@ void __init sched_init_smp(void)
 {
 	cpumask_t non_isolated_cpus;
 
-	lock_cpu_hotplug();
+	mutex_lock(&sched_hotcpu_mutex);
 	arch_init_sched_domains(&cpu_online_map);
 	cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
 	if (cpus_empty(non_isolated_cpus))
 		cpu_set(smp_processor_id(), non_isolated_cpus);
-	unlock_cpu_hotplug();
+	mutex_unlock(&sched_hotcpu_mutex);
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);
 
-- 
cgit v1.2.3


From e7407dcc69e077ac34a527842db916abfbc458df Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 9 May 2007 02:34:04 -0700
Subject: call cpu_chain with CPU_DOWN_FAILED if CPU_DOWN_PREPARE failed

This makes cpu hotplug symmetrical: if CPU_UP_PREPARE fails we get
CPU_UP_CANCELED, so we can undo what ever happened on PREPARE.  The same
should happen for CPU_DOWN_PREPARE.

[akpm@linux-foundation.org: fix for reduce-size-of-task_struct-on-64-bit-machines]
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Gautham Shenoy <ego@in.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpu.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 48810498b355..1a823944e972 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -97,7 +97,7 @@ static inline void check_for_tasks(int cpu)
 		    (!cputime_eq(p->utime, cputime_zero) ||
 		     !cputime_eq(p->stime, cputime_zero)))
 			printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
-				(state = %ld, flags = %lx) \n",
+				(state = %ld, flags = %x) \n",
 				 p->comm, p->pid, cpu, p->state, p->flags);
 	}
 	write_unlock_irq(&tasklist_lock);
@@ -122,9 +122,10 @@ static int take_cpu_down(void *unused)
 /* Requires cpu_add_remove_lock to be held */
 static int _cpu_down(unsigned int cpu)
 {
-	int err;
+	int err, nr_calls = 0;
 	struct task_struct *p;
 	cpumask_t old_allowed, tmp;
+	void *hcpu = (void *)(long)cpu;
 
 	if (num_online_cpus() == 1)
 		return -EBUSY;
@@ -132,11 +133,12 @@ static int _cpu_down(unsigned int cpu)
 	if (!cpu_online(cpu))
 		return -EINVAL;
 
-	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE,
-						(void *)(long)cpu);
-	err = raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
-						(void *)(long)cpu);
+	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
+	err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
+					hcpu, -1, &nr_calls);
 	if (err == NOTIFY_BAD) {
+		__raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, hcpu,
+					  nr_calls, NULL);
 		printk("%s: attempt to take down CPU %u failed\n",
 				__FUNCTION__, cpu);
 		err = -EINVAL;
@@ -156,7 +158,7 @@ static int _cpu_down(unsigned int cpu)
 	if (IS_ERR(p) || cpu_online(cpu)) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
-				(void *)(long)cpu) == NOTIFY_BAD)
+					    hcpu) == NOTIFY_BAD)
 			BUG();
 
 		if (IS_ERR(p)) {
@@ -178,8 +180,7 @@ static int _cpu_down(unsigned int cpu)
 	put_cpu();
 
 	/* CPU is completely dead: tell everyone.  Too late to complain. */
-	if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD,
-			(void *)(long)cpu) == NOTIFY_BAD)
+	if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD, hcpu) == NOTIFY_BAD)
 		BUG();
 
 	check_for_tasks(cpu);
-- 
cgit v1.2.3


From 319c2a986eb45989690c955d9667b814ef0ed56f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:06 -0700
Subject: workqueue: fix freezeable workqueues implementation

Currently ->freezeable is per-cpu, this is wrong. CPU_UP_PREPARE creates
cwq->thread which is not freezeable. Move ->freezeable to workqueue_struct.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: "Pallipadi, Venkatesh" <venkatesh.pallipadi@intel.com>
Cc: Gautham shenoy <ego@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5ecf4984e382..d80dbdceadb8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -49,8 +49,6 @@ struct cpu_workqueue_struct {
 	struct work_struct *current_work;
 
 	int run_depth;		/* Detect run_workqueue() recursion depth */
-
-	int freezeable;		/* Freeze the thread during suspend */
 } ____cacheline_aligned;
 
 /*
@@ -61,6 +59,7 @@ struct workqueue_struct {
 	struct cpu_workqueue_struct *cpu_wq;
 	const char *name;
 	struct list_head list; 	/* Empty if single thread */
+	int freezeable;		/* Freeze threads during suspend */
 };
 
 /* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
@@ -351,7 +350,7 @@ static int worker_thread(void *__cwq)
 	struct k_sigaction sa;
 	sigset_t blocked;
 
-	if (!cwq->freezeable)
+	if (!cwq->wq->freezeable)
 		current->flags |= PF_NOFREEZE;
 
 	set_user_nice(current, -5);
@@ -375,7 +374,7 @@ static int worker_thread(void *__cwq)
 
 	set_current_state(TASK_INTERRUPTIBLE);
 	while (!kthread_should_stop()) {
-		if (cwq->freezeable)
+		if (cwq->wq->freezeable)
 			try_to_freeze();
 
 		add_wait_queue(&cwq->more_work, &wait);
@@ -546,7 +545,7 @@ out:
 EXPORT_SYMBOL_GPL(flush_work);
 
 static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
-						   int cpu, int freezeable)
+							int cpu)
 {
 	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
 	struct task_struct *p;
@@ -554,7 +553,6 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
 	spin_lock_init(&cwq->lock);
 	cwq->wq = wq;
 	cwq->thread = NULL;
-	cwq->freezeable = freezeable;
 	INIT_LIST_HEAD(&cwq->worklist);
 	init_waitqueue_head(&cwq->more_work);
 
@@ -586,10 +584,12 @@ struct workqueue_struct *__create_workqueue(const char *name,
 	}
 
 	wq->name = name;
+	wq->freezeable = freezeable;
+
 	mutex_lock(&workqueue_mutex);
 	if (singlethread) {
 		INIT_LIST_HEAD(&wq->list);
-		p = create_workqueue_thread(wq, singlethread_cpu, freezeable);
+		p = create_workqueue_thread(wq, singlethread_cpu);
 		if (!p)
 			destroy = 1;
 		else
@@ -597,7 +597,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
 	} else {
 		list_add(&wq->list, &workqueues);
 		for_each_online_cpu(cpu) {
-			p = create_workqueue_thread(wq, cpu, freezeable);
+			p = create_workqueue_thread(wq, cpu);
 			if (p) {
 				kthread_bind(p, cpu);
 				wake_up_process(p);
@@ -854,7 +854,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		mutex_lock(&workqueue_mutex);
 		/* Create a new workqueue thread for it. */
 		list_for_each_entry(wq, &workqueues, list) {
-			if (!create_workqueue_thread(wq, hotcpu, 0)) {
+			if (!create_workqueue_thread(wq, hotcpu)) {
 				printk("workqueue for %i failed\n", hotcpu);
 				return NOTIFY_BAD;
 			}
-- 
cgit v1.2.3


From d721304dce0ced0b3b0366996cc02929669708a8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:07 -0700
Subject: workqueue: fix flush_workqueue() vs CPU_DEAD race

Many thanks to Srivatsa Vaddagiri for the helpful discussion and for spotting
the bug in my previous attempt.

work->func() (and thus flush_workqueue()) must not use workqueue_mutex,
this leads to deadlock when CPU_DEAD does kthread_stop(). However without
this mutex held we can't detect CPU_DEAD in progress, which can move pending
works to another CPU while the dead one is not on cpu_online_map.

Change flush_workqueue() to use for_each_possible_cpu(). This means that
flush_cpu_workqueue() may hit CPU which is already dead. However in that
case

	!list_empty(&cwq->worklist) || cwq->current_work != NULL

means that CPU_DEAD in progress, it will do kthread_stop() + take_over_work()
so we can proceed and insert a barrier. We hold cwq->lock, so we are safe.

Also, add migrate_sequence incremented by take_over_work() under cwq->lock.
If take_over_work() happened before we checked this CPU, we should see the
new value after spin_unlock().

Further possible changes:

	remove CPU_DEAD handling (along with take_over_work, migrate_sequence)
	from workqueue.c. CPU_DEAD just sets cwq->please_exit_after_flush flag.

	CPU_UP_PREPARE->create_workqueue_thread() clears this flag, and creates
	the new thread if cwq->thread == NULL.

This way the workqueue/cpu-hotplug interaction is almost zero, workqueue_mutex
just protects "workqueues" list, CPU_LOCK_ACQUIRE/CPU_LOCK_RELEASE go away.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: "Pallipadi, Venkatesh" <venkatesh.pallipadi@intel.com>
Cc: Gautham shenoy <ego@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 44 +++++++++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d80dbdceadb8..1d1933cf3778 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -64,6 +64,7 @@ struct workqueue_struct {
 
 /* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
    threads to each one as cpus come/go. */
+static long migrate_sequence __read_mostly;
 static DEFINE_MUTEX(workqueue_mutex);
 static LIST_HEAD(workqueues);
 
@@ -421,13 +422,7 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
 		 * Probably keventd trying to flush its own queue. So simply run
 		 * it by hand rather than deadlocking.
 		 */
-		preempt_enable();
-		/*
-		 * We can still touch *cwq here because we are keventd, and
-		 * hot-unplug will be waiting us to exit.
-		 */
 		run_workqueue(cwq);
-		preempt_disable();
 	} else {
 		struct wq_barrier barr;
 		int active = 0;
@@ -439,11 +434,8 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
 		}
 		spin_unlock_irq(&cwq->lock);
 
-		if (active) {
-			preempt_enable();
+		if (active)
 			wait_for_completion(&barr.done);
-			preempt_disable();
-		}
 	}
 }
 
@@ -462,17 +454,21 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
  */
 void fastcall flush_workqueue(struct workqueue_struct *wq)
 {
-	preempt_disable();		/* CPU hotplug */
 	if (is_single_threaded(wq)) {
 		/* Always use first cpu's area. */
 		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));
 	} else {
+		long sequence;
 		int cpu;
+again:
+		sequence = migrate_sequence;
 
-		for_each_online_cpu(cpu)
+		for_each_possible_cpu(cpu)
 			flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
+
+		if (unlikely(sequence != migrate_sequence))
+			goto again;
 	}
-	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
 
@@ -544,17 +540,21 @@ out:
 }
 EXPORT_SYMBOL_GPL(flush_work);
 
-static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
-							int cpu)
+static void init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
 {
 	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
-	struct task_struct *p;
 
-	spin_lock_init(&cwq->lock);
 	cwq->wq = wq;
-	cwq->thread = NULL;
+	spin_lock_init(&cwq->lock);
 	INIT_LIST_HEAD(&cwq->worklist);
 	init_waitqueue_head(&cwq->more_work);
+}
+
+static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
+							int cpu)
+{
+	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+	struct task_struct *p;
 
 	if (is_single_threaded(wq))
 		p = kthread_create(worker_thread, cwq, "%s", wq->name);
@@ -589,6 +589,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
 	mutex_lock(&workqueue_mutex);
 	if (singlethread) {
 		INIT_LIST_HEAD(&wq->list);
+		init_cpu_workqueue(wq, singlethread_cpu);
 		p = create_workqueue_thread(wq, singlethread_cpu);
 		if (!p)
 			destroy = 1;
@@ -596,7 +597,11 @@ struct workqueue_struct *__create_workqueue(const char *name,
 			wake_up_process(p);
 	} else {
 		list_add(&wq->list, &workqueues);
-		for_each_online_cpu(cpu) {
+		for_each_possible_cpu(cpu) {
+			init_cpu_workqueue(wq, cpu);
+			if (!cpu_online(cpu))
+				continue;
+
 			p = create_workqueue_thread(wq, cpu);
 			if (p) {
 				kthread_bind(p, cpu);
@@ -831,6 +836,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
 
 	spin_lock_irq(&cwq->lock);
 	list_replace_init(&cwq->worklist, &list);
+	migrate_sequence++;
 
 	while (!list_empty(&list)) {
 		printk("Taking work for %s\n", wq->name);
-- 
cgit v1.2.3


From 36aa9dfc39bf473780439f5629c30f59d677e793 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:08 -0700
Subject: workqueue: don't clear cwq->thread until it exits

Pointed out by Srivatsa Vaddagiri.

cleanup_workqueue_thread() sets cwq->thread = NULL and does kthread_stop().
This breaks the "if (cwq->thread == current)" logic in flush_cpu_workqueue()
and leads to deadlock.

Kill the thead first, then clear cwq->thread. workqueue_mutex protects us
from create_workqueue_thread() so we don't need cwq->lock.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: "Pallipadi, Venkatesh" <venkatesh.pallipadi@intel.com>
Cc: Gautham shenoy <ego@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1d1933cf3778..398c34ff6a54 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -625,17 +625,12 @@ EXPORT_SYMBOL_GPL(__create_workqueue);
 
 static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
 {
-	struct cpu_workqueue_struct *cwq;
-	unsigned long flags;
-	struct task_struct *p;
+	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
 
-	cwq = per_cpu_ptr(wq->cpu_wq, cpu);
-	spin_lock_irqsave(&cwq->lock, flags);
-	p = cwq->thread;
-	cwq->thread = NULL;
-	spin_unlock_irqrestore(&cwq->lock, flags);
-	if (p)
-		kthread_stop(p);
+	if (cwq->thread) {
+		kthread_stop(cwq->thread);
+		cwq->thread = NULL;
+	}
 }
 
 /**
-- 
cgit v1.2.3


From 3af24433efac62f451bfdb1cf1edb7181fb73645 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:09 -0700
Subject: workqueue: don't migrate pending works from the dead CPU

Currently CPU_DEAD uses kthread_stop() to stop cwq->thread and then
transfers cwq->worklist to another CPU.  However, it is very unlikely that
worker_thread() will notice kthread_should_stop() before flushing
cwq->worklist.  It is only possible if worker_thread() was preempted after
run_workqueue(cwq), a new work_struct was added, and CPU_DEAD happened
before cwq->thread has a chance to run.

This means that take_over_work() mostly adds unneeded complications.  Note
also that kthread_stop() is not good per se, wake_up_process() may confuse
work->func() if it sleeps waiting for some event.

Remove take_over_work() and migrate_sequence complications.  CPU_DEAD sets
the cwq->should_stop flag (introduced by this patch) and waits for
cwq->thread to flush cwq->worklist and exit.  Because the dead CPU is not
on cpu_online_map, no more works can be added to that cwq.

cpu_populated_map was introduced to optimize for_each_possible_cpu(), it is
not strictly needed, and it is more a documentation in fact.

Saves 418 bytes.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: "Pallipadi, Venkatesh" <venkatesh.pallipadi@intel.com>
Cc: Gautham shenoy <ego@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 430 ++++++++++++++++++++++++++---------------------------
 1 file changed, 211 insertions(+), 219 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 398c34ff6a54..a981add58fb9 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -43,10 +43,11 @@ struct cpu_workqueue_struct {
 
 	struct list_head worklist;
 	wait_queue_head_t more_work;
+	struct work_struct *current_work;
 
 	struct workqueue_struct *wq;
 	struct task_struct *thread;
-	struct work_struct *current_work;
+	int should_stop;
 
 	int run_depth;		/* Detect run_workqueue() recursion depth */
 } ____cacheline_aligned;
@@ -64,11 +65,12 @@ struct workqueue_struct {
 
 /* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
    threads to each one as cpus come/go. */
-static long migrate_sequence __read_mostly;
 static DEFINE_MUTEX(workqueue_mutex);
 static LIST_HEAD(workqueues);
 
-static int singlethread_cpu;
+static int singlethread_cpu __read_mostly;
+/* optimization, we could use cpu_possible_map */
+static cpumask_t cpu_populated_map __read_mostly;
 
 /* If it's single threaded, it isn't in the list of workqueues. */
 static inline int is_single_threaded(struct workqueue_struct *wq)
@@ -344,10 +346,28 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
 	spin_unlock_irqrestore(&cwq->lock, flags);
 }
 
+/*
+ * NOTE: the caller must not touch *cwq if this func returns true
+ */
+static int cwq_should_stop(struct cpu_workqueue_struct *cwq)
+{
+	int should_stop = cwq->should_stop;
+
+	if (unlikely(should_stop)) {
+		spin_lock_irq(&cwq->lock);
+		should_stop = cwq->should_stop && list_empty(&cwq->worklist);
+		if (should_stop)
+			cwq->thread = NULL;
+		spin_unlock_irq(&cwq->lock);
+	}
+
+	return should_stop;
+}
+
 static int worker_thread(void *__cwq)
 {
 	struct cpu_workqueue_struct *cwq = __cwq;
-	DECLARE_WAITQUEUE(wait, current);
+	DEFINE_WAIT(wait);
 	struct k_sigaction sa;
 	sigset_t blocked;
 
@@ -373,23 +393,21 @@ static int worker_thread(void *__cwq)
 	siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
 	do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
 
-	set_current_state(TASK_INTERRUPTIBLE);
-	while (!kthread_should_stop()) {
+	for (;;) {
 		if (cwq->wq->freezeable)
 			try_to_freeze();
 
-		add_wait_queue(&cwq->more_work, &wait);
-		if (list_empty(&cwq->worklist))
+		prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
+		if (!cwq->should_stop && list_empty(&cwq->worklist))
 			schedule();
-		else
-			__set_current_state(TASK_RUNNING);
-		remove_wait_queue(&cwq->more_work, &wait);
+		finish_wait(&cwq->more_work, &wait);
+
+		if (cwq_should_stop(cwq))
+			break;
 
-		if (!list_empty(&cwq->worklist))
-			run_workqueue(cwq);
-		set_current_state(TASK_INTERRUPTIBLE);
+		run_workqueue(cwq);
 	}
-	__set_current_state(TASK_RUNNING);
+
 	return 0;
 }
 
@@ -454,20 +472,13 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
  */
 void fastcall flush_workqueue(struct workqueue_struct *wq)
 {
-	if (is_single_threaded(wq)) {
-		/* Always use first cpu's area. */
+	if (is_single_threaded(wq))
 		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));
-	} else {
-		long sequence;
+	else {
 		int cpu;
-again:
-		sequence = migrate_sequence;
 
-		for_each_possible_cpu(cpu)
+		for_each_cpu_mask(cpu, cpu_populated_map)
 			flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
-
-		if (unlikely(sequence != migrate_sequence))
-			goto again;
 	}
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
@@ -485,11 +496,8 @@ static void wait_on_work(struct cpu_workqueue_struct *cwq,
 	}
 	spin_unlock_irq(&cwq->lock);
 
-	if (unlikely(running)) {
-		mutex_unlock(&workqueue_mutex);
+	if (unlikely(running))
 		wait_for_completion(&barr.done);
-		mutex_lock(&workqueue_mutex);
-	}
 }
 
 /**
@@ -510,155 +518,31 @@ void flush_work(struct workqueue_struct *wq, struct work_struct *work)
 {
 	struct cpu_workqueue_struct *cwq;
 
-	mutex_lock(&workqueue_mutex);
 	cwq = get_wq_data(work);
 	/* Was it ever queued ? */
 	if (!cwq)
-		goto out;
+		return;
 
 	/*
-	 * This work can't be re-queued, and the lock above protects us
-	 * from take_over_work(), no need to re-check that get_wq_data()
-	 * is still the same when we take cwq->lock.
+	 * This work can't be re-queued, no need to re-check that
+	 * get_wq_data() is still the same when we take cwq->lock.
 	 */
 	spin_lock_irq(&cwq->lock);
 	list_del_init(&work->entry);
 	work_release(work);
 	spin_unlock_irq(&cwq->lock);
 
-	if (is_single_threaded(wq)) {
-		/* Always use first cpu's area. */
+	if (is_single_threaded(wq))
 		wait_on_work(per_cpu_ptr(wq->cpu_wq, singlethread_cpu), work);
-	} else {
+	else {
 		int cpu;
 
-		for_each_online_cpu(cpu)
+		for_each_cpu_mask(cpu, cpu_populated_map)
 			wait_on_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 	}
-out:
-	mutex_unlock(&workqueue_mutex);
 }
 EXPORT_SYMBOL_GPL(flush_work);
 
-static void init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
-{
-	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
-
-	cwq->wq = wq;
-	spin_lock_init(&cwq->lock);
-	INIT_LIST_HEAD(&cwq->worklist);
-	init_waitqueue_head(&cwq->more_work);
-}
-
-static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
-							int cpu)
-{
-	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
-	struct task_struct *p;
-
-	if (is_single_threaded(wq))
-		p = kthread_create(worker_thread, cwq, "%s", wq->name);
-	else
-		p = kthread_create(worker_thread, cwq, "%s/%d", wq->name, cpu);
-	if (IS_ERR(p))
-		return NULL;
-	cwq->thread = p;
-	return p;
-}
-
-struct workqueue_struct *__create_workqueue(const char *name,
-					    int singlethread, int freezeable)
-{
-	int cpu, destroy = 0;
-	struct workqueue_struct *wq;
-	struct task_struct *p;
-
-	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
-	if (!wq)
-		return NULL;
-
-	wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
-	if (!wq->cpu_wq) {
-		kfree(wq);
-		return NULL;
-	}
-
-	wq->name = name;
-	wq->freezeable = freezeable;
-
-	mutex_lock(&workqueue_mutex);
-	if (singlethread) {
-		INIT_LIST_HEAD(&wq->list);
-		init_cpu_workqueue(wq, singlethread_cpu);
-		p = create_workqueue_thread(wq, singlethread_cpu);
-		if (!p)
-			destroy = 1;
-		else
-			wake_up_process(p);
-	} else {
-		list_add(&wq->list, &workqueues);
-		for_each_possible_cpu(cpu) {
-			init_cpu_workqueue(wq, cpu);
-			if (!cpu_online(cpu))
-				continue;
-
-			p = create_workqueue_thread(wq, cpu);
-			if (p) {
-				kthread_bind(p, cpu);
-				wake_up_process(p);
-			} else
-				destroy = 1;
-		}
-	}
-	mutex_unlock(&workqueue_mutex);
-
-	/*
-	 * Was there any error during startup? If yes then clean up:
-	 */
-	if (destroy) {
-		destroy_workqueue(wq);
-		wq = NULL;
-	}
-	return wq;
-}
-EXPORT_SYMBOL_GPL(__create_workqueue);
-
-static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
-{
-	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
-
-	if (cwq->thread) {
-		kthread_stop(cwq->thread);
-		cwq->thread = NULL;
-	}
-}
-
-/**
- * destroy_workqueue - safely terminate a workqueue
- * @wq: target workqueue
- *
- * Safely destroy a workqueue. All work currently pending will be done first.
- */
-void destroy_workqueue(struct workqueue_struct *wq)
-{
-	int cpu;
-
-	flush_workqueue(wq);
-
-	/* We don't need the distraction of CPUs appearing and vanishing. */
-	mutex_lock(&workqueue_mutex);
-	if (is_single_threaded(wq))
-		cleanup_workqueue_thread(wq, singlethread_cpu);
-	else {
-		for_each_online_cpu(cpu)
-			cleanup_workqueue_thread(wq, cpu);
-		list_del(&wq->list);
-	}
-	mutex_unlock(&workqueue_mutex);
-	free_percpu(wq->cpu_wq);
-	kfree(wq);
-}
-EXPORT_SYMBOL_GPL(destroy_workqueue);
 
 static struct workqueue_struct *keventd_wq;
 
@@ -822,85 +706,193 @@ int current_is_keventd(void)
 
 }
 
-/* Take the work from this (downed) CPU. */
-static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
+static struct cpu_workqueue_struct *
+init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
 {
 	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
-	struct list_head list;
-	struct work_struct *work;
 
-	spin_lock_irq(&cwq->lock);
-	list_replace_init(&cwq->worklist, &list);
-	migrate_sequence++;
-
-	while (!list_empty(&list)) {
-		printk("Taking work for %s\n", wq->name);
-		work = list_entry(list.next,struct work_struct,entry);
-		list_del(&work->entry);
-		__queue_work(per_cpu_ptr(wq->cpu_wq, smp_processor_id()), work);
-	}
-	spin_unlock_irq(&cwq->lock);
+	cwq->wq = wq;
+	spin_lock_init(&cwq->lock);
+	INIT_LIST_HEAD(&cwq->worklist);
+	init_waitqueue_head(&cwq->more_work);
+
+	return cwq;
 }
 
-/* We're holding the cpucontrol mutex here */
-static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
-				  unsigned long action,
-				  void *hcpu)
+static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+{
+	struct workqueue_struct *wq = cwq->wq;
+	const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d";
+	struct task_struct *p;
+
+	p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu);
+	/*
+	 * Nobody can add the work_struct to this cwq,
+	 *	if (caller is __create_workqueue)
+	 *		nobody should see this wq
+	 *	else // caller is CPU_UP_PREPARE
+	 *		cpu is not on cpu_online_map
+	 * so we can abort safely.
+	 */
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	cwq->thread = p;
+	cwq->should_stop = 0;
+	if (!is_single_threaded(wq))
+		kthread_bind(p, cpu);
+
+	if (is_single_threaded(wq) || cpu_online(cpu))
+		wake_up_process(p);
+
+	return 0;
+}
+
+struct workqueue_struct *__create_workqueue(const char *name,
+					    int singlethread, int freezeable)
 {
-	unsigned int hotcpu = (unsigned long)hcpu;
 	struct workqueue_struct *wq;
+	struct cpu_workqueue_struct *cwq;
+	int err = 0, cpu;
 
-	switch (action) {
-	case CPU_UP_PREPARE:
+	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
+	if (!wq)
+		return NULL;
+
+	wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
+	if (!wq->cpu_wq) {
+		kfree(wq);
+		return NULL;
+	}
+
+	wq->name = name;
+	wq->freezeable = freezeable;
+
+	if (singlethread) {
+		INIT_LIST_HEAD(&wq->list);
+		cwq = init_cpu_workqueue(wq, singlethread_cpu);
+		err = create_workqueue_thread(cwq, singlethread_cpu);
+	} else {
 		mutex_lock(&workqueue_mutex);
-		/* Create a new workqueue thread for it. */
-		list_for_each_entry(wq, &workqueues, list) {
-			if (!create_workqueue_thread(wq, hotcpu)) {
-				printk("workqueue for %i failed\n", hotcpu);
-				return NOTIFY_BAD;
-			}
+		list_add(&wq->list, &workqueues);
+
+		for_each_possible_cpu(cpu) {
+			cwq = init_cpu_workqueue(wq, cpu);
+			if (err || !cpu_online(cpu))
+				continue;
+			err = create_workqueue_thread(cwq, cpu);
 		}
-		break;
+		mutex_unlock(&workqueue_mutex);
+	}
+
+	if (err) {
+		destroy_workqueue(wq);
+		wq = NULL;
+	}
+	return wq;
+}
+EXPORT_SYMBOL_GPL(__create_workqueue);
 
-	case CPU_ONLINE:
-		/* Kick off worker threads. */
-		list_for_each_entry(wq, &workqueues, list) {
-			struct cpu_workqueue_struct *cwq;
+static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+{
+	struct wq_barrier barr;
+	int alive = 0;
 
-			cwq = per_cpu_ptr(wq->cpu_wq, hotcpu);
-			kthread_bind(cwq->thread, hotcpu);
-			wake_up_process(cwq->thread);
-		}
+	spin_lock_irq(&cwq->lock);
+	if (cwq->thread != NULL) {
+		insert_wq_barrier(cwq, &barr, 1);
+		cwq->should_stop = 1;
+		alive = 1;
+	}
+	spin_unlock_irq(&cwq->lock);
+
+	if (alive) {
+		wait_for_completion(&barr.done);
+
+		while (unlikely(cwq->thread != NULL))
+			cpu_relax();
+		/*
+		 * Wait until cwq->thread unlocks cwq->lock,
+		 * it won't touch *cwq after that.
+		 */
+		smp_rmb();
+		spin_unlock_wait(&cwq->lock);
+	}
+}
+
+/**
+ * destroy_workqueue - safely terminate a workqueue
+ * @wq: target workqueue
+ *
+ * Safely destroy a workqueue. All work currently pending will be done first.
+ */
+void destroy_workqueue(struct workqueue_struct *wq)
+{
+	struct cpu_workqueue_struct *cwq;
+
+	if (is_single_threaded(wq)) {
+		cwq = per_cpu_ptr(wq->cpu_wq, singlethread_cpu);
+		cleanup_workqueue_thread(cwq, singlethread_cpu);
+	} else {
+		int cpu;
+
+		mutex_lock(&workqueue_mutex);
+		list_del(&wq->list);
 		mutex_unlock(&workqueue_mutex);
-		break;
 
-	case CPU_UP_CANCELED:
-		list_for_each_entry(wq, &workqueues, list) {
-			if (!per_cpu_ptr(wq->cpu_wq, hotcpu)->thread)
-				continue;
-			/* Unbind so it can run. */
-			kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread,
-				     any_online_cpu(cpu_online_map));
-			cleanup_workqueue_thread(wq, hotcpu);
+		for_each_cpu_mask(cpu, cpu_populated_map) {
+			cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+			cleanup_workqueue_thread(cwq, cpu);
 		}
-		mutex_unlock(&workqueue_mutex);
-		break;
+	}
 
-	case CPU_DOWN_PREPARE:
+	free_percpu(wq->cpu_wq);
+	kfree(wq);
+}
+EXPORT_SYMBOL_GPL(destroy_workqueue);
+
+static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
+						unsigned long action,
+						void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+	struct cpu_workqueue_struct *cwq;
+	struct workqueue_struct *wq;
+
+	switch (action) {
+	case CPU_LOCK_ACQUIRE:
 		mutex_lock(&workqueue_mutex);
-		break;
+		return NOTIFY_OK;
 
-	case CPU_DOWN_FAILED:
+	case CPU_LOCK_RELEASE:
 		mutex_unlock(&workqueue_mutex);
-		break;
+		return NOTIFY_OK;
 
-	case CPU_DEAD:
-		list_for_each_entry(wq, &workqueues, list)
-			cleanup_workqueue_thread(wq, hotcpu);
-		list_for_each_entry(wq, &workqueues, list)
-			take_over_work(wq, hotcpu);
-		mutex_unlock(&workqueue_mutex);
-		break;
+	case CPU_UP_PREPARE:
+		cpu_set(cpu, cpu_populated_map);
+	}
+
+	list_for_each_entry(wq, &workqueues, list) {
+		cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+
+		switch (action) {
+		case CPU_UP_PREPARE:
+			if (!create_workqueue_thread(cwq, cpu))
+				break;
+			printk(KERN_ERR "workqueue for %i failed\n", cpu);
+			return NOTIFY_BAD;
+
+		case CPU_ONLINE:
+			wake_up_process(cwq->thread);
+			break;
+
+		case CPU_UP_CANCELED:
+			if (cwq->thread)
+				wake_up_process(cwq->thread);
+		case CPU_DEAD:
+			cleanup_workqueue_thread(cwq, cpu);
+			break;
+		}
 	}
 
 	return NOTIFY_OK;
@@ -908,9 +900,9 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 
 void init_workqueues(void)
 {
+	cpu_populated_map = cpu_online_map;
 	singlethread_cpu = first_cpu(cpu_possible_map);
 	hotcpu_notifier(workqueue_cpu_callback, 0);
 	keventd_wq = create_workqueue("events");
 	BUG_ON(!keventd_wq);
 }
-
-- 
cgit v1.2.3


From 7097a87afe937a5879528d52880c2d95f089e96c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:10 -0700
Subject: workqueue: kill run_scheduled_work()

Because it has no callers.

Actually, I think the whole idea of run_scheduled_work() was not right, not
good to mix "unqueue this work and execute its ->func()" in one function.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 73 ------------------------------------------------------
 1 file changed, 73 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a981add58fb9..ea422254f8bf 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -98,79 +98,6 @@ static inline void *get_wq_data(struct work_struct *work)
 	return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
 }
 
-static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work)
-{
-	int ret = 0;
-	unsigned long flags;
-
-	spin_lock_irqsave(&cwq->lock, flags);
-	/*
-	 * We need to re-validate the work info after we've gotten
-	 * the cpu_workqueue lock. We can run the work now iff:
-	 *
-	 *  - the wq_data still matches the cpu_workqueue_struct
-	 *  - AND the work is still marked pending
-	 *  - AND the work is still on a list (which will be this
-	 *    workqueue_struct list)
-	 *
-	 * All these conditions are important, because we
-	 * need to protect against the work being run right
-	 * now on another CPU (all but the last one might be
-	 * true if it's currently running and has not been
-	 * released yet, for example).
-	 */
-	if (get_wq_data(work) == cwq
-	    && work_pending(work)
-	    && !list_empty(&work->entry)) {
-		work_func_t f = work->func;
-		cwq->current_work = work;
-		list_del_init(&work->entry);
-		spin_unlock_irqrestore(&cwq->lock, flags);
-
-		if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work)))
-			work_release(work);
-		f(work);
-
-		spin_lock_irqsave(&cwq->lock, flags);
-		cwq->current_work = NULL;
-		ret = 1;
-	}
-	spin_unlock_irqrestore(&cwq->lock, flags);
-	return ret;
-}
-
-/**
- * run_scheduled_work - run scheduled work synchronously
- * @work: work to run
- *
- * This checks if the work was pending, and runs it
- * synchronously if so. It returns a boolean to indicate
- * whether it had any scheduled work to run or not.
- *
- * NOTE! This _only_ works for normal work_structs. You
- * CANNOT use this for delayed work, because the wq data
- * for delayed work will not point properly to the per-
- * CPU workqueue struct, but will change!
- */
-int fastcall run_scheduled_work(struct work_struct *work)
-{
-	for (;;) {
-		struct cpu_workqueue_struct *cwq;
-
-		if (!work_pending(work))
-			return 0;
-		if (list_empty(&work->entry))
-			return 0;
-		/* NOTE! This depends intimately on __queue_work! */
-		cwq = get_wq_data(work);
-		if (!cwq)
-			return 0;
-		if (__run_work(cwq, work))
-			return 1;
-	}
-}
-EXPORT_SYMBOL(run_scheduled_work);
-
 static void insert_work(struct cpu_workqueue_struct *cwq,
 				struct work_struct *work, int tail)
 {
-- 
cgit v1.2.3


From f293ea92007419e4f9c52db0cf57af17f45b9f94 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:10 -0700
Subject: workqueue: don't save interrupts in run_workqueue()

work->func() may sleep, it's a bug to call run_workqueue() with irqs disabled.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ea422254f8bf..74f3f7825229 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -227,13 +227,7 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 
 static void run_workqueue(struct cpu_workqueue_struct *cwq)
 {
-	unsigned long flags;
-
-	/*
-	 * Keep taking off work from the queue until
-	 * done.
-	 */
-	spin_lock_irqsave(&cwq->lock, flags);
+	spin_lock_irq(&cwq->lock);
 	cwq->run_depth++;
 	if (cwq->run_depth > 3) {
 		/* morton gets to eat his hat */
@@ -248,7 +242,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
 
 		cwq->current_work = work;
 		list_del_init(cwq->worklist.next);
-		spin_unlock_irqrestore(&cwq->lock, flags);
+		spin_unlock_irq(&cwq->lock);
 
 		BUG_ON(get_wq_data(work) != cwq);
 		if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work)))
@@ -266,11 +260,11 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
 			dump_stack();
 		}
 
-		spin_lock_irqsave(&cwq->lock, flags);
+		spin_lock_irq(&cwq->lock);
 		cwq->current_work = NULL;
 	}
 	cwq->run_depth--;
-	spin_unlock_irqrestore(&cwq->lock, flags);
+	spin_unlock_irq(&cwq->lock);
 }
 
 /*
@@ -399,6 +393,8 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
  */
 void fastcall flush_workqueue(struct workqueue_struct *wq)
 {
+	might_sleep();
+
 	if (is_single_threaded(wq))
 		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));
 	else {
@@ -445,6 +441,8 @@ void flush_work(struct workqueue_struct *wq, struct work_struct *work)
 {
 	struct cpu_workqueue_struct *cwq;
 
+	might_sleep();
+
 	cwq = get_wq_data(work);
 	/* Was it ever queued ? */
 	if (!cwq)
-- 
cgit v1.2.3


From dfb4b82e1c631b1a6057e77212996a890aa515b7 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:11 -0700
Subject: workqueue: make cancel_rearming_delayed_workqueue() work on idle
 dwork

cancel_rearming_delayed_workqueue(dwork) will hang forever if dwork was not
scheduled, because in that case cancel_delayed_work()->del_timer_sync() never
returns true.

I don't know if there are any callers which may have problems, but this is not
so convenient, and the fix is very simple.

Q: looks like we don't need "struct workqueue_struct *wq" parameter.  If the
timer was aborted successfully, get_wq_data() == wq.  Is it worth to add the
new function?

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 74f3f7825229..ce72d45c7fd8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -569,6 +569,10 @@ EXPORT_SYMBOL(flush_work_keventd);
 void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
 				       struct delayed_work *dwork)
 {
+	/* Was it ever queued ? */
+	if (!get_wq_data(&dwork->work))
+		return;
+
 	while (!cancel_delayed_work(dwork))
 		flush_workqueue(wq);
 }
-- 
cgit v1.2.3


From b1f4ec172f75bc2f5cc4f4be69b5587660a955d2 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:12 -0700
Subject: workqueue: introduce cpu_singlethread_map

The code like

	if (is_single_threaded(wq))
		do_something(singlethread_cpu);
	else {
		for_each_cpu_mask(cpu, cpu_populated_map)
			do_something(cpu);
	}

looks very annoying. We can add "static cpumask_t cpu_singlethread_map" and
simplify the code. Lessens .text a bit, and imho makes the code more readable.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 55 +++++++++++++++++++++++++-----------------------------
 1 file changed, 25 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ce72d45c7fd8..6308a4bc6a82 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -69,6 +69,7 @@ static DEFINE_MUTEX(workqueue_mutex);
 static LIST_HEAD(workqueues);
 
 static int singlethread_cpu __read_mostly;
+static cpumask_t cpu_singlethread_map __read_mostly;
 /* optimization, we could use cpu_possible_map */
 static cpumask_t cpu_populated_map __read_mostly;
 
@@ -78,6 +79,12 @@ static inline int is_single_threaded(struct workqueue_struct *wq)
 	return list_empty(&wq->list);
 }
 
+static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq)
+{
+	return is_single_threaded(wq)
+		? &cpu_singlethread_map : &cpu_populated_map;
+}
+
 /*
  * Set the workqueue on which a work item is to be run
  * - Must *only* be called if the pending flag is set
@@ -393,16 +400,12 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
  */
 void fastcall flush_workqueue(struct workqueue_struct *wq)
 {
-	might_sleep();
-
-	if (is_single_threaded(wq))
-		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));
-	else {
-		int cpu;
+	const cpumask_t *cpu_map = wq_cpu_map(wq);
+	int cpu
 
-		for_each_cpu_mask(cpu, cpu_populated_map)
-			flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
-	}
+	might_sleep();
+	for_each_cpu_mask(cpu, *cpu_map)
+		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
 
@@ -439,7 +442,9 @@ static void wait_on_work(struct cpu_workqueue_struct *cwq,
  */
 void flush_work(struct workqueue_struct *wq, struct work_struct *work)
 {
+	const cpumask_t *cpu_map = wq_cpu_map(wq);
 	struct cpu_workqueue_struct *cwq;
+	int cpu;
 
 	might_sleep();
 
@@ -457,14 +462,8 @@ void flush_work(struct workqueue_struct *wq, struct work_struct *work)
 	work_release(work);
 	spin_unlock_irq(&cwq->lock);
 
-	if (is_single_threaded(wq))
-		wait_on_work(per_cpu_ptr(wq->cpu_wq, singlethread_cpu), work);
-	else {
-		int cpu;
-
-		for_each_cpu_mask(cpu, cpu_populated_map)
-			wait_on_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
-	}
+	for_each_cpu_mask(cpu, *cpu_map)
+		wait_on_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 }
 EXPORT_SYMBOL_GPL(flush_work);
 
@@ -757,22 +756,17 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
  */
 void destroy_workqueue(struct workqueue_struct *wq)
 {
+	const cpumask_t *cpu_map = wq_cpu_map(wq);
 	struct cpu_workqueue_struct *cwq;
+	int cpu;
 
-	if (is_single_threaded(wq)) {
-		cwq = per_cpu_ptr(wq->cpu_wq, singlethread_cpu);
-		cleanup_workqueue_thread(cwq, singlethread_cpu);
-	} else {
-		int cpu;
+	mutex_lock(&workqueue_mutex);
+	list_del(&wq->list);
+	mutex_unlock(&workqueue_mutex);
 
-		mutex_lock(&workqueue_mutex);
-		list_del(&wq->list);
-		mutex_unlock(&workqueue_mutex);
-
-		for_each_cpu_mask(cpu, cpu_populated_map) {
-			cwq = per_cpu_ptr(wq->cpu_wq, cpu);
-			cleanup_workqueue_thread(cwq, cpu);
-		}
+	for_each_cpu_mask(cpu, *cpu_map) {
+		cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+		cleanup_workqueue_thread(cwq, cpu);
 	}
 
 	free_percpu(wq->cpu_wq);
@@ -831,6 +825,7 @@ void init_workqueues(void)
 {
 	cpu_populated_map = cpu_online_map;
 	singlethread_cpu = first_cpu(cpu_possible_map);
+	cpu_singlethread_map = cpumask_of_cpu(singlethread_cpu);
 	hotcpu_notifier(workqueue_cpu_callback, 0);
 	keventd_wq = create_workqueue("events");
 	BUG_ON(!keventd_wq);
-- 
cgit v1.2.3


From cce1a1656c9a3fdc6c6c1029b576e4ab6ecaac37 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:13 -0700
Subject: workqueue: introduce workqueue_struct->singlethread

Add explicit workqueue_struct->singlethread flag.  This lessens .text a
little, but most importantly this allows us to manipulate wq->list without
changine the meaning of is_single_threaded().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6308a4bc6a82..32b1091f21ef 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -58,8 +58,9 @@ struct cpu_workqueue_struct {
  */
 struct workqueue_struct {
 	struct cpu_workqueue_struct *cpu_wq;
+	struct list_head list;
 	const char *name;
-	struct list_head list; 	/* Empty if single thread */
+	int singlethread;
 	int freezeable;		/* Freeze threads during suspend */
 };
 
@@ -76,7 +77,7 @@ static cpumask_t cpu_populated_map __read_mostly;
 /* If it's single threaded, it isn't in the list of workqueues. */
 static inline int is_single_threaded(struct workqueue_struct *wq)
 {
-	return list_empty(&wq->list);
+	return wq->singlethread;
 }
 
 static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq)
@@ -401,7 +402,7 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
 void fastcall flush_workqueue(struct workqueue_struct *wq)
 {
 	const cpumask_t *cpu_map = wq_cpu_map(wq);
-	int cpu
+	int cpu;
 
 	might_sleep();
 	for_each_cpu_mask(cpu, *cpu_map)
@@ -694,10 +695,11 @@ struct workqueue_struct *__create_workqueue(const char *name,
 	}
 
 	wq->name = name;
+	wq->singlethread = singlethread;
 	wq->freezeable = freezeable;
+	INIT_LIST_HEAD(&wq->list);
 
 	if (singlethread) {
-		INIT_LIST_HEAD(&wq->list);
 		cwq = init_cpu_workqueue(wq, singlethread_cpu);
 		err = create_workqueue_thread(cwq, singlethread_cpu);
 	} else {
-- 
cgit v1.2.3


From c12920d19078eb8fd99560ec232a6e05c6ff1aa8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:14 -0700
Subject: workqueue: make init_workqueues() __init

The only caller of init_workqueues() is do_basic_setup().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 32b1091f21ef..e858e93886e3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -823,7 +823,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 	return NOTIFY_OK;
 }
 
-void init_workqueues(void)
+void __init init_workqueues(void)
 {
 	cpu_populated_map = cpu_online_map;
 	singlethread_cpu = first_cpu(cpu_possible_map);
-- 
cgit v1.2.3


From 06ba38a9a0f6ceffe70343f684c5a690e3710ef4 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:15 -0700
Subject: workqueues: shift kthread_bind() from CPU_UP_PREPARE to CPU_ONLINE

CPU_UP_PREPARE binds cwq->thread to the new CPU.  So CPU_UP_CANCELED tries to
wake up the task which is bound to the failed CPU.

With this patch we don't bind cwq->thread until CPU becomes online.  The first
wake_up() after kthread_create() is a bit special, make a simple helper for
that.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Gautham R Shenoy <ego@in.ibm.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e858e93886e3..7d1ebfc1a995 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -668,15 +668,21 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 
 	cwq->thread = p;
 	cwq->should_stop = 0;
-	if (!is_single_threaded(wq))
-		kthread_bind(p, cpu);
-
-	if (is_single_threaded(wq) || cpu_online(cpu))
-		wake_up_process(p);
 
 	return 0;
 }
 
+static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+{
+	struct task_struct *p = cwq->thread;
+
+	if (p != NULL) {
+		if (cpu >= 0)
+			kthread_bind(p, cpu);
+		wake_up_process(p);
+	}
+}
+
 struct workqueue_struct *__create_workqueue(const char *name,
 					    int singlethread, int freezeable)
 {
@@ -702,6 +708,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
 	if (singlethread) {
 		cwq = init_cpu_workqueue(wq, singlethread_cpu);
 		err = create_workqueue_thread(cwq, singlethread_cpu);
+		start_workqueue_thread(cwq, -1);
 	} else {
 		mutex_lock(&workqueue_mutex);
 		list_add(&wq->list, &workqueues);
@@ -711,6 +718,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
 			if (err || !cpu_online(cpu))
 				continue;
 			err = create_workqueue_thread(cwq, cpu);
+			start_workqueue_thread(cwq, cpu);
 		}
 		mutex_unlock(&workqueue_mutex);
 	}
@@ -808,12 +816,11 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 			return NOTIFY_BAD;
 
 		case CPU_ONLINE:
-			wake_up_process(cwq->thread);
+			start_workqueue_thread(cwq, cpu);
 			break;
 
 		case CPU_UP_CANCELED:
-			if (cwq->thread)
-				wake_up_process(cwq->thread);
+			start_workqueue_thread(cwq, -1);
 		case CPU_DEAD:
 			cleanup_workqueue_thread(cwq, cpu);
 			break;
-- 
cgit v1.2.3


From ed7c0feede39d70092d048ec30f59bb1df69eec6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:16 -0700
Subject: make queue_delayed_work() friendly to flush_fork()

Currently typeof(delayed_work->work.data) is

	"struct workqueue_struct" when the timer is pending

	"struct cpu_workqueue_struct" whe the work is queued

This makes impossible to use flush_fork(delayed_work->work) in addition
to cancel_delayed_work/cancel_rearming_delayed_work, not good.

Change queue_delayed_work/delayed_work_timer_fn to use cwq, not wq. This
complicates (and uglifies) these functions a little bit, but alows us to
use flush_fork(dwork) and imho makes the whole code more consistent.

Also, document the fact that cancel_rearming_delayed_work() doesn't garantee
the completion of work->func() upon return.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 7d1ebfc1a995..d107e1c3b071 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -90,18 +90,20 @@ static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq)
  * Set the workqueue on which a work item is to be run
  * - Must *only* be called if the pending flag is set
  */
-static inline void set_wq_data(struct work_struct *work, void *wq)
+static inline void set_wq_data(struct work_struct *work,
+				struct cpu_workqueue_struct *cwq)
 {
 	unsigned long new;
 
 	BUG_ON(!work_pending(work));
 
-	new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING);
+	new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING);
 	new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work);
 	atomic_long_set(&work->data, new);
 }
 
-static inline void *get_wq_data(struct work_struct *work)
+static inline
+struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
 {
 	return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
 }
@@ -157,7 +159,8 @@ EXPORT_SYMBOL_GPL(queue_work);
 void delayed_work_timer_fn(unsigned long __data)
 {
 	struct delayed_work *dwork = (struct delayed_work *)__data;
-	struct workqueue_struct *wq = get_wq_data(&dwork->work);
+	struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
+	struct workqueue_struct *wq = cwq->wq;
 	int cpu = smp_processor_id();
 
 	if (unlikely(is_single_threaded(wq)))
@@ -189,8 +192,9 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq,
 		BUG_ON(timer_pending(timer));
 		BUG_ON(!list_empty(&work->entry));
 
-		/* This stores wq for the moment, for the timer_fn */
-		set_wq_data(work, wq);
+		/* This stores cwq for the moment, for the timer_fn */
+		set_wq_data(work,
+			per_cpu_ptr(wq->cpu_wq, raw_smp_processor_id()));
 		timer->expires = jiffies + delay;
 		timer->data = (unsigned long)dwork;
 		timer->function = delayed_work_timer_fn;
@@ -221,8 +225,9 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 		BUG_ON(timer_pending(timer));
 		BUG_ON(!list_empty(&work->entry));
 
-		/* This stores wq for the moment, for the timer_fn */
-		set_wq_data(work, wq);
+		/* This stores cwq for the moment, for the timer_fn */
+		set_wq_data(work,
+			per_cpu_ptr(wq->cpu_wq, raw_smp_processor_id()));
 		timer->expires = jiffies + delay;
 		timer->data = (unsigned long)dwork;
 		timer->function = delayed_work_timer_fn;
@@ -562,9 +567,12 @@ void flush_work_keventd(struct work_struct *work)
 EXPORT_SYMBOL(flush_work_keventd);
 
 /**
- * cancel_rearming_delayed_workqueue - reliably kill off a delayed work whose handler rearms the delayed work.
+ * cancel_rearming_delayed_workqueue - kill off a delayed work whose handler rearms the delayed work.
  * @wq:   the controlling workqueue structure
  * @dwork: the delayed work struct
+ *
+ * Note that the work callback function may still be running on return from
+ * cancel_delayed_work(). Run flush_workqueue() or flush_work() to wait on it.
  */
 void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
 				       struct delayed_work *dwork)
@@ -579,7 +587,7 @@ void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
 EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
 
 /**
- * cancel_rearming_delayed_work - reliably kill off a delayed keventd work whose handler rearms the delayed work.
+ * cancel_rearming_delayed_work - kill off a delayed keventd work whose handler rearms the delayed work.
  * @dwork: the delayed work struct
  */
 void cancel_rearming_delayed_work(struct delayed_work *dwork)
-- 
cgit v1.2.3


From 63bc0362521cbaae3ed17b8de7b094f9492453f0 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:16 -0700
Subject: unify queue_delayed_work() and queue_delayed_work_on()

Change queue_delayed_work() to use queue_delayed_work_on() to avoid the code
duplication (saves 133 bytes).

Q: queue_delayed_work() enqueues &dwork->work directly when delay == 0, why?

[jirislaby@gmail.com: oops fix]
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 32 ++++++++++----------------------
 1 file changed, 10 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d107e1c3b071..0eb9b33f1d91 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -180,28 +180,11 @@ void delayed_work_timer_fn(unsigned long __data)
 int fastcall queue_delayed_work(struct workqueue_struct *wq,
 			struct delayed_work *dwork, unsigned long delay)
 {
-	int ret = 0;
-	struct timer_list *timer = &dwork->timer;
-	struct work_struct *work = &dwork->work;
-
-	timer_stats_timer_set_start_info(timer);
+	timer_stats_timer_set_start_info(&dwork->timer);
 	if (delay == 0)
-		return queue_work(wq, work);
-
-	if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
-		BUG_ON(timer_pending(timer));
-		BUG_ON(!list_empty(&work->entry));
+		return queue_work(wq, &dwork->work);
 
-		/* This stores cwq for the moment, for the timer_fn */
-		set_wq_data(work,
-			per_cpu_ptr(wq->cpu_wq, raw_smp_processor_id()));
-		timer->expires = jiffies + delay;
-		timer->data = (unsigned long)dwork;
-		timer->function = delayed_work_timer_fn;
-		add_timer(timer);
-		ret = 1;
-	}
-	return ret;
+	return queue_delayed_work_on(-1, wq, dwork, delay);
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work);
 
@@ -227,11 +210,16 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 
 		/* This stores cwq for the moment, for the timer_fn */
 		set_wq_data(work,
-			per_cpu_ptr(wq->cpu_wq, raw_smp_processor_id()));
+			per_cpu_ptr(wq->cpu_wq, wq->singlethread ?
+				singlethread_cpu : raw_smp_processor_id()));
 		timer->expires = jiffies + delay;
 		timer->data = (unsigned long)dwork;
 		timer->function = delayed_work_timer_fn;
-		add_timer_on(timer, cpu);
+
+		if (unlikely(cpu >= 0))
+			add_timer_on(timer, cpu);
+		else
+			add_timer(timer);
 		ret = 1;
 	}
 	return ret;
-- 
cgit v1.2.3


From a848e3b67c07ed79374bd0f9b82f9ce45a419643 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:17 -0700
Subject: workqueue: introduce wq_per_cpu() helper

Cleanup.  A number of per_cpu_ptr(wq->cpu_wq, cpu) users have to check that
cpu is valid for this wq.  Make a simple helper.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0eb9b33f1d91..985902e2e071 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -86,6 +86,14 @@ static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq)
 		? &cpu_singlethread_map : &cpu_populated_map;
 }
 
+static
+struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
+{
+	if (unlikely(is_single_threaded(wq)))
+		cpu = singlethread_cpu;
+	return per_cpu_ptr(wq->cpu_wq, cpu);
+}
+
 /*
  * Set the workqueue on which a work item is to be run
  * - Must *only* be called if the pending flag is set
@@ -142,16 +150,14 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
  */
 int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
 {
-	int ret = 0, cpu = get_cpu();
+	int ret = 0;
 
 	if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
-		if (unlikely(is_single_threaded(wq)))
-			cpu = singlethread_cpu;
 		BUG_ON(!list_empty(&work->entry));
-		__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
+		__queue_work(wq_per_cpu(wq, get_cpu()), work);
+		put_cpu();
 		ret = 1;
 	}
-	put_cpu();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(queue_work);
@@ -161,12 +167,8 @@ void delayed_work_timer_fn(unsigned long __data)
 	struct delayed_work *dwork = (struct delayed_work *)__data;
 	struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
 	struct workqueue_struct *wq = cwq->wq;
-	int cpu = smp_processor_id();
-
-	if (unlikely(is_single_threaded(wq)))
-		cpu = singlethread_cpu;
 
-	__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), &dwork->work);
+	__queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work);
 }
 
 /**
@@ -209,9 +211,7 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 		BUG_ON(!list_empty(&work->entry));
 
 		/* This stores cwq for the moment, for the timer_fn */
-		set_wq_data(work,
-			per_cpu_ptr(wq->cpu_wq, wq->singlethread ?
-				singlethread_cpu : raw_smp_processor_id()));
+		set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id()));
 		timer->expires = jiffies + delay;
 		timer->data = (unsigned long)dwork;
 		timer->function = delayed_work_timer_fn;
-- 
cgit v1.2.3


From 1634c48f8b85dcb05101f1eb2eab9af40b5976da Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:18 -0700
Subject: make cancel_rearming_delayed_work() work on any workqueue, not just
 keventd_wq

cancel_rearming_delayed_workqueue(wq, dwork) doesn't need the first
parameter.  We don't hang on un-queued dwork any longer, and work->data
doesn't change its type.  This means we can always figure out "wq" from
dwork when it is needed.

Remove this parameter, and rename the function to
cancel_rearming_delayed_work().  Re-create an inline "obsolete"
cancel_rearming_delayed_workqueue(wq) which just calls
cancel_rearming_delayed_work().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 985902e2e071..41eaffd125ca 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -555,32 +555,23 @@ void flush_work_keventd(struct work_struct *work)
 EXPORT_SYMBOL(flush_work_keventd);
 
 /**
- * cancel_rearming_delayed_workqueue - kill off a delayed work whose handler rearms the delayed work.
- * @wq:   the controlling workqueue structure
+ * cancel_rearming_delayed_work - kill off a delayed work whose handler rearms the delayed work.
  * @dwork: the delayed work struct
  *
  * Note that the work callback function may still be running on return from
  * cancel_delayed_work(). Run flush_workqueue() or flush_work() to wait on it.
  */
-void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
-				       struct delayed_work *dwork)
+void cancel_rearming_delayed_work(struct delayed_work *dwork)
 {
-	/* Was it ever queued ? */
-	if (!get_wq_data(&dwork->work))
-		return;
+	struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
 
-	while (!cancel_delayed_work(dwork))
-		flush_workqueue(wq);
-}
-EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
+	/* Was it ever queued ? */
+	if (cwq != NULL) {
+		struct workqueue_struct *wq = cwq->wq;
 
-/**
- * cancel_rearming_delayed_work - kill off a delayed keventd work whose handler rearms the delayed work.
- * @dwork: the delayed work struct
- */
-void cancel_rearming_delayed_work(struct delayed_work *dwork)
-{
-	cancel_rearming_delayed_workqueue(keventd_wq, dwork);
+		while (!cancel_delayed_work(dwork))
+			flush_workqueue(wq);
+	}
 }
 EXPORT_SYMBOL(cancel_rearming_delayed_work);
 
-- 
cgit v1.2.3


From 23b2e5991afde5af91a1a661d7f47ee56120759e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:19 -0700
Subject: workqueue: kill NOAUTOREL works

We don't have any users, and it is not so trivial to use NOAUTOREL works
correctly.  It is better to simplify API.

Delete NOAUTOREL support and rename work_release to work_clear_pending to
avoid a confusion.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 41eaffd125ca..0611de815a8f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -246,8 +246,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
 		spin_unlock_irq(&cwq->lock);
 
 		BUG_ON(get_wq_data(work) != cwq);
-		if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work)))
-			work_release(work);
+		work_clear_pending(work);
 		f(work);
 
 		if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
@@ -453,7 +452,7 @@ void flush_work(struct workqueue_struct *wq, struct work_struct *work)
 	 */
 	spin_lock_irq(&cwq->lock);
 	list_del_init(&work->entry);
-	work_release(work);
+	work_clear_pending(work);
 	spin_unlock_irq(&cwq->lock);
 
 	for_each_cpu_mask(cpu, *cpu_map)
-- 
cgit v1.2.3


From b9aac8e0d32499217417ff0b494731811f185b18 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:20 -0700
Subject: worker_thread: don't play with signals

worker_thread() doesn't need to "Block and flush all signals", this was
already done by its caller, kthread().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0611de815a8f..87693b37d017 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -290,18 +290,11 @@ static int worker_thread(void *__cwq)
 	struct cpu_workqueue_struct *cwq = __cwq;
 	DEFINE_WAIT(wait);
 	struct k_sigaction sa;
-	sigset_t blocked;
 
 	if (!cwq->wq->freezeable)
 		current->flags |= PF_NOFREEZE;
 
 	set_user_nice(current, -5);
-
-	/* Block and flush all signals */
-	sigfillset(&blocked);
-	sigprocmask(SIG_BLOCK, &blocked, NULL);
-	flush_signals(current);
-
 	/*
 	 * We inherited MPOL_INTERLEAVE from the booting kernel.
 	 * Set MPOL_DEFAULT to insure node local allocations.
-- 
cgit v1.2.3


From 85f4186af944c1240c84934a9ab578743df2d69b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:20 -0700
Subject: worker_thread: fix racy try_to_freeze() usage

worker_thread() can miss freeze_process()->signal_wake_up() if it happens
between try_to_freeze() and prepare_to_wait().  We should check freezing()
before entering schedule().

This race was introduced by me in

	[PATCH 1/1] workqueue: don't migrate pending works from the dead CPU

Looks like mm/vmscan.c:kswapd() has the same race.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 87693b37d017..63885abf1ba0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -308,14 +308,14 @@ static int worker_thread(void *__cwq)
 	do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
 
 	for (;;) {
-		if (cwq->wq->freezeable)
-			try_to_freeze();
-
 		prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
-		if (!cwq->should_stop && list_empty(&cwq->worklist))
+		if (!freezing(current) && !cwq->should_stop
+		    && list_empty(&cwq->worklist))
 			schedule();
 		finish_wait(&cwq->more_work, &wait);
 
+		try_to_freeze();
+
 		if (cwq_should_stop(cwq))
 			break;
 
-- 
cgit v1.2.3


From a4798833d26b293fd18b7bf102991426aa0b56fd Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:21 -0700
Subject: zap_other_threads: remove unneeded ->exit_signal change

We already depend on fact that all sub-threads have ->exit_signal == -1, no
need to set it in zap_other_threads().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 4c8f49eadf7d..23ae6d62fc41 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -913,17 +913,6 @@ void zap_other_threads(struct task_struct *p)
 		if (t->exit_state)
 			continue;
 
-		/*
-		 * We don't want to notify the parent, since we are
-		 * killed as part of a thread group due to another
-		 * thread doing an execve() or similar. So set the
-		 * exit signal to -1 to allow immediate reaping of
-		 * the process.  But don't detach the thread group
-		 * leader.
-		 */
-		if (t != p->group_leader)
-			t->exit_signal = -1;
-
 		/* SIGKILL will be handled before any pending SIGSTOP */
 		sigaddset(&t->pending.signal, SIGKILL);
 		signal_wake_up(t, 1);
-- 
cgit v1.2.3


From 28e53bddf814485699a4142bc056fd37d4e11dd4 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:22 -0700
Subject: unify flush_work/flush_work_keventd and rename it to cancel_work_sync

flush_work(wq, work) doesn't need the first parameter, we can use cwq->wq
(this was possible from the very beginnig, I missed this).  So we can unify
flush_work_keventd and flush_work.

Also, rename flush_work() to cancel_work_sync() and fix all callers.
Perhaps this is not the best name, but "flush_work" is really bad.

(akpm: this is why the earlier patches bypassed maintainers)

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Jeff Garzik <jeff@garzik.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Tejun Heo <htejun@gmail.com>
Cc: Auke Kok <auke-jan.h.kok@intel.com>,
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 63885abf1ba0..c9ab4293904f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -413,23 +413,23 @@ static void wait_on_work(struct cpu_workqueue_struct *cwq,
 }
 
 /**
- * flush_work - block until a work_struct's callback has terminated
- * @wq: the workqueue on which the work is queued
+ * cancel_work_sync - block until a work_struct's callback has terminated
  * @work: the work which is to be flushed
  *
- * flush_work() will attempt to cancel the work if it is queued.  If the work's
- * callback appears to be running, flush_work() will block until it has
- * completed.
+ * cancel_work_sync() will attempt to cancel the work if it is queued. If the
+ * work's callback appears to be running, cancel_work_sync() will block until
+ * it has completed.
  *
- * flush_work() is designed to be used when the caller is tearing down data
- * structures which the callback function operates upon.  It is expected that,
- * prior to calling flush_work(), the caller has arranged for the work to not
- * be requeued.
+ * cancel_work_sync() is designed to be used when the caller is tearing down
+ * data structures which the callback function operates upon. It is expected
+ * that, prior to calling cancel_work_sync(), the caller has arranged for the
+ * work to not be requeued.
  */
-void flush_work(struct workqueue_struct *wq, struct work_struct *work)
+void cancel_work_sync(struct work_struct *work)
 {
-	const cpumask_t *cpu_map = wq_cpu_map(wq);
 	struct cpu_workqueue_struct *cwq;
+	struct workqueue_struct *wq;
+	const cpumask_t *cpu_map;
 	int cpu;
 
 	might_sleep();
@@ -448,10 +448,13 @@ void flush_work(struct workqueue_struct *wq, struct work_struct *work)
 	work_clear_pending(work);
 	spin_unlock_irq(&cwq->lock);
 
+	wq = cwq->wq;
+	cpu_map = wq_cpu_map(wq);
+
 	for_each_cpu_mask(cpu, *cpu_map)
 		wait_on_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 }
-EXPORT_SYMBOL_GPL(flush_work);
+EXPORT_SYMBOL_GPL(cancel_work_sync);
 
 
 static struct workqueue_struct *keventd_wq;
@@ -540,18 +543,13 @@ void flush_scheduled_work(void)
 }
 EXPORT_SYMBOL(flush_scheduled_work);
 
-void flush_work_keventd(struct work_struct *work)
-{
-	flush_work(keventd_wq, work);
-}
-EXPORT_SYMBOL(flush_work_keventd);
-
 /**
  * cancel_rearming_delayed_work - kill off a delayed work whose handler rearms the delayed work.
  * @dwork: the delayed work struct
  *
  * Note that the work callback function may still be running on return from
- * cancel_delayed_work(). Run flush_workqueue() or flush_work() to wait on it.
+ * cancel_delayed_work(). Run flush_workqueue() or cancel_work_sync() to wait
+ * on it.
  */
 void cancel_rearming_delayed_work(struct delayed_work *dwork)
 {
-- 
cgit v1.2.3


From c93465181fed0f8f5942a41108943dadea0aa345 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:23 -0700
Subject: ____call_usermodehelper: don't flush_signals()

____call_usermodehelper() has no reason for flush_signals().  It is a fresh
forked process which is going to exec a user-space application or exit on
failure.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kmod.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 49cc4b9c1a8d..6cea9db25c3d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -135,7 +135,6 @@ static int ____call_usermodehelper(void *data)
 
 	/* Unblock all signals and set the session keyring. */
 	new_session = key_get(sub_info->ring);
-	flush_signals(current);
 	spin_lock_irq(&current->sighand->siglock);
 	old_session = __install_session_keyring(current, new_session);
 	flush_signal_handlers(current, 1);
-- 
cgit v1.2.3


From 73c279927f89561ecb45b2dfdf9314bafcfd9f67 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 9 May 2007 02:34:32 -0700
Subject: kthread: don't depend on work queues

Currently there is a circular reference between work queue initialization
and kthread initialization.  This prevents the kthread infrastructure from
initializing until after work queues have been initialized.

We want the properties of tasks created with kthread_create to be as close
as possible to the init_task and to not be contaminated by user processes.
The later we start our kthreadd that creates these tasks the harder it is
to avoid contamination from user processes and the more of a mess we have
to clean up because the defaults have changed on us.

So this patch modifies the kthread support to not use work queues but to
instead use a simple list of structures, and to have kthreadd start from
init_task immediately after our kernel thread that execs /sbin/init.

By being a true child of init_task we only have to change those process
settings that we want to have different from init_task, such as our process
name, the cpus that are allowed, blocking all signals and setting SIGCHLD
to SIG_IGN so that all of our children are reaped automatically.

By being a true child of init_task we also naturally get our ppid set to 0
and do not wind up as a child of PID == 1.  Ensuring that tasks generated
by kthread_create will not slow down the functioning of the wait family of
functions.

[akpm@linux-foundation.org: use interruptible sleeps]
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kthread.c | 124 ++++++++++++++++++++++++++++++-------------------------
 1 file changed, 68 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 87c50ccd1d4e..0eb0070a3c57 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1,7 +1,7 @@
 /* Kernel thread helper functions.
  *   Copyright (C) 2004 IBM Corporation, Rusty Russell.
  *
- * Creation is done via keventd, so that we get a clean environment
+ * Creation is done via kthreadd, so that we get a clean environment
  * even if we're invoked from userspace (think modprobe, hotplug cpu,
  * etc.).
  */
@@ -15,24 +15,22 @@
 #include <linux/mutex.h>
 #include <asm/semaphore.h>
 
-/*
- * We dont want to execute off keventd since it might
- * hold a semaphore our callers hold too:
- */
-static struct workqueue_struct *helper_wq;
+static DEFINE_SPINLOCK(kthread_create_lock);
+static LIST_HEAD(kthread_create_list);
+struct task_struct *kthreadd_task;
 
 struct kthread_create_info
 {
-	/* Information passed to kthread() from keventd. */
+	/* Information passed to kthread() from kthreadd. */
 	int (*threadfn)(void *data);
 	void *data;
 	struct completion started;
 
-	/* Result passed back to kthread_create() from keventd. */
+	/* Result passed back to kthread_create() from kthreadd. */
 	struct task_struct *result;
 	struct completion done;
 
-	struct work_struct work;
+	struct list_head list;
 };
 
 struct kthread_stop_info
@@ -60,42 +58,17 @@ int kthread_should_stop(void)
 }
 EXPORT_SYMBOL(kthread_should_stop);
 
-static void kthread_exit_files(void)
-{
-	struct fs_struct *fs;
-	struct task_struct *tsk = current;
-
-	exit_fs(tsk);		/* current->fs->count--; */
-	fs = init_task.fs;
-	tsk->fs = fs;
-	atomic_inc(&fs->count);
- 	exit_files(tsk);
-	current->files = init_task.files;
-	atomic_inc(&tsk->files->count);
-}
-
 static int kthread(void *_create)
 {
 	struct kthread_create_info *create = _create;
 	int (*threadfn)(void *data);
 	void *data;
-	sigset_t blocked;
 	int ret = -EINTR;
 
-	kthread_exit_files();
-
-	/* Copy data: it's on keventd's stack */
+	/* Copy data: it's on kthread's stack */
 	threadfn = create->threadfn;
 	data = create->data;
 
-	/* Block and flush all signals (in case we're not from keventd). */
-	sigfillset(&blocked);
-	sigprocmask(SIG_BLOCK, &blocked, NULL);
-	flush_signals(current);
-
-	/* By default we can run anywhere, unlike keventd. */
-	set_cpus_allowed(current, CPU_MASK_ALL);
-
 	/* OK, tell user we're spawned, wait for stop or wakeup */
 	__set_current_state(TASK_INTERRUPTIBLE);
 	complete(&create->started);
@@ -112,11 +85,8 @@ static int kthread(void *_create)
 	return 0;
 }
 
-/* We are keventd: create a thread. */
-static void keventd_create_kthread(struct work_struct *work)
+static void create_kthread(struct kthread_create_info *create)
 {
-	struct kthread_create_info *create =
-		container_of(work, struct kthread_create_info, work);
 	int pid;
 
 	/* We want our own signal handler (we take no signals by default). */
@@ -162,17 +132,14 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 	create.data = data;
 	init_completion(&create.started);
 	init_completion(&create.done);
-	INIT_WORK(&create.work, keventd_create_kthread);
-
-	/*
-	 * The workqueue needs to start up first:
-	 */
-	if (!helper_wq)
-		create.work.func(&create.work);
-	else {
-		queue_work(helper_wq, &create.work);
-		wait_for_completion(&create.done);
-	}
+
+	spin_lock(&kthread_create_lock);
+	list_add_tail(&create.list, &kthread_create_list);
+	wake_up_process(kthreadd_task);
+	spin_unlock(&kthread_create_lock);
+
+	wait_for_completion(&create.done);
+
 	if (!IS_ERR(create.result)) {
 		va_list args;
 		va_start(args, namefmt);
@@ -180,7 +147,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 			  namefmt, args);
 		va_end(args);
 	}
-
 	return create.result;
 }
 EXPORT_SYMBOL(kthread_create);
@@ -245,12 +211,58 @@ int kthread_stop(struct task_struct *k)
 }
 EXPORT_SYMBOL(kthread_stop);
 
-static __init int helper_init(void)
+
+static __init void kthreadd_setup(void)
 {
-	helper_wq = create_singlethread_workqueue("kthread");
-	BUG_ON(!helper_wq);
+	struct task_struct *tsk = current;
+	struct k_sigaction sa;
+	sigset_t blocked;
 
-	return 0;
+	set_task_comm(tsk, "kthreadd");
+
+	/* Block and flush all signals */
+	sigfillset(&blocked);
+	sigprocmask(SIG_BLOCK, &blocked, NULL);
+	flush_signals(tsk);
+
+	/* SIG_IGN makes children autoreap: see do_notify_parent(). */
+	sa.sa.sa_handler = SIG_IGN;
+	sa.sa.sa_flags = 0;
+	siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
+	do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
+
+	set_user_nice(current, -5);
+	set_cpus_allowed(current, CPU_MASK_ALL);
 }
 
-core_initcall(helper_init);
+int kthreadd(void *unused)
+{
+	/* Setup a clean context for our children to inherit. */
+	kthreadd_setup();
+
+	current->flags |= PF_NOFREEZE;
+
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (list_empty(&kthread_create_list))
+			schedule();
+		__set_current_state(TASK_RUNNING);
+
+		spin_lock(&kthread_create_lock);
+		while (!list_empty(&kthread_create_list)) {
+			struct kthread_create_info *create;
+
+			create = list_entry(kthread_create_list.next,
+					    struct kthread_create_info, list);
+			list_del_init(&create->list);
+			spin_unlock(&kthread_create_lock);
+
+			create_kthread(create);
+
+			spin_lock(&kthread_create_lock);
+		}
+		spin_unlock(&kthread_create_lock);
+	}
+
+	return 0;
+}
-- 
cgit v1.2.3


From 49d769d52e16efabd3ad47b7995522fff771371d Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 9 May 2007 02:34:33 -0700
Subject: Change reparent_to_init to reparent_to_kthreadd

When a kernel thread calls daemonize, instead of reparenting the thread to
init reparent the thread to kthreadd next to the threads created by
kthread_create.

This is really just a stop gap until daemonize goes away, but it does
ensure no kernel threads are under init and they are all in one place that
is easy to find.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index f5a7abb621f3..bc982cd72743 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -26,6 +26,7 @@
 #include <linux/profile.h>
 #include <linux/mount.h>
 #include <linux/proc_fs.h>
+#include <linux/kthread.h>
 #include <linux/mempolicy.h>
 #include <linux/taskstats_kern.h>
 #include <linux/delayacct.h>
@@ -254,26 +255,25 @@ static int has_stopped_jobs(struct pid *pgrp)
 }
 
 /**
- * reparent_to_init - Reparent the calling kernel thread to the init task of the pid space that the thread belongs to.
+ * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
  *
  * If a kernel thread is launched as a result of a system call, or if
- * it ever exits, it should generally reparent itself to init so that
- * it is correctly cleaned up on exit.
+ * it ever exits, it should generally reparent itself to kthreadd so it
+ * isn't in the way of other processes and is correctly cleaned up on exit.
  *
  * The various task state such as scheduling policy and priority may have
  * been inherited from a user process, so we reset them to sane values here.
  *
- * NOTE that reparent_to_init() gives the caller full capabilities.
+ * NOTE that reparent_to_kthreadd() gives the caller full capabilities.
  */
-static void reparent_to_init(void)
+static void reparent_to_kthreadd(void)
 {
 	write_lock_irq(&tasklist_lock);
 
 	ptrace_unlink(current);
 	/* Reparent to init */
 	remove_parent(current);
-	current->parent = child_reaper(current);
-	current->real_parent = child_reaper(current);
+	current->real_parent = current->parent = kthreadd_task;
 	add_parent(current);
 
 	/* Set the exit signal to SIGCHLD so we signal init on exit */
@@ -400,7 +400,7 @@ void daemonize(const char *name, ...)
 	current->files = init_task.files;
 	atomic_inc(&current->files->count);
 
-	reparent_to_init();
+	reparent_to_kthreadd();
 }
 
 EXPORT_SYMBOL(daemonize);
-- 
cgit v1.2.3


From 90cce03d9bfcb28600a56efef6b0a5a4fbf6c2b1 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:36 -0700
Subject: wait_for_helper: remove unneeded do_sigaction()

allow_signal(SIGCHLD) does all necessary job, no need to call do_sigaction()
prior to.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kmod.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 6cea9db25c3d..4d32eb077179 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -185,14 +185,9 @@ static int wait_for_helper(void *data)
 {
 	struct subprocess_info *sub_info = data;
 	pid_t pid;
-	struct k_sigaction sa;
 
 	/* Install a handler: if SIGCLD isn't handled sys_wait4 won't
 	 * populate the status, but will return -ECHILD. */
-	sa.sa.sa_handler = SIG_IGN;
-	sa.sa.sa_flags = 0;
-	siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
-	do_sigaction(SIGCHLD, &sa, NULL);
 	allow_signal(SIGCHLD);
 
 	pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
-- 
cgit v1.2.3


From 5de18d169739293e27e0cf9acfc75a2d2f4aa572 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:36 -0700
Subject: worker_thread: don't play with SIGCHLD and numa policy

worker_thread() inherits ignored SIGCHLD and numa_default_policy() from its
parent, kthreadd.  No need to setup this again.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c9ab4293904f..25cee1afe6fb 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -289,23 +289,11 @@ static int worker_thread(void *__cwq)
 {
 	struct cpu_workqueue_struct *cwq = __cwq;
 	DEFINE_WAIT(wait);
-	struct k_sigaction sa;
 
 	if (!cwq->wq->freezeable)
 		current->flags |= PF_NOFREEZE;
 
 	set_user_nice(current, -5);
-	/*
-	 * We inherited MPOL_INTERLEAVE from the booting kernel.
-	 * Set MPOL_DEFAULT to insure node local allocations.
-	 */
-	numa_default_policy();
-
-	/* SIG_IGN makes children autoreap: see do_notify_parent(). */
-	sa.sa.sa_handler = SIG_IGN;
-	sa.sa.sa_flags = 0;
-	siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
-	do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
 
 	for (;;) {
 		prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
-- 
cgit v1.2.3


From 10ab825bdef8df510f99c703a5a2d9b13a4e31a5 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:37 -0700
Subject: change kernel threads to ignore signals instead of blocking them

Currently kernel threads use sigprocmask(SIG_BLOCK) to protect against
signals.  This doesn't prevent the signal delivery, this only blocks
signal_wake_up().  Every "killall -33 kthreadd" means a "struct siginfo"
leak.

Change kthreadd_setup() to set all handlers to SIG_IGN instead of blocking
them (make a new helper ignore_signals() for that).  If the kernel thread
needs some signal, it should use allow_signal() anyway, and in that case it
should not use CLONE_SIGHAND.

Note that we can't change daemonize() (should die!) in the same way,
because it can be used along with CLONE_SIGHAND.  This means that
allow_signal() still should unblock the signal to work correctly with
daemonize()ed threads.

However, disallow_signal() doesn't block the signal any longer but ignores
it.

NOTE: with or without this patch the kernel threads are not protected from
handle_stop_signal(), this seems harmless, but not good.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c    |  2 +-
 kernel/kthread.c | 17 +++--------------
 kernel/signal.c  | 10 ++++++++++
 3 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index bc982cd72743..b0c6f0c3a2df 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -347,7 +347,7 @@ int disallow_signal(int sig)
 		return -EINVAL;
 
 	spin_lock_irq(&current->sighand->siglock);
-	sigaddset(&current->blocked, sig);
+	current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 	return 0;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 0eb0070a3c57..df8a8e8f6ca4 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -215,24 +215,13 @@ EXPORT_SYMBOL(kthread_stop);
 static __init void kthreadd_setup(void)
 {
 	struct task_struct *tsk = current;
-	struct k_sigaction sa;
-	sigset_t blocked;
 
 	set_task_comm(tsk, "kthreadd");
 
-	/* Block and flush all signals */
-	sigfillset(&blocked);
-	sigprocmask(SIG_BLOCK, &blocked, NULL);
-	flush_signals(tsk);
+	ignore_signals(tsk);
 
-	/* SIG_IGN makes children autoreap: see do_notify_parent(). */
-	sa.sa.sa_handler = SIG_IGN;
-	sa.sa.sa_flags = 0;
-	siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
-	do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
-
-	set_user_nice(current, -5);
-	set_cpus_allowed(current, CPU_MASK_ALL);
+	set_user_nice(tsk, -5);
+	set_cpus_allowed(tsk, CPU_MASK_ALL);
 }
 
 int kthreadd(void *unused)
diff --git a/kernel/signal.c b/kernel/signal.c
index 23ae6d62fc41..2ac3a668d9dd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -209,6 +209,16 @@ void flush_signals(struct task_struct *t)
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
 }
 
+void ignore_signals(struct task_struct *t)
+{
+	int i;
+
+	for (i = 0; i < _NSIG; ++i)
+		t->sighand->action[i].sa.sa_handler = SIG_IGN;
+
+	flush_signals(t);
+}
+
 /*
  * Flush all handlers for a task.
  */
-- 
cgit v1.2.3


From 7b0834c26fd796c79dfcc3939ed2b9122b75246f Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 9 May 2007 02:34:41 -0700
Subject: Remove kthread_bind() call from _cpu_down()

We are anyway kthread_stop()ping other per-cpu kernel threads after
move_task_off_dead_cpu(), so we can do it with the stop_machine_run thread
as well.

I just checked with Vatsa if there was any subtle reason why they
had put in the kthread_bind() in cpu.c. Vatsa cannot seem to recollect
any and I can't see any. So let us just remove the kthread_bind.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpu.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1a823944e972..28cb6c71a47a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -175,10 +175,6 @@ static int _cpu_down(unsigned int cpu)
 	/* This actually kills the CPU. */
 	__cpu_die(cpu);
 
-	/* Move it here so it can run. */
-	kthread_bind(p, get_cpu());
-	put_cpu();
-
 	/* CPU is completely dead: tell everyone.  Too late to complain. */
 	if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD, hcpu) == NOTIFY_BAD)
 		BUG();
-- 
cgit v1.2.3


From 6e84d644b5929789398914b0ccf447355dec6fb0 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:46 -0700
Subject: make cancel_rearming_delayed_work() reliable

Thanks to Jarek Poplawski for the ideas and for spotting the bug in the
initial draft patch.

cancel_rearming_delayed_work() currently has many limitations, because it
requires that dwork always re-arms itself via queue_delayed_work().  So it
hangs forever if dwork doesn't do this, or cancel_rearming_delayed_work/
cancel_delayed_work was already called.  It uses flush_workqueue() in a
loop, so it can't be used if workqueue was freezed, and it is potentially
live- lockable on busy system if delay is small.

With this patch cancel_rearming_delayed_work() doesn't make any assumptions
about dwork, it can re-arm itself via queue_delayed_work(), or
queue_work(), or do nothing.

As a "side effect", cancel_work_sync() was changed to handle re-arming works
as well.

Disadvantages:

	- this patch adds wmb() to insert_work().

	- slowdowns the fast path (when del_timer() succeeds on entry) of
	  cancel_rearming_delayed_work(), because wait_on_work() is called
	  unconditionally. In that case, compared to the old version, we are
	  doing "unneeded" lock/unlock for each online CPU.

	  On the other hand, this means we don't need to use cancel_work_sync()
	  after cancel_rearming_delayed_work().

	- complicates the code (.text grows by 130 bytes).

[akpm@linux-foundation.org: fix speling]
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: David Chinner <dgc@sgi.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Gautham Shenoy <ego@in.ibm.com>
Acked-by: Jarek Poplawski <jarkao2@o2.pl>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 140 ++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 91 insertions(+), 49 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 25cee1afe6fb..b976ed87dd37 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -120,6 +120,11 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
 				struct work_struct *work, int tail)
 {
 	set_wq_data(work, cwq);
+	/*
+	 * Ensure that we get the right work->data if we see the
+	 * result of list_add() below, see try_to_grab_pending().
+	 */
+	smp_wmb();
 	if (tail)
 		list_add_tail(&work->entry, &cwq->worklist);
 	else
@@ -383,7 +388,46 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
 
-static void wait_on_work(struct cpu_workqueue_struct *cwq,
+/*
+ * Upon a successful return, the caller "owns" WORK_STRUCT_PENDING bit,
+ * so this work can't be re-armed in any way.
+ */
+static int try_to_grab_pending(struct work_struct *work)
+{
+	struct cpu_workqueue_struct *cwq;
+	int ret = 0;
+
+	if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work)))
+		return 1;
+
+	/*
+	 * The queueing is in progress, or it is already queued. Try to
+	 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
+	 */
+
+	cwq = get_wq_data(work);
+	if (!cwq)
+		return ret;
+
+	spin_lock_irq(&cwq->lock);
+	if (!list_empty(&work->entry)) {
+		/*
+		 * This work is queued, but perhaps we locked the wrong cwq.
+		 * In that case we must see the new value after rmb(), see
+		 * insert_work()->wmb().
+		 */
+		smp_rmb();
+		if (cwq == get_wq_data(work)) {
+			list_del_init(&work->entry);
+			ret = 1;
+		}
+	}
+	spin_unlock_irq(&cwq->lock);
+
+	return ret;
+}
+
+static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
 				struct work_struct *work)
 {
 	struct wq_barrier barr;
@@ -400,20 +444,7 @@ static void wait_on_work(struct cpu_workqueue_struct *cwq,
 		wait_for_completion(&barr.done);
 }
 
-/**
- * cancel_work_sync - block until a work_struct's callback has terminated
- * @work: the work which is to be flushed
- *
- * cancel_work_sync() will attempt to cancel the work if it is queued. If the
- * work's callback appears to be running, cancel_work_sync() will block until
- * it has completed.
- *
- * cancel_work_sync() is designed to be used when the caller is tearing down
- * data structures which the callback function operates upon. It is expected
- * that, prior to calling cancel_work_sync(), the caller has arranged for the
- * work to not be requeued.
- */
-void cancel_work_sync(struct work_struct *work)
+static void wait_on_work(struct work_struct *work)
 {
 	struct cpu_workqueue_struct *cwq;
 	struct workqueue_struct *wq;
@@ -423,29 +454,62 @@ void cancel_work_sync(struct work_struct *work)
 	might_sleep();
 
 	cwq = get_wq_data(work);
-	/* Was it ever queued ? */
 	if (!cwq)
 		return;
 
-	/*
-	 * This work can't be re-queued, no need to re-check that
-	 * get_wq_data() is still the same when we take cwq->lock.
-	 */
-	spin_lock_irq(&cwq->lock);
-	list_del_init(&work->entry);
-	work_clear_pending(work);
-	spin_unlock_irq(&cwq->lock);
-
 	wq = cwq->wq;
 	cpu_map = wq_cpu_map(wq);
 
 	for_each_cpu_mask(cpu, *cpu_map)
-		wait_on_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
+		wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
+}
+
+/**
+ * cancel_work_sync - block until a work_struct's callback has terminated
+ * @work: the work which is to be flushed
+ *
+ * cancel_work_sync() will cancel the work if it is queued. If the work's
+ * callback appears to be running, cancel_work_sync() will block until it
+ * has completed.
+ *
+ * It is possible to use this function if the work re-queues itself. It can
+ * cancel the work even if it migrates to another workqueue, however in that
+ * case it only guarantees that work->func() has completed on the last queued
+ * workqueue.
+ *
+ * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not
+ * pending, otherwise it goes into a busy-wait loop until the timer expires.
+ *
+ * The caller must ensure that workqueue_struct on which this work was last
+ * queued can't be destroyed before this function returns.
+ */
+void cancel_work_sync(struct work_struct *work)
+{
+	while (!try_to_grab_pending(work))
+		cpu_relax();
+	wait_on_work(work);
+	work_clear_pending(work);
 }
 EXPORT_SYMBOL_GPL(cancel_work_sync);
 
+/**
+ * cancel_rearming_delayed_work - reliably kill off a delayed work.
+ * @dwork: the delayed work struct
+ *
+ * It is possible to use this function if @dwork rearms itself via queue_work()
+ * or queue_delayed_work(). See also the comment for cancel_work_sync().
+ */
+void cancel_rearming_delayed_work(struct delayed_work *dwork)
+{
+	while (!del_timer(&dwork->timer) &&
+	       !try_to_grab_pending(&dwork->work))
+		cpu_relax();
+	wait_on_work(&dwork->work);
+	work_clear_pending(&dwork->work);
+}
+EXPORT_SYMBOL(cancel_rearming_delayed_work);
 
-static struct workqueue_struct *keventd_wq;
+static struct workqueue_struct *keventd_wq __read_mostly;
 
 /**
  * schedule_work - put work task in global workqueue
@@ -531,28 +595,6 @@ void flush_scheduled_work(void)
 }
 EXPORT_SYMBOL(flush_scheduled_work);
 
-/**
- * cancel_rearming_delayed_work - kill off a delayed work whose handler rearms the delayed work.
- * @dwork: the delayed work struct
- *
- * Note that the work callback function may still be running on return from
- * cancel_delayed_work(). Run flush_workqueue() or cancel_work_sync() to wait
- * on it.
- */
-void cancel_rearming_delayed_work(struct delayed_work *dwork)
-{
-	struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
-
-	/* Was it ever queued ? */
-	if (cwq != NULL) {
-		struct workqueue_struct *wq = cwq->wq;
-
-		while (!cancel_delayed_work(dwork))
-			flush_workqueue(wq);
-	}
-}
-EXPORT_SYMBOL(cancel_rearming_delayed_work);
-
 /**
  * execute_in_process_context - reliably execute the routine with user context
  * @fn:		the function to execute
-- 
cgit v1.2.3


From ec92d08292d3e9b0823eba138a4564d2d39f25c7 Mon Sep 17 00:00:00 2001
From: Pierre Peiffer <pierre.peiffer@bull.net>
Date: Wed, 9 May 2007 02:35:00 -0700
Subject: futex priority based wakeup

Today, all threads waiting for a given futex are woken in FIFO order (first
waiter woken first) instead of priority order.

This patch makes use of plist (pirotity ordered lists) instead of simple list
in futex_hash_bucket.

All non-RT threads are stored with priority MAX_RT_PRIO, causing them to be
woken last, in FIFO order (RT-threads are woken first, in priority order).

Signed-off-by: Sebastien Dugue <sebastien.dugue@bull.net>
Signed-off-by: Pierre Peiffer <pierre.peiffer@bull.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/futex.c | 78 ++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 49 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 600bc9d801f2..685ee2362a5e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -81,12 +81,12 @@ struct futex_pi_state {
  * we can wake only the relevant ones (hashed queues may be shared).
  *
  * A futex_q has a woken state, just like tasks have TASK_RUNNING.
- * It is considered woken when list_empty(&q->list) || q->lock_ptr == 0.
+ * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
  * The order of wakup is always to make the first condition true, then
  * wake up q->waiters, then make the second condition true.
  */
 struct futex_q {
-	struct list_head list;
+	struct plist_node list;
 	wait_queue_head_t waiters;
 
 	/* Which hash list lock to use: */
@@ -108,8 +108,8 @@ struct futex_q {
  * Split the global futex_lock into every hash list lock.
  */
 struct futex_hash_bucket {
-       spinlock_t              lock;
-       struct list_head       chain;
+	spinlock_t lock;
+	struct plist_head chain;
 };
 
 static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
@@ -443,13 +443,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 {
 	struct futex_pi_state *pi_state = NULL;
 	struct futex_q *this, *next;
-	struct list_head *head;
+	struct plist_head *head;
 	struct task_struct *p;
 	pid_t pid;
 
 	head = &hb->chain;
 
-	list_for_each_entry_safe(this, next, head, list) {
+	plist_for_each_entry_safe(this, next, head, list) {
 		if (match_futex(&this->key, &me->key)) {
 			/*
 			 * Another waiter already exists - bump up
@@ -513,12 +513,12 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
  */
 static void wake_futex(struct futex_q *q)
 {
-	list_del_init(&q->list);
+	plist_del(&q->list, &q->list.plist);
 	if (q->filp)
 		send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
 	/*
 	 * The lock in wake_up_all() is a crucial memory barrier after the
-	 * list_del_init() and also before assigning to q->lock_ptr.
+	 * plist_del() and also before assigning to q->lock_ptr.
 	 */
 	wake_up_all(&q->waiters);
 	/*
@@ -633,7 +633,7 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
 {
 	struct futex_hash_bucket *hb;
 	struct futex_q *this, *next;
-	struct list_head *head;
+	struct plist_head *head;
 	union futex_key key;
 	int ret;
 
@@ -647,7 +647,7 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
 	spin_lock(&hb->lock);
 	head = &hb->chain;
 
-	list_for_each_entry_safe(this, next, head, list) {
+	plist_for_each_entry_safe(this, next, head, list) {
 		if (match_futex (&this->key, &key)) {
 			if (this->pi_state) {
 				ret = -EINVAL;
@@ -675,7 +675,7 @@ futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2,
 {
 	union futex_key key1, key2;
 	struct futex_hash_bucket *hb1, *hb2;
-	struct list_head *head;
+	struct plist_head *head;
 	struct futex_q *this, *next;
 	int ret, op_ret, attempt = 0;
 
@@ -748,7 +748,7 @@ retry:
 
 	head = &hb1->chain;
 
-	list_for_each_entry_safe(this, next, head, list) {
+	plist_for_each_entry_safe(this, next, head, list) {
 		if (match_futex (&this->key, &key1)) {
 			wake_futex(this);
 			if (++ret >= nr_wake)
@@ -760,7 +760,7 @@ retry:
 		head = &hb2->chain;
 
 		op_ret = 0;
-		list_for_each_entry_safe(this, next, head, list) {
+		plist_for_each_entry_safe(this, next, head, list) {
 			if (match_futex (&this->key, &key2)) {
 				wake_futex(this);
 				if (++op_ret >= nr_wake2)
@@ -787,7 +787,7 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
 {
 	union futex_key key1, key2;
 	struct futex_hash_bucket *hb1, *hb2;
-	struct list_head *head1;
+	struct plist_head *head1;
 	struct futex_q *this, *next;
 	int ret, drop_count = 0;
 
@@ -836,7 +836,7 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
 	}
 
 	head1 = &hb1->chain;
-	list_for_each_entry_safe(this, next, head1, list) {
+	plist_for_each_entry_safe(this, next, head1, list) {
 		if (!match_futex (&this->key, &key1))
 			continue;
 		if (++ret <= nr_wake) {
@@ -847,9 +847,13 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
 			 * requeue.
 			 */
 			if (likely(head1 != &hb2->chain)) {
-				list_move_tail(&this->list, &hb2->chain);
+				plist_del(&this->list, &hb1->chain);
+				plist_add(&this->list, &hb2->chain);
 				this->lock_ptr = &hb2->lock;
-			}
+#ifdef CONFIG_DEBUG_PI_LIST
+				this->list.plist.lock = &hb2->lock;
+#endif
+ 			}
 			this->key = key2;
 			get_futex_key_refs(&key2);
 			drop_count++;
@@ -894,7 +898,23 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
 
 static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
 {
-	list_add_tail(&q->list, &hb->chain);
+	int prio;
+
+	/*
+	 * The priority used to register this element is
+	 * - either the real thread-priority for the real-time threads
+	 * (i.e. threads with a priority lower than MAX_RT_PRIO)
+	 * - or MAX_RT_PRIO for non-RT threads.
+	 * Thus, all RT-threads are woken first in priority order, and
+	 * the others are woken last, in FIFO order.
+	 */
+	prio = min(current->normal_prio, MAX_RT_PRIO);
+
+	plist_node_init(&q->list, prio);
+#ifdef CONFIG_DEBUG_PI_LIST
+	q->list.plist.lock = &hb->lock;
+#endif
+	plist_add(&q->list, &hb->chain);
 	q->task = current;
 	spin_unlock(&hb->lock);
 }
@@ -949,8 +969,8 @@ static int unqueue_me(struct futex_q *q)
 			spin_unlock(lock_ptr);
 			goto retry;
 		}
-		WARN_ON(list_empty(&q->list));
-		list_del(&q->list);
+		WARN_ON(plist_node_empty(&q->list));
+		plist_del(&q->list, &q->list.plist);
 
 		BUG_ON(q->pi_state);
 
@@ -968,8 +988,8 @@ static int unqueue_me(struct futex_q *q)
  */
 static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
 {
-	WARN_ON(list_empty(&q->list));
-	list_del(&q->list);
+	WARN_ON(plist_node_empty(&q->list));
+	plist_del(&q->list, &q->list.plist);
 
 	BUG_ON(!q->pi_state);
 	free_pi_state(q->pi_state);
@@ -1065,11 +1085,11 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
 	__set_current_state(TASK_INTERRUPTIBLE);
 	add_wait_queue(&q.waiters, &wait);
 	/*
-	 * !list_empty() is safe here without any lock.
+	 * !plist_node_empty() is safe here without any lock.
 	 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
 	 */
 	time_left = 0;
-	if (likely(!list_empty(&q.list))) {
+	if (likely(!plist_node_empty(&q.list))) {
 		unsigned long rel_time;
 
 		if (timed) {
@@ -1384,7 +1404,7 @@ static int futex_unlock_pi(u32 __user *uaddr)
 	struct futex_hash_bucket *hb;
 	struct futex_q *this, *next;
 	u32 uval;
-	struct list_head *head;
+	struct plist_head *head;
 	union futex_key key;
 	int ret, attempt = 0;
 
@@ -1435,7 +1455,7 @@ retry_locked:
 	 */
 	head = &hb->chain;
 
-	list_for_each_entry_safe(this, next, head, list) {
+	plist_for_each_entry_safe(this, next, head, list) {
 		if (!match_futex (&this->key, &key))
 			continue;
 		ret = wake_futex_pi(uaddr, uval, this);
@@ -1509,10 +1529,10 @@ static unsigned int futex_poll(struct file *filp,
 	poll_wait(filp, &q->waiters, wait);
 
 	/*
-	 * list_empty() is safe here without any lock.
+	 * plist_node_empty() is safe here without any lock.
 	 * q->lock_ptr != 0 is not safe, because of ordering against wakeup.
 	 */
-	if (list_empty(&q->list))
+	if (plist_node_empty(&q->list))
 		ret = POLLIN | POLLRDNORM;
 
 	return ret;
@@ -1895,7 +1915,7 @@ static int __init init(void)
 	}
 
 	for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
-		INIT_LIST_HEAD(&futex_queues[i].chain);
+		plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
 		spin_lock_init(&futex_queues[i].lock);
 	}
 	return 0;
-- 
cgit v1.2.3


From c19384b5b296905d4988c7c684ff540a0f9d65be Mon Sep 17 00:00:00 2001
From: Pierre Peiffer <pierre.peiffer@bull.net>
Date: Wed, 9 May 2007 02:35:02 -0700
Subject: Make futex_wait() use an hrtimer for timeout

This patch modifies futex_wait() to use an hrtimer + schedule() in place of
schedule_timeout().

schedule_timeout() is tick based, therefore the timeout granularity is the
tick (1 ms, 4 ms or 10 ms depending on HZ).  By using a high resolution timer
for timeout wakeup, we can attain a much finer timeout granularity (in the
microsecond range).  This parallels what is already done for futex_lock_pi().

The timeout passed to the syscall is no longer converted to jiffies and is
therefore passed to do_futex() and futex_wait() as an absolute ktime_t
therefore keeping nanosecond resolution.

Also this removes the need to pass the nanoseconds timeout part to
futex_lock_pi() in val2.

In futex_wait(), if there is no timeout then a regular schedule() is
performed.  Otherwise, an hrtimer is fired before schedule() is called.

[akpm@linux-foundation.org: fix `make headers_check']
Signed-off-by: Sebastien Dugue <sebastien.dugue@bull.net>
Signed-off-by: Pierre Peiffer <pierre.peiffer@bull.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/futex.c        | 89 ++++++++++++++++++++++++++-------------------------
 kernel/futex_compat.c | 19 ++++++-----
 2 files changed, 54 insertions(+), 54 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 685ee2362a5e..e1246ccbf89a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1001,16 +1001,16 @@ static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
 }
 
 static long futex_wait_restart(struct restart_block *restart);
-static int futex_wait_abstime(u32 __user *uaddr, u32 val,
-			int timed, unsigned long abs_time)
+static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 {
 	struct task_struct *curr = current;
 	DECLARE_WAITQUEUE(wait, curr);
 	struct futex_hash_bucket *hb;
 	struct futex_q q;
-	unsigned long time_left = 0;
 	u32 uval;
 	int ret;
+	struct hrtimer_sleeper t;
+	int rem = 0;
 
 	q.pi_state = NULL;
  retry:
@@ -1088,20 +1088,29 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
 	 * !plist_node_empty() is safe here without any lock.
 	 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
 	 */
-	time_left = 0;
 	if (likely(!plist_node_empty(&q.list))) {
-		unsigned long rel_time;
-
-		if (timed) {
-			unsigned long now = jiffies;
-			if (time_after(now, abs_time))
-				rel_time = 0;
-			else
-				rel_time = abs_time - now;
-		} else
-			rel_time = MAX_SCHEDULE_TIMEOUT;
+		if (!abs_time)
+			schedule();
+		else {
+			hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+			hrtimer_init_sleeper(&t, current);
+			t.timer.expires = *abs_time;
+
+			hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS);
+
+			/*
+			 * the timer could have already expired, in which
+			 * case current would be flagged for rescheduling.
+			 * Don't bother calling schedule.
+			 */
+			if (likely(t.task))
+				schedule();
+
+			hrtimer_cancel(&t.timer);
 
-		time_left = schedule_timeout(rel_time);
+			/* Flag if a timeout occured */
+			rem = (t.task == NULL);
+		}
 	}
 	__set_current_state(TASK_RUNNING);
 
@@ -1113,14 +1122,14 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
 	/* If we were woken (and unqueued), we succeeded, whatever. */
 	if (!unqueue_me(&q))
 		return 0;
-	if (time_left == 0)
+	if (rem)
 		return -ETIMEDOUT;
 
 	/*
 	 * We expect signal_pending(current), but another thread may
 	 * have handled it for us already.
 	 */
-	if (time_left == MAX_SCHEDULE_TIMEOUT)
+	if (!abs_time)
 		return -ERESTARTSYS;
 	else {
 		struct restart_block *restart;
@@ -1128,8 +1137,7 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
 		restart->fn = futex_wait_restart;
 		restart->arg0 = (unsigned long)uaddr;
 		restart->arg1 = (unsigned long)val;
-		restart->arg2 = (unsigned long)timed;
-		restart->arg3 = abs_time;
+		restart->arg2 = (unsigned long)abs_time;
 		return -ERESTART_RESTARTBLOCK;
 	}
 
@@ -1141,21 +1149,15 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
 	return ret;
 }
 
-static int futex_wait(u32 __user *uaddr, u32 val, unsigned long rel_time)
-{
-	int timed = (rel_time != MAX_SCHEDULE_TIMEOUT);
-	return futex_wait_abstime(uaddr, val, timed, jiffies+rel_time);
-}
 
 static long futex_wait_restart(struct restart_block *restart)
 {
 	u32 __user *uaddr = (u32 __user *)restart->arg0;
 	u32 val = (u32)restart->arg1;
-	int timed = (int)restart->arg2;
-	unsigned long abs_time = restart->arg3;
+	ktime_t *abs_time = (ktime_t *)restart->arg2;
 
 	restart->fn = do_no_restart_syscall;
-	return (long)futex_wait_abstime(uaddr, val, timed, abs_time);
+	return (long)futex_wait(uaddr, val, abs_time);
 }
 
 
@@ -1165,8 +1167,8 @@ static long futex_wait_restart(struct restart_block *restart)
  * if there are waiters then it will block, it does PI, etc. (Due to
  * races the kernel might see a 0 value of the futex too.)
  */
-static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
-			 long nsec, int trylock)
+static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
+			 int trylock)
 {
 	struct hrtimer_sleeper timeout, *to = NULL;
 	struct task_struct *curr = current;
@@ -1178,11 +1180,11 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
 	if (refill_pi_state_cache())
 		return -ENOMEM;
 
-	if (sec != MAX_SCHEDULE_TIMEOUT) {
+	if (time) {
 		to = &timeout;
 		hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
 		hrtimer_init_sleeper(to, current);
-		to->timer.expires = ktime_set(sec, nsec);
+		to->timer.expires = *time;
 	}
 
 	q.pi_state = NULL;
@@ -1818,7 +1820,7 @@ void exit_robust_list(struct task_struct *curr)
 	}
 }
 
-long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
+long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		u32 __user *uaddr2, u32 val2, u32 val3)
 {
 	int ret;
@@ -1844,13 +1846,13 @@ long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
 		ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
 		break;
 	case FUTEX_LOCK_PI:
-		ret = futex_lock_pi(uaddr, val, timeout, val2, 0);
+		ret = futex_lock_pi(uaddr, val, timeout, 0);
 		break;
 	case FUTEX_UNLOCK_PI:
 		ret = futex_unlock_pi(uaddr);
 		break;
 	case FUTEX_TRYLOCK_PI:
-		ret = futex_lock_pi(uaddr, 0, timeout, val2, 1);
+		ret = futex_lock_pi(uaddr, 0, timeout, 1);
 		break;
 	default:
 		ret = -ENOSYS;
@@ -1863,21 +1865,20 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
 			  struct timespec __user *utime, u32 __user *uaddr2,
 			  u32 val3)
 {
-	struct timespec t;
-	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+	struct timespec ts;
+	ktime_t t, *tp = NULL;
 	u32 val2 = 0;
 
 	if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
-		if (copy_from_user(&t, utime, sizeof(t)) != 0)
+		if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
 			return -EFAULT;
-		if (!timespec_valid(&t))
+		if (!timespec_valid(&ts))
 			return -EINVAL;
+
+		t = timespec_to_ktime(ts);
 		if (op == FUTEX_WAIT)
-			timeout = timespec_to_jiffies(&t) + 1;
-		else {
-			timeout = t.tv_sec;
-			val2 = t.tv_nsec;
-		}
+			t = ktime_add(ktime_get(), t);
+		tp = &t;
 	}
 	/*
 	 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
@@ -1885,7 +1886,7 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
 	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
 		val2 = (u32) (unsigned long) utime;
 
-	return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
+	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
 }
 
 static int futexfs_get_sb(struct file_system_type *fs_type,
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 50f24eea6cd0..dff27c471ea6 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -141,24 +141,23 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
 		struct compat_timespec __user *utime, u32 __user *uaddr2,
 		u32 val3)
 {
-	struct timespec t;
-	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+	struct timespec ts;
+	ktime_t t, *tp = NULL;
 	int val2 = 0;
 
 	if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
-		if (get_compat_timespec(&t, utime))
+		if (get_compat_timespec(&ts, utime))
 			return -EFAULT;
-		if (!timespec_valid(&t))
+		if (!timespec_valid(&ts))
 			return -EINVAL;
+
+		t = timespec_to_ktime(ts);
 		if (op == FUTEX_WAIT)
-			timeout = timespec_to_jiffies(&t) + 1;
-		else {
-			timeout = t.tv_sec;
-			val2 = t.tv_nsec;
-		}
+			t = ktime_add(ktime_get(), t);
+		tp = &t;
 	}
 	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
 		val2 = (int) (unsigned long) utime;
 
-	return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
+	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
 }
-- 
cgit v1.2.3


From d0aa7a70bf03b9de9e995ab272293be1f7937822 Mon Sep 17 00:00:00 2001
From: Pierre Peiffer <pierre.peiffer@bull.net>
Date: Wed, 9 May 2007 02:35:02 -0700
Subject: futex_requeue_pi optimization

This patch provides the futex_requeue_pi functionality, which allows some
threads waiting on a normal futex to be requeued on the wait-queue of a
PI-futex.

This provides an optimization, already used for (normal) futexes, to be used
with the PI-futexes.

This optimization is currently used by the glibc in pthread_broadcast, when
using "normal" mutexes.  With futex_requeue_pi, it can be used with
PRIO_INHERIT mutexes too.

Signed-off-by: Pierre Peiffer <pierre.peiffer@bull.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/futex.c          | 541 +++++++++++++++++++++++++++++++++++++++++++-----
 kernel/futex_compat.c   |   3 +-
 kernel/rtmutex.c        |  41 +---
 kernel/rtmutex_common.h |  34 +++
 4 files changed, 532 insertions(+), 87 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index e1246ccbf89a..4a60ef55dab4 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -53,6 +53,12 @@
 
 #include "rtmutex_common.h"
 
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
+
 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
 
 /*
@@ -102,6 +108,12 @@ struct futex_q {
 	/* Optional priority inheritance state: */
 	struct futex_pi_state *pi_state;
 	struct task_struct *task;
+
+	/*
+	 * This waiter is used in case of requeue from a
+	 * normal futex to a PI-futex
+	 */
+	struct rt_mutex_waiter waiter;
 };
 
 /*
@@ -180,6 +192,9 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
 	if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
 		return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
 
+	/* Save the user address in the ley */
+	key->uaddr = uaddr;
+
 	/*
 	 * Private mappings are handled in a simple way.
 	 *
@@ -439,7 +454,8 @@ void exit_pi_state_list(struct task_struct *curr)
 }
 
 static int
-lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
+lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+		union futex_key *key, struct futex_pi_state **ps)
 {
 	struct futex_pi_state *pi_state = NULL;
 	struct futex_q *this, *next;
@@ -450,7 +466,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 	head = &hb->chain;
 
 	plist_for_each_entry_safe(this, next, head, list) {
-		if (match_futex(&this->key, &me->key)) {
+		if (match_futex(&this->key, key)) {
 			/*
 			 * Another waiter already exists - bump up
 			 * the refcount and return its pi_state:
@@ -465,7 +481,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 			WARN_ON(!atomic_read(&pi_state->refcount));
 
 			atomic_inc(&pi_state->refcount);
-			me->pi_state = pi_state;
+			*ps = pi_state;
 
 			return 0;
 		}
@@ -492,7 +508,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
 
 	/* Store the key for possible exit cleanups: */
-	pi_state->key = me->key;
+	pi_state->key = *key;
 
 	spin_lock_irq(&p->pi_lock);
 	WARN_ON(!list_empty(&pi_state->list));
@@ -502,7 +518,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 
 	put_task_struct(p);
 
-	me->pi_state = pi_state;
+	*ps = pi_state;
 
 	return 0;
 }
@@ -562,6 +578,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 	 */
 	if (!(uval & FUTEX_OWNER_DIED)) {
 		newval = FUTEX_WAITERS | new_owner->pid;
+		/* Keep the FUTEX_WAITER_REQUEUED flag if it was set */
+		newval |= (uval & FUTEX_WAITER_REQUEUED);
 
 		pagefault_disable();
 		curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
@@ -665,6 +683,254 @@ out:
 	return ret;
 }
 
+/*
+ * Called from futex_requeue_pi.
+ * Set FUTEX_WAITERS and FUTEX_WAITER_REQUEUED flags on the
+ * PI-futex value; search its associated pi_state if an owner exist
+ * or create a new one without owner.
+ */
+static inline int
+lookup_pi_state_for_requeue(u32 __user *uaddr, struct futex_hash_bucket *hb,
+			    union futex_key *key,
+			    struct futex_pi_state **pi_state)
+{
+	u32 curval, uval, newval;
+
+retry:
+	/*
+	 * We can't handle a fault cleanly because we can't
+	 * release the locks here. Simply return the fault.
+	 */
+	if (get_futex_value_locked(&curval, uaddr))
+		return -EFAULT;
+
+	/* set the flags FUTEX_WAITERS and FUTEX_WAITER_REQUEUED */
+	if ((curval & (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED))
+	    != (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED)) {
+		/*
+		 * No waiters yet, we prepare the futex to have some waiters.
+		 */
+
+		uval = curval;
+		newval = uval | FUTEX_WAITERS | FUTEX_WAITER_REQUEUED;
+
+		pagefault_disable();
+		curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+		pagefault_enable();
+
+		if (unlikely(curval == -EFAULT))
+			return -EFAULT;
+		if (unlikely(curval != uval))
+			goto retry;
+	}
+
+	if (!(curval & FUTEX_TID_MASK)
+	    || lookup_pi_state(curval, hb, key, pi_state)) {
+		/* the futex has no owner (yet) or the lookup failed:
+		   allocate one pi_state without owner */
+
+		*pi_state = alloc_pi_state();
+
+		/* Already stores the key: */
+		(*pi_state)->key = *key;
+
+		/* init the mutex without owner */
+		__rt_mutex_init(&(*pi_state)->pi_mutex, NULL);
+	}
+
+	return 0;
+}
+
+/*
+ * Keep the first nr_wake waiter from futex1, wake up one,
+ * and requeue the next nr_requeue waiters following hashed on
+ * one physical page to another physical page (PI-futex uaddr2)
+ */
+static int futex_requeue_pi(u32 __user *uaddr1, u32 __user *uaddr2,
+			    int nr_wake, int nr_requeue, u32 *cmpval)
+{
+	union futex_key key1, key2;
+	struct futex_hash_bucket *hb1, *hb2;
+	struct plist_head *head1;
+	struct futex_q *this, *next;
+	struct futex_pi_state *pi_state2 = NULL;
+	struct rt_mutex_waiter *waiter, *top_waiter = NULL;
+	struct rt_mutex *lock2 = NULL;
+	int ret, drop_count = 0;
+
+	if (refill_pi_state_cache())
+		return -ENOMEM;
+
+retry:
+	/*
+	 * First take all the futex related locks:
+	 */
+	down_read(&current->mm->mmap_sem);
+
+	ret = get_futex_key(uaddr1, &key1);
+	if (unlikely(ret != 0))
+		goto out;
+	ret = get_futex_key(uaddr2, &key2);
+	if (unlikely(ret != 0))
+		goto out;
+
+	hb1 = hash_futex(&key1);
+	hb2 = hash_futex(&key2);
+
+	double_lock_hb(hb1, hb2);
+
+	if (likely(cmpval != NULL)) {
+		u32 curval;
+
+		ret = get_futex_value_locked(&curval, uaddr1);
+
+		if (unlikely(ret)) {
+			spin_unlock(&hb1->lock);
+			if (hb1 != hb2)
+				spin_unlock(&hb2->lock);
+
+			/*
+			 * If we would have faulted, release mmap_sem, fault
+			 * it in and start all over again.
+			 */
+			up_read(&current->mm->mmap_sem);
+
+			ret = get_user(curval, uaddr1);
+
+			if (!ret)
+				goto retry;
+
+			return ret;
+		}
+		if (curval != *cmpval) {
+			ret = -EAGAIN;
+			goto out_unlock;
+		}
+	}
+
+	head1 = &hb1->chain;
+	plist_for_each_entry_safe(this, next, head1, list) {
+		if (!match_futex (&this->key, &key1))
+			continue;
+		if (++ret <= nr_wake) {
+			wake_futex(this);
+		} else {
+			/*
+			 * FIRST: get and set the pi_state
+			 */
+			if (!pi_state2) {
+				int s;
+				/* do this only the first time we requeue someone */
+				s = lookup_pi_state_for_requeue(uaddr2, hb2,
+								&key2, &pi_state2);
+				if (s) {
+					ret = s;
+					goto out_unlock;
+				}
+
+				lock2 = &pi_state2->pi_mutex;
+				spin_lock(&lock2->wait_lock);
+
+				/* Save the top waiter of the wait_list */
+				if (rt_mutex_has_waiters(lock2))
+					top_waiter = rt_mutex_top_waiter(lock2);
+			} else
+				atomic_inc(&pi_state2->refcount);
+
+
+			this->pi_state = pi_state2;
+
+			/*
+			 * SECOND: requeue futex_q to the correct hashbucket
+			 */
+
+			/*
+			 * If key1 and key2 hash to the same bucket, no need to
+			 * requeue.
+			 */
+			if (likely(head1 != &hb2->chain)) {
+				plist_del(&this->list, &hb1->chain);
+				plist_add(&this->list, &hb2->chain);
+				this->lock_ptr = &hb2->lock;
+#ifdef CONFIG_DEBUG_PI_LIST
+				this->list.plist.lock = &hb2->lock;
+#endif
+			}
+			this->key = key2;
+			get_futex_key_refs(&key2);
+			drop_count++;
+
+
+			/*
+			 * THIRD: queue it to lock2
+			 */
+			spin_lock_irq(&this->task->pi_lock);
+			waiter = &this->waiter;
+			waiter->task = this->task;
+			waiter->lock = lock2;
+			plist_node_init(&waiter->list_entry, this->task->prio);
+			plist_node_init(&waiter->pi_list_entry, this->task->prio);
+			plist_add(&waiter->list_entry, &lock2->wait_list);
+			this->task->pi_blocked_on = waiter;
+			spin_unlock_irq(&this->task->pi_lock);
+
+			if (ret - nr_wake >= nr_requeue)
+				break;
+		}
+	}
+
+	/* If we've requeued some tasks and the top_waiter of the rt_mutex
+	   has changed, we must adjust the priority of the owner, if any */
+	if (drop_count) {
+		struct task_struct *owner = rt_mutex_owner(lock2);
+		if (owner &&
+		    (top_waiter != (waiter = rt_mutex_top_waiter(lock2)))) {
+			int chain_walk = 0;
+
+			spin_lock_irq(&owner->pi_lock);
+			if (top_waiter)
+				plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
+			else
+				/*
+				 * There was no waiters before the requeue,
+				 * the flag must be updated
+				 */
+				mark_rt_mutex_waiters(lock2);
+
+			plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
+			__rt_mutex_adjust_prio(owner);
+			if (owner->pi_blocked_on) {
+				chain_walk = 1;
+				get_task_struct(owner);
+			}
+
+			spin_unlock_irq(&owner->pi_lock);
+			spin_unlock(&lock2->wait_lock);
+
+			if (chain_walk)
+				rt_mutex_adjust_prio_chain(owner, 0, lock2, NULL,
+							   current);
+		} else {
+			/* No owner or the top_waiter does not change */
+			mark_rt_mutex_waiters(lock2);
+			spin_unlock(&lock2->wait_lock);
+		}
+	}
+
+out_unlock:
+	spin_unlock(&hb1->lock);
+	if (hb1 != hb2)
+		spin_unlock(&hb2->lock);
+
+	/* drop_futex_key_refs() must be called outside the spinlocks. */
+	while (--drop_count >= 0)
+		drop_futex_key_refs(&key1);
+
+out:
+	up_read(&current->mm->mmap_sem);
+	return ret;
+}
+
 /*
  * Wake up all waiters hashed on the physical page that is mapped
  * to this virtual address:
@@ -984,9 +1250,10 @@ static int unqueue_me(struct futex_q *q)
 
 /*
  * PI futexes can not be requeued and must remove themself from the
- * hash bucket. The hash bucket lock is held on entry and dropped here.
+ * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
+ * and dropped here.
  */
-static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
+static void unqueue_me_pi(struct futex_q *q)
 {
 	WARN_ON(plist_node_empty(&q->list));
 	plist_del(&q->list, &q->list.plist);
@@ -995,11 +1262,65 @@ static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
 	free_pi_state(q->pi_state);
 	q->pi_state = NULL;
 
-	spin_unlock(&hb->lock);
+	spin_unlock(q->lock_ptr);
 
 	drop_futex_key_refs(&q->key);
 }
 
+/*
+ * Fixup the pi_state owner with current.
+ *
+ * The cur->mm semaphore must be  held, it is released at return of this
+ * function.
+ */
+static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
+				struct futex_hash_bucket *hb,
+				struct task_struct *curr)
+{
+	u32 newtid = curr->pid | FUTEX_WAITERS;
+	struct futex_pi_state *pi_state = q->pi_state;
+	u32 uval, curval, newval;
+	int ret;
+
+	/* Owner died? */
+	if (pi_state->owner != NULL) {
+		spin_lock_irq(&pi_state->owner->pi_lock);
+		WARN_ON(list_empty(&pi_state->list));
+		list_del_init(&pi_state->list);
+		spin_unlock_irq(&pi_state->owner->pi_lock);
+	} else
+		newtid |= FUTEX_OWNER_DIED;
+
+	pi_state->owner = curr;
+
+	spin_lock_irq(&curr->pi_lock);
+	WARN_ON(!list_empty(&pi_state->list));
+	list_add(&pi_state->list, &curr->pi_state_list);
+	spin_unlock_irq(&curr->pi_lock);
+
+	/* Unqueue and drop the lock */
+	unqueue_me_pi(q);
+	up_read(&curr->mm->mmap_sem);
+	/*
+	 * We own it, so we have to replace the pending owner
+	 * TID. This must be atomic as we have preserve the
+	 * owner died bit here.
+	 */
+	ret = get_user(uval, uaddr);
+	while (!ret) {
+		newval = (uval & FUTEX_OWNER_DIED) | newtid;
+		newval |= (uval & FUTEX_WAITER_REQUEUED);
+		curval = futex_atomic_cmpxchg_inatomic(uaddr,
+						       uval, newval);
+		if (curval == -EFAULT)
+ 			ret = -EFAULT;
+		if (curval == uval)
+			break;
+		uval = curval;
+	}
+	return ret;
+}
+
 static long futex_wait_restart(struct restart_block *restart);
 static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 {
@@ -1009,7 +1330,7 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 	struct futex_q q;
 	u32 uval;
 	int ret;
-	struct hrtimer_sleeper t;
+	struct hrtimer_sleeper t, *to = NULL;
 	int rem = 0;
 
 	q.pi_state = NULL;
@@ -1063,6 +1384,14 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 	if (uval != val)
 		goto out_unlock_release_sem;
 
+	/*
+	 * This rt_mutex_waiter structure is prepared here and will
+	 * be used only if this task is requeued from a normal futex to
+	 * a PI-futex with futex_requeue_pi.
+	 */
+	debug_rt_mutex_init_waiter(&q.waiter);
+	q.waiter.task = NULL;
+
 	/* Only actually queue if *uaddr contained val.  */
 	__queue_me(&q, hb);
 
@@ -1092,6 +1421,7 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 		if (!abs_time)
 			schedule();
 		else {
+			to = &t;
 			hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 			hrtimer_init_sleeper(&t, current);
 			t.timer.expires = *abs_time;
@@ -1119,6 +1449,66 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 	 * we are the only user of it.
 	 */
 
+	if (q.pi_state) {
+		/*
+		 * We were woken but have been requeued on a PI-futex.
+		 * We have to complete the lock acquisition by taking
+		 * the rtmutex.
+		 */
+
+		struct rt_mutex *lock = &q.pi_state->pi_mutex;
+
+		spin_lock(&lock->wait_lock);
+		if (unlikely(q.waiter.task)) {
+			remove_waiter(lock, &q.waiter);
+		}
+		spin_unlock(&lock->wait_lock);
+
+		if (rem)
+			ret = -ETIMEDOUT;
+		else
+			ret = rt_mutex_timed_lock(lock, to, 1);
+
+		down_read(&curr->mm->mmap_sem);
+		spin_lock(q.lock_ptr);
+
+		/*
+		 * Got the lock. We might not be the anticipated owner if we
+		 * did a lock-steal - fix up the PI-state in that case.
+		 */
+		if (!ret && q.pi_state->owner != curr) {
+			/*
+			 * We MUST play with the futex we were requeued on,
+			 * NOT the current futex.
+			 * We can retrieve it from the key of the pi_state
+			 */
+			uaddr = q.pi_state->key.uaddr;
+
+			/* mmap_sem and hash_bucket lock are unlocked at
+			   return of this function */
+			ret = fixup_pi_state_owner(uaddr, &q, hb, curr);
+		} else {
+			/*
+			 * Catch the rare case, where the lock was released
+			 * when we were on the way back before we locked
+			 * the hash bucket.
+			 */
+			if (ret && q.pi_state->owner == curr) {
+				if (rt_mutex_trylock(&q.pi_state->pi_mutex))
+					ret = 0;
+			}
+			/* Unqueue and drop the lock */
+			unqueue_me_pi(&q);
+			up_read(&curr->mm->mmap_sem);
+		}
+
+		debug_rt_mutex_free_waiter(&q.waiter);
+
+		return ret;
+	}
+
+	debug_rt_mutex_free_waiter(&q.waiter);
+
 	/* If we were woken (and unqueued), we succeeded, whatever. */
 	if (!unqueue_me(&q))
 		return 0;
@@ -1161,6 +1551,51 @@ static long futex_wait_restart(struct restart_block *restart)
 }
 
 
+static void set_pi_futex_owner(struct futex_hash_bucket *hb,
+			       union futex_key *key, struct task_struct *p)
+{
+	struct plist_head *head;
+	struct futex_q *this, *next;
+	struct futex_pi_state *pi_state = NULL;
+	struct rt_mutex *lock;
+
+	/* Search a waiter that should already exists */
+
+	head = &hb->chain;
+
+	plist_for_each_entry_safe(this, next, head, list) {
+		if (match_futex (&this->key, key)) {
+			pi_state = this->pi_state;
+			break;
+		}
+	}
+
+	BUG_ON(!pi_state);
+
+	/* set p as pi_state's owner */
+	lock = &pi_state->pi_mutex;
+
+	spin_lock(&lock->wait_lock);
+	spin_lock_irq(&p->pi_lock);
+
+	list_add(&pi_state->list, &p->pi_state_list);
+	pi_state->owner = p;
+
+
+	/* set p as pi_mutex's owner */
+	debug_rt_mutex_proxy_lock(lock, p);
+	WARN_ON(rt_mutex_owner(lock));
+	rt_mutex_set_owner(lock, p, 0);
+	rt_mutex_deadlock_account_lock(lock, p);
+
+	plist_add(&rt_mutex_top_waiter(lock)->pi_list_entry,
+		  &p->pi_waiters);
+	__rt_mutex_adjust_prio(p);
+
+	spin_unlock_irq(&p->pi_lock);
+	spin_unlock(&lock->wait_lock);
+}
+
 /*
  * Userspace tried a 0 -> TID atomic transition of the futex value
  * and failed. The kernel side here does the whole locking operation:
@@ -1175,7 +1610,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 	struct futex_hash_bucket *hb;
 	u32 uval, newval, curval;
 	struct futex_q q;
-	int ret, attempt = 0;
+	int ret, lock_held, attempt = 0;
 
 	if (refill_pi_state_cache())
 		return -ENOMEM;
@@ -1198,6 +1633,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 	hb = queue_lock(&q, -1, NULL);
 
  retry_locked:
+	lock_held = 0;
+
 	/*
 	 * To avoid races, we attempt to take the lock here again
 	 * (by doing a 0 -> TID atomic cmpxchg), while holding all
@@ -1216,7 +1653,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 	if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
 		if (!detect && 0)
 			force_sig(SIGKILL, current);
-		ret = -EDEADLK;
+		/*
+		 * Normally, this check is done in user space.
+		 * In case of requeue, the owner may attempt to lock this futex,
+		 * even if the ownership has already been given by the previous
+		 * waker.
+		 * In the usual case, this is a case of deadlock, but not in case
+		 * of REQUEUE_PI.
+		 */
+		if (!(curval & FUTEX_WAITER_REQUEUED))
+			ret = -EDEADLK;
 		goto out_unlock_release_sem;
 	}
 
@@ -1228,7 +1674,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 		goto out_unlock_release_sem;
 
 	uval = curval;
-	newval = uval | FUTEX_WAITERS;
+	/*
+	 * In case of a requeue, check if there already is an owner
+	 * If not, just take the futex.
+	 */
+	if ((curval & FUTEX_WAITER_REQUEUED) && !(curval & FUTEX_TID_MASK)) {
+		/* set current as futex owner */
+		newval = curval | current->pid;
+		lock_held = 1;
+	} else
+		/* Set the WAITERS flag, so the owner will know it has someone
+		   to wake at next unlock */
+		newval = curval | FUTEX_WAITERS;
 
 	pagefault_disable();
 	curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
@@ -1239,11 +1696,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 	if (unlikely(curval != uval))
 		goto retry_locked;
 
+	if (lock_held) {
+		set_pi_futex_owner(hb, &q.key, curr);
+		goto out_unlock_release_sem;
+	}
+
 	/*
 	 * We dont have the lock. Look up the PI state (or create it if
 	 * we are the first waiter):
 	 */
-	ret = lookup_pi_state(uval, hb, &q);
+	ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
 
 	if (unlikely(ret)) {
 		/*
@@ -1306,45 +1768,10 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 	 * Got the lock. We might not be the anticipated owner if we
 	 * did a lock-steal - fix up the PI-state in that case.
 	 */
-	if (!ret && q.pi_state->owner != curr) {
-		u32 newtid = current->pid | FUTEX_WAITERS;
-
-		/* Owner died? */
-		if (q.pi_state->owner != NULL) {
-			spin_lock_irq(&q.pi_state->owner->pi_lock);
-			WARN_ON(list_empty(&q.pi_state->list));
-			list_del_init(&q.pi_state->list);
-			spin_unlock_irq(&q.pi_state->owner->pi_lock);
-		} else
-			newtid |= FUTEX_OWNER_DIED;
-
-		q.pi_state->owner = current;
-
-		spin_lock_irq(&current->pi_lock);
-		WARN_ON(!list_empty(&q.pi_state->list));
-		list_add(&q.pi_state->list, &current->pi_state_list);
-		spin_unlock_irq(&current->pi_lock);
-
-		/* Unqueue and drop the lock */
-		unqueue_me_pi(&q, hb);
-		up_read(&curr->mm->mmap_sem);
-		/*
-		 * We own it, so we have to replace the pending owner
-		 * TID. This must be atomic as we have preserve the
-		 * owner died bit here.
-		 */
-		ret = get_user(uval, uaddr);
-		while (!ret) {
-			newval = (uval & FUTEX_OWNER_DIED) | newtid;
-			curval = futex_atomic_cmpxchg_inatomic(uaddr,
-							       uval, newval);
-			if (curval == -EFAULT)
-				ret = -EFAULT;
-			if (curval == uval)
-				break;
-			uval = curval;
-		}
-	} else {
+	if (!ret && q.pi_state->owner != curr)
+		/* mmap_sem is unlocked at return of this function */
+		ret = fixup_pi_state_owner(uaddr, &q, hb, curr);
+	else {
 		/*
 		 * Catch the rare case, where the lock was released
 		 * when we were on the way back before we locked
@@ -1355,7 +1782,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 				ret = 0;
 		}
 		/* Unqueue and drop the lock */
-		unqueue_me_pi(&q, hb);
+		unqueue_me_pi(&q);
 		up_read(&curr->mm->mmap_sem);
 	}
 
@@ -1724,6 +2151,8 @@ retry:
 		 * userspace.
 		 */
 		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+		/* Also keep the FUTEX_WAITER_REQUEUED flag if set */
+		mval |= (uval & FUTEX_WAITER_REQUEUED);
 		nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
 
 		if (nval == -EFAULT)
@@ -1854,6 +2283,9 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 	case FUTEX_TRYLOCK_PI:
 		ret = futex_lock_pi(uaddr, 0, timeout, 1);
 		break;
+	case FUTEX_CMP_REQUEUE_PI:
+		ret = futex_requeue_pi(uaddr, uaddr2, val, val2, &val3);
+		break;
 	default:
 		ret = -ENOSYS;
 	}
@@ -1883,7 +2315,8 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
 	/*
 	 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
 	 */
-	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
+	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE
+	    || op == FUTEX_CMP_REQUEUE_PI)
 		val2 = (u32) (unsigned long) utime;
 
 	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index dff27c471ea6..338a9b489fbc 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -156,7 +156,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
 			t = ktime_add(ktime_get(), t);
 		tp = &t;
 	}
-	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
+	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE
+	    || op == FUTEX_CMP_REQUEUE_PI)
 		val2 = (int) (unsigned long) utime;
 
 	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 180978cb2f75..12879f6c1ec3 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -56,7 +56,7 @@
  * state.
  */
 
-static void
+void
 rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
 		   unsigned long mask)
 {
@@ -80,29 +80,6 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
 		clear_rt_mutex_waiters(lock);
 }
 
-/*
- * We can speed up the acquire/release, if the architecture
- * supports cmpxchg and if there's no debugging state to be set up
- */
-#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
-# define rt_mutex_cmpxchg(l,c,n)	(cmpxchg(&l->owner, c, n) == c)
-static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
-{
-	unsigned long owner, *p = (unsigned long *) &lock->owner;
-
-	do {
-		owner = *p;
-	} while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
-}
-#else
-# define rt_mutex_cmpxchg(l,c,n)	(0)
-static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
-{
-	lock->owner = (struct task_struct *)
-			((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
-}
-#endif
-
 /*
  * Calculate task priority from the waiter list priority
  *
@@ -123,7 +100,7 @@ int rt_mutex_getprio(struct task_struct *task)
  *
  * This can be both boosting and unboosting. task->pi_lock must be held.
  */
-static void __rt_mutex_adjust_prio(struct task_struct *task)
+void __rt_mutex_adjust_prio(struct task_struct *task)
 {
 	int prio = rt_mutex_getprio(task);
 
@@ -159,11 +136,11 @@ int max_lock_depth = 1024;
  * Decreases task's usage by one - may thus free the task.
  * Returns 0 or -EDEADLK.
  */
-static int rt_mutex_adjust_prio_chain(struct task_struct *task,
-				      int deadlock_detect,
-				      struct rt_mutex *orig_lock,
-				      struct rt_mutex_waiter *orig_waiter,
-				      struct task_struct *top_task)
+int rt_mutex_adjust_prio_chain(struct task_struct *task,
+			       int deadlock_detect,
+			       struct rt_mutex *orig_lock,
+			       struct rt_mutex_waiter *orig_waiter,
+			       struct task_struct *top_task)
 {
 	struct rt_mutex *lock;
 	struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
@@ -524,8 +501,8 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
  *
  * Must be called with lock->wait_lock held
  */
-static void remove_waiter(struct rt_mutex *lock,
-			  struct rt_mutex_waiter *waiter)
+void remove_waiter(struct rt_mutex *lock,
+		   struct rt_mutex_waiter *waiter)
 {
 	int first = (waiter == rt_mutex_top_waiter(lock));
 	struct task_struct *owner = rt_mutex_owner(lock);
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 9c75856e791e..242ec7ee740b 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -112,6 +112,29 @@ static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
 	return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
 }
 
+/*
+ * We can speed up the acquire/release, if the architecture
+ * supports cmpxchg and if there's no debugging state to be set up
+ */
+#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
+# define rt_mutex_cmpxchg(l,c,n)	(cmpxchg(&l->owner, c, n) == c)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+	unsigned long owner, *p = (unsigned long *) &lock->owner;
+
+	do {
+		owner = *p;
+	} while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
+}
+#else
+# define rt_mutex_cmpxchg(l,c,n)	(0)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+	lock->owner = (struct task_struct *)
+			((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
+}
+#endif
+
 /*
  * PI-futex support (proxy locking functions, etc.):
  */
@@ -120,4 +143,15 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
 				       struct task_struct *proxy_owner);
 extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
 				  struct task_struct *proxy_owner);
+
+extern void rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
+			       unsigned long mask);
+extern void __rt_mutex_adjust_prio(struct task_struct *task);
+extern int rt_mutex_adjust_prio_chain(struct task_struct *task,
+				      int deadlock_detect,
+				      struct rt_mutex *orig_lock,
+				      struct rt_mutex_waiter *orig_waiter,
+				      struct task_struct *top_task);
+extern void remove_waiter(struct rt_mutex *lock,
+			  struct rt_mutex_waiter *waiter);
 #endif
-- 
cgit v1.2.3


From 34f01cc1f512fa783302982776895c73714ebbc2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 9 May 2007 02:35:04 -0700
Subject: FUTEX: new PRIVATE futexes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  Analysis of current linux futex code :
  --------------------------------------

A central hash table futex_queues[] holds all contexts (futex_q) of waiting
threads.

Each futex_wait()/futex_wait() has to obtain a spinlock on a hash slot to
perform lookups or insert/deletion of a futex_q.

When a futex_wait() is done, calling thread has to :

1) - Obtain a read lock on mmap_sem to be able to validate the user pointer
     (calling find_vma()). This validation tells us if the futex uses
     an inode based store (mapped file), or mm based store (anonymous mem)

2) - compute a hash key

3) - Atomic increment of reference counter on an inode or a mm_struct

4) - lock part of futex_queues[] hash table

5) - perform the test on value of futex.
	(rollback is value != expected_value, returns EWOULDBLOCK)
	(various loops if test triggers mm faults)

6) queue the context into hash table, release the lock got in 4)

7) - release the read_lock on mmap_sem

   <block>

8) Eventually unqueue the context (but rarely, as this part  may be done
   by the futex_wake())

Futexes were designed to improve scalability but current implementation has
various problems :

- Central hashtable :

  This means scalability problems if many processes/threads want to use
  futexes at the same time.
  This means NUMA unbalance because this hashtable is located on one node.

- Using mmap_sem on every futex() syscall :

  Even if mmap_sem is a rw_semaphore, up_read()/down_read() are doing atomic
  ops on mmap_sem, dirtying cache line :
    - lot of cache line ping pongs on SMP configurations.

  mmap_sem is also extensively used by mm code (page faults, mmap()/munmap())
  Highly threaded processes might suffer from mmap_sem contention.

  mmap_sem is also used by oprofile code. Enabling oprofile hurts threaded
  programs because of contention on the mmap_sem cache line.

- Using an atomic_inc()/atomic_dec() on inode ref counter or mm ref counter:
  It's also a cache line ping pong on SMP. It also increases mmap_sem hold time
  because of cache misses.

Most of these scalability problems come from the fact that futexes are in
one global namespace.  As we use a central hash table, we must make sure
they are all using the same reference (given by the mm subsystem).  We
chose to force all futexes be 'shared'.  This has a cost.

But fact is POSIX defined PRIVATE and SHARED, allowing clear separation,
and optimal performance if carefuly implemented.  Time has come for linux
to have better threading performance.

The goal is to permit new futex commands to avoid :
 - Taking the mmap_sem semaphore, conflicting with other subsystems.
 - Modifying a ref_count on mm or an inode, still conflicting with mm or fs.

This is possible because, for one process using PTHREAD_PROCESS_PRIVATE
futexes, we only need to distinguish futexes by their virtual address, no
matter the underlying mm storage is.

If glibc wants to exploit this new infrastructure, it should use new
_PRIVATE futex subcommands for PTHREAD_PROCESS_PRIVATE futexes.  And be
prepared to fallback on old subcommands for old kernels.  Using one global
variable with the FUTEX_PRIVATE_FLAG or 0 value should be OK.

PTHREAD_PROCESS_SHARED futexes should still use the old subcommands.

Compatibility with old applications is preserved, they still hit the
scalability problems, but new applications can fly :)

Note : the same SHARED futex (mapped on a file) can be used by old binaries
*and* new binaries, because both binaries will use the old subcommands.

Note : Vast majority of futexes should be using PROCESS_PRIVATE semantic,
as this is the default semantic. Almost all applications should benefit
of this changes (new kernel and updated libc)

Some bench results on a Pentium M 1.6 GHz (SMP kernel on a UP machine)

/* calling futex_wait(addr, value) with value != *addr */
433 cycles per futex(FUTEX_WAIT) call (mixing 2 futexes)
424 cycles per futex(FUTEX_WAIT) call (using one futex)
334 cycles per futex(FUTEX_WAIT_PRIVATE) call (mixing 2 futexes)
334 cycles per futex(FUTEX_WAIT_PRIVATE) call (using one futex)
For reference :
187 cycles per getppid() call
188 cycles per umask() call
181 cycles per ni_syscall() call

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Pierre Peiffer <pierre.peiffer@bull.net>
Cc: "Ulrich Drepper" <drepper@gmail.com>
Cc: "Nick Piggin" <nickpiggin@yahoo.com.au>
Cc: "Ingo Molnar" <mingo@elte.hu>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/futex.c | 324 +++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 210 insertions(+), 114 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 4a60ef55dab4..b7ce15c67e32 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -16,6 +16,9 @@
  *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
  *
+ *  PRIVATE futexes by Eric Dumazet
+ *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
+ *
  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  *  enough at me, Linus for the original (flawed) idea, Matthew
  *  Kirkwood for proof-of-concept implementation.
@@ -150,19 +153,26 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
 		&& key1->both.offset == key2->both.offset);
 }
 
-/*
- * Get parameters which are the keys for a futex.
+/**
+ * get_futex_key - Get parameters which are the keys for a futex.
+ * @uaddr: virtual address of the futex
+ * @shared: NULL for a PROCESS_PRIVATE futex,
+ *	&current->mm->mmap_sem for a PROCESS_SHARED futex
+ * @key: address where result is stored.
+ *
+ * Returns a negative error code or 0
+ * The key words are stored in *key on success.
  *
  * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
  * offset_within_page).  For private mappings, it's (uaddr, current->mm).
  * We can usually work out the index without swapping in the page.
  *
- * Returns: 0, or negative error code.
- * The key words are stored in *key on success.
- *
- * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
+ * fshared is NULL for PROCESS_PRIVATE futexes
+ * For other futexes, it points to &current->mm->mmap_sem and
+ * caller must have taken the reader lock. but NOT any spinlocks.
  */
-int get_futex_key(u32 __user *uaddr, union futex_key *key)
+int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
+		  union futex_key *key)
 {
 	unsigned long address = (unsigned long)uaddr;
 	struct mm_struct *mm = current->mm;
@@ -174,10 +184,24 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
 	 * The futex address must be "naturally" aligned.
 	 */
 	key->both.offset = address % PAGE_SIZE;
-	if (unlikely((key->both.offset % sizeof(u32)) != 0))
+	if (unlikely((address % sizeof(u32)) != 0))
 		return -EINVAL;
 	address -= key->both.offset;
 
+	/*
+	 * PROCESS_PRIVATE futexes are fast.
+	 * As the mm cannot disappear under us and the 'key' only needs
+	 * virtual address, we dont even have to find the underlying vma.
+	 * Note : We do have to check 'uaddr' is a valid user address,
+	 *        but access_ok() should be faster than find_vma()
+	 */
+	if (!fshared) {
+		if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
+			return -EFAULT;
+		key->private.mm = mm;
+		key->private.address = address;
+		return 0;
+	}
 	/*
 	 * The futex is hashed differently depending on whether
 	 * it's in a shared or private mapping.  So check vma first.
@@ -205,6 +229,7 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
 	 * mappings of _writable_ handles.
 	 */
 	if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
+		key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */
 		key->private.mm = mm;
 		key->private.address = address;
 		return 0;
@@ -214,7 +239,7 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
 	 * Linear file mappings are also simple.
 	 */
 	key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
-	key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
+	key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
 	if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
 		key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
 				     + vma->vm_pgoff);
@@ -242,16 +267,18 @@ EXPORT_SYMBOL_GPL(get_futex_key);
  * Take a reference to the resource addressed by a key.
  * Can be called while holding spinlocks.
  *
- * NOTE: mmap_sem MUST be held between get_futex_key() and calling this
- * function, if it is called at all.  mmap_sem keeps key->shared.inode valid.
  */
 inline void get_futex_key_refs(union futex_key *key)
 {
-	if (key->both.ptr != 0) {
-		if (key->both.offset & 1)
+	if (key->both.ptr == 0)
+		return;
+	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+		case FUT_OFF_INODE:
 			atomic_inc(&key->shared.inode->i_count);
-		else
+			break;
+		case FUT_OFF_MMSHARED:
 			atomic_inc(&key->private.mm->mm_count);
+			break;
 	}
 }
 EXPORT_SYMBOL_GPL(get_futex_key_refs);
@@ -262,11 +289,15 @@ EXPORT_SYMBOL_GPL(get_futex_key_refs);
  */
 void drop_futex_key_refs(union futex_key *key)
 {
-	if (key->both.ptr != 0) {
-		if (key->both.offset & 1)
+	if (key->both.ptr == 0)
+		return;
+	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+		case FUT_OFF_INODE:
 			iput(key->shared.inode);
-		else
+			break;
+		case FUT_OFF_MMSHARED:
 			mmdrop(key->private.mm);
+			break;
 	}
 }
 EXPORT_SYMBOL_GPL(drop_futex_key_refs);
@@ -283,28 +314,38 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
 }
 
 /*
- * Fault handling. Called with current->mm->mmap_sem held.
+ * Fault handling.
+ * if fshared is non NULL, current->mm->mmap_sem is already held
  */
-static int futex_handle_fault(unsigned long address, int attempt)
+static int futex_handle_fault(unsigned long address,
+			      struct rw_semaphore *fshared, int attempt)
 {
 	struct vm_area_struct * vma;
 	struct mm_struct *mm = current->mm;
+	int ret = -EFAULT;
 
-	if (attempt > 2 || !(vma = find_vma(mm, address)) ||
-	    vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
-		return -EFAULT;
+	if (attempt > 2)
+		return ret;
 
-	switch (handle_mm_fault(mm, vma, address, 1)) {
-	case VM_FAULT_MINOR:
-		current->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		current->maj_flt++;
-		break;
-	default:
-		return -EFAULT;
+	if (!fshared)
+		down_read(&mm->mmap_sem);
+	vma = find_vma(mm, address);
+	if (vma && address >= vma->vm_start &&
+	    (vma->vm_flags & VM_WRITE)) {
+		switch (handle_mm_fault(mm, vma, address, 1)) {
+		case VM_FAULT_MINOR:
+			ret = 0;
+			current->min_flt++;
+			break;
+		case VM_FAULT_MAJOR:
+			ret = 0;
+			current->maj_flt++;
+			break;
+		}
 	}
-	return 0;
+	if (!fshared)
+		up_read(&mm->mmap_sem);
+	return ret;
 }
 
 /*
@@ -647,7 +688,8 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
  * Wake up all waiters hashed on the physical page that is mapped
  * to this virtual address:
  */
-static int futex_wake(u32 __user *uaddr, int nr_wake)
+static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
+		      int nr_wake)
 {
 	struct futex_hash_bucket *hb;
 	struct futex_q *this, *next;
@@ -655,9 +697,10 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
 	union futex_key key;
 	int ret;
 
-	down_read(&current->mm->mmap_sem);
+	if (fshared)
+		down_read(fshared);
 
-	ret = get_futex_key(uaddr, &key);
+	ret = get_futex_key(uaddr, fshared, &key);
 	if (unlikely(ret != 0))
 		goto out;
 
@@ -679,7 +722,8 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
 
 	spin_unlock(&hb->lock);
 out:
-	up_read(&current->mm->mmap_sem);
+	if (fshared)
+		up_read(fshared);
 	return ret;
 }
 
@@ -746,7 +790,9 @@ retry:
  * and requeue the next nr_requeue waiters following hashed on
  * one physical page to another physical page (PI-futex uaddr2)
  */
-static int futex_requeue_pi(u32 __user *uaddr1, u32 __user *uaddr2,
+static int futex_requeue_pi(u32 __user *uaddr1,
+			    struct rw_semaphore *fshared,
+			    u32 __user *uaddr2,
 			    int nr_wake, int nr_requeue, u32 *cmpval)
 {
 	union futex_key key1, key2;
@@ -765,12 +811,13 @@ retry:
 	/*
 	 * First take all the futex related locks:
 	 */
-	down_read(&current->mm->mmap_sem);
+	if (fshared)
+		down_read(fshared);
 
-	ret = get_futex_key(uaddr1, &key1);
+	ret = get_futex_key(uaddr1, fshared, &key1);
 	if (unlikely(ret != 0))
 		goto out;
-	ret = get_futex_key(uaddr2, &key2);
+	ret = get_futex_key(uaddr2, fshared, &key2);
 	if (unlikely(ret != 0))
 		goto out;
 
@@ -793,7 +840,8 @@ retry:
 			 * If we would have faulted, release mmap_sem, fault
 			 * it in and start all over again.
 			 */
-			up_read(&current->mm->mmap_sem);
+			if (fshared)
+				up_read(fshared);
 
 			ret = get_user(curval, uaddr1);
 
@@ -927,7 +975,8 @@ out_unlock:
 		drop_futex_key_refs(&key1);
 
 out:
-	up_read(&current->mm->mmap_sem);
+	if (fshared)
+		up_read(fshared);
 	return ret;
 }
 
@@ -936,7 +985,8 @@ out:
  * to this virtual address:
  */
 static int
-futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2,
+futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
+	      u32 __user *uaddr2,
 	      int nr_wake, int nr_wake2, int op)
 {
 	union futex_key key1, key2;
@@ -946,12 +996,13 @@ futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2,
 	int ret, op_ret, attempt = 0;
 
 retryfull:
-	down_read(&current->mm->mmap_sem);
+	if (fshared)
+		down_read(fshared);
 
-	ret = get_futex_key(uaddr1, &key1);
+	ret = get_futex_key(uaddr1, fshared, &key1);
 	if (unlikely(ret != 0))
 		goto out;
-	ret = get_futex_key(uaddr2, &key2);
+	ret = get_futex_key(uaddr2, fshared, &key2);
 	if (unlikely(ret != 0))
 		goto out;
 
@@ -991,11 +1042,10 @@ retry:
 		 * still holding the mmap_sem.
 		 */
 		if (attempt++) {
-			if (futex_handle_fault((unsigned long)uaddr2,
-						attempt)) {
-				ret = -EFAULT;
+			ret = futex_handle_fault((unsigned long)uaddr2,
+						fshared, attempt);
+			if (ret)
 				goto out;
-			}
 			goto retry;
 		}
 
@@ -1003,7 +1053,8 @@ retry:
 		 * If we would have faulted, release mmap_sem,
 		 * fault it in and start all over again.
 		 */
-		up_read(&current->mm->mmap_sem);
+		if (fshared)
+			up_read(fshared);
 
 		ret = get_user(dummy, uaddr2);
 		if (ret)
@@ -1040,7 +1091,8 @@ retry:
 	if (hb1 != hb2)
 		spin_unlock(&hb2->lock);
 out:
-	up_read(&current->mm->mmap_sem);
+	if (fshared)
+		up_read(fshared);
 	return ret;
 }
 
@@ -1048,7 +1100,8 @@ out:
  * Requeue all waiters hashed on one physical page to another
  * physical page.
  */
-static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
+static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
+			 u32 __user *uaddr2,
 			 int nr_wake, int nr_requeue, u32 *cmpval)
 {
 	union futex_key key1, key2;
@@ -1058,12 +1111,13 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
 	int ret, drop_count = 0;
 
  retry:
-	down_read(&current->mm->mmap_sem);
+	if (fshared)
+		down_read(fshared);
 
-	ret = get_futex_key(uaddr1, &key1);
+	ret = get_futex_key(uaddr1, fshared, &key1);
 	if (unlikely(ret != 0))
 		goto out;
-	ret = get_futex_key(uaddr2, &key2);
+	ret = get_futex_key(uaddr2, fshared, &key2);
 	if (unlikely(ret != 0))
 		goto out;
 
@@ -1086,7 +1140,8 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
 			 * If we would have faulted, release mmap_sem, fault
 			 * it in and start all over again.
 			 */
-			up_read(&current->mm->mmap_sem);
+			if (fshared)
+				up_read(fshared);
 
 			ret = get_user(curval, uaddr1);
 
@@ -1139,7 +1194,8 @@ out_unlock:
 		drop_futex_key_refs(&key1);
 
 out:
-	up_read(&current->mm->mmap_sem);
+	if (fshared)
+		up_read(fshared);
 	return ret;
 }
 
@@ -1273,7 +1329,8 @@ static void unqueue_me_pi(struct futex_q *q)
  * The cur->mm semaphore must be  held, it is released at return of this
  * function.
  */
-static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
+static int fixup_pi_state_owner(u32 __user *uaddr, struct rw_semaphore *fshared,
+				struct futex_q *q,
 				struct futex_hash_bucket *hb,
 				struct task_struct *curr)
 {
@@ -1300,7 +1357,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 
 	/* Unqueue and drop the lock */
 	unqueue_me_pi(q);
-	up_read(&curr->mm->mmap_sem);
+	if (fshared)
+		up_read(fshared);
 	/*
 	 * We own it, so we have to replace the pending owner
 	 * TID. This must be atomic as we have preserve the
@@ -1321,8 +1379,15 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 	return ret;
 }
 
+/*
+ * In case we must use restart_block to restart a futex_wait,
+ * we encode in the 'arg3' shared capability
+ */
+#define ARG3_SHARED  1
+
 static long futex_wait_restart(struct restart_block *restart);
-static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
+static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
+		      u32 val, ktime_t *abs_time)
 {
 	struct task_struct *curr = current;
 	DECLARE_WAITQUEUE(wait, curr);
@@ -1335,9 +1400,10 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 
 	q.pi_state = NULL;
  retry:
-	down_read(&curr->mm->mmap_sem);
+	if (fshared)
+		down_read(fshared);
 
-	ret = get_futex_key(uaddr, &q.key);
+	ret = get_futex_key(uaddr, fshared, &q.key);
 	if (unlikely(ret != 0))
 		goto out_release_sem;
 
@@ -1360,8 +1426,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 	 * a wakeup when *uaddr != val on entry to the syscall.  This is
 	 * rare, but normal.
 	 *
-	 * We hold the mmap semaphore, so the mapping cannot have changed
-	 * since we looked it up in get_futex_key.
+	 * for shared futexes, we hold the mmap semaphore, so the mapping
+	 * cannot have changed since we looked it up in get_futex_key.
 	 */
 	ret = get_futex_value_locked(&uval, uaddr);
 
@@ -1372,7 +1438,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 		 * If we would have faulted, release mmap_sem, fault it in and
 		 * start all over again.
 		 */
-		up_read(&curr->mm->mmap_sem);
+		if (fshared)
+			up_read(fshared);
 
 		ret = get_user(uval, uaddr);
 
@@ -1399,7 +1466,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 	 * Now the futex is queued and we have checked the data, we
 	 * don't want to hold mmap_sem while we sleep.
 	 */
-	up_read(&curr->mm->mmap_sem);
+	if (fshared)
+		up_read(fshared);
 
 	/*
 	 * There might have been scheduling since the queue_me(), as we
@@ -1469,7 +1537,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 		else
 			ret = rt_mutex_timed_lock(lock, to, 1);
 
-		down_read(&curr->mm->mmap_sem);
+		if (fshared)
+			down_read(fshared);
 		spin_lock(q.lock_ptr);
 
 		/*
@@ -1486,7 +1555,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 
 			/* mmap_sem and hash_bucket lock are unlocked at
 			   return of this function */
-			ret = fixup_pi_state_owner(uaddr, &q, hb, curr);
+			ret = fixup_pi_state_owner(uaddr, fshared,
+						   &q, hb, curr);
 		} else {
 			/*
 			 * Catch the rare case, where the lock was released
@@ -1499,7 +1569,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 			}
 			/* Unqueue and drop the lock */
 			unqueue_me_pi(&q);
-			up_read(&curr->mm->mmap_sem);
+			if (fshared)
+				up_read(fshared);
 		}
 
 		debug_rt_mutex_free_waiter(&q.waiter);
@@ -1528,6 +1599,9 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 		restart->arg0 = (unsigned long)uaddr;
 		restart->arg1 = (unsigned long)val;
 		restart->arg2 = (unsigned long)abs_time;
+		restart->arg3 = 0;
+		if (fshared)
+			restart->arg3 |= ARG3_SHARED;
 		return -ERESTART_RESTARTBLOCK;
 	}
 
@@ -1535,7 +1609,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 	queue_unlock(&q, hb);
 
  out_release_sem:
-	up_read(&curr->mm->mmap_sem);
+	if (fshared)
+		up_read(fshared);
 	return ret;
 }
 
@@ -1545,9 +1620,12 @@ static long futex_wait_restart(struct restart_block *restart)
 	u32 __user *uaddr = (u32 __user *)restart->arg0;
 	u32 val = (u32)restart->arg1;
 	ktime_t *abs_time = (ktime_t *)restart->arg2;
+	struct rw_semaphore *fshared = NULL;
 
 	restart->fn = do_no_restart_syscall;
-	return (long)futex_wait(uaddr, val, abs_time);
+	if (restart->arg3 & ARG3_SHARED)
+		fshared = &current->mm->mmap_sem;
+	return (long)futex_wait(uaddr, fshared, val, abs_time);
 }
 
 
@@ -1602,8 +1680,8 @@ static void set_pi_futex_owner(struct futex_hash_bucket *hb,
  * if there are waiters then it will block, it does PI, etc. (Due to
  * races the kernel might see a 0 value of the futex too.)
  */
-static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
-			 int trylock)
+static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
+			 int detect, ktime_t *time, int trylock)
 {
 	struct hrtimer_sleeper timeout, *to = NULL;
 	struct task_struct *curr = current;
@@ -1624,9 +1702,10 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 
 	q.pi_state = NULL;
  retry:
-	down_read(&curr->mm->mmap_sem);
+	if (fshared)
+		down_read(fshared);
 
-	ret = get_futex_key(uaddr, &q.key);
+	ret = get_futex_key(uaddr, fshared, &q.key);
 	if (unlikely(ret != 0))
 		goto out_release_sem;
 
@@ -1747,7 +1826,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 	 * Now the futex is queued and we have checked the data, we
 	 * don't want to hold mmap_sem while we sleep.
 	 */
-	up_read(&curr->mm->mmap_sem);
+	if (fshared)
+		up_read(fshared);
 
 	WARN_ON(!q.pi_state);
 	/*
@@ -1761,7 +1841,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 		ret = ret ? 0 : -EWOULDBLOCK;
 	}
 
-	down_read(&curr->mm->mmap_sem);
+	if (fshared)
+		down_read(fshared);
 	spin_lock(q.lock_ptr);
 
 	/*
@@ -1770,7 +1851,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 	 */
 	if (!ret && q.pi_state->owner != curr)
 		/* mmap_sem is unlocked at return of this function */
-		ret = fixup_pi_state_owner(uaddr, &q, hb, curr);
+		ret = fixup_pi_state_owner(uaddr, fshared, &q, hb, curr);
 	else {
 		/*
 		 * Catch the rare case, where the lock was released
@@ -1783,7 +1864,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 		}
 		/* Unqueue and drop the lock */
 		unqueue_me_pi(&q);
-		up_read(&curr->mm->mmap_sem);
+		if (fshared)
+			up_read(fshared);
 	}
 
 	if (!detect && ret == -EDEADLK && 0)
@@ -1795,7 +1877,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 	queue_unlock(&q, hb);
 
  out_release_sem:
-	up_read(&curr->mm->mmap_sem);
+	if (fshared)
+		up_read(fshared);
 	return ret;
 
  uaddr_faulted:
@@ -1806,15 +1889,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 	 * still holding the mmap_sem.
 	 */
 	if (attempt++) {
-		if (futex_handle_fault((unsigned long)uaddr, attempt)) {
-			ret = -EFAULT;
+		ret = futex_handle_fault((unsigned long)uaddr, fshared,
+					 attempt);
+		if (ret)
 			goto out_unlock_release_sem;
-		}
 		goto retry_locked;
 	}
 
 	queue_unlock(&q, hb);
-	up_read(&curr->mm->mmap_sem);
+	if (fshared)
+		up_read(fshared);
 
 	ret = get_user(uval, uaddr);
 	if (!ret && (uval != -EFAULT))
@@ -1828,7 +1912,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
  * This is the in-kernel slowpath: we look up the PI state (if any),
  * and do the rt-mutex unlock.
  */
-static int futex_unlock_pi(u32 __user *uaddr)
+static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)
 {
 	struct futex_hash_bucket *hb;
 	struct futex_q *this, *next;
@@ -1848,9 +1932,10 @@ retry:
 	/*
 	 * First take all the futex related locks:
 	 */
-	down_read(&current->mm->mmap_sem);
+	if (fshared)
+		down_read(fshared);
 
-	ret = get_futex_key(uaddr, &key);
+	ret = get_futex_key(uaddr, fshared, &key);
 	if (unlikely(ret != 0))
 		goto out;
 
@@ -1909,7 +1994,8 @@ retry_locked:
 out_unlock:
 	spin_unlock(&hb->lock);
 out:
-	up_read(&current->mm->mmap_sem);
+	if (fshared)
+		up_read(fshared);
 
 	return ret;
 
@@ -1921,15 +2007,16 @@ pi_faulted:
 	 * still holding the mmap_sem.
 	 */
 	if (attempt++) {
-		if (futex_handle_fault((unsigned long)uaddr, attempt)) {
-			ret = -EFAULT;
+		ret = futex_handle_fault((unsigned long)uaddr, fshared,
+					 attempt);
+		if (ret)
 			goto out_unlock;
-		}
 		goto retry_locked;
 	}
 
 	spin_unlock(&hb->lock);
-	up_read(&current->mm->mmap_sem);
+	if (fshared)
+		up_read(fshared);
 
 	ret = get_user(uval, uaddr);
 	if (!ret && (uval != -EFAULT))
@@ -1981,6 +2068,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
 	struct futex_q *q;
 	struct file *filp;
 	int ret, err;
+	struct rw_semaphore *fshared;
 	static unsigned long printk_interval;
 
 	if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
@@ -2022,11 +2110,12 @@ static int futex_fd(u32 __user *uaddr, int signal)
 	}
 	q->pi_state = NULL;
 
-	down_read(&current->mm->mmap_sem);
-	err = get_futex_key(uaddr, &q->key);
+	fshared = &current->mm->mmap_sem;
+	down_read(fshared);
+	err = get_futex_key(uaddr, fshared, &q->key);
 
 	if (unlikely(err != 0)) {
-		up_read(&current->mm->mmap_sem);
+		up_read(fshared);
 		kfree(q);
 		goto error;
 	}
@@ -2038,7 +2127,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
 	filp->private_data = q;
 
 	queue_me(q, ret, filp);
-	up_read(&current->mm->mmap_sem);
+	up_read(fshared);
 
 	/* Now we map fd to filp, so userspace can access it */
 	fd_install(ret, filp);
@@ -2167,7 +2256,7 @@ retry:
 		 */
 		if (!pi) {
 			if (uval & FUTEX_WAITERS)
-				futex_wake(uaddr, 1);
+				futex_wake(uaddr, &curr->mm->mmap_sem, 1);
 		}
 	}
 	return 0;
@@ -2223,7 +2312,8 @@ void exit_robust_list(struct task_struct *curr)
 		return;
 
 	if (pending)
-		handle_futex_death((void __user *)pending + futex_offset, curr, pip);
+		handle_futex_death((void __user *)pending + futex_offset,
+				   curr, pip);
 
 	while (entry != &head->list) {
 		/*
@@ -2253,38 +2343,43 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		u32 __user *uaddr2, u32 val2, u32 val3)
 {
 	int ret;
+	int cmd = op & FUTEX_CMD_MASK;
+	struct rw_semaphore *fshared = NULL;
+
+	if (!(op & FUTEX_PRIVATE_FLAG))
+		fshared = &current->mm->mmap_sem;
 
-	switch (op) {
+	switch (cmd) {
 	case FUTEX_WAIT:
-		ret = futex_wait(uaddr, val, timeout);
+		ret = futex_wait(uaddr, fshared, val, timeout);
 		break;
 	case FUTEX_WAKE:
-		ret = futex_wake(uaddr, val);
+		ret = futex_wake(uaddr, fshared, val);
 		break;
 	case FUTEX_FD:
 		/* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
 		ret = futex_fd(uaddr, val);
 		break;
 	case FUTEX_REQUEUE:
-		ret = futex_requeue(uaddr, uaddr2, val, val2, NULL);
+		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
 		break;
 	case FUTEX_CMP_REQUEUE:
-		ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
+		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
 		break;
 	case FUTEX_WAKE_OP:
-		ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
+		ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
 		break;
 	case FUTEX_LOCK_PI:
-		ret = futex_lock_pi(uaddr, val, timeout, 0);
+		ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
 		break;
 	case FUTEX_UNLOCK_PI:
-		ret = futex_unlock_pi(uaddr);
+		ret = futex_unlock_pi(uaddr, fshared);
 		break;
 	case FUTEX_TRYLOCK_PI:
-		ret = futex_lock_pi(uaddr, 0, timeout, 1);
+		ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
 		break;
 	case FUTEX_CMP_REQUEUE_PI:
-		ret = futex_requeue_pi(uaddr, uaddr2, val, val2, &val3);
+		ret = futex_requeue_pi(uaddr, fshared, uaddr2, val, val2, &val3);
 		break;
 	default:
 		ret = -ENOSYS;
@@ -2300,23 +2395,24 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
 	struct timespec ts;
 	ktime_t t, *tp = NULL;
 	u32 val2 = 0;
+	int cmd = op & FUTEX_CMD_MASK;
 
-	if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
+	if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) {
 		if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
 			return -EFAULT;
 		if (!timespec_valid(&ts))
 			return -EINVAL;
 
 		t = timespec_to_ktime(ts);
-		if (op == FUTEX_WAIT)
+		if (cmd == FUTEX_WAIT)
 			t = ktime_add(ktime_get(), t);
 		tp = &t;
 	}
 	/*
-	 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
+	 * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
 	 */
-	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE
-	    || op == FUTEX_CMP_REQUEUE_PI)
+	if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE
+	    || cmd == FUTEX_CMP_REQUEUE_PI)
 		val2 = (u32) (unsigned long) utime;
 
 	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
-- 
cgit v1.2.3


From 38a23e311b6cd389b9d8af2ea6c28c8cffbe581c Mon Sep 17 00:00:00 2001
From: Jarek Poplawski <jarkao2@o2.pl>
Date: Wed, 9 May 2007 02:35:05 -0700
Subject: timer: parenthesis fix in tbase_get_deferrable() etc

Signed-off-by: Jarek Poplawski <jarkao2@o2.pl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/timer.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 7a6448340f90..58f6dd07c80b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -92,24 +92,24 @@ static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
 /* Functions below help us manage 'deferrable' flag */
 static inline unsigned int tbase_get_deferrable(tvec_base_t *base)
 {
-	return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG);
+	return (unsigned int)((unsigned long)base & TBASE_DEFERRABLE_FLAG);
 }
 
 static inline tvec_base_t *tbase_get_base(tvec_base_t *base)
 {
-	return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
+	return (tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG);
 }
 
 static inline void timer_set_deferrable(struct timer_list *timer)
 {
-	timer->base = ((tvec_base_t *)((unsigned long)(timer->base) |
-	                               TBASE_DEFERRABLE_FLAG));
+	timer->base = (tvec_base_t *)((unsigned long)timer->base |
+	                               TBASE_DEFERRABLE_FLAG);
 }
 
 static inline void
 timer_set_base(struct timer_list *timer, tvec_base_t *new_base)
 {
-	timer->base = (tvec_base_t *)((unsigned long)(new_base) |
+	timer->base = (tvec_base_t *)((unsigned long)new_base |
 	                              tbase_get_deferrable(timer->base));
 }
 
-- 
cgit v1.2.3


From 8bb7844286fb8c9fce6f65d8288aeb09d03a5e0d Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 9 May 2007 02:35:10 -0700
Subject: Add suspend-related notifications for CPU hotplug

Since nonboot CPUs are now disabled after tasks and devices have been
frozen and the CPU hotplug infrastructure is used for this purpose, we need
special CPU hotplug notifications that will help the CPU-hotplug-aware
subsystems distinguish normal CPU hotplug events from CPU hotplug events
related to a system-wide suspend or resume operation in progress.  This
patch introduces such notifications and causes them to be used during
suspend and resume transitions.  It also changes all of the
CPU-hotplug-aware subsystems to take these notifications into consideration
(for now they are handled in the same way as the corresponding "normal"
ones).

[oleg@tv-sign.ru: cleanups]
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Gautham R Shenoy <ego@in.ibm.com>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpu.c        | 34 ++++++++++++++++++----------------
 kernel/hrtimer.c    |  2 ++
 kernel/profile.c    |  4 ++++
 kernel/rcupdate.c   |  2 ++
 kernel/relay.c      |  2 ++
 kernel/sched.c      | 10 ++++++++++
 kernel/softirq.c    |  4 ++++
 kernel/softlockup.c |  4 ++++
 kernel/timer.c      |  2 ++
 kernel/workqueue.c  |  2 ++
 10 files changed, 50 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 28cb6c71a47a..369d2892687d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -120,12 +120,13 @@ static int take_cpu_down(void *unused)
 }
 
 /* Requires cpu_add_remove_lock to be held */
-static int _cpu_down(unsigned int cpu)
+static int _cpu_down(unsigned int cpu, int tasks_frozen)
 {
 	int err, nr_calls = 0;
 	struct task_struct *p;
 	cpumask_t old_allowed, tmp;
 	void *hcpu = (void *)(long)cpu;
+	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
 
 	if (num_online_cpus() == 1)
 		return -EBUSY;
@@ -134,11 +135,11 @@ static int _cpu_down(unsigned int cpu)
 		return -EINVAL;
 
 	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
-	err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
+	err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
 					hcpu, -1, &nr_calls);
 	if (err == NOTIFY_BAD) {
-		__raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, hcpu,
-					  nr_calls, NULL);
+		__raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
+					  hcpu, nr_calls, NULL);
 		printk("%s: attempt to take down CPU %u failed\n",
 				__FUNCTION__, cpu);
 		err = -EINVAL;
@@ -157,7 +158,7 @@ static int _cpu_down(unsigned int cpu)
 
 	if (IS_ERR(p) || cpu_online(cpu)) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
-		if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
+		if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
 					    hcpu) == NOTIFY_BAD)
 			BUG();
 
@@ -176,7 +177,8 @@ static int _cpu_down(unsigned int cpu)
 	__cpu_die(cpu);
 
 	/* CPU is completely dead: tell everyone.  Too late to complain. */
-	if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD, hcpu) == NOTIFY_BAD)
+	if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod,
+				    hcpu) == NOTIFY_BAD)
 		BUG();
 
 	check_for_tasks(cpu);
@@ -186,8 +188,7 @@ out_thread:
 out_allowed:
 	set_cpus_allowed(current, old_allowed);
 out_release:
-	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE,
-						(void *)(long)cpu);
+	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
 	return err;
 }
 
@@ -199,7 +200,7 @@ int cpu_down(unsigned int cpu)
 	if (cpu_hotplug_disabled)
 		err = -EBUSY;
 	else
-		err = _cpu_down(cpu);
+		err = _cpu_down(cpu, 0);
 
 	mutex_unlock(&cpu_add_remove_lock);
 	return err;
@@ -207,16 +208,17 @@ int cpu_down(unsigned int cpu)
 #endif /*CONFIG_HOTPLUG_CPU*/
 
 /* Requires cpu_add_remove_lock to be held */
-static int __cpuinit _cpu_up(unsigned int cpu)
+static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
 {
 	int ret, nr_calls = 0;
 	void *hcpu = (void *)(long)cpu;
+	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
 
 	if (cpu_online(cpu) || !cpu_present(cpu))
 		return -EINVAL;
 
 	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
-	ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu,
+	ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
 							-1, &nr_calls);
 	if (ret == NOTIFY_BAD) {
 		printk("%s: attempt to bring up CPU %u failed\n",
@@ -234,12 +236,12 @@ static int __cpuinit _cpu_up(unsigned int cpu)
 	BUG_ON(!cpu_online(cpu));
 
 	/* Now call notifier in preparation. */
-	raw_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
+	raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
 
 out_notify:
 	if (ret != 0)
 		__raw_notifier_call_chain(&cpu_chain,
-				CPU_UP_CANCELED, hcpu, nr_calls, NULL);
+				CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
 	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
 
 	return ret;
@@ -253,7 +255,7 @@ int __cpuinit cpu_up(unsigned int cpu)
 	if (cpu_hotplug_disabled)
 		err = -EBUSY;
 	else
-		err = _cpu_up(cpu);
+		err = _cpu_up(cpu, 0);
 
 	mutex_unlock(&cpu_add_remove_lock);
 	return err;
@@ -283,7 +285,7 @@ int disable_nonboot_cpus(void)
 	for_each_online_cpu(cpu) {
 		if (cpu == first_cpu)
 			continue;
-		error = _cpu_down(cpu);
+		error = _cpu_down(cpu, 1);
 		if (!error) {
 			cpu_set(cpu, frozen_cpus);
 			printk("CPU%d is down\n", cpu);
@@ -318,7 +320,7 @@ void enable_nonboot_cpus(void)
 	suspend_cpu_hotplug = 1;
 	printk("Enabling non-boot CPUs ...\n");
 	for_each_cpu_mask(cpu, frozen_cpus) {
-		error = _cpu_up(cpu);
+		error = _cpu_up(cpu, 1);
 		if (!error) {
 			printk("CPU%d is up\n", cpu);
 			continue;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index c9f4f044a8a8..23c03f43e196 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1411,11 +1411,13 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
 	switch (action) {
 
 	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 		init_hrtimers_cpu(cpu);
 		break;
 
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu);
 		migrate_hrtimers(cpu);
 		break;
diff --git a/kernel/profile.c b/kernel/profile.c
index 9bfadb248dd8..cc91b9bf759d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -340,6 +340,7 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
 
 	switch (action) {
 	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 		node = cpu_to_node(cpu);
 		per_cpu(cpu_profile_flip, cpu) = 0;
 		if (!per_cpu(cpu_profile_hits, cpu)[1]) {
@@ -365,10 +366,13 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
 		__free_page(page);
 		return NOTIFY_BAD;
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		cpu_set(cpu, prof_cpu_mask);
 		break;
 	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		cpu_clear(cpu, prof_cpu_mask);
 		if (per_cpu(cpu_profile_hits, cpu)[0]) {
 			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 3554b76da84c..2c2dd8410dc4 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -558,9 +558,11 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 	long cpu = (long)hcpu;
 	switch (action) {
 	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 		rcu_online_cpu(cpu);
 		break;
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		rcu_offline_cpu(cpu);
 		break;
 	default:
diff --git a/kernel/relay.c b/kernel/relay.c
index e804589c863c..61a504900eaa 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -484,6 +484,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
 
 	switch(action) {
 	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 		mutex_lock(&relay_channels_mutex);
 		list_for_each_entry(chan, &relay_channels, list) {
 			if (chan->buf[hotcpu])
@@ -500,6 +501,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
 		mutex_unlock(&relay_channels_mutex);
 		break;
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		/* No need to flush the cpu : will be flushed upon
 		 * final relay_flush() call. */
 		break;
diff --git a/kernel/sched.c b/kernel/sched.c
index fe1a9c2b855a..799d23b4e35d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5394,6 +5394,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		break;
 
 	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 		p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
 		if (IS_ERR(p))
 			return NOTIFY_BAD;
@@ -5407,12 +5408,14 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		break;
 
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		/* Strictly unneccessary, as first user will wake it. */
 		wake_up_process(cpu_rq(cpu)->migration_thread);
 		break;
 
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
 		if (!cpu_rq(cpu)->migration_thread)
 			break;
 		/* Unbind it from offline cpu so it can run.  Fall thru. */
@@ -5423,6 +5426,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		break;
 
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		migrate_live_tasks(cpu);
 		rq = cpu_rq(cpu);
 		kthread_stop(rq->migration_thread);
@@ -6912,14 +6916,20 @@ static int update_sched_domains(struct notifier_block *nfb,
 {
 	switch (action) {
 	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
 		detach_destroy_domains(&cpu_online_map);
 		return NOTIFY_OK;
 
 	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
 	case CPU_DOWN_FAILED:
+	case CPU_DOWN_FAILED_FROZEN:
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		/*
 		 * Fall through and re-initialise the domains.
 		 */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 8b75008e2bd8..0b9886a00e74 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -593,6 +593,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
 
 	switch (action) {
 	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 		p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
 		if (IS_ERR(p)) {
 			printk("ksoftirqd for %i failed\n", hotcpu);
@@ -602,16 +603,19 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
   		per_cpu(ksoftirqd, hotcpu) = p;
  		break;
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		wake_up_process(per_cpu(ksoftirqd, hotcpu));
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
 		if (!per_cpu(ksoftirqd, hotcpu))
 			break;
 		/* Unbind so it can run.  Fall thru. */
 		kthread_bind(per_cpu(ksoftirqd, hotcpu),
 			     any_online_cpu(cpu_online_map));
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		p = per_cpu(ksoftirqd, hotcpu);
 		per_cpu(ksoftirqd, hotcpu) = NULL;
 		kthread_stop(p);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 8fa7040247ad..0131e296ffb4 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -146,6 +146,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 
 	switch (action) {
 	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 		BUG_ON(per_cpu(watchdog_task, hotcpu));
 		p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
 		if (IS_ERR(p)) {
@@ -157,16 +158,19 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		kthread_bind(p, hotcpu);
  		break;
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		wake_up_process(per_cpu(watchdog_task, hotcpu));
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
 		if (!per_cpu(watchdog_task, hotcpu))
 			break;
 		/* Unbind so it can run.  Fall thru. */
 		kthread_bind(per_cpu(watchdog_task, hotcpu),
 			     any_online_cpu(cpu_online_map));
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		p = per_cpu(watchdog_task, hotcpu);
 		per_cpu(watchdog_task, hotcpu) = NULL;
 		kthread_stop(p);
diff --git a/kernel/timer.c b/kernel/timer.c
index 58f6dd07c80b..de85f8491c1d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1293,11 +1293,13 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self,
 	long cpu = (long)hcpu;
 	switch(action) {
 	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 		if (init_timers_cpu(cpu) < 0)
 			return NOTIFY_BAD;
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		migrate_timers(cpu);
 		break;
 #endif
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b976ed87dd37..fb56fedd5c02 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -799,6 +799,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 	struct cpu_workqueue_struct *cwq;
 	struct workqueue_struct *wq;
 
+	action &= ~CPU_TASKS_FROZEN;
+
 	switch (action) {
 	case CPU_LOCK_ACQUIRE:
 		mutex_lock(&workqueue_mutex);
-- 
cgit v1.2.3


From 455c017ae3934797653549704c286e7bcc3a9397 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 9 May 2007 02:35:11 -0700
Subject: microcode: use suspend-related CPU hotplug notifications

Make the microcode driver use the suspend-related CPU hotplug notifications
to handle the CPU hotplug events occuring during system-wide suspend and
resume transitions.  Remove the global variable suspend_cpu_hotplug
previously used for this purpose.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Gautham R Shenoy <ego@in.ibm.com>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpu.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 369d2892687d..208cf3497c10 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -262,12 +262,6 @@ int __cpuinit cpu_up(unsigned int cpu)
 }
 
 #ifdef CONFIG_SUSPEND_SMP
-/* Needed to prevent the microcode driver from requesting firmware in its CPU
- * hotplug notifier during the suspend/resume.
- */
-int suspend_cpu_hotplug;
-EXPORT_SYMBOL(suspend_cpu_hotplug);
-
 static cpumask_t frozen_cpus;
 
 int disable_nonboot_cpus(void)
@@ -275,7 +269,6 @@ int disable_nonboot_cpus(void)
 	int cpu, first_cpu, error = 0;
 
 	mutex_lock(&cpu_add_remove_lock);
-	suspend_cpu_hotplug = 1;
 	first_cpu = first_cpu(cpu_online_map);
 	/* We take down all of the non-boot CPUs in one shot to avoid races
 	 * with the userspace trying to use the CPU hotplug at the same time
@@ -302,7 +295,6 @@ int disable_nonboot_cpus(void)
 	} else {
 		printk(KERN_ERR "Non-boot CPUs are not disabled\n");
 	}
-	suspend_cpu_hotplug = 0;
 	mutex_unlock(&cpu_add_remove_lock);
 	return error;
 }
@@ -317,7 +309,6 @@ void enable_nonboot_cpus(void)
 	if (cpus_empty(frozen_cpus))
 		goto out;
 
-	suspend_cpu_hotplug = 1;
 	printk("Enabling non-boot CPUs ...\n");
 	for_each_cpu_mask(cpu, frozen_cpus) {
 		error = _cpu_up(cpu, 1);
@@ -328,7 +319,6 @@ void enable_nonboot_cpus(void)
 		printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
 	}
 	cpus_clear(frozen_cpus);
-	suspend_cpu_hotplug = 0;
 out:
 	mutex_unlock(&cpu_add_remove_lock);
 }
-- 
cgit v1.2.3


From 77461ab33229d48614402decfb1b2eaa6d446861 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 9 May 2007 02:35:13 -0700
Subject: Make vm statistics update interval configurable

Make it configurable.  Code in mm makes the vm statistics intervals
independent from the cache reaper use that opportunity to make it
configurable.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f0664bd5011c..4073353abd4f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -77,6 +77,7 @@ extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
 extern int maps_protect;
+extern int sysctl_stat_interval;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -857,6 +858,17 @@ static ctl_table vm_table[] = {
 		.extra2		= &one_hundred,
 	},
 #endif
+#ifdef CONFIG_SMP
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "stat_interval",
+		.data		= &sysctl_stat_interval,
+		.maxlen		= sizeof(sysctl_stat_interval),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies,
+	},
+#endif
 #if defined(CONFIG_X86_32) || \
    (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
 	{
-- 
cgit v1.2.3


From b52f52a093bb1e841e014c2087b5bee7162da413 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 May 2007 02:35:15 -0700
Subject: clocksource: fix resume logic

We need to make sure that the clocksources are resumed, when timekeeping is
resumed.  The current resume logic does not guarantee this.

Add a resume function pointer to the clocksource struct, so clocksource
drivers which need to reinitialize the clocksource can provide a resume
function.

Add a resume function, which calls the maybe available clocksource resume
functions and resets the watchdog function, so a stable TSC can be used
accross suspend/resume.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/time/clocksource.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/timer.c            |  2 ++
 2 files changed, 47 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index db0c725de5ea..3db5c3c460d7 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -74,6 +74,8 @@ static struct clocksource *watchdog;
 static struct timer_list watchdog_timer;
 static DEFINE_SPINLOCK(watchdog_lock);
 static cycle_t watchdog_last;
+static int watchdog_resumed;
+
 /*
  * Interval: 0.5sec Threshold: 0.0625s
  */
@@ -98,15 +100,26 @@ static void clocksource_watchdog(unsigned long data)
 	struct clocksource *cs, *tmp;
 	cycle_t csnow, wdnow;
 	int64_t wd_nsec, cs_nsec;
+	int resumed;
 
 	spin_lock(&watchdog_lock);
 
+	resumed = watchdog_resumed;
+	if (unlikely(resumed))
+		watchdog_resumed = 0;
+
 	wdnow = watchdog->read();
 	wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
 	watchdog_last = wdnow;
 
 	list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
 		csnow = cs->read();
+
+		if (unlikely(resumed)) {
+			cs->wd_last = csnow;
+			continue;
+		}
+
 		/* Initialized ? */
 		if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
 			if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
@@ -136,6 +149,13 @@ static void clocksource_watchdog(unsigned long data)
 	}
 	spin_unlock(&watchdog_lock);
 }
+static void clocksource_resume_watchdog(void)
+{
+	spin_lock(&watchdog_lock);
+	watchdog_resumed = 1;
+	spin_unlock(&watchdog_lock);
+}
+
 static void clocksource_check_watchdog(struct clocksource *cs)
 {
 	struct clocksource *cse;
@@ -182,8 +202,33 @@ static void clocksource_check_watchdog(struct clocksource *cs)
 	if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
 		cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
 }
+
+static inline void clocksource_resume_watchdog(void) { }
 #endif
 
+/**
+ * clocksource_resume - resume the clocksource(s)
+ */
+void clocksource_resume(void)
+{
+	struct list_head *tmp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&clocksource_lock, flags);
+
+	list_for_each(tmp, &clocksource_list) {
+		struct clocksource *cs;
+
+		cs = list_entry(tmp, struct clocksource, list);
+		if (cs->resume)
+			cs->resume();
+	}
+
+	clocksource_resume_watchdog();
+
+	spin_unlock_irqrestore(&clocksource_lock, flags);
+}
+
 /**
  * clocksource_get_next - Returns the selected clocksource
  *
diff --git a/kernel/timer.c b/kernel/timer.c
index de85f8491c1d..59a28b1752f8 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1499,6 +1499,8 @@ unregister_time_interpolator(struct time_interpolator *ti)
 		prev = &curr->next;
 	}
 
+	clocksource_resume();
+
 	write_seqlock_irqsave(&xtime_lock, flags);
 	if (ti == time_interpolator) {
 		/* we lost the best time-interpolator: */
-- 
cgit v1.2.3


From c9f4f06d3191bd91c1a081b54a6c8e913e7b8a83 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Wed, 9 May 2007 02:35:16 -0700
Subject: wrap access to thread_info

Recently a few direct accesses to the thread_info in the task structure snuck
back, so this wraps them with the appropriate wrapper.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/mutex.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/mutex.c b/kernel/mutex.c
index e7cbbb82765b..303eab18484b 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -133,7 +133,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
 
 	debug_mutex_lock_common(lock, &waiter);
 	mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
-	debug_mutex_add_waiter(lock, &waiter, task->thread_info);
+	debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
 
 	/* add waiting tasks to the end of the waitqueue (FIFO): */
 	list_add_tail(&waiter.list, &lock->wait_list);
@@ -159,7 +159,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
 		 */
 		if (unlikely(state == TASK_INTERRUPTIBLE &&
 						signal_pending(task))) {
-			mutex_remove_waiter(lock, &waiter, task->thread_info);
+			mutex_remove_waiter(lock, &waiter, task_thread_info(task));
 			mutex_release(&lock->dep_map, 1, _RET_IP_);
 			spin_unlock_mutex(&lock->wait_lock, flags);
 
@@ -175,8 +175,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
 	}
 
 	/* got the lock - rejoice! */
-	mutex_remove_waiter(lock, &waiter, task->thread_info);
-	debug_mutex_set_owner(lock, task->thread_info);
+	mutex_remove_waiter(lock, &waiter, task_thread_info(task));
+	debug_mutex_set_owner(lock, task_thread_info(task));
 
 	/* set it to 0 if there are no waiters left: */
 	if (likely(list_empty(&lock->wait_list)))
-- 
cgit v1.2.3


From f7e4217b007d1f73e7e3cf10ba4fea4a608c603f Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Wed, 9 May 2007 02:35:17 -0700
Subject: rename thread_info to stack

This finally renames the thread_info field in task structure to stack, so that
the assumptions about this field are gone and archs have more freedom about
placing the thread_info structure.

Nonbroken archs which have a proper thread pointer can do the access to both
current thread and task structure via a single pointer.

It'll allow for a few more cleanups of the fork code, from which e.g.  ia64
could benefit.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
[akpm@linux-foundation.org: build fix]
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ian Molton <spyro@f2s.com>
Cc: Haavard Skinnemoen <hskinnemoen@atmel.com>
Cc: Mikael Starvik <starvik@axis.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: Greg Ungerer <gerg@uclinux.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Kazumoto Kojima <kkojima@rr.iij4u.or.jp>
Cc: Richard Curnow <rc@rc0.org.uk>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Cc: Miles Bader <uclinux-v850@lsi.nec.co.jp>
Cc: Andi Kleen <ak@muc.de>
Cc: Chris Zankel <chris@zankel.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index a8dd75d4992b..5dd3979747f5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -105,7 +105,7 @@ static struct kmem_cache *mm_cachep;
 
 void free_task(struct task_struct *tsk)
 {
-	free_thread_info(tsk->thread_info);
+	free_thread_info(tsk->stack);
 	rt_mutex_debug_task_free(tsk);
 	free_task_struct(tsk);
 }
@@ -175,7 +175,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	}
 
 	*tsk = *orig;
-	tsk->thread_info = ti;
+	tsk->stack = ti;
 	setup_thread_stack(tsk, orig);
 
 #ifdef CONFIG_CC_STACKPROTECTOR
-- 
cgit v1.2.3


From e9910846fdb19f7c5810cbe4c95e4ca6dab6a00f Mon Sep 17 00:00:00 2001
From: "akpm@linux-foundation.org" <akpm@linux-foundation.org>
Date: Thu, 10 May 2007 03:16:01 -0700
Subject: timer: revert parenthesis fix in tbase_get_deferrable() etc

On 09-05-2007 21:10, Pallipadi, Venkatesh wrote:
...
> On a 64 bit system, converting pointer to int causes unnecessary
> compiler warning, and intermediate long conversion was to avoid that.
> I will have to rephrase my comment to remove 32 bit value and use int,
> as that is what the function returns.

So, this patch reverts all changes done by my previous patch.

I apologize for my wrong comment about "logical error" here.

Cc: "Pallipadi, Venkatesh" <venkatesh.pallipadi@intel.com>
Cc: Satyam Sharma <satyam.sharma@gmail.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Jarek Poplawski <jarkao2@o2.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/timer.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 59a28b1752f8..a6c580ac084b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -92,24 +92,24 @@ static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
 /* Functions below help us manage 'deferrable' flag */
 static inline unsigned int tbase_get_deferrable(tvec_base_t *base)
 {
-	return (unsigned int)((unsigned long)base & TBASE_DEFERRABLE_FLAG);
+	return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG);
 }
 
 static inline tvec_base_t *tbase_get_base(tvec_base_t *base)
 {
-	return (tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG);
+	return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
 }
 
 static inline void timer_set_deferrable(struct timer_list *timer)
 {
-	timer->base = (tvec_base_t *)((unsigned long)timer->base |
-	                               TBASE_DEFERRABLE_FLAG);
+	timer->base = ((tvec_base_t *)((unsigned long)(timer->base) |
+	                               TBASE_DEFERRABLE_FLAG));
 }
 
 static inline void
 timer_set_base(struct timer_list *timer, tvec_base_t *new_base)
 {
-	timer->base = (tvec_base_t *)((unsigned long)new_base |
+	timer->base = (tvec_base_t *)((unsigned long)(new_base) |
 	                              tbase_get_deferrable(timer->base));
 }
 
-- 
cgit v1.2.3


From a5cb013da773a67ee48d1c19e96436c22a73a7eb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 20 Mar 2007 13:58:35 -0400
Subject: [PATCH] auditing ptrace

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 29 +++++++++++++++++++++++++++++
 kernel/ptrace.c  |  3 +++
 2 files changed, 32 insertions(+)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 628c7ac590a0..2243c559bc03 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -209,6 +209,9 @@ struct audit_context {
 	unsigned long	    personality;
 	int		    arch;
 
+	pid_t		    target_pid;
+	u32		    target_sid;
+
 #if AUDIT_DEBUG
 	int		    put_count;
 	int		    ino_count;
@@ -973,6 +976,23 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 		audit_log_end(ab);
 	}
 
+	if (context->target_pid) {
+		ab =audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID);
+		if (ab) {
+			char *s = NULL, *t;
+			u32 len;
+			if (selinux_sid_to_string(context->target_sid,
+						    &s, &len))
+				t = "(none)";
+			else
+				t = s;
+			audit_log_format(ab, "opid=%d obj=%s",
+					context->target_pid, t);
+			audit_log_end(ab);
+			kfree(s);
+		}
+	}
+
 	if (context->pwd && context->pwdmnt) {
 		ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
 		if (ab) {
@@ -1193,6 +1213,7 @@ void audit_syscall_exit(int valid, long return_code)
 	} else {
 		audit_free_names(context);
 		audit_free_aux(context);
+		context->target_pid = 0;
 		kfree(context->filterkey);
 		context->filterkey = NULL;
 		tsk->audit_context = context;
@@ -1880,6 +1901,14 @@ int audit_sockaddr(int len, void *a)
 	return 0;
 }
 
+void __audit_ptrace(struct task_struct *t)
+{
+	struct audit_context *context = current->audit_context;
+
+	context->target_pid = t->pid;
+	selinux_get_task_sid(t, &context->target_sid);
+}
+
 /**
  * audit_avc_path - record the granting or denial of permissions
  * @dentry: dentry to record
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 4d50e06fd745..ad7949a589dd 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -18,6 +18,7 @@
 #include <linux/ptrace.h>
 #include <linux/security.h>
 #include <linux/signal.h>
+#include <linux/audit.h>
 
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
@@ -161,6 +162,8 @@ int ptrace_attach(struct task_struct *task)
 {
 	int retval;
 
+	audit_ptrace(task);
+
 	retval = -EPERM;
 	if (task->pid <= 1)
 		goto out;
-- 
cgit v1.2.3


From e54dc2431d740a79a6bd013babade99d71b1714f Mon Sep 17 00:00:00 2001
From: Amy Griffis <amy.griffis@hp.com>
Date: Thu, 29 Mar 2007 18:01:04 -0400
Subject: [PATCH] audit signal recipients

When auditing syscalls that send signals, log the pid and security
context for each target process. Optimize the data collection by
adding a counter for signal-related rules, and avoiding allocating an
aux struct unless we have more than one target process. For process
groups, collect pid/context data in blocks of 16. Move the
audit_signal_info() hook up in check_kill_permission() so we audit
attempts where permission is denied.

Signed-off-by: Amy Griffis <amy.griffis@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.h       |  13 +++---
 kernel/auditfilter.c |  48 +++++++++++++++++++++-
 kernel/auditsc.c     | 111 ++++++++++++++++++++++++++++++++++++++++++---------
 kernel/signal.c      |  10 +++--
 4 files changed, 153 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.h b/kernel/audit.h
index a3370232a390..815d6f5c04ee 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -83,6 +83,7 @@ struct audit_krule {
 	u32			field_count;
 	char			*filterkey; /* ties events to rules */
 	struct audit_field	*fields;
+	struct audit_field	*arch_f; /* quick access to arch field */
 	struct audit_field	*inode_f; /* quick access to an inode field */
 	struct audit_watch	*watch;	/* associated watch */
 	struct list_head	rlist;	/* entry in audit_watch.rules list */
@@ -131,17 +132,19 @@ extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
 extern int selinux_audit_rule_update(void);
 
 #ifdef CONFIG_AUDITSYSCALL
-extern void __audit_signal_info(int sig, struct task_struct *t);
-static inline void audit_signal_info(int sig, struct task_struct *t)
+extern int __audit_signal_info(int sig, struct task_struct *t);
+static inline int audit_signal_info(int sig, struct task_struct *t)
 {
-	if (unlikely(audit_pid && t->tgid == audit_pid))
-		__audit_signal_info(sig, t);
+	if (unlikely((audit_pid && t->tgid == audit_pid) ||
+		     (audit_signals && !audit_dummy_context())))
+		return __audit_signal_info(sig, t);
+	return 0;
 }
 extern enum audit_state audit_filter_inodes(struct task_struct *,
 					    struct audit_context *);
 extern void audit_set_auditable(struct audit_context *);
 #else
-#define audit_signal_info(s,t)
+#define audit_signal_info(s,t) AUDIT_DISABLED
 #define audit_filter_inodes(t,c) AUDIT_DISABLED
 #define audit_set_auditable(c)
 #endif
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 3749193aed8c..6c61263ff96d 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -311,6 +311,43 @@ int audit_match_class(int class, unsigned syscall)
 	return classes[class][AUDIT_WORD(syscall)] & AUDIT_BIT(syscall);
 }
 
+static inline int audit_match_class_bits(int class, u32 *mask)
+{
+	int i;
+
+	if (classes[class]) {
+		for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+			if (mask[i] & classes[class][i])
+				return 0;
+	}
+	return 1;
+}
+
+static int audit_match_signal(struct audit_entry *entry)
+{
+	struct audit_field *arch = entry->rule.arch_f;
+
+	if (!arch) {
+		/* When arch is unspecified, we must check both masks on biarch
+		 * as syscall number alone is ambiguous. */
+		return (audit_match_class_bits(AUDIT_CLASS_SIGNAL,
+					       entry->rule.mask) &&
+			audit_match_class_bits(AUDIT_CLASS_SIGNAL_32,
+					       entry->rule.mask));
+	}
+
+	switch(audit_classify_arch(arch->val)) {
+	case 0: /* native */
+		return (audit_match_class_bits(AUDIT_CLASS_SIGNAL,
+					       entry->rule.mask));
+	case 1: /* 32bit on biarch */
+		return (audit_match_class_bits(AUDIT_CLASS_SIGNAL_32,
+					       entry->rule.mask));
+	default:
+		return 1;
+	}
+}
+
 /* Common user-space to kernel rule translation. */
 static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
 {
@@ -429,6 +466,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 				err = -EINVAL;
 				goto exit_free;
 			}
+			entry->rule.arch_f = f;
 			break;
 		case AUDIT_PERM:
 			if (f->val & ~15)
@@ -519,7 +557,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 		case AUDIT_FSGID:
 		case AUDIT_LOGINUID:
 		case AUDIT_PERS:
-		case AUDIT_ARCH:
 		case AUDIT_MSGTYPE:
 		case AUDIT_PPID:
 		case AUDIT_DEVMAJOR:
@@ -531,6 +568,9 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 		case AUDIT_ARG2:
 		case AUDIT_ARG3:
 			break;
+		case AUDIT_ARCH:
+			entry->rule.arch_f = f;
+			break;
 		case AUDIT_SUBJ_USER:
 		case AUDIT_SUBJ_ROLE:
 		case AUDIT_SUBJ_TYPE:
@@ -1221,6 +1261,9 @@ static inline int audit_add_rule(struct audit_entry *entry,
 #ifdef CONFIG_AUDITSYSCALL
 	if (!dont_count)
 		audit_n_rules++;
+
+	if (!audit_match_signal(entry))
+		audit_signals++;
 #endif
 	mutex_unlock(&audit_filter_mutex);
 
@@ -1294,6 +1337,9 @@ static inline int audit_del_rule(struct audit_entry *entry,
 #ifdef CONFIG_AUDITSYSCALL
 	if (!dont_count)
 		audit_n_rules--;
+
+	if (!audit_match_signal(entry))
+		audit_signals--;
 #endif
 	mutex_unlock(&audit_filter_mutex);
 
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 2243c559bc03..6aff0df75568 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -89,6 +89,9 @@ extern int audit_enabled;
 /* number of audit rules */
 int audit_n_rules;
 
+/* determines whether we collect data for signals sent */
+int audit_signals;
+
 /* When fs/namei.c:getname() is called, we store the pointer in name and
  * we don't let putname() free it (instead we free all of the saved
  * pointers at syscall exit time).
@@ -114,6 +117,9 @@ struct audit_aux_data {
 
 #define AUDIT_AUX_IPCPERM	0
 
+/* Number of target pids per aux struct. */
+#define AUDIT_AUX_PIDS	16
+
 struct audit_aux_data_mq_open {
 	struct audit_aux_data	d;
 	int			oflag;
@@ -181,6 +187,13 @@ struct audit_aux_data_path {
 	struct vfsmount		*mnt;
 };
 
+struct audit_aux_data_pids {
+	struct audit_aux_data	d;
+	pid_t			target_pid[AUDIT_AUX_PIDS];
+	u32			target_sid[AUDIT_AUX_PIDS];
+	int			pid_count;
+};
+
 /* The per-task audit context. */
 struct audit_context {
 	int		    dummy;	/* must be the first element */
@@ -201,6 +214,7 @@ struct audit_context {
 	struct vfsmount *   pwdmnt;
 	struct audit_context *previous; /* For nested syscalls */
 	struct audit_aux_data *aux;
+	struct audit_aux_data *aux_pids;
 
 				/* Save things to print about task_struct */
 	pid_t		    pid, ppid;
@@ -657,6 +671,10 @@ static inline void audit_free_aux(struct audit_context *context)
 		context->aux = aux->next;
 		kfree(aux);
 	}
+	while ((aux = context->aux_pids)) {
+		context->aux_pids = aux->next;
+		kfree(aux);
+	}
 }
 
 static inline void audit_zero_context(struct audit_context *context,
@@ -798,6 +816,29 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
 	audit_log_task_context(ab);
 }
 
+static int audit_log_pid_context(struct audit_context *context, pid_t pid,
+				 u32 sid)
+{
+	struct audit_buffer *ab;
+	char *s = NULL;
+	u32 len;
+	int rc = 0;
+
+	ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID);
+	if (!ab)
+		return 1;
+
+	if (selinux_sid_to_string(sid, &s, &len)) {
+		audit_log_format(ab, "opid=%d obj=(none)", pid);
+		rc = 1;
+	} else
+		audit_log_format(ab, "opid=%d  obj=%s", pid, s);
+	audit_log_end(ab);
+	kfree(s);
+
+	return rc;
+}
+
 static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
 	int i, call_panic = 0;
@@ -976,23 +1017,21 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 		audit_log_end(ab);
 	}
 
-	if (context->target_pid) {
-		ab =audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID);
-		if (ab) {
-			char *s = NULL, *t;
-			u32 len;
-			if (selinux_sid_to_string(context->target_sid,
-						    &s, &len))
-				t = "(none)";
-			else
-				t = s;
-			audit_log_format(ab, "opid=%d obj=%s",
-					context->target_pid, t);
-			audit_log_end(ab);
-			kfree(s);
-		}
+	for (aux = context->aux_pids; aux; aux = aux->next) {
+		struct audit_aux_data_pids *axs = (void *)aux;
+		int i;
+
+		for (i = 0; i < axs->pid_count; i++)
+			if (audit_log_pid_context(context, axs->target_pid[i],
+						  axs->target_sid[i]))
+				call_panic = 1;
 	}
 
+	if (context->target_pid &&
+	    audit_log_pid_context(context, context->target_pid,
+				  context->target_sid))
+			call_panic = 1;
+
 	if (context->pwd && context->pwdmnt) {
 		ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
 		if (ab) {
@@ -1213,7 +1252,10 @@ void audit_syscall_exit(int valid, long return_code)
 	} else {
 		audit_free_names(context);
 		audit_free_aux(context);
+		context->aux = NULL;
+		context->aux_pids = NULL;
 		context->target_pid = 0;
+		context->target_sid = 0;
 		kfree(context->filterkey);
 		context->filterkey = NULL;
 		tsk->audit_context = context;
@@ -1947,15 +1989,17 @@ int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt)
  * If the audit subsystem is being terminated, record the task (pid)
  * and uid that is doing that.
  */
-void __audit_signal_info(int sig, struct task_struct *t)
+int __audit_signal_info(int sig, struct task_struct *t)
 {
+	struct audit_aux_data_pids *axp;
+	struct task_struct *tsk = current;
+	struct audit_context *ctx = tsk->audit_context;
 	extern pid_t audit_sig_pid;
 	extern uid_t audit_sig_uid;
 	extern u32 audit_sig_sid;
 
-	if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) {
-		struct task_struct *tsk = current;
-		struct audit_context *ctx = tsk->audit_context;
+	if (audit_pid && t->tgid == audit_pid &&
+	    (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1)) {
 		audit_sig_pid = tsk->pid;
 		if (ctx)
 			audit_sig_uid = ctx->loginuid;
@@ -1963,4 +2007,33 @@ void __audit_signal_info(int sig, struct task_struct *t)
 			audit_sig_uid = tsk->uid;
 		selinux_get_task_sid(tsk, &audit_sig_sid);
 	}
+
+	if (!audit_signals) /* audit_context checked in wrapper */
+		return 0;
+
+	/* optimize the common case by putting first signal recipient directly
+	 * in audit_context */
+	if (!ctx->target_pid) {
+		ctx->target_pid = t->tgid;
+		selinux_get_task_sid(t, &ctx->target_sid);
+		return 0;
+	}
+
+	axp = (void *)ctx->aux_pids;
+	if (!axp || axp->pid_count == AUDIT_AUX_PIDS) {
+		axp = kzalloc(sizeof(*axp), GFP_ATOMIC);
+		if (!axp)
+			return -ENOMEM;
+
+		axp->d.type = AUDIT_OBJ_PID;
+		axp->d.next = ctx->aux_pids;
+		ctx->aux_pids = (void *)axp;
+	}
+	BUG_ON(axp->pid_count > AUDIT_AUX_PIDS);
+
+	axp->target_pid[axp->pid_count] = t->tgid;
+	selinux_get_task_sid(t, &axp->target_sid[axp->pid_count]);
+	axp->pid_count++;
+
+	return 0;
 }
diff --git a/kernel/signal.c b/kernel/signal.c
index 2ac3a668d9dd..c43a3f19d477 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -497,6 +497,11 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	int error = -EINVAL;
 	if (!valid_signal(sig))
 		return error;
+
+	error = audit_signal_info(sig, t); /* Let audit system see the signal */
+	if (error)
+		return error;
+
 	error = -EPERM;
 	if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
 	    && ((sig != SIGCONT) ||
@@ -506,10 +511,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	    && !capable(CAP_KILL))
 		return error;
 
-	error = security_task_kill(t, info, sig, 0);
-	if (!error)
-		audit_signal_info(sig, t); /* Let audit system see the signal */
-	return error;
+	return security_task_kill(t, info, sig, 0);
 }
 
 /* forward decl */
-- 
cgit v1.2.3


From e41e8bde43026d5d2e41464e6105a50b31e34102 Mon Sep 17 00:00:00 2001
From: Amy Griffis <amy.griffis@hp.com>
Date: Tue, 13 Feb 2007 14:14:09 -0500
Subject: [PATCH] initialize name osid

Audit contexts can be reused, so initialize a name's osid to the
default in audit_getname(). This ensures we don't log a bogus object
label when no inode data is collected for a name.

Signed-off-by: Amy Griffis <amy.griffis@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 6aff0df75568..41d129a78793 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1289,6 +1289,7 @@ void __audit_getname(const char *name)
 	context->names[context->name_count].name_len = AUDIT_NAME_FULL;
 	context->names[context->name_count].name_put = 1;
 	context->names[context->name_count].ino  = (unsigned long)-1;
+	context->names[context->name_count].osid = 0;
 	++context->name_count;
 	if (!context->pwd) {
 		read_lock(&current->fs->lock);
-- 
cgit v1.2.3


From 4fc03b9beb2314f3adb9e72b7935a80c577954d1 Mon Sep 17 00:00:00 2001
From: Amy Griffis <amy.griffis@hp.com>
Date: Tue, 13 Feb 2007 14:15:01 -0500
Subject: [PATCH] complete message queue auditing

Handle the edge cases for POSIX message queue auditing. Collect inode
info when opening an existing mq, and for send/receive operations. Remove
audit_inode_update() as it has really evolved into the equivalent of
audit_inode().

Signed-off-by: Amy Griffis <amy.griffis@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 27 ---------------------------
 1 file changed, 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 41d129a78793..25d890e997f2 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1475,33 +1475,6 @@ update_context:
 	}
 }
 
-/**
- * audit_inode_update - update inode info for last collected name
- * @inode: inode being audited
- *
- * When open() is called on an existing object with the O_CREAT flag, the inode
- * data audit initially collects is incorrect.  This additional hook ensures
- * audit has the inode data for the actual object to be opened.
- */
-void __audit_inode_update(const struct inode *inode)
-{
-	struct audit_context *context = current->audit_context;
-	int idx;
-
-	if (!context->in_syscall || !inode)
-		return;
-
-	if (context->name_count == 0) {
-		context->name_count++;
-#if AUDIT_DEBUG
-		context->ino_count++;
-#endif
-	}
-	idx = context->name_count - 1;
-
-	audit_copy_inode(&context->names[idx], inode);
-}
-
 /**
  * auditsc_get_stamp - get local copies of audit_context values
  * @ctx: audit_context for the task
-- 
cgit v1.2.3


From 5712e88f2b0f626a4857c24128810bbf8ce09537 Mon Sep 17 00:00:00 2001
From: Amy Griffis <amy.griffis@hp.com>
Date: Tue, 13 Feb 2007 14:15:22 -0500
Subject: [PATCH] match audit name data

Make more effort to detect previously collected names, so we don't log
multiple PATH records for a single filesystem object. Add
audit_inc_name_count() to reduce duplicate code.

Signed-off-by: Amy Griffis <amy.griffis@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 142 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 84 insertions(+), 58 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 25d890e997f2..5276b7ef05f1 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -78,11 +78,6 @@ extern int audit_enabled;
  * for saving names from getname(). */
 #define AUDIT_NAMES    20
 
-/* AUDIT_NAMES_RESERVED is the number of slots we reserve in the
- * audit_context from being used for nameless inodes from
- * path_lookup. */
-#define AUDIT_NAMES_RESERVED 7
-
 /* Indicates that audit should log the full pathname. */
 #define AUDIT_NAME_FULL -1
 
@@ -1343,6 +1338,28 @@ void audit_putname(const char *name)
 #endif
 }
 
+static int audit_inc_name_count(struct audit_context *context,
+				const struct inode *inode)
+{
+	if (context->name_count >= AUDIT_NAMES) {
+		if (inode)
+			printk(KERN_DEBUG "name_count maxed, losing inode data: "
+			       "dev=%02x:%02x, inode=%lu",
+			       MAJOR(inode->i_sb->s_dev),
+			       MINOR(inode->i_sb->s_dev),
+			       inode->i_ino);
+
+		else
+			printk(KERN_DEBUG "name_count maxed, losing inode data");
+		return 1;
+	}
+	context->name_count++;
+#if AUDIT_DEBUG
+	context->ino_count++;
+#endif
+	return 0;
+}
+
 /* Copy inode data into an audit_names. */
 static void audit_copy_inode(struct audit_names *name, const struct inode *inode)
 {
@@ -1380,13 +1397,10 @@ void __audit_inode(const char *name, const struct inode *inode)
 	else {
 		/* FIXME: how much do we care about inodes that have no
 		 * associated name? */
-		if (context->name_count >= AUDIT_NAMES - AUDIT_NAMES_RESERVED)
+		if (audit_inc_name_count(context, inode))
 			return;
-		idx = context->name_count++;
+		idx = context->name_count - 1;
 		context->names[idx].name = NULL;
-#if AUDIT_DEBUG
-		++context->ino_count;
-#endif
 	}
 	audit_copy_inode(&context->names[idx], inode);
 }
@@ -1410,7 +1424,7 @@ void __audit_inode_child(const char *dname, const struct inode *inode,
 {
 	int idx;
 	struct audit_context *context = current->audit_context;
-	const char *found_name = NULL;
+	const char *found_parent = NULL, *found_child = NULL;
 	int dirlen = 0;
 
 	if (!context->in_syscall)
@@ -1418,61 +1432,73 @@ void __audit_inode_child(const char *dname, const struct inode *inode,
 
 	/* determine matching parent */
 	if (!dname)
-		goto update_context;
-	for (idx = 0; idx < context->name_count; idx++)
-		if (context->names[idx].ino == parent->i_ino) {
-			const char *name = context->names[idx].name;
+		goto add_names;
 
-			if (!name)
-				continue;
+	/* parent is more likely, look for it first */
+	for (idx = 0; idx < context->name_count; idx++) {
+		struct audit_names *n = &context->names[idx];
 
-			if (audit_compare_dname_path(dname, name, &dirlen) == 0) {
-				context->names[idx].name_len = dirlen;
-				found_name = name;
-				break;
-			}
+		if (!n->name)
+			continue;
+
+		if (n->ino == parent->i_ino &&
+		    !audit_compare_dname_path(dname, n->name, &dirlen)) {
+			n->name_len = dirlen; /* update parent data in place */
+			found_parent = n->name;
+			goto add_names;
 		}
+	}
 
-update_context:
-	idx = context->name_count;
-	if (context->name_count == AUDIT_NAMES) {
-		printk(KERN_DEBUG "name_count maxed and losing %s\n",
-			found_name ?: "(null)");
-		return;
+	/* no matching parent, look for matching child */
+	for (idx = 0; idx < context->name_count; idx++) {
+		struct audit_names *n = &context->names[idx];
+
+		if (!n->name)
+			continue;
+
+		/* strcmp() is the more likely scenario */
+		if (!strcmp(dname, n->name) ||
+		     !audit_compare_dname_path(dname, n->name, &dirlen)) {
+			if (inode)
+				audit_copy_inode(n, inode);
+			else
+				n->ino = (unsigned long)-1;
+			found_child = n->name;
+			goto add_names;
+		}
 	}
-	context->name_count++;
-#if AUDIT_DEBUG
-	context->ino_count++;
-#endif
-	/* Re-use the name belonging to the slot for a matching parent directory.
-	 * All names for this context are relinquished in audit_free_names() */
-	context->names[idx].name = found_name;
-	context->names[idx].name_len = AUDIT_NAME_FULL;
-	context->names[idx].name_put = 0;	/* don't call __putname() */
-
-	if (!inode)
-		context->names[idx].ino = (unsigned long)-1;
-	else
-		audit_copy_inode(&context->names[idx], inode);
-
-	/* A parent was not found in audit_names, so copy the inode data for the
-	 * provided parent. */
-	if (!found_name) {
-		idx = context->name_count;
-		if (context->name_count == AUDIT_NAMES) {
-			printk(KERN_DEBUG
-				"name_count maxed and losing parent inode data: dev=%02x:%02x, inode=%lu",
-				MAJOR(parent->i_sb->s_dev),
-				MINOR(parent->i_sb->s_dev),
-				parent->i_ino);
+
+add_names:
+	if (!found_parent) {
+		if (audit_inc_name_count(context, parent))
 			return;
-		}
-		context->name_count++;
-#if AUDIT_DEBUG
-		context->ino_count++;
-#endif
+		idx = context->name_count - 1;
+		context->names[idx].name = NULL;
 		audit_copy_inode(&context->names[idx], parent);
 	}
+
+	if (!found_child) {
+		if (audit_inc_name_count(context, inode))
+			return;
+		idx = context->name_count - 1;
+
+		/* Re-use the name belonging to the slot for a matching parent
+		 * directory. All names for this context are relinquished in
+		 * audit_free_names() */
+		if (found_parent) {
+			context->names[idx].name = found_parent;
+			context->names[idx].name_len = AUDIT_NAME_FULL;
+			/* don't call __putname() */
+			context->names[idx].name_put = 0;
+		} else {
+			context->names[idx].name = NULL;
+		}
+
+		if (inode)
+			audit_copy_inode(&context->names[idx], inode);
+		else
+			context->names[idx].ino = (unsigned long)-1;
+	}
 }
 
 /**
-- 
cgit v1.2.3


From 0a4ff8c2598b72f2fa9d50aae9e1809e684dbf41 Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Thu, 19 Apr 2007 10:28:21 -0400
Subject: [PATCH] Abnormal End of Processes

Hi,

I have been working on some code that detects abnormal events based on audit
system events. One kind of event that we currently have no visibility for is
when a program terminates due to segfault - which should never happen on a
production machine. And if it did, you'd want to investigate it. Attached is a
patch that collects these events and sends them into the audit system.

Signed-off-by: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 5276b7ef05f1..e36481ed61b4 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2037,3 +2037,42 @@ int __audit_signal_info(int sig, struct task_struct *t)
 
 	return 0;
 }
+
+/**
+ * audit_core_dumps - record information about processes that end abnormally
+ * @sig: signal value
+ *
+ * If a process ends with a core dump, something fishy is going on and we
+ * should record the event for investigation.
+ */
+void audit_core_dumps(long signr)
+{
+	struct audit_buffer *ab;
+	u32 sid;
+
+	if (!audit_enabled)
+		return;
+
+	if (signr == SIGQUIT)	/* don't care for those */
+		return;
+
+	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
+	audit_log_format(ab, "auid=%u uid=%u gid=%u",
+			audit_get_loginuid(current->audit_context),
+			current->uid, current->gid);
+	selinux_get_task_sid(current, &sid);
+	if (sid) {
+		char *ctx = NULL;
+		u32 len;
+
+		if (selinux_sid_to_string(sid, &ctx, &len))
+			audit_log_format(ab, " ssid=%u", sid);
+		else
+			audit_log_format(ab, " subj=%s", ctx);
+		kfree(ctx);
+	}
+	audit_log_format(ab, " pid=%d comm=", current->pid);
+	audit_log_untrustedstring(ab, current->comm);
+	audit_log_format(ab, " sig=%ld", signr);
+	audit_log_end(ab);
+}
-- 
cgit v1.2.3


From 6eaeeaba39e5fa3d52a0bb8de15e995516ae251a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Thu, 10 May 2007 22:22:37 -0700
Subject: getrusage(): fill ru_inblock and ru_oublock fields if possible
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If CONFIG_TASK_IO_ACCOUNTING is defined, we update io accounting counters for
each task.

This patch permits reporting of values using the well known getrusage()
syscall, filling ru_inblock and ru_oublock instead of null values.

As TASK_IO_ACCOUNTING currently counts bytes counts, we approximate blocks
count doing : nr_blocks = nr_bytes / 512

Example of use :
----------------------
After patch is applied, /usr/bin/time command can now give a good
approximation of IO that the process had to do.

$ /usr/bin/time grep tototo /usr/include/*
Command exited with non-zero status 1
0.00user 0.02system 0:02.11elapsed 1%CPU (0avgtext+0avgdata 0maxresident)k
24288inputs+0outputs (0major+259minor)pagefaults 0swaps

$ /usr/bin/time dd if=/dev/zero of=/tmp/testfile count=1000
1000+0 enregistrements lus
1000+0 enregistrements écrits
512000 octets (512 kB) copiés, 0,00326601 seconde, 157 MB/s
0.00user 0.00system 0:00.00elapsed 80%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+3000outputs (0major+299minor)pagefaults 0swaps

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 9 +++++++++
 kernel/fork.c | 1 +
 kernel/sys.c  | 7 +++++++
 3 files changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index b0c6f0c3a2df..7a5fd77f8fb0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -42,6 +42,7 @@
 #include <linux/audit.h> /* for audit_free() */
 #include <linux/resource.h>
 #include <linux/blkdev.h>
+#include <linux/task_io_accounting_ops.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -113,6 +114,8 @@ static void __exit_signal(struct task_struct *tsk)
 		sig->nvcsw += tsk->nvcsw;
 		sig->nivcsw += tsk->nivcsw;
 		sig->sched_time += tsk->sched_time;
+		sig->inblock += task_io_get_inblock(tsk);
+		sig->oublock += task_io_get_oublock(tsk);
 		sig = NULL; /* Marker for below. */
 	}
 
@@ -1193,6 +1196,12 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
 			p->nvcsw + sig->nvcsw + sig->cnvcsw;
 		psig->cnivcsw +=
 			p->nivcsw + sig->nivcsw + sig->cnivcsw;
+		psig->cinblock +=
+			task_io_get_inblock(p) +
+			sig->inblock + sig->cinblock;
+		psig->coublock +=
+			task_io_get_oublock(p) +
+			sig->oublock + sig->coublock;
 		spin_unlock_irq(&p->parent->sighand->siglock);
 	}
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 5dd3979747f5..da92e01aba6b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -875,6 +875,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
+	sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
 	sig->sched_time = 0;
 	INIT_LIST_HEAD(&sig->cpu_timers[0]);
 	INIT_LIST_HEAD(&sig->cpu_timers[1]);
diff --git a/kernel/sys.c b/kernel/sys.c
index cdb7e9457ba6..ec319bbb0bd4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -29,6 +29,7 @@
 #include <linux/signal.h>
 #include <linux/cn_proc.h>
 #include <linux/getcpu.h>
+#include <linux/task_io_accounting_ops.h>
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -2082,6 +2083,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 			r->ru_nivcsw = p->signal->cnivcsw;
 			r->ru_minflt = p->signal->cmin_flt;
 			r->ru_majflt = p->signal->cmaj_flt;
+			r->ru_inblock = p->signal->cinblock;
+			r->ru_oublock = p->signal->coublock;
 
 			if (who == RUSAGE_CHILDREN)
 				break;
@@ -2093,6 +2096,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 			r->ru_nivcsw += p->signal->nivcsw;
 			r->ru_minflt += p->signal->min_flt;
 			r->ru_majflt += p->signal->maj_flt;
+			r->ru_inblock += p->signal->inblock;
+			r->ru_oublock += p->signal->oublock;
 			t = p;
 			do {
 				utime = cputime_add(utime, t->utime);
@@ -2101,6 +2106,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 				r->ru_nivcsw += t->nivcsw;
 				r->ru_minflt += t->min_flt;
 				r->ru_majflt += t->maj_flt;
+				r->ru_inblock += task_io_get_inblock(t);
+				r->ru_oublock += task_io_get_oublock(t);
 				t = next_thread(t);
 			} while (t != p);
 			break;
-- 
cgit v1.2.3


From a12bb44471b819c7f33d8a83044e7116c124e3d5 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 10 May 2007 22:22:47 -0700
Subject: stop_machine() now uses hard_irq_disable

Add a call to hard_irq_disable() to stop_machine so that we make sure IRQs are
really disabled and not only lazy-disabled on archs like powerpc as some users
of stop_machine() may rely on that.

[akpm@linux-foundation.org: build fix]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/stop_machine.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index daabb74ee0bc..fcee2a8e6da3 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -8,6 +8,8 @@
 #include <linux/sched.h>
 #include <linux/stop_machine.h>
 #include <linux/syscalls.h>
+#include <linux/interrupt.h>
+
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
 #include <asm/uaccess.h>
@@ -45,6 +47,7 @@ static int stopmachine(void *cpu)
 		if (stopmachine_state == STOPMACHINE_DISABLE_IRQ 
 		    && !irqs_disabled) {
 			local_irq_disable();
+			hard_irq_disable();
 			irqs_disabled = 1;
 			/* Ack: irqs disabled. */
 			smp_mb(); /* Must read state first. */
@@ -124,6 +127,7 @@ static int stop_machine(void)
 
 	/* Make them disable irqs. */
 	local_irq_disable();
+	hard_irq_disable();
 	stopmachine_set_state(STOPMACHINE_DISABLE_IRQ);
 
 	return 0;
-- 
cgit v1.2.3


From 3e88c553db938ad016026375f0545304b9030b42 Mon Sep 17 00:00:00 2001
From: Daniel Walker <dwalker@mvista.com>
Date: Thu, 10 May 2007 22:22:53 -0700
Subject: use defines in sys_getpriority/sys_setpriority

Switch to the defines for these two checks, instead of hard coding the
values.

[akpm@linux-foundation.org: add missing include]
Signed-off-by: Daniel Walker <dwalker@mvista.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index ec319bbb0bd4..df4c3a8f5df9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,6 +14,7 @@
 #include <linux/prctl.h>
 #include <linux/highuid.h>
 #include <linux/fs.h>
+#include <linux/resource.h>
 #include <linux/kernel.h>
 #include <linux/kexec.h>
 #include <linux/workqueue.h>
@@ -659,7 +660,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
 	int error = -EINVAL;
 	struct pid *pgrp;
 
-	if (which > 2 || which < 0)
+	if (which > PRIO_USER || which < PRIO_PROCESS)
 		goto out;
 
 	/* normalize: avoid signed division (rounding problems) */
@@ -723,7 +724,7 @@ asmlinkage long sys_getpriority(int which, int who)
 	long niceval, retval = -ESRCH;
 	struct pid *pgrp;
 
-	if (which > 2 || which < 0)
+	if (which > PRIO_USER || which < PRIO_PROCESS)
 		return -EINVAL;
 
 	read_lock(&tasklist_lock);
-- 
cgit v1.2.3


From e713d0dab21a68500720e222fa02567fc7dfb14b Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Date: Thu, 10 May 2007 22:22:58 -0700
Subject: attach_pid() with struct pid parameter

attach_pid() currently takes a pid_t and then uses find_pid() to find the
corresponding struct pid.  Sometimes we already have the struct pid.  We can
then skip find_pid() if attach_pid() were to take a struct pid parameter.

Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: <containers@lists.osdl.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c |  4 ++--
 kernel/fork.c | 11 +++++++----
 kernel/pid.c  |  9 ++++++---
 kernel/sys.c  |  2 +-
 4 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 7a5fd77f8fb0..e93691e9b325 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -302,12 +302,12 @@ void __set_special_pids(pid_t session, pid_t pgrp)
 	if (process_session(curr) != session) {
 		detach_pid(curr, PIDTYPE_SID);
 		set_signal_session(curr->signal, session);
-		attach_pid(curr, PIDTYPE_SID, session);
+		attach_pid(curr, PIDTYPE_SID, find_pid(session));
 	}
 	if (process_group(curr) != pgrp) {
 		detach_pid(curr, PIDTYPE_PGID);
 		curr->signal->pgrp = pgrp;
-		attach_pid(curr, PIDTYPE_PGID, pgrp);
+		attach_pid(curr, PIDTYPE_PGID, find_pid(pgrp));
 	}
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index da92e01aba6b..6031800c94cf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1249,16 +1249,19 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 			__ptrace_link(p, current->parent);
 
 		if (thread_group_leader(p)) {
+			pid_t pgid = process_group(current);
+			pid_t sid = process_session(current);
+
 			p->signal->tty = current->signal->tty;
-			p->signal->pgrp = process_group(current);
+			p->signal->pgrp = pgid;
 			set_signal_session(p->signal, process_session(current));
-			attach_pid(p, PIDTYPE_PGID, process_group(p));
-			attach_pid(p, PIDTYPE_SID, process_session(p));
+			attach_pid(p, PIDTYPE_PGID, find_pid(pgid));
+			attach_pid(p, PIDTYPE_SID, find_pid(sid));
 
 			list_add_tail_rcu(&p->tasks, &init_task.tasks);
 			__get_cpu_var(process_counts)++;
 		}
-		attach_pid(p, PIDTYPE_PID, p->pid);
+		attach_pid(p, PIDTYPE_PID, find_pid(p->pid));
 		nr_threads++;
 	}
 
diff --git a/kernel/pid.c b/kernel/pid.c
index d3ad724afa83..d76f59326bd4 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -247,13 +247,16 @@ struct pid * fastcall find_pid(int nr)
 }
 EXPORT_SYMBOL_GPL(find_pid);
 
-int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr)
+/*
+ * attach_pid() must be called with the tasklist_lock write-held.
+ */
+int fastcall attach_pid(struct task_struct *task, enum pid_type type,
+		struct pid *pid)
 {
 	struct pid_link *link;
-	struct pid *pid;
 
 	link = &task->pids[type];
-	link->pid = pid = find_pid(nr);
+	link->pid = pid;
 	hlist_add_head_rcu(&link->node, &pid->tasks[type]);
 
 	return 0;
diff --git a/kernel/sys.c b/kernel/sys.c
index df4c3a8f5df9..872271ccc384 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1488,7 +1488,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 	if (process_group(p) != pgid) {
 		detach_pid(p, PIDTYPE_PGID);
 		p->signal->pgrp = pgid;
-		attach_pid(p, PIDTYPE_PGID, pgid);
+		attach_pid(p, PIDTYPE_PGID, find_pid(pgid));
 	}
 
 	err = 0;
-- 
cgit v1.2.3


From 820e45db2380eb1545fa2bc5d34b8b2f2933faeb Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Date: Thu, 10 May 2007 22:23:00 -0700
Subject: statically initialize struct pid for swapper

Statically initialize a struct pid for the swapper process (pid_t == 0) and
attach it to init_task.  This is needed so task_pid(), task_pgrp() and
task_session() interfaces work on the swapper process also.

Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: <containers@lists.osdl.org>
Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pid.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index d76f59326bd4..eb66bd2953ab 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -27,11 +27,13 @@
 #include <linux/bootmem.h>
 #include <linux/hash.h>
 #include <linux/pid_namespace.h>
+#include <linux/init_task.h>
 
 #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
 static struct hlist_head *pid_hash;
 static int pidhash_shift;
 static struct kmem_cache *pid_cachep;
+struct pid init_struct_pid = INIT_STRUCT_PID;
 
 int pid_max = PID_MAX_DEFAULT;
 
-- 
cgit v1.2.3


From 85868995d9d82de5b0de38d695559daddffef893 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Date: Thu, 10 May 2007 22:23:03 -0700
Subject: Use struct pid parameter in copy_process()

Modify copy_process() to take a struct pid * parameter instead of a pid_t.
This simplifies the code a bit and also avoids having to call find_pid() to
convert the pid_t to a struct pid.

Changelog:
	- Fixed Badari Pulavarty's comments and passed in &init_struct_pid
	  from fork_idle().
	- Fixed Eric Biederman's comments and simplified this patch and
	  used a new patch to remove the likely(pid) check.

Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: <containers@lists.osdl.org>
Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 6031800c94cf..cf13c44f3da3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -956,7 +956,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 					unsigned long stack_size,
 					int __user *parent_tidptr,
 					int __user *child_tidptr,
-					int pid)
+					struct pid *pid)
 {
 	int retval;
 	struct task_struct *p = NULL;
@@ -1023,7 +1023,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->did_exec = 0;
 	delayacct_tsk_init(p);	/* Must remain after dup_task_struct() */
 	copy_flags(clone_flags, p);
-	p->pid = pid;
+	p->pid = pid_nr(pid);
 	retval = -EFAULT;
 	if (clone_flags & CLONE_PARENT_SETTID)
 		if (put_user(p->pid, parent_tidptr))
@@ -1261,7 +1261,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 			list_add_tail_rcu(&p->tasks, &init_task.tasks);
 			__get_cpu_var(process_counts)++;
 		}
-		attach_pid(p, PIDTYPE_PID, find_pid(p->pid));
+		attach_pid(p, PIDTYPE_PID, pid);
 		nr_threads++;
 	}
 
@@ -1325,7 +1325,8 @@ struct task_struct * __cpuinit fork_idle(int cpu)
 	struct task_struct *task;
 	struct pt_regs regs;
 
-	task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0);
+	task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL,
+				&init_struct_pid);
 	if (!IS_ERR(task))
 		init_idle(task, cpu);
 
@@ -1375,7 +1376,7 @@ long do_fork(unsigned long clone_flags,
 			clone_flags |= CLONE_PTRACE;
 	}
 
-	p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, nr);
+	p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid);
 	/*
 	 * Do this prior waking up the new thread - the thread pointer
 	 * might get invalid after that point, if the thread exits quickly.
-- 
cgit v1.2.3


From 0800d30832ddecf2d9dc40068fed9df95930a8f1 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Date: Thu, 10 May 2007 22:23:04 -0700
Subject: Use task_pgrp() task_session() in copy_process()

Use task_pgrp() and task_session() in copy_process(), and avoid find_pid()
call when attaching the task to its process group and session.

Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: <containers@lists.osdl.org>
Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index cf13c44f3da3..083bf8953ce8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1249,14 +1249,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 			__ptrace_link(p, current->parent);
 
 		if (thread_group_leader(p)) {
-			pid_t pgid = process_group(current);
-			pid_t sid = process_session(current);
-
 			p->signal->tty = current->signal->tty;
-			p->signal->pgrp = pgid;
+			p->signal->pgrp = process_group(current);
 			set_signal_session(p->signal, process_session(current));
-			attach_pid(p, PIDTYPE_PGID, find_pid(pgid));
-			attach_pid(p, PIDTYPE_SID, find_pid(sid));
+			attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
+			attach_pid(p, PIDTYPE_SID, task_session(current));
 
 			list_add_tail_rcu(&p->tasks, &init_task.tasks);
 			__get_cpu_var(process_counts)++;
-- 
cgit v1.2.3


From fba2afaaec790dc5ab4ae8827972f342211bbb86 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Thu, 10 May 2007 22:23:13 -0700
Subject: signal/timer/event: signalfd core

This patch series implements the new signalfd() system call.

I took part of the original Linus code (and you know how badly it can be
broken :), and I added even more breakage ;) Signals are fetched from the same
signal queue used by the process, so signalfd will compete with standard
kernel delivery in dequeue_signal().  If you want to reliably fetch signals on
the signalfd file, you need to block them with sigprocmask(SIG_BLOCK).  This
seems to be working fine on my Dual Opteron machine.  I made a quick test
program for it:

http://www.xmailserver.org/signafd-test.c

The signalfd() system call implements signal delivery into a file descriptor
receiver.  The signalfd file descriptor if created with the following API:

int signalfd(int ufd, const sigset_t *mask, size_t masksize);

The "ufd" parameter allows to change an existing signalfd sigmask, w/out going
to close/create cycle (Linus idea).  Use "ufd" == -1 if you want a brand new
signalfd file.

The "mask" allows to specify the signal mask of signals that we are interested
in.  The "masksize" parameter is the size of "mask".

The signalfd fd supports the poll(2) and read(2) system calls.  The poll(2)
will return POLLIN when signals are available to be dequeued.  As a direct
consequence of supporting the Linux poll subsystem, the signalfd fd can use
used together with epoll(2) too.

The read(2) system call will return a "struct signalfd_siginfo" structure in
the userspace supplied buffer.  The return value is the number of bytes copied
in the supplied buffer, or -1 in case of error.  The read(2) call can also
return 0, in case the sighand structure to which the signalfd was attached,
has been orphaned.  The O_NONBLOCK flag is also supported, and read(2) will
return -EAGAIN in case no signal is available.

If the size of the buffer passed to read(2) is lower than sizeof(struct
signalfd_siginfo), -EINVAL is returned.  A read from the signalfd can also
return -ERESTARTSYS in case a signal hits the process.  The format of the
struct signalfd_siginfo is, and the valid fields depends of the (->code &
__SI_MASK) value, in the same way a struct siginfo would:

struct signalfd_siginfo {
	__u32 signo;	/* si_signo */
	__s32 err;	/* si_errno */
	__s32 code;	/* si_code */
	__u32 pid;	/* si_pid */
	__u32 uid;	/* si_uid */
	__s32 fd;	/* si_fd */
	__u32 tid;	/* si_fd */
	__u32 band;	/* si_band */
	__u32 overrun;	/* si_overrun */
	__u32 trapno;	/* si_trapno */
	__s32 status;	/* si_status */
	__s32 svint;	/* si_int */
	__u64 svptr;	/* si_ptr */
	__u64 utime;	/* si_utime */
	__u64 stime;	/* si_stime */
	__u64 addr;	/* si_addr */
};

[akpm@linux-foundation.org: fix signalfd_copyinfo() on i386]
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c   |  9 +++++++++
 kernel/fork.c   |  8 +++++---
 kernel/signal.c | 22 ++++++++++++++++++++--
 kernel/sys_ni.c |  3 +++
 4 files changed, 37 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index e93691e9b325..c6d14b8008dd 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -24,6 +24,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/ptrace.h>
 #include <linux/profile.h>
+#include <linux/signalfd.h>
 #include <linux/mount.h>
 #include <linux/proc_fs.h>
 #include <linux/kthread.h>
@@ -83,6 +84,14 @@ static void __exit_signal(struct task_struct *tsk)
 	sighand = rcu_dereference(tsk->sighand);
 	spin_lock(&sighand->siglock);
 
+	/*
+	 * Notify that this sighand has been detached. This must
+	 * be called with the tsk->sighand lock held. Also, this
+	 * access tsk->sighand internally, so it must be called
+	 * before tsk->sighand is reset.
+	 */
+	signalfd_detach_locked(tsk);
+
 	posix_cpu_timers_exit(tsk);
 	if (atomic_dec_and_test(&sig->count))
 		posix_cpu_timers_exit_group(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 083bf8953ce8..49530e40ea8b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1422,12 +1422,15 @@ long do_fork(unsigned long clone_flags,
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif
 
-static void sighand_ctor(void *data, struct kmem_cache *cachep, unsigned long flags)
+static void sighand_ctor(void *data, struct kmem_cache *cachep,
+			unsigned long flags)
 {
 	struct sighand_struct *sighand = data;
 
-	if (flags & SLAB_CTOR_CONSTRUCTOR)
+	if (flags & SLAB_CTOR_CONSTRUCTOR) {
 		spin_lock_init(&sighand->siglock);
+		INIT_LIST_HEAD(&sighand->signalfd_list);
+	}
 }
 
 void __init proc_caches_init(void)
@@ -1453,7 +1456,6 @@ void __init proc_caches_init(void)
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
 }
 
-
 /*
  * Check constraints on flags passed to the unshare system call and
  * force unsharing of additional process context as appropriate.
diff --git a/kernel/signal.c b/kernel/signal.c
index 2ac3a668d9dd..34b7d6abce8f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -21,6 +21,7 @@
 #include <linux/syscalls.h>
 #include <linux/ptrace.h>
 #include <linux/signal.h>
+#include <linux/signalfd.h>
 #include <linux/capability.h>
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
@@ -113,8 +114,7 @@ void recalc_sigpending(void)
 
 /* Given the mask, find the first available signal that should be serviced. */
 
-static int
-next_signal(struct sigpending *pending, sigset_t *mask)
+int next_signal(struct sigpending *pending, sigset_t *mask)
 {
 	unsigned long i, *s, *m, x;
 	int sig = 0;
@@ -629,6 +629,12 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	struct sigqueue * q = NULL;
 	int ret = 0;
 
+	/*
+	 * Deliver the signal to listening signalfds. This must be called
+	 * with the sighand lock held.
+	 */
+	signalfd_notify(t, sig);
+
 	/*
 	 * fast-pathed signals for kernel-internal things like SIGSTOP
 	 * or SIGKILL.
@@ -1280,6 +1286,11 @@ int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 		ret = 1;
 		goto out;
 	}
+	/*
+	 * Deliver the signal to listening signalfds. This must be called
+	 * with the sighand lock held.
+	 */
+	signalfd_notify(p, sig);
 
 	list_add_tail(&q->list, &p->pending.list);
 	sigaddset(&p->pending.signal, sig);
@@ -1323,6 +1334,11 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 		q->info.si_overrun++;
 		goto out;
 	} 
+	/*
+	 * Deliver the signal to listening signalfds. This must be called
+	 * with the sighand lock held.
+	 */
+	signalfd_notify(p, sig);
 
 	/*
 	 * Put this signal on the shared-pending queue.
@@ -1983,6 +1999,8 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
 	/*
 	 * If you change siginfo_t structure, please be sure
 	 * this code is fixed accordingly.
+	 * Please remember to update the signalfd_copyinfo() function
+	 * inside fs/signalfd.c too, in case siginfo_t changes.
 	 * It should never copy any pad contained in the structure
 	 * to avoid security leaks, but must copy the generic
 	 * 3 ints plus the relevant union member.
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index d7306d0f3dfc..807e9bb8fcdb 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -141,3 +141,6 @@ cond_syscall(compat_sys_migrate_pages);
 cond_syscall(sys_bdflush);
 cond_syscall(sys_ioprio_set);
 cond_syscall(sys_ioprio_get);
+
+/* New file descriptors */
+cond_syscall(sys_signalfd);
-- 
cgit v1.2.3


From b215e283992899650c4271e7385c79e26fb9a88e Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Thu, 10 May 2007 22:23:16 -0700
Subject: signal/timer/event: timerfd core

This patch introduces a new system call for timers events delivered though
file descriptors.  This allows timer event to be used with standard POSIX
poll(2), select(2) and read(2).  As a consequence of supporting the Linux
f_op->poll subsystem, they can be used with epoll(2) too.

The system call is defined as:

int timerfd(int ufd, int clockid, int flags, const struct itimerspec *utmr);

The "ufd" parameter allows for re-use (re-programming) of an existing timerfd
w/out going through the close/open cycle (same as signalfd).  If "ufd" is -1,
s new file descriptor will be created, otherwise the existing "ufd" will be
re-programmed.

The "clockid" parameter is either CLOCK_MONOTONIC or CLOCK_REALTIME.  The time
specified in the "utmr->it_value" parameter is the expiry time for the timer.

If the TFD_TIMER_ABSTIME flag is set in "flags", this is an absolute time,
otherwise it's a relative time.

If the time specified in the "utmr->it_interval" is not zero (.tv_sec == 0,
tv_nsec == 0), this is the period at which the following ticks should be
generated.

The "utmr->it_interval" should be set to zero if only one tick is requested.
Setting the "utmr->it_value" to zero will disable the timer, or will create a
timerfd without the timer enabled.

The function returns the new (or same, in case "ufd" is a valid timerfd
descriptor) file, or -1 in case of error.

As stated before, the timerfd file descriptor supports poll(2), select(2) and
epoll(2).  When a timer event happened on the timerfd, a POLLIN mask will be
returned.

The read(2) call can be used, and it will return a u32 variable holding the
number of "ticks" that happened on the interface since the last call to
read(2).  The read(2) call supportes the O_NONBLOCK flag too, and EAGAIN will
be returned if no ticks happened.

A quick test program, shows timerfd working correctly on my amd64 box:

http://www.xmailserver.org/timerfd-test.c

[akpm@linux-foundation.org: add sys_timerfd to sys_ni.c]
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys_ni.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 807e9bb8fcdb..b18f62549515 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -144,3 +144,4 @@ cond_syscall(sys_ioprio_get);
 
 /* New file descriptors */
 cond_syscall(sys_signalfd);
+cond_syscall(sys_timerfd);
-- 
cgit v1.2.3


From 83f5d1266926c75890f1bc4678e49d79483cb573 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Thu, 10 May 2007 22:23:18 -0700
Subject: signal/timer/event: timerfd compat code

This patch implements the necessary compat code for the timerfd system call.

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/compat.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index cebb4c28c039..3bae3742c2aa 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -475,8 +475,8 @@ asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
 	return min_length;
 }
 
-static int get_compat_itimerspec(struct itimerspec *dst, 
-				 struct compat_itimerspec __user *src)
+int get_compat_itimerspec(struct itimerspec *dst,
+			  const struct compat_itimerspec __user *src)
 { 
 	if (get_compat_timespec(&dst->it_interval, &src->it_interval) ||
 	    get_compat_timespec(&dst->it_value, &src->it_value))
@@ -484,8 +484,8 @@ static int get_compat_itimerspec(struct itimerspec *dst,
 	return 0;
 } 
 
-static int put_compat_itimerspec(struct compat_itimerspec __user *dst, 
-				 struct itimerspec *src)
+int put_compat_itimerspec(struct compat_itimerspec __user *dst,
+			  const struct itimerspec *src)
 { 
 	if (put_compat_timespec(&src->it_interval, &dst->it_interval) ||
 	    put_compat_timespec(&src->it_value, &dst->it_value))
-- 
cgit v1.2.3


From e1ad7468c77ddb94b0615d5f50fa255525fde0f0 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Thu, 10 May 2007 22:23:19 -0700
Subject: signal/timer/event: eventfd core

This is a very simple and light file descriptor, that can be used as event
wait/dispatch by userspace (both wait and dispatch) and by the kernel
(dispatch only).  It can be used instead of pipe(2) in all cases where those
would simply be used to signal events.  Their kernel overhead is much lower
than pipes, and they do not consume two fds.  When used in the kernel, it can
offer an fd-bridge to enable, for example, functionalities like KAIO or
syslets/threadlets to signal to an fd the completion of certain operations.
But more in general, an eventfd can be used by the kernel to signal readiness,
in a POSIX poll/select way, of interfaces that would otherwise be incompatible
with it.  The API is:

int eventfd(unsigned int count);

The eventfd API accepts an initial "count" parameter, and returns an eventfd
fd.  It supports poll(2) (POLLIN, POLLOUT, POLLERR), read(2) and write(2).

The POLLIN flag is raised when the internal counter is greater than zero.

The POLLOUT flag is raised when at least a value of "1" can be written to the
internal counter.

The POLLERR flag is raised when an overflow in the counter value is detected.

The write(2) operation can never overflow the counter, since it blocks (unless
O_NONBLOCK is set, in which case -EAGAIN is returned).

But the eventfd_signal() function can do it, since it's supposed to not sleep
during its operation.

The read(2) function reads the __u64 counter value, and reset the internal
value to zero.  If the value read is equal to (__u64) -1, an overflow happened
on the internal counter (due to 2^64 eventfd_signal() posts that has never
been retired - unlickely, but possible).

The write(2) call writes an __u64 count value, and adds it to the current
counter.  The eventfd fd supports O_NONBLOCK also.

On the kernel side, we have:

struct file *eventfd_fget(int fd);
int eventfd_signal(struct file *file, unsigned int n);

The eventfd_fget() should be called to get a struct file* from an eventfd fd
(this is an fget() + check of f_op being an eventfd fops pointer).

The kernel can then call eventfd_signal() every time it wants to post an event
to userspace.  The eventfd_signal() function can be called from any context.
An eventfd() simple test and bench is available here:

http://www.xmailserver.org/eventfd-bench.c

This is the eventfd-based version of pipetest-4 (pipe(2) based):

http://www.xmailserver.org/pipetest-4.c

Not that performance matters much in the eventfd case, but eventfd-bench
shows almost as double as performance than pipetest-4.

[akpm@linux-foundation.org: fix i386 build]
[akpm@linux-foundation.org: add sys_eventfd to sys_ni.c]
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys_ni.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index b18f62549515..b6d77a8a1ca9 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -145,3 +145,4 @@ cond_syscall(sys_ioprio_get);
 /* New file descriptors */
 cond_syscall(sys_signalfd);
 cond_syscall(sys_timerfd);
+cond_syscall(sys_eventfd);
-- 
cgit v1.2.3