From b3e55c727ff7349c5db722fbdb8d99a151e8e0bf Mon Sep 17 00:00:00 2001
From: "Mao, Bibo" <bibo.mao@intel.com>
Date: Mon, 12 Dec 2005 00:37:00 -0800
Subject: [PATCH] Kprobes: Reference count the modules when probed on it

When a Kprobes are inserted/removed on a modules, the modules must be ref
counted so as not to allow to unload while probes are registered on that
module.

Without this patch, the probed module is free to unload, and when the
probing module unregister the probe, the kpobes code while trying to
replace the original instruction might crash.

Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Mao Bibo <bibo.mao@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kprobes.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 5beda378cc75..fde5a16a2913 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -462,9 +462,16 @@ int __kprobes register_kprobe(struct kprobe *p)
 	int ret = 0;
 	unsigned long flags = 0;
 	struct kprobe *old_p;
+	struct module *mod;
+
+	if ((!kernel_text_address((unsigned long) p->addr)) ||
+		in_kprobes_functions((unsigned long) p->addr))
+		return -EINVAL;
+
+	if ((mod = module_text_address((unsigned long) p->addr)) &&
+			(unlikely(!try_module_get(mod))))
+		return -EINVAL;
 
-	if ((ret = in_kprobes_functions((unsigned long) p->addr)) != 0)
-		return ret;
 	if ((ret = arch_prepare_kprobe(p)) != 0)
 		goto rm_kprobe;
 
@@ -488,6 +495,8 @@ out:
 rm_kprobe:
 	if (ret == -EEXIST)
 		arch_remove_kprobe(p);
+	if (ret && mod)
+		module_put(mod);
 	return ret;
 }
 
@@ -495,6 +504,7 @@ void __kprobes unregister_kprobe(struct kprobe *p)
 {
 	unsigned long flags;
 	struct kprobe *old_p;
+	struct module *mod;
 
 	spin_lock_irqsave(&kprobe_lock, flags);
 	old_p = get_kprobe(p->addr);
@@ -506,6 +516,10 @@ void __kprobes unregister_kprobe(struct kprobe *p)
 			cleanup_kprobe(p, flags);
 
 		synchronize_sched();
+
+		if ((mod = module_text_address((unsigned long)p->addr)))
+			module_put(mod);
+
 		if (old_p->pre_handler == aggr_pre_handler &&
 				list_empty(&old_p->list))
 			kfree(old_p);
-- 
cgit v1.2.3


From ab4720ec76b756e1f8705e207a7b392b0453afd6 Mon Sep 17 00:00:00 2001
From: Dipankar Sarma <dipankar@in.ibm.com>
Date: Mon, 12 Dec 2005 00:37:05 -0800
Subject: [PATCH] add rcu_barrier() synchronization point

This introduces a new interface - rcu_barrier() which waits until all
the RCUs queued until this call have been completed.

Reiser4 needs this, because we do more than just freeing memory object
in our RCU callback: we also remove it from the list hanging off
super-block.  This means, that before freeing reiser4-specific portion
of super-block (during umount) we have to wait until all pending RCU
callbacks are executed.

The only change of reiser4 made to the original patch, is exporting of
rcu_barrier().

Cc: Hans Reiser <reiser@namesys.com>
Cc: Vladimir V. Saveliev <vs@namesys.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcupdate.c | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c4d159a21e04..f45b91723dc6 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -116,6 +116,10 @@ void fastcall call_rcu(struct rcu_head *head,
 	local_irq_restore(flags);
 }
 
+static atomic_t rcu_barrier_cpu_count;
+static struct semaphore rcu_barrier_sema;
+static struct completion rcu_barrier_completion;
+
 /**
  * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
  * @head: structure to be used for queueing the RCU updates.
@@ -162,6 +166,42 @@ long rcu_batches_completed(void)
 	return rcu_ctrlblk.completed;
 }
 
+static void rcu_barrier_callback(struct rcu_head *notused)
+{
+	if (atomic_dec_and_test(&rcu_barrier_cpu_count))
+		complete(&rcu_barrier_completion);
+}
+
+/*
+ * Called with preemption disabled, and from cross-cpu IRQ context.
+ */
+static void rcu_barrier_func(void *notused)
+{
+	int cpu = smp_processor_id();
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	struct rcu_head *head;
+
+	head = &rdp->barrier;
+	atomic_inc(&rcu_barrier_cpu_count);
+	call_rcu(head, rcu_barrier_callback);
+}
+
+/**
+ * rcu_barrier - Wait until all the in-flight RCUs are complete.
+ */
+void rcu_barrier(void)
+{
+	BUG_ON(in_interrupt());
+	/* Take cpucontrol semaphore to protect against CPU hotplug */
+	down(&rcu_barrier_sema);
+	init_completion(&rcu_barrier_completion);
+	atomic_set(&rcu_barrier_cpu_count, 0);
+	on_each_cpu(rcu_barrier_func, NULL, 0, 1);
+	wait_for_completion(&rcu_barrier_completion);
+	up(&rcu_barrier_sema);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier);
+
 /*
  * Invoke the completed RCU callbacks. They are expected to be in
  * a per-cpu list.
@@ -457,6 +497,7 @@ static struct notifier_block __devinitdata rcu_nb = {
  */
 void __init rcu_init(void)
 {
+	sema_init(&rcu_barrier_sema, 1);
 	rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
 			(void *)(long)smp_processor_id());
 	/* Register notifier for non-boot CPUs */
-- 
cgit v1.2.3


From 89d46b8778f65223f732d82c0166e0abba20fb1e Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Date: Mon, 12 Dec 2005 00:37:06 -0800
Subject: [PATCH] Fix bug in RCU torture test

While doing some test of RCU torture module, I hit a OOPS in rcu_do_batch,
which was trying to processes callback of a module that was just removed.
This is because we weren't waiting long enough for all callbacks to fire.

Signed-off-by: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Acked-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcutorture.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 88c28d476550..49fbbeff201c 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -409,9 +409,8 @@ rcu_torture_cleanup(void)
 	stats_task = NULL;
 
 	/* Wait for all RCU callbacks to fire.  */
+	rcu_barrier();
 
-	for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++)
-		synchronize_rcu();
 	rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
 	printk(KERN_ALERT TORTURE_FLAG
 	       "--- End of test: %s\n",
-- 
cgit v1.2.3


From c3f5902325d3053986e7359f706581d8f032e72f Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Date: Mon, 12 Dec 2005 00:37:07 -0800
Subject: [PATCH] Fix RCU race in access of nohz_cpu_mask

Accessing nohz_cpu_mask before incrementing rcp->cur is racy.  It can cause
tickless idle CPUs to be included in rsp->cpumask, which will extend
graceperiods unnecessarily.

Fix this race.  It has been tested using extensions to RCU torture module
that forces various CPUs to become idle.

Signed-off-by: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcupdate.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f45b91723dc6..48d3bce465b8 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -257,15 +257,23 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp,
 
 	if (rcp->next_pending &&
 			rcp->completed == rcp->cur) {
-		/* Can't change, since spin lock held. */
-		cpus_andnot(rsp->cpumask, cpu_online_map, nohz_cpu_mask);
-
 		rcp->next_pending = 0;
-		/* next_pending == 0 must be visible in __rcu_process_callbacks()
-		 * before it can see new value of cur.
+		/*
+		 * next_pending == 0 must be visible in
+		 * __rcu_process_callbacks() before it can see new value of cur.
 		 */
 		smp_wmb();
 		rcp->cur++;
+
+		/*
+		 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
+		 * Barrier  Otherwise it can cause tickless idle CPUs to be
+		 * included in rsp->cpumask, which will extend graceperiods
+		 * unnecessarily.
+		 */
+		smp_mb();
+		cpus_andnot(rsp->cpumask, cpu_online_map, nohz_cpu_mask);
+
 	}
 }
 
-- 
cgit v1.2.3


From 64123fd42c7a1e4ebf6acd2399c98caddc7e0c26 Mon Sep 17 00:00:00 2001
From: Matt Helsley <matthltc@us.ibm.com>
Date: Mon, 12 Dec 2005 00:37:09 -0800
Subject: [PATCH] Add getnstimestamp function

There are several functions that might seem appropriate for a timestamp:

get_cycles()
current_kernel_time()
do_gettimeofday()
<read jiffies/jiffies_64>

Each has problems with combinations of SMP-safety, low resolution, and
monotonicity. This patch adds a new function that returns a monotonic SMP-safe
timestamp with nanosecond resolution where available.

Changes:
	Split timestamp into separate patch
	Moved to kernel/time.c
	Renamed to getnstimestamp
	Fixed unintended-pointer-arithmetic bug

Signed-off-by: Matt Helsley <matthltc@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time.c b/kernel/time.c
index 245d595a13cb..b94bfa8c03e0 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -561,6 +561,28 @@ void getnstimeofday(struct timespec *tv)
 EXPORT_SYMBOL_GPL(getnstimeofday);
 #endif
 
+void getnstimestamp(struct timespec *ts)
+{
+	unsigned int seq;
+	struct timespec wall2mono;
+
+	/* synchronize with settimeofday() changes */
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		getnstimeofday(ts);
+		wall2mono = wall_to_monotonic;
+	} while(unlikely(read_seqretry(&xtime_lock, seq)));
+
+	/* adjust to monotonicaly-increasing values */
+	ts->tv_sec += wall2mono.tv_sec;
+	ts->tv_nsec += wall2mono.tv_nsec;
+	while (unlikely(ts->tv_nsec >= NSEC_PER_SEC)) {
+		ts->tv_nsec -= NSEC_PER_SEC;
+		ts->tv_sec++;
+	}
+}
+EXPORT_SYMBOL_GPL(getnstimestamp);
+
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void)
 {
-- 
cgit v1.2.3


From adad0f331f9c693129e81e233c5461e2e7c3e443 Mon Sep 17 00:00:00 2001
From: Keshavamurthy Anil S <anil.s.keshavamurthy@intel.com>
Date: Mon, 12 Dec 2005 00:37:12 -0800
Subject: [PATCH] kprobes: fix race in aggregate kprobe registration

When registering multiple kprobes at the same address, we leave a small
window where the kprobe hlist will not contain a reference to the
registered kprobe, leading to potentially, a system crash if the breakpoint
is hit on another processor.

Patch below now automically relpace the old kprobe with the new
kprobe from the hash list.

Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kprobes.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index fde5a16a2913..e4f0fc62bd3e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -399,10 +399,7 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 	INIT_LIST_HEAD(&ap->list);
 	list_add_rcu(&p->list, &ap->list);
 
-	INIT_HLIST_NODE(&ap->hlist);
-	hlist_del_rcu(&p->hlist);
-	hlist_add_head_rcu(&ap->hlist,
-		&kprobe_table[hash_ptr(ap->addr, KPROBE_HASH_BITS)]);
+	hlist_replace_rcu(&p->hlist, &ap->hlist);
 }
 
 /*
-- 
cgit v1.2.3


From 7a4ae749a478f8bca73d4b5b8c1b8cbb178b2db5 Mon Sep 17 00:00:00 2001
From: Pierre Ossman <drzeus@drzeus.cx>
Date: Mon, 12 Dec 2005 00:37:22 -0800
Subject: [PATCH] Add try_to_freeze to kauditd

kauditd was causing suspends to fail because it refused to freeze.  Adding
a try_to_freeze() to its sleep loop solves the issue.

Signed-off-by: Pierre Ossman <drzeus@drzeus.cx>
Acked-by: Pavel Machek <pavel@suse.cz>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/audit.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 0c56320d38dc..32fa03ad1984 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -291,8 +291,10 @@ int kauditd_thread(void *dummy)
 			set_current_state(TASK_INTERRUPTIBLE);
 			add_wait_queue(&kauditd_wait, &wait);
 
-			if (!skb_queue_len(&audit_skb_queue))
+			if (!skb_queue_len(&audit_skb_queue)) {
+				try_to_freeze();
 				schedule();
+			}
 
 			__set_current_state(TASK_RUNNING);
 			remove_wait_queue(&kauditd_wait, &wait);
-- 
cgit v1.2.3


From 00d7c05ab168c10f9b520e07400923267bc04419 Mon Sep 17 00:00:00 2001
From: Keshavamurthy Anil S <anil.s.keshavamurthy@intel.com>
Date: Mon, 12 Dec 2005 00:37:33 -0800
Subject: [PATCH] kprobes: no probes on critical path

For Kprobes critical path is the path from debug break exception handler
till the control reaches kprobes exception code.  No probes can be
supported in this path as we will end up in recursion.

This patch prevents this by moving the below function to safe __kprobes
section onto which no probes can be inserted.

Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index bce933ebb29f..eecf84526afe 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -32,6 +32,7 @@
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
+#include <linux/kprobes.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -168,7 +169,7 @@ EXPORT_SYMBOL(notifier_chain_unregister);
  *	of the last notifier function called.
  */
  
-int notifier_call_chain(struct notifier_block **n, unsigned long val, void *v)
+int __kprobes notifier_call_chain(struct notifier_block **n, unsigned long val, void *v)
 {
 	int ret=NOTIFY_DONE;
 	struct notifier_block *nb = *n;
-- 
cgit v1.2.3


From bf8d5c52c3b6b27061e3b7d779057fd9a6cac164 Mon Sep 17 00:00:00 2001
From: Keshavamurthy Anil S <anil.s.keshavamurthy@intel.com>
Date: Mon, 12 Dec 2005 00:37:34 -0800
Subject: [PATCH] kprobes: increment kprobe missed count for multiprobes

When multiple probes are registered at the same address and if due to some
recursion (probe getting triggered within a probe handler), we skip calling
pre_handlers and just increment nmissed field.

The below patch make sure it walks the list for multiple probes case.
Without the below patch we get incorrect results of nmissed count for
multiple probe case.

Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kprobes.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e4f0fc62bd3e..3bb71e63a37e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -246,6 +246,19 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
 	return ret;
 }
 
+/* Walks the list and increments nmissed count for multiprobe case */
+void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
+{
+	struct kprobe *kp;
+	if (p->pre_handler != aggr_pre_handler) {
+		p->nmissed++;
+	} else {
+		list_for_each_entry_rcu(kp, &p->list, list)
+			kp->nmissed++;
+	}
+	return;
+}
+
 /* Called with kretprobe_lock held */
 struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
 {
-- 
cgit v1.2.3


From 9e28393998d3d0733097306762f6d1c083fc92c6 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Tue, 20 Dec 2005 15:21:24 +0100
Subject: [PATCH] kernel/params.c: fix sysfs access with CONFIG_MODULES=n

All the work was done to setup the file and maintain the file handles but
the access functions were zeroed out due to the #ifdef.  Removing the
#ifdef allows full access to all the parameters when CONFIG_MODULES=n.

akpm: put it back again, but use CONFIG_SYSFS instead.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/params.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 47ba69547945..c76ad25e6a21 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -619,7 +619,7 @@ static void __init param_sysfs_builtin(void)
 
 
 /* module-related sysfs stuff */
-#ifdef CONFIG_MODULES
+#ifdef CONFIG_SYSFS
 
 #define to_module_attr(n) container_of(n, struct module_attribute, attr);
 #define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
-- 
cgit v1.2.3


From 8e31108b9f41069d55cb9b019ac8262c55fd2616 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 23 Dec 2005 19:54:46 -0800
Subject: [PATCH] Fix memory ordering problem in wake_futex()

Fix a memory ordering problem that occurs on IA64. The "store" to q->lock_ptr
in wake_futex() can become visible before wake_up_all() clears the lock in the
futex_q.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 5872e3507f35..5e71a6bf6f6b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -270,7 +270,13 @@ static void wake_futex(struct futex_q *q)
 	/*
 	 * The waiting task can free the futex_q as soon as this is written,
 	 * without taking any locks.  This must come last.
+	 *
+	 * A memory barrier is required here to prevent the following store
+	 * to lock_ptr from getting ahead of the wakeup. Clearing the lock
+	 * at the end of wake_up_all() does not prevent this store from
+	 * moving.
 	 */
+	wmb();
 	q->lock_ptr = NULL;
 }
 
-- 
cgit v1.2.3


From 8febdd85adaa41fa1fc1cb31286210fc2cd3ed0c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Fri, 30 Dec 2005 17:18:53 -0800
Subject: sysctl: don't overflow the user-supplied buffer with '\0'

If the string was too long to fit in the user-supplied buffer,
the sysctl layer would zero-terminate it by writing past the
end of the buffer. Don't do that.

Noticed by Yi Yang <yang.y.yi@gmail.com>

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9990e10192e8..ad0425a8f709 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2201,14 +2201,12 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen,
 		if (get_user(len, oldlenp))
 			return -EFAULT;
 		if (len) {
-			l = strlen(table->data);
+			l = strlen(table->data)+1;
 			if (len > l) len = l;
 			if (len >= table->maxlen)
 				len = table->maxlen;
 			if(copy_to_user(oldval, table->data, len))
 				return -EFAULT;
-			if(put_user(0, ((char __user *) oldval) + len))
-				return -EFAULT;
 			if(put_user(len, oldlenp))
 				return -EFAULT;
 		}
-- 
cgit v1.2.3


From 82c9df820112c6286a8e8fbe482e94b65b49062c Mon Sep 17 00:00:00 2001
From: Yi Yang <yang.y.yi@gmail.com>
Date: Fri, 30 Dec 2005 16:37:10 +0800
Subject: [PATCH] Fix false old value return of sysctl

For the sysctl syscall, if the user wants to get the old value of a
sysctl entry and set a new value for it in the same syscall, the old
value is always overwritten by the new value if the sysctl entry is of
string type and if the user sets its strategy to sysctl_string.  This
issue lies in the strategy being run twice if the strategy is set to
sysctl_string, the general strategy sysctl_string always returns 0 if
success.

Such strategy routines as sysctl_jiffies and sysctl_jiffies_ms return 1
because they do read and write for the sysctl entry.

The strategy routine sysctl_string return 0 although it actually read
and write the sysctl entry.

According to my analysis, if a strategy routine do read and write, it
should return 1, if it just does some necessary check but not read and
write, it should return 0, for example sysctl_intvec.

Signed-off-by: Yi Yang <yang.y.yi@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ad0425a8f709..e5102ea6e104 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2221,7 +2221,7 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen,
 			len--;
 		((char *) table->data)[len] = 0;
 	}
-	return 0;
+	return 1;
 }
 
 /*
-- 
cgit v1.2.3


From de9e007d9105bf8fa613a89810feff32a43add03 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Sat, 31 Dec 2005 17:00:29 -0800
Subject: sysctl: make sure to terminate strings with a NUL

This is a slightly more complete fix for the previous minimal sysctl
string fix.  It always terminates the returned string with a NUL, even
if the full result wouldn't fit in the user-supplied buffer.

The returned length is the full untruncated length, so that you can
tell when truncation has occurred.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e5102ea6e104..b53115b882e1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2192,27 +2192,32 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen,
 		  void __user *oldval, size_t __user *oldlenp,
 		  void __user *newval, size_t newlen, void **context)
 {
-	size_t l, len;
-	
 	if (!table->data || !table->maxlen) 
 		return -ENOTDIR;
 	
 	if (oldval && oldlenp) {
-		if (get_user(len, oldlenp))
+		size_t bufsize;
+		if (get_user(bufsize, oldlenp))
 			return -EFAULT;
-		if (len) {
-			l = strlen(table->data)+1;
-			if (len > l) len = l;
-			if (len >= table->maxlen)
+		if (bufsize) {
+			size_t len = strlen(table->data), copied;
+
+			/* This shouldn't trigger for a well-formed sysctl */
+			if (len > table->maxlen)
 				len = table->maxlen;
-			if(copy_to_user(oldval, table->data, len))
+
+			/* Copy up to a max of bufsize-1 bytes of the string */
+			copied = (len >= bufsize) ? bufsize - 1 : len;
+
+			if (copy_to_user(oldval, table->data, copied) ||
+			    put_user(0, (char __user *)(oldval + copied)))
 				return -EFAULT;
-			if(put_user(len, oldlenp))
+			if (put_user(len, oldlenp))
 				return -EFAULT;
 		}
 	}
 	if (newval && newlen) {
-		len = newlen;
+		size_t len = newlen;
 		if (len > table->maxlen)
 			len = table->maxlen;
 		if(copy_from_user(table->data, newval, len))
-- 
cgit v1.2.3


From febf7ea4bedcd36fba0843db726bba28d22bf89a Mon Sep 17 00:00:00 2001
From:  <sam@mars.ravnborg.org>
Date: Tue, 3 Jan 2006 11:35:26 +0100
Subject: gitignore: ignore more generated files

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
---
 kernel/.gitignore | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 kernel/.gitignore

(limited to 'kernel')

diff --git a/kernel/.gitignore b/kernel/.gitignore
new file mode 100644
index 000000000000..f2ab70073bd4
--- /dev/null
+++ b/kernel/.gitignore
@@ -0,0 +1,5 @@
+#
+# Generated files
+#
+config_data.h
+config_data.gz
-- 
cgit v1.2.3


From f4b09ebc8baa51ec8394c4173e3de9d62b2cc97a Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 3 Jan 2006 13:37:51 +0100
Subject: update the email address of Randy Dunlap

This patch removes all references to the bouncing address
rddunlap@osdl.org and one dead web page from the kernel.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Randy Dunlap <rdunlap@xenotime.net>
---
 kernel/configs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/configs.c b/kernel/configs.c
index 986f7af31e0a..009e1ebdcb88 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -3,7 +3,7 @@
  * Echo the kernel .config file used to build the kernel
  *
  * Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com>
- * Copyright (C) 2002 Randy Dunlap <rddunlap@osdl.org>
+ * Copyright (C) 2002 Randy Dunlap <rdunlap@xenotime.net>
  * Copyright (C) 2002 Al Stone <ahs3@fc.hp.com>
  * Copyright (C) 2002 Hewlett-Packard Company
  *
-- 
cgit v1.2.3


From 0296b2281352e4794e174b393c37f131502e09f0 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@suse.de>
Date: Fri, 11 Nov 2005 05:33:52 +0100
Subject: [PATCH] remove CONFIG_KOBJECT_UEVENT option

It makes zero sense to have hotplug, but not the netlink
events enabled today. Remove this option and merge the
kobject_uevent.h header into the kobject.h header file.

Signed-off-by: Kay Sievers <kay.sievers@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/sysctl.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b53115b882e1..6a51e25d4466 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -31,6 +31,7 @@
 #include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/kobject.h>
 #include <linux/net.h>
 #include <linux/sysrq.h>
 #include <linux/highuid.h>
@@ -83,9 +84,6 @@ static int ngroups_max = NGROUPS_MAX;
 #ifdef CONFIG_KMOD
 extern char modprobe_path[];
 #endif
-#ifdef CONFIG_HOTPLUG
-extern char hotplug_path[];
-#endif
 #ifdef CONFIG_CHR_DEV_SG
 extern int sg_big_buff;
 #endif
-- 
cgit v1.2.3


From 0f76e5acf9dc788e664056dda1e461f0bec93948 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@suse.de>
Date: Fri, 11 Nov 2005 04:58:04 +0100
Subject: [PATCH] add uevent_helper control in /sys/kernel/

This deprecates the /proc/sys/kernel/hotplug file, as all
this stuff should be in /sys some day, right? :)
In /sys/kernel/ we have now uevent_seqnum and uevent_helper.
The seqnum is no longer used by udev, as the version for this
kernel depends on netlink which events will never get
out-of-order.

Recent udev versions disable the /sbin/hotplug helper with
an init script, cause it leads to OOM on big boxes by running
hundreds of shells in parallel. It should be done now by:
  echo "" > /sys/kernel/uevent_helper

(Note that "-n" does not work, cause neighter proc nor sysfs
support truncate().)

Signed-off-by: Kay Sievers <kay.sievers@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/ksysfs.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 015fb69ad94d..e975a76a9d5b 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -23,11 +23,29 @@ static struct subsys_attribute _name##_attr = \
 	__ATTR(_name, 0644, _name##_show, _name##_store)
 
 #ifdef CONFIG_HOTPLUG
-static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page)
+/* current uevent sequence number */
+static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page)
 {
 	return sprintf(page, "%llu\n", (unsigned long long)hotplug_seqnum);
 }
-KERNEL_ATTR_RO(hotplug_seqnum);
+KERNEL_ATTR_RO(uevent_seqnum);
+
+/* uevent helper program, used during early boo */
+static ssize_t uevent_helper_show(struct subsystem *subsys, char *page)
+{
+	return sprintf(page, "%s\n", hotplug_path);
+}
+static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, size_t count)
+{
+	if (count+1 > HOTPLUG_PATH_LEN)
+		return -ENOENT;
+	memcpy(hotplug_path, page, count);
+	hotplug_path[count] = '\0';
+	if (count && hotplug_path[count-1] == '\n')
+		hotplug_path[count-1] = '\0';
+	return count;
+}
+KERNEL_ATTR_RW(uevent_helper);
 #endif
 
 #ifdef CONFIG_KEXEC
@@ -45,7 +63,8 @@ EXPORT_SYMBOL_GPL(kernel_subsys);
 
 static struct attribute * kernel_attrs[] = {
 #ifdef CONFIG_HOTPLUG
-	&hotplug_seqnum_attr.attr,
+	&uevent_seqnum_attr.attr,
+	&uevent_helper_attr.attr,
 #endif
 #ifdef CONFIG_KEXEC
 	&crash_notes_attr.attr,
-- 
cgit v1.2.3


From 312c004d36ce6c739512bac83b452f4c20ab1f62 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@suse.de>
Date: Wed, 16 Nov 2005 09:00:00 +0100
Subject: [PATCH] driver core: replace "hotplug" by "uevent"

Leave the overloaded "hotplug" word to susbsystems which are handling
real devices. The driver core does not "plug" anything, it just exports
the state to userspace and generates events.

Signed-off-by: Kay Sievers <kay.sievers@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/ksysfs.c | 14 +++++++-------
 kernel/sysctl.c |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index e975a76a9d5b..bfb4a7a54e22 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -26,23 +26,23 @@ static struct subsys_attribute _name##_attr = \
 /* current uevent sequence number */
 static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page)
 {
-	return sprintf(page, "%llu\n", (unsigned long long)hotplug_seqnum);
+	return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum);
 }
 KERNEL_ATTR_RO(uevent_seqnum);
 
 /* uevent helper program, used during early boo */
 static ssize_t uevent_helper_show(struct subsystem *subsys, char *page)
 {
-	return sprintf(page, "%s\n", hotplug_path);
+	return sprintf(page, "%s\n", uevent_helper);
 }
 static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, size_t count)
 {
-	if (count+1 > HOTPLUG_PATH_LEN)
+	if (count+1 > UEVENT_HELPER_PATH_LEN)
 		return -ENOENT;
-	memcpy(hotplug_path, page, count);
-	hotplug_path[count] = '\0';
-	if (count && hotplug_path[count-1] == '\n')
-		hotplug_path[count-1] = '\0';
+	memcpy(uevent_helper, page, count);
+	uevent_helper[count] = '\0';
+	if (count && uevent_helper[count-1] == '\n')
+		uevent_helper[count-1] = '\0';
 	return count;
 }
 KERNEL_ATTR_RW(uevent_helper);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6a51e25d4466..345f4a1d533f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -395,8 +395,8 @@ static ctl_table kern_table[] = {
 	{
 		.ctl_name	= KERN_HOTPLUG,
 		.procname	= "hotplug",
-		.data		= &hotplug_path,
-		.maxlen		= HOTPLUG_PATH_LEN,
+		.data		= &uevent_helper,
+		.maxlen		= UEVENT_HELPER_PATH_LEN,
 		.mode		= 0644,
 		.proc_handler	= &proc_dostring,
 		.strategy	= &sysctl_string,
-- 
cgit v1.2.3


From f743ca5e10f4145e0b3e6d11b9b46171e16af7ce Mon Sep 17 00:00:00 2001
From: "akpm@osdl.org" <akpm@osdl.org>
Date: Tue, 22 Nov 2005 23:36:13 -0800
Subject: [PATCH] kobject_uevent CONFIG_NET=n fix

lib/lib.a(kobject_uevent.o)(.text+0x25f): In function `kobject_uevent':
: undefined reference to `__alloc_skb'
lib/lib.a(kobject_uevent.o)(.text+0x2a1): In function `kobject_uevent':
: undefined reference to `skb_over_panic'
lib/lib.a(kobject_uevent.o)(.text+0x31d): In function `kobject_uevent':
: undefined reference to `skb_over_panic'
lib/lib.a(kobject_uevent.o)(.text+0x356): In function `kobject_uevent':
: undefined reference to `netlink_broadcast'
lib/lib.a(kobject_uevent.o)(.init.text+0x9): In function `kobject_uevent_init':
: undefined reference to `netlink_kernel_create'
make: *** [.tmp_vmlinux1] Error 1

Netlink is unconditionally enabled if CONFIG_NET, so that's OK.

kobject_uevent.o is compiled even if !CONFIG_HOTPLUG, which is lazy.

Let's compound the sin.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/ksysfs.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index bfb4a7a54e22..99af8b05eeaa 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -15,6 +15,9 @@
 #include <linux/module.h>
 #include <linux/init.h>
 
+u64 uevent_seqnum;
+char uevent_helper[UEVENT_HELPER_PATH_LEN] = "/sbin/hotplug";
+
 #define KERNEL_ATTR_RO(_name) \
 static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
 
-- 
cgit v1.2.3


From a576219aca70e6700705a9836e098dbecd25fb56 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 6 Jan 2006 00:09:50 -0800
Subject: [PATCH] swsusp: resume_store() retval fix

- This function returns -EINVAL all the time.  Fix.

- Decruftify it a bit too.

- Writing to it doesn't seem to do what it's suppoed to do.

Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 027322a564f4..4d944b281b28 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -363,30 +363,28 @@ static ssize_t resume_show(struct subsystem * subsys, char *buf)
 		       MINOR(swsusp_resume_device));
 }
 
-static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t n)
+static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n)
 {
-	int len;
-	char *p;
 	unsigned int maj, min;
-	int error = -EINVAL;
 	dev_t res;
+	int ret = -EINVAL;
 
-	p = memchr(buf, '\n', n);
-	len = p ? p - buf : n;
+	if (sscanf(buf, "%u:%u", &maj, &min) != 2)
+		goto out;
 
-	if (sscanf(buf, "%u:%u", &maj, &min) == 2) {
-		res = MKDEV(maj,min);
-		if (maj == MAJOR(res) && min == MINOR(res)) {
-			down(&pm_sem);
-			swsusp_resume_device = res;
-			up(&pm_sem);
-			printk("Attempting manual resume\n");
-			noresume = 0;
-			software_resume();
-		}
-	}
+	res = MKDEV(maj,min);
+	if (maj != MAJOR(res) || min != MINOR(res))
+		goto out;
 
-	return error >= 0 ? n : error;
+	down(&pm_sem);
+	swsusp_resume_device = res;
+	up(&pm_sem);
+	printk("Attempting manual resume\n");
+	noresume = 0;
+	software_resume();
+	ret = n;
+out:
+	return ret;
 }
 
 power_attr(resume);
-- 
cgit v1.2.3


From 7ee1dd3fee22f15728f545d266403fc977e1eb99 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 6 Jan 2006 00:11:44 -0800
Subject: [PATCH] FRV: Make futex code compilable on nommu [try #2]

Make the futex code compilable and usable on NOMMU by making the attempt to
handle page faults conditional on CONFIG_MMU.  If this is not enabled, then
we can assume that EFAULT returned from futex_atomic_op_inuser() is not
recoverable, and that the address lies outside of valid memory.

handle_mm_fault() is made to BUG if called on NOMMU without attempting to
invoke the actual handler (__handle_mm_fault).

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 5e71a6bf6f6b..5efa2f978032 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -356,6 +356,13 @@ retry:
 		if (bh1 != bh2)
 			spin_unlock(&bh2->lock);
 
+#ifndef CONFIG_MMU
+		/* we don't get EFAULT from MMU faults if we don't have an MMU,
+		 * but we might get them from range checking */
+		ret = op_ret;
+		goto out;
+#endif
+
 		if (unlikely(op_ret != -EFAULT)) {
 			ret = op_ret;
 			goto out;
-- 
cgit v1.2.3


From eee45269b0f5979c70bc151c6c2f4e5f4f5ababe Mon Sep 17 00:00:00 2001
From: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Date: Fri, 6 Jan 2006 00:12:21 -0800
Subject: [PATCH] Alpha: convert to generic irq framework (generic part)

Thanks to Christoph for doing most of the work.

This allows automatic SMP IRQ affinity assignment other than default "all
interrupts on all CPUs" which is rather expensive.  This might be useful if
the hardware can be programmed to distribute interrupts among different
CPUs, like Alpha does.

Signed-off-by: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Richard Henderson <rth@twiddle.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/manage.c | 2 ++
 kernel/irq/proc.c   | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 81c49a4d679e..97d5559997d2 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -366,6 +366,8 @@ int request_irq(unsigned int irq,
 	action->next = NULL;
 	action->dev_id = dev_id;
 
+	select_smp_affinity(irq);
+
 	retval = setup_irq(irq, action);
 	if (retval)
 		kfree(action);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index f26e534c6585..8a64a4844cde 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -68,7 +68,9 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
 	 */
 	cpus_and(tmp, new_value, cpu_online_map);
 	if (cpus_empty(tmp))
-		return -EINVAL;
+		/* Special case for empty set - allow the architecture
+		   code to set default SMP affinity. */
+		return select_smp_affinity(irq) ? -EINVAL : full_count;
 
 	proc_set_irq_affinity(irq, new_value);
 
-- 
cgit v1.2.3


From f2d97f02961e8b1f8a24befb88ab0e5c886586ff Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:12:24 -0800
Subject: [PATCH] swsusp: remove encryption

This patch removes the image encryption that is only used by swsusp instead of
zeroing the image after resume in order to prevent someone from reading some
confidential data from it in the future and it does not protect the image from
being read by an unauthorized person before resume.  The functionality it
provides should really belong to the user space and will possibly be
reimplemented after the swap-handling functionality of swsusp is moved to the
user space.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swsusp.c | 163 ++------------------------------------------------
 1 file changed, 4 insertions(+), 159 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index c05f46e7348f..bd3097c583bf 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -30,9 +30,6 @@
  * Alex Badea <vampire@go.ro>:
  * Fixed runaway init
  *
- * Andreas Steinmetz <ast@domdv.de>:
- * Added encrypted suspend option
- *
  * More state savers are welcome. Especially for the scsi layer...
  *
  * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
@@ -67,10 +64,6 @@
 #include <asm/tlbflush.h>
 #include <asm/io.h>
 
-#include <linux/random.h>
-#include <linux/crypto.h>
-#include <asm/scatterlist.h>
-
 #include "power.h"
 
 #ifdef CONFIG_HIGHMEM
@@ -81,10 +74,6 @@ static int save_highmem(void) { return 0; }
 static int restore_highmem(void) { return 0; }
 #endif
 
-#define CIPHER "aes"
-#define MAXKEY 32
-#define MAXIV  32
-
 extern char resume_file[];
 
 /* Local variables that should not be affected by save */
@@ -102,8 +91,7 @@ suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
 #define SWSUSP_SIG	"S1SUSPEND"
 
 static struct swsusp_header {
-	char reserved[PAGE_SIZE - 20 - MAXKEY - MAXIV - sizeof(swp_entry_t)];
-	u8 key_iv[MAXKEY+MAXIV];
+	char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
 	swp_entry_t swsusp_info;
 	char	orig_sig[10];
 	char	sig[10];
@@ -123,131 +111,6 @@ static struct swsusp_info swsusp_info;
 static unsigned short swapfile_used[MAX_SWAPFILES];
 static unsigned short root_swap;
 
-static int write_page(unsigned long addr, swp_entry_t *loc);
-static int bio_read_page(pgoff_t page_off, void *page);
-
-static u8 key_iv[MAXKEY+MAXIV];
-
-#ifdef CONFIG_SWSUSP_ENCRYPT
-
-static int crypto_init(int mode, void **mem)
-{
-	int error = 0;
-	int len;
-	char *modemsg;
-	struct crypto_tfm *tfm;
-
-	modemsg = mode ? "suspend not possible" : "resume not possible";
-
-	tfm = crypto_alloc_tfm(CIPHER, CRYPTO_TFM_MODE_CBC);
-	if(!tfm) {
-		printk(KERN_ERR "swsusp: no tfm, %s\n", modemsg);
-		error = -EINVAL;
-		goto out;
-	}
-
-	if(MAXKEY < crypto_tfm_alg_min_keysize(tfm)) {
-		printk(KERN_ERR "swsusp: key buffer too small, %s\n", modemsg);
-		error = -ENOKEY;
-		goto fail;
-	}
-
-	if (mode)
-		get_random_bytes(key_iv, MAXKEY+MAXIV);
-
-	len = crypto_tfm_alg_max_keysize(tfm);
-	if (len > MAXKEY)
-		len = MAXKEY;
-
-	if (crypto_cipher_setkey(tfm, key_iv, len)) {
-		printk(KERN_ERR "swsusp: key setup failure, %s\n", modemsg);
-		error = -EKEYREJECTED;
-		goto fail;
-	}
-
-	len = crypto_tfm_alg_ivsize(tfm);
-
-	if (MAXIV < len) {
-		printk(KERN_ERR "swsusp: iv buffer too small, %s\n", modemsg);
-		error = -EOVERFLOW;
-		goto fail;
-	}
-
-	crypto_cipher_set_iv(tfm, key_iv+MAXKEY, len);
-
-	*mem=(void *)tfm;
-
-	goto out;
-
-fail:	crypto_free_tfm(tfm);
-out:	return error;
-}
-
-static __inline__ void crypto_exit(void *mem)
-{
-	crypto_free_tfm((struct crypto_tfm *)mem);
-}
-
-static __inline__ int crypto_write(struct pbe *p, void *mem)
-{
-	int error = 0;
-	struct scatterlist src, dst;
-
-	src.page   = virt_to_page(p->address);
-	src.offset = 0;
-	src.length = PAGE_SIZE;
-	dst.page   = virt_to_page((void *)&swsusp_header);
-	dst.offset = 0;
-	dst.length = PAGE_SIZE;
-
-	error = crypto_cipher_encrypt((struct crypto_tfm *)mem, &dst, &src,
-					PAGE_SIZE);
-
-	if (!error)
-		error = write_page((unsigned long)&swsusp_header,
-				&(p->swap_address));
-	return error;
-}
-
-static __inline__ int crypto_read(struct pbe *p, void *mem)
-{
-	int error = 0;
-	struct scatterlist src, dst;
-
-	error = bio_read_page(swp_offset(p->swap_address), (void *)p->address);
-	if (!error) {
-		src.offset = 0;
-		src.length = PAGE_SIZE;
-		dst.offset = 0;
-		dst.length = PAGE_SIZE;
-		src.page = dst.page = virt_to_page((void *)p->address);
-
-		error = crypto_cipher_decrypt((struct crypto_tfm *)mem, &dst,
-						&src, PAGE_SIZE);
-	}
-	return error;
-}
-#else
-static __inline__ int crypto_init(int mode, void *mem)
-{
-	return 0;
-}
-
-static __inline__ void crypto_exit(void *mem)
-{
-}
-
-static __inline__ int crypto_write(struct pbe *p, void *mem)
-{
-	return write_page(p->address, &(p->swap_address));
-}
-
-static __inline__ int crypto_read(struct pbe *p, void *mem)
-{
-	return bio_read_page(swp_offset(p->swap_address), (void *)p->address);
-}
-#endif
-
 static int mark_swapfiles(swp_entry_t prev)
 {
 	int error;
@@ -259,7 +122,6 @@ static int mark_swapfiles(swp_entry_t prev)
 	    !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
 		memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
 		memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
-		memcpy(swsusp_header.key_iv, key_iv, MAXKEY+MAXIV);
 		swsusp_header.swsusp_info = prev;
 		error = rw_swap_page_sync(WRITE,
 					  swp_entry(root_swap, 0),
@@ -405,10 +267,6 @@ static int data_write(void)
 	int error = 0, i = 0;
 	unsigned int mod = nr_copy_pages / 100;
 	struct pbe *p;
-	void *tfm;
-
-	if ((error = crypto_init(1, &tfm)))
-		return error;
 
 	if (!mod)
 		mod = 1;
@@ -417,14 +275,11 @@ static int data_write(void)
 	for_each_pbe (p, pagedir_nosave) {
 		if (!(i%mod))
 			printk( "\b\b\b\b%3d%%", i / mod );
-		if ((error = crypto_write(p, tfm))) {
-			crypto_exit(tfm);
+		if ((error = write_page(p->address, &p->swap_address)))
 			return error;
-		}
 		i++;
 	}
 	printk("\b\b\b\bdone\n");
-	crypto_exit(tfm);
 	return error;
 }
 
@@ -550,7 +405,6 @@ static int write_suspend_image(void)
 	if ((error = close_swap()))
 		goto FreePagedir;
  Done:
-	memset(key_iv, 0, MAXKEY+MAXIV);
 	return error;
  FreePagedir:
 	free_pagedir_entries();
@@ -812,8 +666,6 @@ static int check_sig(void)
 		return error;
 	if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
 		memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
-		memcpy(key_iv, swsusp_header.key_iv, MAXKEY+MAXIV);
-		memset(swsusp_header.key_iv, 0, MAXKEY+MAXIV);
 
 		/*
 		 * Reset swap signature now.
@@ -840,10 +692,6 @@ static int data_read(struct pbe *pblist)
 	int error = 0;
 	int i = 0;
 	int mod = swsusp_info.image_pages / 100;
-	void *tfm;
-
-	if ((error = crypto_init(0, &tfm)))
-		return error;
 
 	if (!mod)
 		mod = 1;
@@ -855,15 +703,13 @@ static int data_read(struct pbe *pblist)
 		if (!(i % mod))
 			printk("\b\b\b\b%3d%%", i / mod);
 
-		if ((error = crypto_read(p, tfm))) {
-			crypto_exit(tfm);
+		if ((error = bio_read_page(swp_offset(p->swap_address),
+						(void *)p->address)))
 			return error;
-		}
 
 		i++;
 	}
 	printk("\b\b\b\bdone\n");
-	crypto_exit(tfm);
 	return error;
 }
 
@@ -986,7 +832,6 @@ int swsusp_read(void)
 
 	error = read_suspend_image();
 	blkdev_put(resume_bdev);
-	memset(key_iv, 0, MAXKEY+MAXIV);
 
 	if (!error)
 		pr_debug("swsusp: Reading resume file was successful\n");
-- 
cgit v1.2.3


From 7088a5c00103ef48782d6c359cd12b13a10666e6 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:13:05 -0800
Subject: [PATCH] swsusp: introduce the swap map structure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch introduces the swap map structure that can be used by swsusp for
keeping tracks of data pages written to the swap.   The structure itself is
described in a comment within the patch.

The overall idea is to reduce the amount of metadata written to the swap and
to write and read the image pages sequentially, in a file-alike way.  This
makes the swap-handling part of swsusp fairly independent of its
snapshot-handling part and will hopefully allow us to completely separate
these two parts in the future.

This patch is needed to remove the suspend image size limit imposed by the
limited size of the swsusp_info structure, which is essential for x86-64
systems with more than 512 MB of RAM.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c     |   8 +-
 kernel/power/power.h    |  13 +-
 kernel/power/snapshot.c |  14 +-
 kernel/power/swsusp.c   | 558 ++++++++++++++++++++++++++++++++++--------------
 4 files changed, 417 insertions(+), 176 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 4d944b281b28..76a5131b0e80 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -25,9 +25,9 @@
 extern suspend_disk_method_t pm_disk_mode;
 
 extern int swsusp_suspend(void);
-extern int swsusp_write(void);
+extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages);
 extern int swsusp_check(void);
-extern int swsusp_read(void);
+extern int swsusp_read(struct pbe **pblist_ptr);
 extern void swsusp_close(void);
 extern int swsusp_resume(void);
 
@@ -176,7 +176,7 @@ int pm_suspend_disk(void)
 	if (in_suspend) {
 		device_resume();
 		pr_debug("PM: writing image.\n");
-		error = swsusp_write();
+		error = swsusp_write(pagedir_nosave, nr_copy_pages);
 		if (!error)
 			power_down(pm_disk_mode);
 		else {
@@ -247,7 +247,7 @@ static int software_resume(void)
 
 	pr_debug("PM: Reading swsusp image.\n");
 
-	if ((error = swsusp_read())) {
+	if ((error = swsusp_read(&pagedir_nosave))) {
 		swsusp_free();
 		goto Thaw;
 	}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 6c042b5ee14b..977877c6dcfc 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -9,19 +9,14 @@
 #define SUSPEND_CONSOLE	(MAX_NR_CONSOLES-1)
 #endif
 
-#define MAX_PBES	((PAGE_SIZE - sizeof(struct new_utsname) \
-			- 4 - 3*sizeof(unsigned long) - sizeof(int) \
-			- sizeof(void *)) / sizeof(swp_entry_t))
-
 struct swsusp_info {
 	struct new_utsname	uts;
 	u32			version_code;
 	unsigned long		num_physpages;
 	int			cpus;
 	unsigned long		image_pages;
-	unsigned long		pagedir_pages;
-	suspend_pagedir_t	* suspend_pagedir;
-	swp_entry_t		pagedir[MAX_PBES];
+	unsigned long		pages;
+	swp_entry_t		start;
 } __attribute__((aligned(PAGE_SIZE)));
 
 
@@ -67,6 +62,8 @@ extern asmlinkage int swsusp_arch_resume(void);
 
 extern void free_pagedir(struct pbe *pblist);
 extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed);
-extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages);
 extern void swsusp_free(void);
 extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed);
+extern unsigned int snapshot_nr_pages(void);
+extern struct pbe *snapshot_pblist(void);
+extern void snapshot_pblist_set(struct pbe *pblist);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 4a6dbcefd378..152d56cdf017 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -33,6 +33,9 @@
 
 #include "power.h"
 
+struct pbe *pagedir_nosave;
+unsigned int nr_copy_pages;
+
 #ifdef CONFIG_HIGHMEM
 struct highmem_page {
 	char *data;
@@ -244,7 +247,7 @@ static inline void fill_pb_page(struct pbe *pbpage)
  *	of memory pages allocated with alloc_pagedir()
  */
 
-void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
+static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
 {
 	struct pbe *pbpage, *p;
 	unsigned int num = PBES_PER_PAGE;
@@ -261,7 +264,6 @@ void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
 			p->next = p + 1;
 		p->next = NULL;
 	}
-	pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
 }
 
 /**
@@ -332,7 +334,8 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
 	if (!pbe) { /* get_zeroed_page() failed */
 		free_pagedir(pblist);
 		pblist = NULL;
-        }
+        } else
+        	create_pbe_list(pblist, nr_pages);
 	return pblist;
 }
 
@@ -395,7 +398,6 @@ static struct pbe *swsusp_alloc(unsigned int nr_pages)
 		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
 		return NULL;
 	}
-	create_pbe_list(pblist, nr_pages);
 
 	if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) {
 		printk(KERN_ERR "suspend: Allocating image pages failed.\n");
@@ -421,10 +423,6 @@ asmlinkage int swsusp_save(void)
 		 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE,
 		 PAGES_FOR_IO, nr_free_pages());
 
-	/* This is needed because of the fixed size of swsusp_info */
-	if (MAX_PBES < (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE)
-		return -ENOSPC;
-
 	if (!enough_free_mem(nr_pages)) {
 		printk(KERN_ERR "swsusp: Not enough free memory\n");
 		return -ENOMEM;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index bd3097c583bf..b09bd7c0998d 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -30,6 +30,9 @@
  * Alex Badea <vampire@go.ro>:
  * Fixed runaway init
  *
+ * Rafael J. Wysocki <rjw@sisk.pl>
+ * Added the swap map data structure and reworked the handling of swap
+ *
  * More state savers are welcome. Especially for the scsi layer...
  *
  * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
@@ -76,18 +79,6 @@ static int restore_highmem(void) { return 0; }
 
 extern char resume_file[];
 
-/* Local variables that should not be affected by save */
-unsigned int nr_copy_pages __nosavedata = 0;
-
-/* Suspend pagedir is allocated before final copy, therefore it
-   must be freed after resume
-
-   Warning: this is even more evil than it seems. Pagedirs this file
-   talks about are completely different from page directories used by
-   MMU hardware.
- */
-suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
-
 #define SWSUSP_SIG	"S1SUSPEND"
 
 static struct swsusp_header {
@@ -238,48 +229,205 @@ static int write_page(unsigned long addr, swp_entry_t *loc)
 }
 
 /**
- *	data_free - Free the swap entries used by the saved image.
+ *	Swap map-handling functions
+ *
+ *	The swap map is a data structure used for keeping track of each page
+ *	written to the swap.  It consists of many swap_map_page structures
+ *	that contain each an array of MAP_PAGE_SIZE swap entries.
+ *	These structures are linked together with the help of either the
+ *	.next (in memory) or the .next_swap (in swap) member.
  *
- *	Walk the list of used swap entries and free each one.
- *	This is only used for cleanup when suspend fails.
+ *	The swap map is created during suspend.  At that time we need to keep
+ *	it in memory, because we have to free all of the allocated swap
+ *	entries if an error occurs.  The memory needed is preallocated
+ *	so that we know in advance if there's enough of it.
+ *
+ *	The first swap_map_page structure is filled with the swap entries that
+ *	correspond to the first MAP_PAGE_SIZE data pages written to swap and
+ *	so on.  After the all of the data pages have been written, the order
+ *	of the swap_map_page structures in the map is reversed so that they
+ *	can be read from swap in the original order.  This causes the data
+ *	pages to be loaded in exactly the same order in which they have been
+ *	saved.
+ *
+ *	During resume we only need to use one swap_map_page structure
+ *	at a time, which means that we only need to use two memory pages for
+ *	reading the image - one for reading the swap_map_page structures
+ *	and the second for reading the data pages from swap.
  */
-static void data_free(void)
+
+#define MAP_PAGE_SIZE	((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \
+			/ sizeof(swp_entry_t))
+
+struct swap_map_page {
+	swp_entry_t		entries[MAP_PAGE_SIZE];
+	swp_entry_t		next_swap;
+	struct swap_map_page	*next;
+};
+
+static inline void free_swap_map(struct swap_map_page *swap_map)
 {
-	swp_entry_t entry;
-	struct pbe *p;
+	struct swap_map_page *swp;
 
-	for_each_pbe (p, pagedir_nosave) {
-		entry = p->swap_address;
-		if (entry.val)
-			swap_free(entry);
-		else
-			break;
+	while (swap_map) {
+		swp = swap_map->next;
+		free_page((unsigned long)swap_map);
+		swap_map = swp;
+	}
+}
+
+static struct swap_map_page *alloc_swap_map(unsigned int nr_pages)
+{
+	struct swap_map_page *swap_map, *swp;
+	unsigned n = 0;
+
+	if (!nr_pages)
+		return NULL;
+
+	pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages);
+	swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
+	swp = swap_map;
+	for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) {
+		swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
+		swp = swp->next;
+		if (!swp) {
+			free_swap_map(swap_map);
+			return NULL;
+		}
 	}
+	return swap_map;
 }
 
 /**
- *	data_write - Write saved image to swap.
- *
- *	Walk the list of pages in the image and sync each one to swap.
+ *	reverse_swap_map - reverse the order of pages in the swap map
+ *	@swap_map
  */
-static int data_write(void)
+
+static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map)
 {
-	int error = 0, i = 0;
-	unsigned int mod = nr_copy_pages / 100;
-	struct pbe *p;
+	struct swap_map_page *prev, *next;
+
+	prev = NULL;
+	while (swap_map) {
+		next = swap_map->next;
+		swap_map->next = prev;
+		prev = swap_map;
+		swap_map = next;
+	}
+	return prev;
+}
 
-	if (!mod)
-		mod = 1;
+/**
+ *	free_swap_map_entries - free the swap entries allocated to store
+ *	the swap map @swap_map (this is only called in case of an error)
+ */
+static inline void free_swap_map_entries(struct swap_map_page *swap_map)
+{
+	while (swap_map) {
+		if (swap_map->next_swap.val)
+			swap_free(swap_map->next_swap);
+		swap_map = swap_map->next;
+	}
+}
 
-	printk( "Writing data to swap (%d pages)...     ", nr_copy_pages );
-	for_each_pbe (p, pagedir_nosave) {
-		if (!(i%mod))
-			printk( "\b\b\b\b%3d%%", i / mod );
-		if ((error = write_page(p->address, &p->swap_address)))
+/**
+ *	save_swap_map - save the swap map used for tracing the data pages
+ *	stored in the swap
+ */
+
+static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start)
+{
+	swp_entry_t entry = (swp_entry_t){0};
+	int error;
+
+	while (swap_map) {
+		swap_map->next_swap = entry;
+		if ((error = write_page((unsigned long)swap_map, &entry)))
 			return error;
-		i++;
+		swap_map = swap_map->next;
 	}
-	printk("\b\b\b\bdone\n");
+	*start = entry;
+	return 0;
+}
+
+/**
+ *	free_image_entries - free the swap entries allocated to store
+ *	the image data pages (this is only called in case of an error)
+ */
+
+static inline void free_image_entries(struct swap_map_page *swp)
+{
+	unsigned k;
+
+	while (swp) {
+		for (k = 0; k < MAP_PAGE_SIZE; k++)
+			if (swp->entries[k].val)
+				swap_free(swp->entries[k]);
+		swp = swp->next;
+	}
+}
+
+/**
+ *	The swap_map_handle structure is used for handling the swap map in
+ *	a file-alike way
+ */
+
+struct swap_map_handle {
+	struct swap_map_page *cur;
+	unsigned int k;
+};
+
+static inline void init_swap_map_handle(struct swap_map_handle *handle,
+                                        struct swap_map_page *map)
+{
+	handle->cur = map;
+	handle->k = 0;
+}
+
+static inline int swap_map_write_page(struct swap_map_handle *handle,
+                                      unsigned long addr)
+{
+	int error;
+
+	error = write_page(addr, handle->cur->entries + handle->k);
+	if (error)
+		return error;
+	if (++handle->k >= MAP_PAGE_SIZE) {
+		handle->cur = handle->cur->next;
+		handle->k = 0;
+	}
+	return 0;
+}
+
+/**
+ *	save_image_data - save the data pages pointed to by the PBEs
+ *	from the list @pblist using the swap map handle @handle
+ *	(assume there are @nr_pages data pages to save)
+ */
+
+static int save_image_data(struct pbe *pblist,
+                           struct swap_map_handle *handle,
+                           unsigned int nr_pages)
+{
+	unsigned int m;
+	struct pbe *p;
+	int error = 0;
+
+	printk("Saving image data pages (%u pages) ...     ", nr_pages);
+	m = nr_pages / 100;
+	if (!m)
+		m = 1;
+	nr_pages = 0;
+	for_each_pbe (p, pblist) {
+		error = swap_map_write_page(handle, p->address);
+		if (error)
+			break;
+		if (!(nr_pages % m))
+			printk("\b\b\b\b%3d%%", nr_pages / m);
+		nr_pages++;
+	}
+	if (!error)
+		printk("\b\b\b\bdone\n");
 	return error;
 }
 
@@ -295,19 +443,20 @@ static void dump_info(void)
 	pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname);
 	pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus);
 	pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages);
-	pr_debug(" swsusp: Pagedir: %ld Pages\n",swsusp_info.pagedir_pages);
+	pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages);
 }
 
-static void init_header(void)
+static void init_header(unsigned int nr_pages)
 {
 	memset(&swsusp_info, 0, sizeof(swsusp_info));
 	swsusp_info.version_code = LINUX_VERSION_CODE;
 	swsusp_info.num_physpages = num_physpages;
 	memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname));
 
-	swsusp_info.suspend_pagedir = pagedir_nosave;
 	swsusp_info.cpus = num_online_cpus();
-	swsusp_info.image_pages = nr_copy_pages;
+	swsusp_info.image_pages = nr_pages;
+	swsusp_info.pages = nr_pages +
+		((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT);
 }
 
 static int close_swap(void)
@@ -326,39 +475,53 @@ static int close_swap(void)
 }
 
 /**
- *	free_pagedir_entries - Free pages used by the page directory.
- *
- *	This is used during suspend for error recovery.
+ *	pack_orig_addresses - the .orig_address fields of the PBEs from the
+ *	list starting at @pbe are stored in the array @buf[] (1 page)
  */
 
-static void free_pagedir_entries(void)
+static inline struct pbe *pack_orig_addresses(unsigned long *buf,
+                                              struct pbe *pbe)
 {
-	int i;
+	int j;
 
-	for (i = 0; i < swsusp_info.pagedir_pages; i++)
-		swap_free(swsusp_info.pagedir[i]);
+	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
+		buf[j] = pbe->orig_address;
+		pbe = pbe->next;
+	}
+	if (!pbe)
+		for (; j < PAGE_SIZE / sizeof(long); j++)
+			buf[j] = 0;
+	return pbe;
 }
 
-
 /**
- *	write_pagedir - Write the array of pages holding the page directory.
- *	@last:	Last swap entry we write (needed for header).
+ *	save_image_metadata - save the .orig_address fields of the PBEs
+ *	from the list @pblist using the swap map handle @handle
  */
 
-static int write_pagedir(void)
+static int save_image_metadata(struct pbe *pblist,
+                               struct swap_map_handle *handle)
 {
-	int error = 0;
+	unsigned long *buf;
 	unsigned int n = 0;
-	struct pbe *pbe;
+	struct pbe *p;
+	int error = 0;
 
-	printk( "Writing pagedir...");
-	for_each_pb_page (pbe, pagedir_nosave) {
-		if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++])))
-			return error;
+	printk("Saving image metadata ... ");
+	buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
+	if (!buf)
+		return -ENOMEM;
+	p = pblist;
+	while (p) {
+		p = pack_orig_addresses(buf, p);
+		error = swap_map_write_page(handle, (unsigned long)buf);
+		if (error)
+			break;
+		n++;
 	}
-
-	swsusp_info.pagedir_pages = n;
-	printk("done (%u pages)\n", n);
+	free_page((unsigned long)buf);
+	if (!error)
+		printk("done (%u pages saved)\n", n);
 	return error;
 }
 
@@ -384,33 +547,48 @@ static int enough_swap(unsigned int nr_pages)
 
 /**
  *	write_suspend_image - Write entire image and metadata.
- *
  */
-static int write_suspend_image(void)
+static int write_suspend_image(struct pbe *pblist, unsigned int nr_pages)
 {
+	struct swap_map_page *swap_map;
+	struct swap_map_handle handle;
 	int error;
 
-	if (!enough_swap(nr_copy_pages)) {
+	if (!enough_swap(nr_pages)) {
 		printk(KERN_ERR "swsusp: Not enough free swap\n");
 		return -ENOSPC;
 	}
 
-	init_header();
-	if ((error = data_write()))
-		goto FreeData;
+	init_header(nr_pages);
+	swap_map = alloc_swap_map(swsusp_info.pages);
+	if (!swap_map)
+		return -ENOMEM;
+	init_swap_map_handle(&handle, swap_map);
 
-	if ((error = write_pagedir()))
-		goto FreePagedir;
+	error = save_image_metadata(pblist, &handle);
+	if (!error)
+		error = save_image_data(pblist, &handle, nr_pages);
+	if (error)
+		goto Free_image_entries;
 
-	if ((error = close_swap()))
-		goto FreePagedir;
- Done:
+	swap_map = reverse_swap_map(swap_map);
+	error = save_swap_map(swap_map, &swsusp_info.start);
+	if (error)
+		goto Free_map_entries;
+
+	error = close_swap();
+	if (error)
+		goto Free_map_entries;
+
+Free_swap_map:
+	free_swap_map(swap_map);
 	return error;
- FreePagedir:
-	free_pagedir_entries();
- FreeData:
-	data_free();
-	goto Done;
+
+Free_map_entries:
+	free_swap_map_entries(swap_map);
+Free_image_entries:
+	free_image_entries(swap_map);
+	goto Free_swap_map;
 }
 
 /* It is important _NOT_ to umount filesystems at this point. We want
@@ -418,7 +596,7 @@ static int write_suspend_image(void)
  * filesystem clean: it is not. (And it does not matter, if we resume
  * correctly, we'll mark system clean, anyway.)
  */
-int swsusp_write(void)
+int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
 {
 	int error;
 
@@ -427,14 +605,12 @@ int swsusp_write(void)
 		return error;
 	}
 	lock_swapdevices();
-	error = write_suspend_image();
+	error = write_suspend_image(pblist, nr_pages);
 	/* This will unlock ignored swap devices since writing is finished */
 	lock_swapdevices();
 	return error;
 }
 
-
-
 int swsusp_suspend(void)
 {
 	int error;
@@ -531,7 +707,6 @@ static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
 	/* We assume both lists contain the same number of elements */
 	while (src) {
 		dst->orig_address = src->orig_address;
-		dst->swap_address = src->swap_address;
 		dst = dst->next;
 		src = src->next;
 	}
@@ -611,6 +786,61 @@ static int bio_write_page(pgoff_t page_off, void *page)
 	return submit(WRITE, page_off, page);
 }
 
+/**
+ *	The following functions allow us to read data using a swap map
+ *	in a file-alike way
+ */
+
+static inline void release_swap_map_reader(struct swap_map_handle *handle)
+{
+	if (handle->cur)
+		free_page((unsigned long)handle->cur);
+	handle->cur = NULL;
+}
+
+static inline int get_swap_map_reader(struct swap_map_handle *handle,
+                                      swp_entry_t start)
+{
+	int error;
+
+	if (!swp_offset(start))
+		return -EINVAL;
+	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
+	if (!handle->cur)
+		return -ENOMEM;
+	error = bio_read_page(swp_offset(start), handle->cur);
+	if (error) {
+		release_swap_map_reader(handle);
+		return error;
+	}
+	handle->k = 0;
+	return 0;
+}
+
+static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf)
+{
+	unsigned long offset;
+	int error;
+
+	if (!handle->cur)
+		return -EINVAL;
+	offset = swp_offset(handle->cur->entries[handle->k]);
+	if (!offset)
+		return -EINVAL;
+	error = bio_read_page(offset, buf);
+	if (error)
+		return error;
+	if (++handle->k >= MAP_PAGE_SIZE) {
+		handle->k = 0;
+		offset = swp_offset(handle->cur->next_swap);
+		if (!offset)
+			release_swap_map_reader(handle);
+		else
+			error = bio_read_page(offset, handle->cur);
+	}
+	return error;
+}
+
 /*
  * Sanity check if this image makes sense with this kernel/swap context
  * I really don't think that it's foolproof but more than nothing..
@@ -639,7 +869,6 @@ static const char *sanity_check(void)
 	return NULL;
 }
 
-
 static int check_header(void)
 {
 	const char *reason = NULL;
@@ -653,7 +882,6 @@ static int check_header(void)
 		printk(KERN_ERR "swsusp: Resume mismatch: %s\n",reason);
 		return -EPERM;
 	}
-	nr_copy_pages = swsusp_info.image_pages;
 	return error;
 }
 
@@ -680,75 +908,88 @@ static int check_sig(void)
 }
 
 /**
- *	data_read - Read image pages from swap.
- *
- *	You do not need to check for overlaps, check_pagedir()
- *	already did that.
+ *	load_image_data - load the image data using the swap map handle
+ *	@handle and store them using the page backup list @pblist
+ *	(assume there are @nr_pages pages to load)
  */
 
-static int data_read(struct pbe *pblist)
+static int load_image_data(struct pbe *pblist,
+                           struct swap_map_handle *handle,
+                           unsigned int nr_pages)
 {
+	int error;
+	unsigned int m;
 	struct pbe *p;
-	int error = 0;
-	int i = 0;
-	int mod = swsusp_info.image_pages / 100;
-
-	if (!mod)
-		mod = 1;
-
-	printk("swsusp: Reading image data (%lu pages):     ",
-			swsusp_info.image_pages);
-
-	for_each_pbe (p, pblist) {
-		if (!(i % mod))
-			printk("\b\b\b\b%3d%%", i / mod);
 
-		if ((error = bio_read_page(swp_offset(p->swap_address),
-						(void *)p->address)))
-			return error;
-
-		i++;
+	if (!pblist)
+		return -EINVAL;
+	printk("Loading image data pages (%u pages) ...     ", nr_pages);
+	m = nr_pages / 100;
+	if (!m)
+		m = 1;
+	nr_pages = 0;
+	p = pblist;
+	while (p) {
+		error = swap_map_read_page(handle, (void *)p->address);
+		if (error)
+			break;
+		p = p->next;
+		if (!(nr_pages % m))
+			printk("\b\b\b\b%3d%%", nr_pages / m);
+		nr_pages++;
 	}
-	printk("\b\b\b\bdone\n");
+	if (!error)
+		printk("\b\b\b\bdone\n");
 	return error;
 }
 
 /**
- *	read_pagedir - Read page backup list pages from swap
+ *	unpack_orig_addresses - copy the elements of @buf[] (1 page) to
+ *	the PBEs in the list starting at @pbe
  */
 
-static int read_pagedir(struct pbe *pblist)
+static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
+                                                struct pbe *pbe)
 {
-	struct pbe *pbpage, *p;
-	unsigned int i = 0;
-	int error;
+	int j;
 
-	if (!pblist)
-		return -EFAULT;
+	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
+		pbe->orig_address = buf[j];
+		pbe = pbe->next;
+	}
+	return pbe;
+}
 
-	printk("swsusp: Reading pagedir (%lu pages)\n",
-			swsusp_info.pagedir_pages);
+/**
+ *	load_image_metadata - load the image metadata using the swap map
+ *	handle @handle and put them into the PBEs in the list @pblist
+ */
 
-	for_each_pb_page (pbpage, pblist) {
-		unsigned long offset = swp_offset(swsusp_info.pagedir[i++]);
+static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle)
+{
+	struct pbe *p;
+	unsigned long *buf;
+	unsigned int n = 0;
+	int error = 0;
 
-		error = -EFAULT;
-		if (offset) {
-			p = (pbpage + PB_PAGE_SKIP)->next;
-			error = bio_read_page(offset, (void *)pbpage);
-			(pbpage + PB_PAGE_SKIP)->next = p;
-		}
+	printk("Loading image metadata ... ");
+	buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
+	if (!buf)
+		return -ENOMEM;
+	p = pblist;
+	while (p) {
+		error = swap_map_read_page(handle, buf);
 		if (error)
 			break;
+		p = unpack_orig_addresses(buf, p);
+		n++;
 	}
-
+	free_page((unsigned long)buf);
 	if (!error)
-		BUG_ON(i != swsusp_info.pagedir_pages);
-
+		printk("done (%u pages loaded)\n", n);
 	return error;
 }
 
-
 static int check_suspend_image(void)
 {
 	int error = 0;
@@ -762,34 +1003,39 @@ static int check_suspend_image(void)
 	return 0;
 }
 
-static int read_suspend_image(void)
+static int read_suspend_image(struct pbe **pblist_ptr)
 {
 	int error = 0;
-	struct pbe *p;
+	struct pbe *p, *pblist;
+	struct swap_map_handle handle;
+	unsigned int nr_pages = swsusp_info.image_pages;
 
-	if (!(p = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 0)))
+	p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0);
+	if (!p)
 		return -ENOMEM;
-
-	if ((error = read_pagedir(p)))
+	error = get_swap_map_reader(&handle, swsusp_info.start);
+	if (error)
+		/* The PBE list at p will be released by swsusp_free() */
 		return error;
-	create_pbe_list(p, nr_copy_pages);
-	mark_unsafe_pages(p);
-	pagedir_nosave = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1);
-	if (pagedir_nosave) {
-		create_pbe_list(pagedir_nosave, nr_copy_pages);
-		copy_page_backup_list(pagedir_nosave, p);
+	error = load_image_metadata(p, &handle);
+	if (!error) {
+		mark_unsafe_pages(p);
+		pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
+		if (pblist)
+			copy_page_backup_list(pblist, p);
+		free_pagedir(p);
+		if (!pblist)
+			error = -ENOMEM;
+
+		/* Allocate memory for the image and read the data from swap */
+		if (!error)
+			error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
+		if (!error)
+			error = load_image_data(pblist, &handle, nr_pages);
+		if (!error)
+			*pblist_ptr = pblist;
 	}
-	free_pagedir(p);
-	if (!pagedir_nosave)
-		return -ENOMEM;
-
-	/* Allocate memory for the image and read the data from swap */
-
-	error = alloc_data_pages(pagedir_nosave, GFP_ATOMIC, 1);
-
-	if (!error)
-		error = data_read(pagedir_nosave);
-
+	release_swap_map_reader(&handle);
 	return error;
 }
 
@@ -821,7 +1067,7 @@ int swsusp_check(void)
  *	swsusp_read - Read saved image from swap.
  */
 
-int swsusp_read(void)
+int swsusp_read(struct pbe **pblist_ptr)
 {
 	int error;
 
@@ -830,7 +1076,7 @@ int swsusp_read(void)
 		return PTR_ERR(resume_bdev);
 	}
 
-	error = read_suspend_image();
+	error = read_suspend_image(pblist_ptr);
 	blkdev_put(resume_bdev);
 
 	if (!error)
-- 
cgit v1.2.3


From 72a97e08394a3b2e75481ff680ec2a0591e3cba4 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:13:46 -0800
Subject: [PATCH] swsusp: improve freeing of memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch makes swsusp free only as much memory as needed to complete the
suspend and not as much as possible.   In the most of cases this should speed
up the suspend and make the system much more responsive after resume,
especially if a GUI (eg.  X Windows) is used.

If needed, the old behavior (ie to free as much memory as possible during
suspend) can be restored by unsetting FAST_FREE in power.h

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c     | 30 +++--------------------
 kernel/power/power.h    | 14 ++++++++---
 kernel/power/snapshot.c | 65 +++++++++++++++++++++++++++++++++++++++++++++----
 kernel/power/swsusp.c   | 52 ++++++++++++++++++++++++++++++++++++++-
 4 files changed, 125 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 76a5131b0e80..9e51cdf7b78d 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -24,6 +24,7 @@
 
 extern suspend_disk_method_t pm_disk_mode;
 
+extern int swsusp_shrink_memory(void);
 extern int swsusp_suspend(void);
 extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages);
 extern int swsusp_check(void);
@@ -73,31 +74,6 @@ static void power_down(suspend_disk_method_t mode)
 static int in_suspend __nosavedata = 0;
 
 
-/**
- *	free_some_memory -  Try to free as much memory as possible
- *
- *	... but do not OOM-kill anyone
- *
- *	Notice: all userland should be stopped at this point, or
- *	livelock is possible.
- */
-
-static void free_some_memory(void)
-{
-	unsigned int i = 0;
-	unsigned int tmp;
-	unsigned long pages = 0;
-	char *p = "-\\|/";
-
-	printk("Freeing memory...  ");
-	while ((tmp = shrink_all_memory(10000))) {
-		pages += tmp;
-		printk("\b%c", p[i++ % 4]);
-	}
-	printk("\bdone (%li pages freed)\n", pages);
-}
-
-
 static inline void platform_finish(void)
 {
 	if (pm_disk_mode == PM_DISK_PLATFORM) {
@@ -127,8 +103,8 @@ static int prepare_processes(void)
 	}
 
 	/* Free memory before shutting down devices. */
-	free_some_memory();
-	return 0;
+	if (!(error = swsusp_shrink_memory()))
+		return 0;
 thaw:
 	thaw_processes();
 	enable_nonboot_cpus();
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 977877c6dcfc..acdc83b3d890 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -49,18 +49,26 @@ extern void thaw_processes(void);
 extern int pm_prepare_console(void);
 extern void pm_restore_console(void);
 
-
 /* References to section boundaries */
 extern const void __nosave_begin, __nosave_end;
 
 extern unsigned int nr_copy_pages;
-extern suspend_pagedir_t *pagedir_nosave;
-extern suspend_pagedir_t *pagedir_save;
+extern struct pbe *pagedir_nosave;
+
+/*
+ * This compilation switch determines the way in which memory will be freed
+ * during suspend.  If defined, only as much memory will be freed as needed
+ * to complete the suspend, which will make it go faster.  Otherwise, the
+ * largest possible amount of memory will be freed.
+ */
+#define FAST_FREE	1
 
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
 
+extern unsigned int count_data_pages(void);
 extern void free_pagedir(struct pbe *pblist);
+extern void release_eaten_pages(void);
 extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed);
 extern void swsusp_free(void);
 extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 152d56cdf017..e80d282dbf58 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -37,6 +37,31 @@ struct pbe *pagedir_nosave;
 unsigned int nr_copy_pages;
 
 #ifdef CONFIG_HIGHMEM
+unsigned int count_highmem_pages(void)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+	unsigned int n = 0;
+
+	for_each_zone (zone)
+		if (is_highmem(zone)) {
+			mark_free_pages(zone);
+			for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) {
+				struct page *page;
+				unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+				if (!pfn_valid(pfn))
+					continue;
+				page = pfn_to_page(pfn);
+				if (PageReserved(page))
+					continue;
+				if (PageNosaveFree(page))
+					continue;
+				n++;
+			}
+		}
+	return n;
+}
+
 struct highmem_page {
 	char *data;
 	struct page *page;
@@ -152,17 +177,15 @@ static int saveable(struct zone *zone, unsigned long *zone_pfn)
 	BUG_ON(PageReserved(page) && PageNosave(page));
 	if (PageNosave(page))
 		return 0;
-	if (PageReserved(page) && pfn_is_nosave(pfn)) {
-		pr_debug("[nosave pfn 0x%lx]", pfn);
+	if (PageReserved(page) && pfn_is_nosave(pfn))
 		return 0;
-	}
 	if (PageNosaveFree(page))
 		return 0;
 
 	return 1;
 }
 
-static unsigned count_data_pages(void)
+unsigned int count_data_pages(void)
 {
 	struct zone *zone;
 	unsigned long zone_pfn;
@@ -266,6 +289,35 @@ static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
 	}
 }
 
+/**
+ *	On resume it is necessary to trace and eventually free the unsafe
+ *	pages that have been allocated, because they are needed for I/O
+ *	(on x86-64 we likely will "eat" these pages once again while
+ *	creating the temporary page translation tables)
+ */
+
+struct eaten_page {
+	struct eaten_page *next;
+	char padding[PAGE_SIZE - sizeof(void *)];
+};
+
+static struct eaten_page *eaten_pages = NULL;
+
+void release_eaten_pages(void)
+{
+	struct eaten_page *p, *q;
+
+	p = eaten_pages;
+	while (p) {
+		q = p->next;
+		/* We don't want swsusp_free() to free this page again */
+		ClearPageNosave(virt_to_page(p));
+		free_page((unsigned long)p);
+		p = q;
+	}
+	eaten_pages = NULL;
+}
+
 /**
  *	@safe_needed - on resume, for storing the PBE list and the image,
  *	we can only use memory pages that do not conflict with the pages
@@ -284,9 +336,12 @@ static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
 	if (safe_needed)
 		do {
 			res = (void *)get_zeroed_page(gfp_mask);
-			if (res && PageNosaveFree(virt_to_page(res)))
+			if (res && PageNosaveFree(virt_to_page(res))) {
 				/* This is for swsusp_free() */
 				SetPageNosave(virt_to_page(res));
+				((struct eaten_page *)res)->next = eaten_pages;
+				eaten_pages = res;
+			}
 		} while (res && PageNosaveFree(virt_to_page(res)));
 	else
 		res = (void *)get_zeroed_page(gfp_mask);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index b09bd7c0998d..f77f9397a364 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -70,11 +70,13 @@
 #include "power.h"
 
 #ifdef CONFIG_HIGHMEM
+unsigned int count_highmem_pages(void);
 int save_highmem(void);
 int restore_highmem(void);
 #else
 static int save_highmem(void) { return 0; }
 static int restore_highmem(void) { return 0; }
+static unsigned int count_highmem_pages(void) { return 0; }
 #endif
 
 extern char resume_file[];
@@ -611,6 +613,52 @@ int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
 	return error;
 }
 
+/**
+ *	swsusp_shrink_memory -  Try to free as much memory as needed
+ *
+ *	... but do not OOM-kill anyone
+ *
+ *	Notice: all userland should be stopped before it is called, or
+ *	livelock is possible.
+ */
+
+#define SHRINK_BITE	10000
+
+int swsusp_shrink_memory(void)
+{
+	long tmp;
+	struct zone *zone;
+	unsigned long pages = 0;
+	unsigned int i = 0;
+	char *p = "-\\|/";
+
+	printk("Shrinking memory...  ");
+	do {
+#ifdef FAST_FREE
+		tmp = 2 * count_highmem_pages();
+		tmp += tmp / 50 + count_data_pages();
+		tmp += (tmp + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
+			PAGES_FOR_IO;
+		for_each_zone (zone)
+			if (!is_highmem(zone))
+				tmp -= zone->free_pages;
+		if (tmp > 0) {
+			tmp = shrink_all_memory(SHRINK_BITE);
+			if (!tmp)
+				return -ENOMEM;
+			pages += tmp;
+		}
+#else
+		tmp = shrink_all_memory(SHRINK_BITE);
+		pages += tmp;
+#endif
+		printk("\b%c", p[i++%4]);
+	} while (tmp > 0);
+	printk("\bdone (%lu pages freed)\n", pages);
+
+	return 0;
+}
+
 int swsusp_suspend(void)
 {
 	int error;
@@ -1030,8 +1078,10 @@ static int read_suspend_image(struct pbe **pblist_ptr)
 		/* Allocate memory for the image and read the data from swap */
 		if (!error)
 			error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
-		if (!error)
+		if (!error) {
+			release_eaten_pages();
 			error = load_image_data(pblist, &handle, nr_pages);
+		}
 		if (!error)
 			*pblist_ptr = pblist;
 	}
-- 
cgit v1.2.3


From e5e2fa7857f6bf46605c77d949fa6698b9b0bc28 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:14:20 -0800
Subject: [PATCH] swsusp: fix enough_free_mem

This patch fixes a problem with the function enough_free_mem() used by
swsusp to verify if there is a sufficient number of memory pages available
to it to create and save the suspend image.

Namely, enough_free_mem() uses nr_free_pages() to obtain the number of free
memory pages, which is incorrect, because this function returns the total
number of free pages, including free highmem pages, and the highmem pages
cannot be used by swsusp for storing the image data.

The patch makes enough_free_mem() avoid counting the free highmem
pages as available to swsusp.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index e80d282dbf58..41f66365f0d8 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -428,8 +428,14 @@ void swsusp_free(void)
 
 static int enough_free_mem(unsigned int nr_pages)
 {
-	pr_debug("swsusp: available memory: %u pages\n", nr_free_pages());
-	return nr_free_pages() > (nr_pages + PAGES_FOR_IO +
+	struct zone *zone;
+	unsigned int n = 0;
+
+	for_each_zone (zone)
+		if (!is_highmem(zone))
+			n += zone->free_pages;
+	pr_debug("swsusp: available memory: %u pages\n", n);
+	return n > (nr_pages + PAGES_FOR_IO +
 		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
 }
 
-- 
cgit v1.2.3


From c050ca78705592d440c22055865bf4de40fe2a4c Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Fri, 6 Jan 2006 00:15:21 -0800
Subject: [PATCH] swsusp: Drop duplicate prototypes

These two prototypes are already present in sched.h, remove duplicate
version.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/power.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index acdc83b3d890..e521e61e0d95 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -43,9 +43,6 @@ static struct subsys_attribute _name##_attr = {	\
 
 extern struct subsystem power_subsys;
 
-extern int freeze_processes(void);
-extern void thaw_processes(void);
-
 extern int pm_prepare_console(void);
 extern void pm_restore_console(void);
 
-- 
cgit v1.2.3


From b3a93a255ec33a04776ec50efb30b7a99168dda2 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:15:22 -0800
Subject: [PATCH] swsusp: limit image size

Limit the size of the suspend image to approx.  500 MB, which should
improve the overall performance of swsusp on systems with more than 1 GB of
RAM.

It introduces the constant IMAGE_SIZE that can be set to the preferred size
of the image (in MB) and modifies the memory-shrinking part of swsusp to
take this constant into account (500 is the default value of IMAGE_SIZE).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/power.h  |  8 +++-----
 kernel/power/swsusp.c | 17 ++++++++---------
 2 files changed, 11 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index e521e61e0d95..9b0459903613 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -53,12 +53,10 @@ extern unsigned int nr_copy_pages;
 extern struct pbe *pagedir_nosave;
 
 /*
- * This compilation switch determines the way in which memory will be freed
- * during suspend.  If defined, only as much memory will be freed as needed
- * to complete the suspend, which will make it go faster.  Otherwise, the
- * largest possible amount of memory will be freed.
+ * Preferred image size in MB (set it to zero to get the smallest
+ * image possible)
  */
-#define FAST_FREE	1
+#define IMAGE_SIZE	500
 
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index f77f9397a364..6d5ceaf4c364 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -626,7 +626,7 @@ int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
 
 int swsusp_shrink_memory(void)
 {
-	long tmp;
+	long size, tmp;
 	struct zone *zone;
 	unsigned long pages = 0;
 	unsigned int i = 0;
@@ -634,11 +634,11 @@ int swsusp_shrink_memory(void)
 
 	printk("Shrinking memory...  ");
 	do {
-#ifdef FAST_FREE
-		tmp = 2 * count_highmem_pages();
-		tmp += tmp / 50 + count_data_pages();
-		tmp += (tmp + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
+		size = 2 * count_highmem_pages();
+		size += size / 50 + count_data_pages();
+		size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
 			PAGES_FOR_IO;
+		tmp = size;
 		for_each_zone (zone)
 			if (!is_highmem(zone))
 				tmp -= zone->free_pages;
@@ -647,11 +647,10 @@ int swsusp_shrink_memory(void)
 			if (!tmp)
 				return -ENOMEM;
 			pages += tmp;
+		} else if (size > (IMAGE_SIZE * 1024 * 1024) / PAGE_SIZE) {
+			tmp = shrink_all_memory(SHRINK_BITE);
+			pages += tmp;
 		}
-#else
-		tmp = shrink_all_memory(SHRINK_BITE);
-		pages += tmp;
-#endif
 		printk("\b%c", p[i++%4]);
 	} while (tmp > 0);
 	printk("\bdone (%lu pages freed)\n", pages);
-- 
cgit v1.2.3


From ca0aec0f7a94bf9f07fefa8bfd23282d4e8ceb8a Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:15:56 -0800
Subject: [PATCH] swsusp: make image size limit tunable

Make the suspend image size limit tunable via /sys/power/image_size.

It is necessary for systems on which there is a limited amount of swap
available for suspend.  It can also be useful for optimizing performance of
swsusp on systems with 1 GB of RAM or more.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c   | 20 ++++++++++++++++++++
 kernel/power/power.h  |  7 ++-----
 kernel/power/swsusp.c | 10 +++++++++-
 3 files changed, 31 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 9e51cdf7b78d..e24446f8d8cd 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -365,9 +365,29 @@ out:
 
 power_attr(resume);
 
+static ssize_t image_size_show(struct subsystem * subsys, char *buf)
+{
+	return sprintf(buf, "%u\n", image_size);
+}
+
+static ssize_t image_size_store(struct subsystem * subsys, const char * buf, size_t n)
+{
+	unsigned int size;
+
+	if (sscanf(buf, "%u", &size) == 1) {
+		image_size = size;
+		return n;
+	}
+
+	return -EINVAL;
+}
+
+power_attr(image_size);
+
 static struct attribute * g[] = {
 	&disk_attr.attr,
 	&resume_attr.attr,
+	&image_size_attr.attr,
 	NULL,
 };
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 9b0459903613..273a5b1d70be 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -52,11 +52,8 @@ extern const void __nosave_begin, __nosave_end;
 extern unsigned int nr_copy_pages;
 extern struct pbe *pagedir_nosave;
 
-/*
- * Preferred image size in MB (set it to zero to get the smallest
- * image possible)
- */
-#define IMAGE_SIZE	500
+/* Preferred image size in MB (default 500) */
+extern unsigned int image_size;
 
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 6d5ceaf4c364..d760a6a719f0 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -69,6 +69,14 @@
 
 #include "power.h"
 
+/*
+ * Preferred image size in MB (tunable via /sys/power/image_size).
+ * When it is set to N, swsusp will do its best to ensure the image
+ * size will not exceed N MB, but if that is impossible, it will
+ * try to create the smallest image possible.
+ */
+unsigned int image_size = 500;
+
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void);
 int save_highmem(void);
@@ -647,7 +655,7 @@ int swsusp_shrink_memory(void)
 			if (!tmp)
 				return -ENOMEM;
 			pages += tmp;
-		} else if (size > (IMAGE_SIZE * 1024 * 1024) / PAGE_SIZE) {
+		} else if (size > (image_size * 1024 * 1024) / PAGE_SIZE) {
 			tmp = shrink_all_memory(SHRINK_BITE);
 			pages += tmp;
 		}
-- 
cgit v1.2.3


From 1adf6c8ea916bc4a2587a881ec7715fece63fb5e Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:17:16 -0800
Subject: [PATCH] swsusp: improve handling of swap partitions

This changes the handling of swap partitions by swsusp to avoid locking of the
swap devices that are not used for suspend and, consequently, simplifies the
code.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swsusp.c | 128 ++++++++++++++------------------------------------
 1 file changed, 36 insertions(+), 92 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index d760a6a719f0..0479c9be7d71 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -104,13 +104,7 @@ static struct swsusp_info swsusp_info;
  * Saving part...
  */
 
-/* We memorize in swapfile_used what swap devices are used for suspension */
-#define SWAPFILE_UNUSED    0
-#define SWAPFILE_SUSPEND   1	/* This is the suspending device */
-#define SWAPFILE_IGNORED   2	/* Those are other swap devices ignored for suspension */
-
-static unsigned short swapfile_used[MAX_SWAPFILES];
-static unsigned short root_swap;
+static unsigned short root_swap = 0xffff;
 
 static int mark_swapfiles(swp_entry_t prev)
 {
@@ -146,7 +140,7 @@ static int mark_swapfiles(swp_entry_t prev)
  * devfs, since the resume code can only recognize the form /dev/hda4,
  * but the suspend code would see the long name.)
  */
-static int is_resume_device(const struct swap_info_struct *swap_info)
+static inline int is_resume_device(const struct swap_info_struct *swap_info)
 {
 	struct file *file = swap_info->swap_file;
 	struct inode *inode = file->f_dentry->d_inode;
@@ -156,55 +150,23 @@ static int is_resume_device(const struct swap_info_struct *swap_info)
 }
 
 static int swsusp_swap_check(void) /* This is called before saving image */
-{
-	int i, len;
-
-	len=strlen(resume_file);
-	root_swap = 0xFFFF;
-
-	spin_lock(&swap_lock);
-	for (i=0; i<MAX_SWAPFILES; i++) {
-		if (!(swap_info[i].flags & SWP_WRITEOK)) {
-			swapfile_used[i]=SWAPFILE_UNUSED;
-		} else {
-			if (!len) {
-	    			printk(KERN_WARNING "resume= option should be used to set suspend device" );
-				if (root_swap == 0xFFFF) {
-					swapfile_used[i] = SWAPFILE_SUSPEND;
-					root_swap = i;
-				} else
-					swapfile_used[i] = SWAPFILE_IGNORED;
-			} else {
-	  			/* we ignore all swap devices that are not the resume_file */
-				if (is_resume_device(&swap_info[i])) {
-					swapfile_used[i] = SWAPFILE_SUSPEND;
-					root_swap = i;
-				} else {
-				  	swapfile_used[i] = SWAPFILE_IGNORED;
-				}
-			}
-		}
-	}
-	spin_unlock(&swap_lock);
-	return (root_swap != 0xffff) ? 0 : -ENODEV;
-}
-
-/**
- * This is called after saving image so modification
- * will be lost after resume... and that's what we want.
- * we make the device unusable. A new call to
- * lock_swapdevices can unlock the devices.
- */
-static void lock_swapdevices(void)
 {
 	int i;
 
+	if (!swsusp_resume_device)
+		return -ENODEV;
 	spin_lock(&swap_lock);
-	for (i = 0; i< MAX_SWAPFILES; i++)
-		if (swapfile_used[i] == SWAPFILE_IGNORED) {
-			swap_info[i].flags ^= SWP_WRITEOK;
+	for (i = 0; i < MAX_SWAPFILES; i++) {
+		if (!(swap_info[i].flags & SWP_WRITEOK))
+			continue;
+		if (is_resume_device(swap_info + i)) {
+			spin_unlock(&swap_lock);
+			root_swap = i;
+			return 0;
 		}
+	}
 	spin_unlock(&swap_lock);
+	return -ENODEV;
 }
 
 /**
@@ -222,19 +184,14 @@ static void lock_swapdevices(void)
 static int write_page(unsigned long addr, swp_entry_t *loc)
 {
 	swp_entry_t entry;
-	int error = 0;
+	int error = -ENOSPC;
 
-	entry = get_swap_page();
-	if (swp_offset(entry) &&
-	    swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) {
-		error = rw_swap_page_sync(WRITE, entry,
-					  virt_to_page(addr));
-		if (error == -EIO)
-			error = 0;
-		if (!error)
+	entry = get_swap_page_of_type(root_swap);
+	if (swp_offset(entry)) {
+		error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr));
+		if (!error || error == -EIO)
 			*loc = entry;
-	} else
-		error = -ENOSPC;
+	}
 	return error;
 }
 
@@ -539,31 +496,38 @@ static int save_image_metadata(struct pbe *pblist,
  *	enough_swap - Make sure we have enough swap to save the image.
  *
  *	Returns TRUE or FALSE after checking the total amount of swap
- *	space avaiable.
- *
- *	FIXME: si_swapinfo(&i) returns all swap devices information.
- *	We should only consider resume_device.
+ *	space avaiable from the resume partition.
  */
 
 static int enough_swap(unsigned int nr_pages)
 {
-	struct sysinfo i;
+	unsigned int free_swap = swap_info[root_swap].pages -
+		swap_info[root_swap].inuse_pages;
 
-	si_swapinfo(&i);
-	pr_debug("swsusp: available swap: %lu pages\n", i.freeswap);
-	return i.freeswap > (nr_pages + PAGES_FOR_IO +
+	pr_debug("swsusp: free swap pages: %u\n", free_swap);
+	return free_swap > (nr_pages + PAGES_FOR_IO +
 		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
 }
 
 /**
- *	write_suspend_image - Write entire image and metadata.
+ *	swsusp_write - Write entire image and metadata.
+ *
+ *	It is important _NOT_ to umount filesystems at this point. We want
+ *	them synced (in case something goes wrong) but we DO not want to mark
+ *	filesystem clean: it is not. (And it does not matter, if we resume
+ *	correctly, we'll mark system clean, anyway.)
  */
-static int write_suspend_image(struct pbe *pblist, unsigned int nr_pages)
+
+int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
 {
 	struct swap_map_page *swap_map;
 	struct swap_map_handle handle;
 	int error;
 
+	if ((error = swsusp_swap_check())) {
+		printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
+		return error;
+	}
 	if (!enough_swap(nr_pages)) {
 		printk(KERN_ERR "swsusp: Not enough free swap\n");
 		return -ENOSPC;
@@ -601,26 +565,6 @@ Free_image_entries:
 	goto Free_swap_map;
 }
 
-/* It is important _NOT_ to umount filesystems at this point. We want
- * them synced (in case something goes wrong) but we DO not want to mark
- * filesystem clean: it is not. (And it does not matter, if we resume
- * correctly, we'll mark system clean, anyway.)
- */
-int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
-{
-	int error;
-
-	if ((error = swsusp_swap_check())) {
-		printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n");
-		return error;
-	}
-	lock_swapdevices();
-	error = write_suspend_image(pblist, nr_pages);
-	/* This will unlock ignored swap devices since writing is finished */
-	lock_swapdevices();
-	return error;
-}
-
 /**
  *	swsusp_shrink_memory -  Try to free as much memory as needed
  *
-- 
cgit v1.2.3


From 277c6e2ad7369558dbd7ffbcc6dcbe16458bf723 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:17:58 -0800
Subject: [PATCH] swsusp: save image header first
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This makes the swsusp_info structure become the header of the image in the
literal sense (ie.  it is saved to the swap and read before any other image
data with the help of the swsusp's swap map structure, so generally it is
treated in the same way as the rest of the image).

The main thing it does is to make swsusp_header contain the offset of the swap
map used to track the image data pages rather than the offset of swsusp_info.
 Simultaneously, swsusp_info becomes the first image page written to the swap.

The other changes are generally consequences of the above with a few
exceptions (there's some consolidation in the image reading part as a few
functions turn into trivial wrappers around something else).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/power.h  |   1 -
 kernel/power/swsusp.c | 190 +++++++++++++++++---------------------------------
 2 files changed, 65 insertions(+), 126 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index 273a5b1d70be..7e8492fd1423 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -16,7 +16,6 @@ struct swsusp_info {
 	int			cpus;
 	unsigned long		image_pages;
 	unsigned long		pages;
-	swp_entry_t		start;
 } __attribute__((aligned(PAGE_SIZE)));
 
 
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 0479c9be7d71..55a18d26abed 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -93,7 +93,7 @@ extern char resume_file[];
 
 static struct swsusp_header {
 	char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
-	swp_entry_t swsusp_info;
+	swp_entry_t image;
 	char	orig_sig[10];
 	char	sig[10];
 } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
@@ -106,7 +106,7 @@ static struct swsusp_info swsusp_info;
 
 static unsigned short root_swap = 0xffff;
 
-static int mark_swapfiles(swp_entry_t prev)
+static int mark_swapfiles(swp_entry_t start)
 {
 	int error;
 
@@ -117,7 +117,7 @@ static int mark_swapfiles(swp_entry_t prev)
 	    !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
 		memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
 		memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
-		swsusp_header.swsusp_info = prev;
+		swsusp_header.image = start;
 		error = rw_swap_page_sync(WRITE,
 					  swp_entry(root_swap, 0),
 					  virt_to_page((unsigned long)
@@ -423,22 +423,7 @@ static void init_header(unsigned int nr_pages)
 	swsusp_info.cpus = num_online_cpus();
 	swsusp_info.image_pages = nr_pages;
 	swsusp_info.pages = nr_pages +
-		((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT);
-}
-
-static int close_swap(void)
-{
-	swp_entry_t entry;
-	int error;
-
-	dump_info();
-	error = write_page((unsigned long)&swsusp_info, &entry);
-	if (!error) {
-		printk( "S" );
-		error = mark_swapfiles(entry);
-		printk( "|\n" );
-	}
-	return error;
+		((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1;
 }
 
 /**
@@ -522,6 +507,7 @@ int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
 {
 	struct swap_map_page *swap_map;
 	struct swap_map_handle handle;
+	swp_entry_t start;
 	int error;
 
 	if ((error = swsusp_swap_check())) {
@@ -539,18 +525,23 @@ int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
 		return -ENOMEM;
 	init_swap_map_handle(&handle, swap_map);
 
-	error = save_image_metadata(pblist, &handle);
+	error = swap_map_write_page(&handle, (unsigned long)&swsusp_info);
+	if (!error)
+		error = save_image_metadata(pblist, &handle);
 	if (!error)
 		error = save_image_data(pblist, &handle, nr_pages);
 	if (error)
 		goto Free_image_entries;
 
 	swap_map = reverse_swap_map(swap_map);
-	error = save_swap_map(swap_map, &swsusp_info.start);
+	error = save_swap_map(swap_map, &start);
 	if (error)
 		goto Free_map_entries;
 
-	error = close_swap();
+	dump_info();
+	printk( "S" );
+	error = mark_swapfiles(start);
+	printk( "|\n" );
 	if (error)
 		goto Free_map_entries;
 
@@ -840,70 +831,28 @@ static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf)
 	return error;
 }
 
-/*
- * Sanity check if this image makes sense with this kernel/swap context
- * I really don't think that it's foolproof but more than nothing..
- */
-
-static const char *sanity_check(void)
+static int check_header(void)
 {
+	char *reason = NULL;
+
 	dump_info();
 	if (swsusp_info.version_code != LINUX_VERSION_CODE)
-		return "kernel version";
+		reason = "kernel version";
 	if (swsusp_info.num_physpages != num_physpages)
-		return "memory size";
+		reason = "memory size";
 	if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname))
-		return "system type";
+		reason = "system type";
 	if (strcmp(swsusp_info.uts.release,system_utsname.release))
-		return "kernel release";
+		reason = "kernel release";
 	if (strcmp(swsusp_info.uts.version,system_utsname.version))
-		return "version";
+		reason = "version";
 	if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
-		return "machine";
-#if 0
-	/* We can't use number of online CPUs when we use hotplug to remove them ;-))) */
-	if (swsusp_info.cpus != num_possible_cpus())
-		return "number of cpus";
-#endif
-	return NULL;
-}
-
-static int check_header(void)
-{
-	const char *reason = NULL;
-	int error;
-
-	if ((error = bio_read_page(swp_offset(swsusp_header.swsusp_info), &swsusp_info)))
-		return error;
-
- 	/* Is this same machine? */
-	if ((reason = sanity_check())) {
-		printk(KERN_ERR "swsusp: Resume mismatch: %s\n",reason);
+		reason = "machine";
+	if (reason) {
+		printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
 		return -EPERM;
 	}
-	return error;
-}
-
-static int check_sig(void)
-{
-	int error;
-
-	memset(&swsusp_header, 0, sizeof(swsusp_header));
-	if ((error = bio_read_page(0, &swsusp_header)))
-		return error;
-	if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
-		memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
-
-		/*
-		 * Reset swap signature now.
-		 */
-		error = bio_write_page(0, &swsusp_header);
-	} else {
-		return -EINVAL;
-	}
-	if (!error)
-		pr_debug("swsusp: Signature found, resuming\n");
-	return error;
+	return 0;
 }
 
 /**
@@ -989,33 +938,29 @@ static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handl
 	return error;
 }
 
-static int check_suspend_image(void)
-{
-	int error = 0;
-
-	if ((error = check_sig()))
-		return error;
-
-	if ((error = check_header()))
-		return error;
-
-	return 0;
-}
-
-static int read_suspend_image(struct pbe **pblist_ptr)
+int swsusp_read(struct pbe **pblist_ptr)
 {
-	int error = 0;
+	int error;
 	struct pbe *p, *pblist;
 	struct swap_map_handle handle;
-	unsigned int nr_pages = swsusp_info.image_pages;
+	unsigned int nr_pages;
 
+	if (IS_ERR(resume_bdev)) {
+		pr_debug("swsusp: block device not initialised\n");
+		return PTR_ERR(resume_bdev);
+	}
+
+	error = get_swap_map_reader(&handle, swsusp_header.image);
+	if (!error)
+		error = swap_map_read_page(&handle, &swsusp_info);
+	if (!error)
+		error = check_header();
+	if (error)
+		return error;
+	nr_pages = swsusp_info.image_pages;
 	p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0);
 	if (!p)
 		return -ENOMEM;
-	error = get_swap_map_reader(&handle, swsusp_info.start);
-	if (error)
-		/* The PBE list at p will be released by swsusp_free() */
-		return error;
 	error = load_image_metadata(p, &handle);
 	if (!error) {
 		mark_unsafe_pages(p);
@@ -1037,11 +982,18 @@ static int read_suspend_image(struct pbe **pblist_ptr)
 			*pblist_ptr = pblist;
 	}
 	release_swap_map_reader(&handle);
+
+	blkdev_put(resume_bdev);
+
+	if (!error)
+		pr_debug("swsusp: Reading resume file was successful\n");
+	else
+		pr_debug("swsusp: Error %d resuming\n", error);
 	return error;
 }
 
 /**
- *      swsusp_check - Check for saved image in swap
+ *      swsusp_check - Check for swsusp signature in the resume device
  */
 
 int swsusp_check(void)
@@ -1051,39 +1003,27 @@ int swsusp_check(void)
 	resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
 	if (!IS_ERR(resume_bdev)) {
 		set_blocksize(resume_bdev, PAGE_SIZE);
-		error = check_suspend_image();
+		memset(&swsusp_header, 0, sizeof(swsusp_header));
+		if ((error = bio_read_page(0, &swsusp_header)))
+			return error;
+		if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
+			memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
+			/* Reset swap signature now */
+			error = bio_write_page(0, &swsusp_header);
+		} else {
+			return -EINVAL;
+		}
 		if (error)
-		    blkdev_put(resume_bdev);
-	} else
+			blkdev_put(resume_bdev);
+		else
+			pr_debug("swsusp: Signature found, resuming\n");
+	} else {
 		error = PTR_ERR(resume_bdev);
-
-	if (!error)
-		pr_debug("swsusp: resume file found\n");
-	else
-		pr_debug("swsusp: Error %d check for resume file\n", error);
-	return error;
-}
-
-/**
- *	swsusp_read - Read saved image from swap.
- */
-
-int swsusp_read(struct pbe **pblist_ptr)
-{
-	int error;
-
-	if (IS_ERR(resume_bdev)) {
-		pr_debug("swsusp: block device not initialised\n");
-		return PTR_ERR(resume_bdev);
 	}
 
-	error = read_suspend_image(pblist_ptr);
-	blkdev_put(resume_bdev);
+	if (error)
+		pr_debug("swsusp: Error %d check for resume file\n", error);
 
-	if (!error)
-		pr_debug("swsusp: Reading resume file was successful\n");
-	else
-		pr_debug("swsusp: Error %d resuming\n", error);
 	return error;
 }
 
-- 
cgit v1.2.3


From 089545f0c71bab6511395c2a060d7f81a99bad58 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:12 -0800
Subject: [PATCH] s390: cputime_t fixes

There are some more places where the use of cputime_t instead of an integer
type and the associated macros is necessary for the virtual cputime accounting
on s390.  Affected are the s390 specific appldata code and BSD process
accounting.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/acct.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index 6312d6bd43e3..38d57fa6b78f 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -427,6 +427,7 @@ static void do_acct_process(long exitcode, struct file *file)
 	u64 elapsed;
 	u64 run_time;
 	struct timespec uptime;
+	unsigned long jiffies;
 
 	/*
 	 * First check to see if there is enough free_space to continue
@@ -467,12 +468,12 @@ static void do_acct_process(long exitcode, struct file *file)
 #endif
 	do_div(elapsed, AHZ);
 	ac.ac_btime = xtime.tv_sec - elapsed;
-	ac.ac_utime = encode_comp_t(jiffies_to_AHZ(
-					    current->signal->utime +
-					    current->group_leader->utime));
-	ac.ac_stime = encode_comp_t(jiffies_to_AHZ(
-					    current->signal->stime +
-					    current->group_leader->stime));
+	jiffies = cputime_to_jiffies(cputime_add(current->group_leader->utime,
+						 current->signal->utime));
+	ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies));
+	jiffies = cputime_to_jiffies(cputime_add(current->group_leader->stime,
+						 current->signal->stime));
+	ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies));
 	/* we really need to bite the bullet and change layout */
 	ac.ac_uid = current->uid;
 	ac.ac_gid = current->gid;
@@ -580,7 +581,8 @@ void acct_process(long exitcode)
 void acct_update_integrals(struct task_struct *tsk)
 {
 	if (likely(tsk->mm)) {
-		long delta = tsk->stime - tsk->acct_stimexpd;
+		long delta =
+			cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd;
 
 		if (delta == 0)
 			return;
-- 
cgit v1.2.3


From 347a8dc3b815f0c0fa62a1df075184ffe4cbdcf1 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:28 -0800
Subject: [PATCH] s390: cleanup Kconfig

Sanitize some s390 Kconfig options.  We have ARCH_S390, ARCH_S390X,
ARCH_S390_31, 64BIT, S390_SUPPORT and COMPAT.  Replace these 6 options by
S390, 64BIT and COMPAT.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/panic.c  | 4 ++--
 kernel/sysctl.c | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index aabc5f86fa3f..c5c4ab255834 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -60,7 +60,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 	long i;
 	static char buf[1024];
 	va_list args;
-#if defined(CONFIG_ARCH_S390)
+#if defined(CONFIG_S390)
         unsigned long caller = (unsigned long) __builtin_return_address(0);
 #endif
 
@@ -125,7 +125,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 		printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n");
 	}
 #endif
-#if defined(CONFIG_ARCH_S390)
+#if defined(CONFIG_S390)
         disabled_wait(caller);
 #endif
 	local_irq_enable();
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 345f4a1d533f..a85047bb5739 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -108,7 +108,7 @@ extern int pwrsw_enabled;
 extern int unaligned_enabled;
 #endif
 
-#ifdef CONFIG_ARCH_S390
+#ifdef CONFIG_S390
 #ifdef CONFIG_MATHEMU
 extern int sysctl_ieee_emulation_warnings;
 #endif
@@ -542,7 +542,7 @@ static ctl_table kern_table[] = {
 		.extra1		= &minolduid,
 		.extra2		= &maxolduid,
 	},
-#ifdef CONFIG_ARCH_S390
+#ifdef CONFIG_S390
 #ifdef CONFIG_MATHEMU
 	{
 		.ctl_name	= KERN_IEEE_EMULATION_WARNINGS,
@@ -644,7 +644,7 @@ static ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-#if defined(CONFIG_ARCH_S390)
+#if defined(CONFIG_S390)
 	{
 		.ctl_name	= KERN_SPIN_RETRY,
 		.procname	= "spin_retry",
-- 
cgit v1.2.3


From 6fe2e70bbed3995d930f39452fb6ce3be7dc47dc Mon Sep 17 00:00:00 2001
From: Jayachandran C <c.jayachandran@gmail.com>
Date: Fri, 6 Jan 2006 00:19:54 -0800
Subject: [PATCH] kernel/module.c: removed dead code

This patch fixes an issue reported by Coverity in kernel/module.c

Error reported: Cannot reach this line of code "else return ptr;"

Patch description:
  This is the error path, so 'err' will be negative, the else case
  is not required, this patch removes it.

Signed-off-by: Jayachandran C. <c.jayachandran@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 2ea929d51ad0..4b06bbad49c2 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1854,8 +1854,7 @@ static struct module *load_module(void __user *umod,
 	kfree(args);
  free_hdr:
 	vfree(hdr);
-	if (err < 0) return ERR_PTR(err);
-	else return ptr;
+	return ERR_PTR(err);
 
  truncated:
 	printk(KERN_ERR "Module len %lu truncated\n", len);
-- 
cgit v1.2.3


From 0aec63e67c69545ca757a73a66f5dcf05fa484bf Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 6 Jan 2006 15:36:48 -0800
Subject: [PATCH] Fix posix-cpu-timers sched_time accumulation

I've spent the past 3 days digging into a glibc testsuite failure in
current CVS, specifically libc/rt/tst-cputimer1.c The thr1 and thr2
timers fire too early in the second pass of this test.  The second
pass is noteworthy because it makes use of intervals, whereas the
first pass does not.

All throughout the posix-cpu-timers.c code, the calculation of the
process sched_time sum is implemented roughly as:

	unsigned long long sum;

	sum = tsk->signal->sched_time;
	t = tsk;
	do {
		sum += t->sched_time;
		t = next_thread(t);
	} while (t != tsk);

In fact this is the exact scheme used by check_process_timers().

In the case of check_process_timers(), current->sched_time has just
been updated (via scheduler_tick(), which is invoked by
update_process_times(), which subsequently invokes
run_posix_cpu_timers()) So there is no special processing necessary
wrt. that.

In other contexts, we have to allot for the fact that tsk->sched_time
might be a bit out of date if we are current.  And the
posix-cpu-timers.c code uses current_sched_time() to deal with that.

Unfortunately it does so in an erroneous and inconsistent manner in
one spot which is what results in the early timer firing.

In cpu_clock_sample_group_locked(), it does this:

		cpu->sched = p->signal->sched_time;
		/* Add in each other live thread.  */
		while ((t = next_thread(t)) != p) {
			cpu->sched += t->sched_time;
		}
		if (p->tgid == current->tgid) {
			/*
			 * We're sampling ourselves, so include the
			 * cycles not yet banked.  We still omit
			 * other threads running on other CPUs,
			 * so the total can always be behind as
			 * much as max(nthreads-1,ncpus) * (NSEC_PER_SEC/HZ).
			 */
			cpu->sched += current_sched_time(current);
		} else {
			cpu->sched += p->sched_time;
		}

The problem is the "p->tgid == current->tgid" test.  If "p" is
not current, and the tgids are the same, we will add the process
t->sched_time twice into cpu->sched and omit "p"'s sched_time
which is very very very wrong.

posix-cpu-timers.c has a helper function, sched_ns(p) which takes care
of this, so my fix is to use that here instead of this special tgid
test.

The fact that current can be one of the sub-threads of "p" points out
that we could make things a little bit more accurate, perhaps by using
sched_ns() on every thread we process in these loops.  It also points
out that we don't use the most accurate value for threads in the group
actively running other cpus (and this is mentioned in the comment).

But that is a future enhancement, and this fix here definitely makes
sense.

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index cae4f5728997..4c68edff900b 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -238,18 +238,7 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx,
 		while ((t = next_thread(t)) != p) {
 			cpu->sched += t->sched_time;
 		}
-		if (p->tgid == current->tgid) {
-			/*
-			 * We're sampling ourselves, so include the
-			 * cycles not yet banked.  We still omit
-			 * other threads running on other CPUs,
-			 * so the total can always be behind as
-			 * much as max(nthreads-1,ncpus) * (NSEC_PER_SEC/HZ).
-			 */
-			cpu->sched += current_sched_time(current);
-		} else {
-			cpu->sched += p->sched_time;
-		}
+		cpu->sched += sched_ns(p);
 		break;
 	}
 	return 0;
-- 
cgit v1.2.3