From b3e55c727ff7349c5db722fbdb8d99a151e8e0bf Mon Sep 17 00:00:00 2001 From: "Mao, Bibo" Date: Mon, 12 Dec 2005 00:37:00 -0800 Subject: [PATCH] Kprobes: Reference count the modules when probed on it When a Kprobes are inserted/removed on a modules, the modules must be ref counted so as not to allow to unload while probes are registered on that module. Without this patch, the probed module is free to unload, and when the probing module unregister the probe, the kpobes code while trying to replace the original instruction might crash. Signed-off-by: Anil S Keshavamurthy Signed-off-by: Mao Bibo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 5beda378cc75..fde5a16a2913 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -462,9 +462,16 @@ int __kprobes register_kprobe(struct kprobe *p) int ret = 0; unsigned long flags = 0; struct kprobe *old_p; + struct module *mod; + + if ((!kernel_text_address((unsigned long) p->addr)) || + in_kprobes_functions((unsigned long) p->addr)) + return -EINVAL; + + if ((mod = module_text_address((unsigned long) p->addr)) && + (unlikely(!try_module_get(mod)))) + return -EINVAL; - if ((ret = in_kprobes_functions((unsigned long) p->addr)) != 0) - return ret; if ((ret = arch_prepare_kprobe(p)) != 0) goto rm_kprobe; @@ -488,6 +495,8 @@ out: rm_kprobe: if (ret == -EEXIST) arch_remove_kprobe(p); + if (ret && mod) + module_put(mod); return ret; } @@ -495,6 +504,7 @@ void __kprobes unregister_kprobe(struct kprobe *p) { unsigned long flags; struct kprobe *old_p; + struct module *mod; spin_lock_irqsave(&kprobe_lock, flags); old_p = get_kprobe(p->addr); @@ -506,6 +516,10 @@ void __kprobes unregister_kprobe(struct kprobe *p) cleanup_kprobe(p, flags); synchronize_sched(); + + if ((mod = module_text_address((unsigned long)p->addr))) + module_put(mod); + if (old_p->pre_handler == aggr_pre_handler && list_empty(&old_p->list)) kfree(old_p); -- cgit v1.2.3 From ab4720ec76b756e1f8705e207a7b392b0453afd6 Mon Sep 17 00:00:00 2001 From: Dipankar Sarma Date: Mon, 12 Dec 2005 00:37:05 -0800 Subject: [PATCH] add rcu_barrier() synchronization point This introduces a new interface - rcu_barrier() which waits until all the RCUs queued until this call have been completed. Reiser4 needs this, because we do more than just freeing memory object in our RCU callback: we also remove it from the list hanging off super-block. This means, that before freeing reiser4-specific portion of super-block (during umount) we have to wait until all pending RCU callbacks are executed. The only change of reiser4 made to the original patch, is exporting of rcu_barrier(). Cc: Hans Reiser Cc: Vladimir V. Saveliev Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcupdate.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index c4d159a21e04..f45b91723dc6 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -116,6 +116,10 @@ void fastcall call_rcu(struct rcu_head *head, local_irq_restore(flags); } +static atomic_t rcu_barrier_cpu_count; +static struct semaphore rcu_barrier_sema; +static struct completion rcu_barrier_completion; + /** * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. * @head: structure to be used for queueing the RCU updates. @@ -162,6 +166,42 @@ long rcu_batches_completed(void) return rcu_ctrlblk.completed; } +static void rcu_barrier_callback(struct rcu_head *notused) +{ + if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + complete(&rcu_barrier_completion); +} + +/* + * Called with preemption disabled, and from cross-cpu IRQ context. + */ +static void rcu_barrier_func(void *notused) +{ + int cpu = smp_processor_id(); + struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + struct rcu_head *head; + + head = &rdp->barrier; + atomic_inc(&rcu_barrier_cpu_count); + call_rcu(head, rcu_barrier_callback); +} + +/** + * rcu_barrier - Wait until all the in-flight RCUs are complete. + */ +void rcu_barrier(void) +{ + BUG_ON(in_interrupt()); + /* Take cpucontrol semaphore to protect against CPU hotplug */ + down(&rcu_barrier_sema); + init_completion(&rcu_barrier_completion); + atomic_set(&rcu_barrier_cpu_count, 0); + on_each_cpu(rcu_barrier_func, NULL, 0, 1); + wait_for_completion(&rcu_barrier_completion); + up(&rcu_barrier_sema); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + /* * Invoke the completed RCU callbacks. They are expected to be in * a per-cpu list. @@ -457,6 +497,7 @@ static struct notifier_block __devinitdata rcu_nb = { */ void __init rcu_init(void) { + sema_init(&rcu_barrier_sema, 1); rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)smp_processor_id()); /* Register notifier for non-boot CPUs */ -- cgit v1.2.3 From 89d46b8778f65223f732d82c0166e0abba20fb1e Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Mon, 12 Dec 2005 00:37:06 -0800 Subject: [PATCH] Fix bug in RCU torture test While doing some test of RCU torture module, I hit a OOPS in rcu_do_batch, which was trying to processes callback of a module that was just removed. This is because we weren't waiting long enough for all callbacks to fire. Signed-off-by: Srivatsa Vaddagiri Cc: Dipankar Sarma Acked-by: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcutorture.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 88c28d476550..49fbbeff201c 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -409,9 +409,8 @@ rcu_torture_cleanup(void) stats_task = NULL; /* Wait for all RCU callbacks to fire. */ + rcu_barrier(); - for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) - synchronize_rcu(); rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ printk(KERN_ALERT TORTURE_FLAG "--- End of test: %s\n", -- cgit v1.2.3 From c3f5902325d3053986e7359f706581d8f032e72f Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Mon, 12 Dec 2005 00:37:07 -0800 Subject: [PATCH] Fix RCU race in access of nohz_cpu_mask Accessing nohz_cpu_mask before incrementing rcp->cur is racy. It can cause tickless idle CPUs to be included in rsp->cpumask, which will extend graceperiods unnecessarily. Fix this race. It has been tested using extensions to RCU torture module that forces various CPUs to become idle. Signed-off-by: Srivatsa Vaddagiri Cc: Dipankar Sarma Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcupdate.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index f45b91723dc6..48d3bce465b8 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -257,15 +257,23 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp, if (rcp->next_pending && rcp->completed == rcp->cur) { - /* Can't change, since spin lock held. */ - cpus_andnot(rsp->cpumask, cpu_online_map, nohz_cpu_mask); - rcp->next_pending = 0; - /* next_pending == 0 must be visible in __rcu_process_callbacks() - * before it can see new value of cur. + /* + * next_pending == 0 must be visible in + * __rcu_process_callbacks() before it can see new value of cur. */ smp_wmb(); rcp->cur++; + + /* + * Accessing nohz_cpu_mask before incrementing rcp->cur needs a + * Barrier Otherwise it can cause tickless idle CPUs to be + * included in rsp->cpumask, which will extend graceperiods + * unnecessarily. + */ + smp_mb(); + cpus_andnot(rsp->cpumask, cpu_online_map, nohz_cpu_mask); + } } -- cgit v1.2.3 From 64123fd42c7a1e4ebf6acd2399c98caddc7e0c26 Mon Sep 17 00:00:00 2001 From: Matt Helsley Date: Mon, 12 Dec 2005 00:37:09 -0800 Subject: [PATCH] Add getnstimestamp function There are several functions that might seem appropriate for a timestamp: get_cycles() current_kernel_time() do_gettimeofday() Each has problems with combinations of SMP-safety, low resolution, and monotonicity. This patch adds a new function that returns a monotonic SMP-safe timestamp with nanosecond resolution where available. Changes: Split timestamp into separate patch Moved to kernel/time.c Renamed to getnstimestamp Fixed unintended-pointer-arithmetic bug Signed-off-by: Matt Helsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/time.c b/kernel/time.c index 245d595a13cb..b94bfa8c03e0 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -561,6 +561,28 @@ void getnstimeofday(struct timespec *tv) EXPORT_SYMBOL_GPL(getnstimeofday); #endif +void getnstimestamp(struct timespec *ts) +{ + unsigned int seq; + struct timespec wall2mono; + + /* synchronize with settimeofday() changes */ + do { + seq = read_seqbegin(&xtime_lock); + getnstimeofday(ts); + wall2mono = wall_to_monotonic; + } while(unlikely(read_seqretry(&xtime_lock, seq))); + + /* adjust to monotonicaly-increasing values */ + ts->tv_sec += wall2mono.tv_sec; + ts->tv_nsec += wall2mono.tv_nsec; + while (unlikely(ts->tv_nsec >= NSEC_PER_SEC)) { + ts->tv_nsec -= NSEC_PER_SEC; + ts->tv_sec++; + } +} +EXPORT_SYMBOL_GPL(getnstimestamp); + #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) { -- cgit v1.2.3 From adad0f331f9c693129e81e233c5461e2e7c3e443 Mon Sep 17 00:00:00 2001 From: Keshavamurthy Anil S Date: Mon, 12 Dec 2005 00:37:12 -0800 Subject: [PATCH] kprobes: fix race in aggregate kprobe registration When registering multiple kprobes at the same address, we leave a small window where the kprobe hlist will not contain a reference to the registered kprobe, leading to potentially, a system crash if the breakpoint is hit on another processor. Patch below now automically relpace the old kprobe with the new kprobe from the hash list. Signed-off-by: Anil S Keshavamurthy Acked-by: Ananth N Mavinakayanahalli Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index fde5a16a2913..e4f0fc62bd3e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -399,10 +399,7 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) INIT_LIST_HEAD(&ap->list); list_add_rcu(&p->list, &ap->list); - INIT_HLIST_NODE(&ap->hlist); - hlist_del_rcu(&p->hlist); - hlist_add_head_rcu(&ap->hlist, - &kprobe_table[hash_ptr(ap->addr, KPROBE_HASH_BITS)]); + hlist_replace_rcu(&p->hlist, &ap->hlist); } /* -- cgit v1.2.3 From 7a4ae749a478f8bca73d4b5b8c1b8cbb178b2db5 Mon Sep 17 00:00:00 2001 From: Pierre Ossman Date: Mon, 12 Dec 2005 00:37:22 -0800 Subject: [PATCH] Add try_to_freeze to kauditd kauditd was causing suspends to fail because it refused to freeze. Adding a try_to_freeze() to its sleep loop solves the issue. Signed-off-by: Pierre Ossman Acked-by: Pavel Machek Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 0c56320d38dc..32fa03ad1984 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -291,8 +291,10 @@ int kauditd_thread(void *dummy) set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&kauditd_wait, &wait); - if (!skb_queue_len(&audit_skb_queue)) + if (!skb_queue_len(&audit_skb_queue)) { + try_to_freeze(); schedule(); + } __set_current_state(TASK_RUNNING); remove_wait_queue(&kauditd_wait, &wait); -- cgit v1.2.3 From 00d7c05ab168c10f9b520e07400923267bc04419 Mon Sep 17 00:00:00 2001 From: Keshavamurthy Anil S Date: Mon, 12 Dec 2005 00:37:33 -0800 Subject: [PATCH] kprobes: no probes on critical path For Kprobes critical path is the path from debug break exception handler till the control reaches kprobes exception code. No probes can be supported in this path as we will end up in recursion. This patch prevents this by moving the below function to safe __kprobes section onto which no probes can be inserted. Signed-off-by: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index bce933ebb29f..eecf84526afe 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -32,6 +32,7 @@ #include #include +#include #include #include @@ -168,7 +169,7 @@ EXPORT_SYMBOL(notifier_chain_unregister); * of the last notifier function called. */ -int notifier_call_chain(struct notifier_block **n, unsigned long val, void *v) +int __kprobes notifier_call_chain(struct notifier_block **n, unsigned long val, void *v) { int ret=NOTIFY_DONE; struct notifier_block *nb = *n; -- cgit v1.2.3 From bf8d5c52c3b6b27061e3b7d779057fd9a6cac164 Mon Sep 17 00:00:00 2001 From: Keshavamurthy Anil S Date: Mon, 12 Dec 2005 00:37:34 -0800 Subject: [PATCH] kprobes: increment kprobe missed count for multiprobes When multiple probes are registered at the same address and if due to some recursion (probe getting triggered within a probe handler), we skip calling pre_handlers and just increment nmissed field. The below patch make sure it walks the list for multiple probes case. Without the below patch we get incorrect results of nmissed count for multiple probe case. Signed-off-by: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e4f0fc62bd3e..3bb71e63a37e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -246,6 +246,19 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) return ret; } +/* Walks the list and increments nmissed count for multiprobe case */ +void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) +{ + struct kprobe *kp; + if (p->pre_handler != aggr_pre_handler) { + p->nmissed++; + } else { + list_for_each_entry_rcu(kp, &p->list, list) + kp->nmissed++; + } + return; +} + /* Called with kretprobe_lock held */ struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp) { -- cgit v1.2.3 From 9e28393998d3d0733097306762f6d1c083fc92c6 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Tue, 20 Dec 2005 15:21:24 +0100 Subject: [PATCH] kernel/params.c: fix sysfs access with CONFIG_MODULES=n All the work was done to setup the file and maintain the file handles but the access functions were zeroed out due to the #ifdef. Removing the #ifdef allows full access to all the parameters when CONFIG_MODULES=n. akpm: put it back again, but use CONFIG_SYSFS instead. Signed-off-by: Jason Wessel Signed-off-by: Andrew Morton Signed-off-by: Adrian Bunk Signed-off-by: Linus Torvalds --- kernel/params.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 47ba69547945..c76ad25e6a21 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -619,7 +619,7 @@ static void __init param_sysfs_builtin(void) /* module-related sysfs stuff */ -#ifdef CONFIG_MODULES +#ifdef CONFIG_SYSFS #define to_module_attr(n) container_of(n, struct module_attribute, attr); #define to_module_kobject(n) container_of(n, struct module_kobject, kobj); -- cgit v1.2.3 From 8e31108b9f41069d55cb9b019ac8262c55fd2616 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 23 Dec 2005 19:54:46 -0800 Subject: [PATCH] Fix memory ordering problem in wake_futex() Fix a memory ordering problem that occurs on IA64. The "store" to q->lock_ptr in wake_futex() can become visible before wake_up_all() clears the lock in the futex_q. Signed-off-by: Jack Steiner Acked-by: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/futex.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 5872e3507f35..5e71a6bf6f6b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -270,7 +270,13 @@ static void wake_futex(struct futex_q *q) /* * The waiting task can free the futex_q as soon as this is written, * without taking any locks. This must come last. + * + * A memory barrier is required here to prevent the following store + * to lock_ptr from getting ahead of the wakeup. Clearing the lock + * at the end of wake_up_all() does not prevent this store from + * moving. */ + wmb(); q->lock_ptr = NULL; } -- cgit v1.2.3 From 8febdd85adaa41fa1fc1cb31286210fc2cd3ed0c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 30 Dec 2005 17:18:53 -0800 Subject: sysctl: don't overflow the user-supplied buffer with '\0' If the string was too long to fit in the user-supplied buffer, the sysctl layer would zero-terminate it by writing past the end of the buffer. Don't do that. Noticed by Yi Yang Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9990e10192e8..ad0425a8f709 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2201,14 +2201,12 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen, if (get_user(len, oldlenp)) return -EFAULT; if (len) { - l = strlen(table->data); + l = strlen(table->data)+1; if (len > l) len = l; if (len >= table->maxlen) len = table->maxlen; if(copy_to_user(oldval, table->data, len)) return -EFAULT; - if(put_user(0, ((char __user *) oldval) + len)) - return -EFAULT; if(put_user(len, oldlenp)) return -EFAULT; } -- cgit v1.2.3 From 82c9df820112c6286a8e8fbe482e94b65b49062c Mon Sep 17 00:00:00 2001 From: Yi Yang Date: Fri, 30 Dec 2005 16:37:10 +0800 Subject: [PATCH] Fix false old value return of sysctl For the sysctl syscall, if the user wants to get the old value of a sysctl entry and set a new value for it in the same syscall, the old value is always overwritten by the new value if the sysctl entry is of string type and if the user sets its strategy to sysctl_string. This issue lies in the strategy being run twice if the strategy is set to sysctl_string, the general strategy sysctl_string always returns 0 if success. Such strategy routines as sysctl_jiffies and sysctl_jiffies_ms return 1 because they do read and write for the sysctl entry. The strategy routine sysctl_string return 0 although it actually read and write the sysctl entry. According to my analysis, if a strategy routine do read and write, it should return 1, if it just does some necessary check but not read and write, it should return 0, for example sysctl_intvec. Signed-off-by: Yi Yang Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ad0425a8f709..e5102ea6e104 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2221,7 +2221,7 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen, len--; ((char *) table->data)[len] = 0; } - return 0; + return 1; } /* -- cgit v1.2.3 From de9e007d9105bf8fa613a89810feff32a43add03 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 31 Dec 2005 17:00:29 -0800 Subject: sysctl: make sure to terminate strings with a NUL This is a slightly more complete fix for the previous minimal sysctl string fix. It always terminates the returned string with a NUL, even if the full result wouldn't fit in the user-supplied buffer. The returned length is the full untruncated length, so that you can tell when truncation has occurred. Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e5102ea6e104..b53115b882e1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2192,27 +2192,32 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen, void **context) { - size_t l, len; - if (!table->data || !table->maxlen) return -ENOTDIR; if (oldval && oldlenp) { - if (get_user(len, oldlenp)) + size_t bufsize; + if (get_user(bufsize, oldlenp)) return -EFAULT; - if (len) { - l = strlen(table->data)+1; - if (len > l) len = l; - if (len >= table->maxlen) + if (bufsize) { + size_t len = strlen(table->data), copied; + + /* This shouldn't trigger for a well-formed sysctl */ + if (len > table->maxlen) len = table->maxlen; - if(copy_to_user(oldval, table->data, len)) + + /* Copy up to a max of bufsize-1 bytes of the string */ + copied = (len >= bufsize) ? bufsize - 1 : len; + + if (copy_to_user(oldval, table->data, copied) || + put_user(0, (char __user *)(oldval + copied))) return -EFAULT; - if(put_user(len, oldlenp)) + if (put_user(len, oldlenp)) return -EFAULT; } } if (newval && newlen) { - len = newlen; + size_t len = newlen; if (len > table->maxlen) len = table->maxlen; if(copy_from_user(table->data, newval, len)) -- cgit v1.2.3 From febf7ea4bedcd36fba0843db726bba28d22bf89a Mon Sep 17 00:00:00 2001 From: Date: Tue, 3 Jan 2006 11:35:26 +0100 Subject: gitignore: ignore more generated files Signed-off-by: Sam Ravnborg --- kernel/.gitignore | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 kernel/.gitignore (limited to 'kernel') diff --git a/kernel/.gitignore b/kernel/.gitignore new file mode 100644 index 000000000000..f2ab70073bd4 --- /dev/null +++ b/kernel/.gitignore @@ -0,0 +1,5 @@ +# +# Generated files +# +config_data.h +config_data.gz -- cgit v1.2.3 From f4b09ebc8baa51ec8394c4173e3de9d62b2cc97a Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 3 Jan 2006 13:37:51 +0100 Subject: update the email address of Randy Dunlap This patch removes all references to the bouncing address rddunlap@osdl.org and one dead web page from the kernel. Signed-off-by: Adrian Bunk Acked-by: Randy Dunlap --- kernel/configs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/configs.c b/kernel/configs.c index 986f7af31e0a..009e1ebdcb88 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -3,7 +3,7 @@ * Echo the kernel .config file used to build the kernel * * Copyright (C) 2002 Khalid Aziz - * Copyright (C) 2002 Randy Dunlap + * Copyright (C) 2002 Randy Dunlap * Copyright (C) 2002 Al Stone * Copyright (C) 2002 Hewlett-Packard Company * -- cgit v1.2.3 From 0296b2281352e4794e174b393c37f131502e09f0 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 11 Nov 2005 05:33:52 +0100 Subject: [PATCH] remove CONFIG_KOBJECT_UEVENT option It makes zero sense to have hotplug, but not the netlink events enabled today. Remove this option and merge the kobject_uevent.h header into the kobject.h header file. Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/sysctl.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b53115b882e1..6a51e25d4466 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -83,9 +84,6 @@ static int ngroups_max = NGROUPS_MAX; #ifdef CONFIG_KMOD extern char modprobe_path[]; #endif -#ifdef CONFIG_HOTPLUG -extern char hotplug_path[]; -#endif #ifdef CONFIG_CHR_DEV_SG extern int sg_big_buff; #endif -- cgit v1.2.3 From 0f76e5acf9dc788e664056dda1e461f0bec93948 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 11 Nov 2005 04:58:04 +0100 Subject: [PATCH] add uevent_helper control in /sys/kernel/ This deprecates the /proc/sys/kernel/hotplug file, as all this stuff should be in /sys some day, right? :) In /sys/kernel/ we have now uevent_seqnum and uevent_helper. The seqnum is no longer used by udev, as the version for this kernel depends on netlink which events will never get out-of-order. Recent udev versions disable the /sbin/hotplug helper with an init script, cause it leads to OOM on big boxes by running hundreds of shells in parallel. It should be done now by: echo "" > /sys/kernel/uevent_helper (Note that "-n" does not work, cause neighter proc nor sysfs support truncate().) Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/ksysfs.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 015fb69ad94d..e975a76a9d5b 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -23,11 +23,29 @@ static struct subsys_attribute _name##_attr = \ __ATTR(_name, 0644, _name##_show, _name##_store) #ifdef CONFIG_HOTPLUG -static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page) +/* current uevent sequence number */ +static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page) { return sprintf(page, "%llu\n", (unsigned long long)hotplug_seqnum); } -KERNEL_ATTR_RO(hotplug_seqnum); +KERNEL_ATTR_RO(uevent_seqnum); + +/* uevent helper program, used during early boo */ +static ssize_t uevent_helper_show(struct subsystem *subsys, char *page) +{ + return sprintf(page, "%s\n", hotplug_path); +} +static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, size_t count) +{ + if (count+1 > HOTPLUG_PATH_LEN) + return -ENOENT; + memcpy(hotplug_path, page, count); + hotplug_path[count] = '\0'; + if (count && hotplug_path[count-1] == '\n') + hotplug_path[count-1] = '\0'; + return count; +} +KERNEL_ATTR_RW(uevent_helper); #endif #ifdef CONFIG_KEXEC @@ -45,7 +63,8 @@ EXPORT_SYMBOL_GPL(kernel_subsys); static struct attribute * kernel_attrs[] = { #ifdef CONFIG_HOTPLUG - &hotplug_seqnum_attr.attr, + &uevent_seqnum_attr.attr, + &uevent_helper_attr.attr, #endif #ifdef CONFIG_KEXEC &crash_notes_attr.attr, -- cgit v1.2.3 From 312c004d36ce6c739512bac83b452f4c20ab1f62 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Wed, 16 Nov 2005 09:00:00 +0100 Subject: [PATCH] driver core: replace "hotplug" by "uevent" Leave the overloaded "hotplug" word to susbsystems which are handling real devices. The driver core does not "plug" anything, it just exports the state to userspace and generates events. Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/ksysfs.c | 14 +++++++------- kernel/sysctl.c | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index e975a76a9d5b..bfb4a7a54e22 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -26,23 +26,23 @@ static struct subsys_attribute _name##_attr = \ /* current uevent sequence number */ static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page) { - return sprintf(page, "%llu\n", (unsigned long long)hotplug_seqnum); + return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum); } KERNEL_ATTR_RO(uevent_seqnum); /* uevent helper program, used during early boo */ static ssize_t uevent_helper_show(struct subsystem *subsys, char *page) { - return sprintf(page, "%s\n", hotplug_path); + return sprintf(page, "%s\n", uevent_helper); } static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, size_t count) { - if (count+1 > HOTPLUG_PATH_LEN) + if (count+1 > UEVENT_HELPER_PATH_LEN) return -ENOENT; - memcpy(hotplug_path, page, count); - hotplug_path[count] = '\0'; - if (count && hotplug_path[count-1] == '\n') - hotplug_path[count-1] = '\0'; + memcpy(uevent_helper, page, count); + uevent_helper[count] = '\0'; + if (count && uevent_helper[count-1] == '\n') + uevent_helper[count-1] = '\0'; return count; } KERNEL_ATTR_RW(uevent_helper); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6a51e25d4466..345f4a1d533f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -395,8 +395,8 @@ static ctl_table kern_table[] = { { .ctl_name = KERN_HOTPLUG, .procname = "hotplug", - .data = &hotplug_path, - .maxlen = HOTPLUG_PATH_LEN, + .data = &uevent_helper, + .maxlen = UEVENT_HELPER_PATH_LEN, .mode = 0644, .proc_handler = &proc_dostring, .strategy = &sysctl_string, -- cgit v1.2.3 From f743ca5e10f4145e0b3e6d11b9b46171e16af7ce Mon Sep 17 00:00:00 2001 From: "akpm@osdl.org" Date: Tue, 22 Nov 2005 23:36:13 -0800 Subject: [PATCH] kobject_uevent CONFIG_NET=n fix lib/lib.a(kobject_uevent.o)(.text+0x25f): In function `kobject_uevent': : undefined reference to `__alloc_skb' lib/lib.a(kobject_uevent.o)(.text+0x2a1): In function `kobject_uevent': : undefined reference to `skb_over_panic' lib/lib.a(kobject_uevent.o)(.text+0x31d): In function `kobject_uevent': : undefined reference to `skb_over_panic' lib/lib.a(kobject_uevent.o)(.text+0x356): In function `kobject_uevent': : undefined reference to `netlink_broadcast' lib/lib.a(kobject_uevent.o)(.init.text+0x9): In function `kobject_uevent_init': : undefined reference to `netlink_kernel_create' make: *** [.tmp_vmlinux1] Error 1 Netlink is unconditionally enabled if CONFIG_NET, so that's OK. kobject_uevent.o is compiled even if !CONFIG_HOTPLUG, which is lazy. Let's compound the sin. Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- kernel/ksysfs.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index bfb4a7a54e22..99af8b05eeaa 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -15,6 +15,9 @@ #include #include +u64 uevent_seqnum; +char uevent_helper[UEVENT_HELPER_PATH_LEN] = "/sbin/hotplug"; + #define KERNEL_ATTR_RO(_name) \ static struct subsys_attribute _name##_attr = __ATTR_RO(_name) -- cgit v1.2.3 From a576219aca70e6700705a9836e098dbecd25fb56 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 6 Jan 2006 00:09:50 -0800 Subject: [PATCH] swsusp: resume_store() retval fix - This function returns -EINVAL all the time. Fix. - Decruftify it a bit too. - Writing to it doesn't seem to do what it's suppoed to do. Cc: Pavel Machek Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 027322a564f4..4d944b281b28 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -363,30 +363,28 @@ static ssize_t resume_show(struct subsystem * subsys, char *buf) MINOR(swsusp_resume_device)); } -static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t n) +static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n) { - int len; - char *p; unsigned int maj, min; - int error = -EINVAL; dev_t res; + int ret = -EINVAL; - p = memchr(buf, '\n', n); - len = p ? p - buf : n; + if (sscanf(buf, "%u:%u", &maj, &min) != 2) + goto out; - if (sscanf(buf, "%u:%u", &maj, &min) == 2) { - res = MKDEV(maj,min); - if (maj == MAJOR(res) && min == MINOR(res)) { - down(&pm_sem); - swsusp_resume_device = res; - up(&pm_sem); - printk("Attempting manual resume\n"); - noresume = 0; - software_resume(); - } - } + res = MKDEV(maj,min); + if (maj != MAJOR(res) || min != MINOR(res)) + goto out; - return error >= 0 ? n : error; + down(&pm_sem); + swsusp_resume_device = res; + up(&pm_sem); + printk("Attempting manual resume\n"); + noresume = 0; + software_resume(); + ret = n; +out: + return ret; } power_attr(resume); -- cgit v1.2.3 From 7ee1dd3fee22f15728f545d266403fc977e1eb99 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 6 Jan 2006 00:11:44 -0800 Subject: [PATCH] FRV: Make futex code compilable on nommu [try #2] Make the futex code compilable and usable on NOMMU by making the attempt to handle page faults conditional on CONFIG_MMU. If this is not enabled, then we can assume that EFAULT returned from futex_atomic_op_inuser() is not recoverable, and that the address lies outside of valid memory. handle_mm_fault() is made to BUG if called on NOMMU without attempting to invoke the actual handler (__handle_mm_fault). Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 5e71a6bf6f6b..5efa2f978032 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -356,6 +356,13 @@ retry: if (bh1 != bh2) spin_unlock(&bh2->lock); +#ifndef CONFIG_MMU + /* we don't get EFAULT from MMU faults if we don't have an MMU, + * but we might get them from range checking */ + ret = op_ret; + goto out; +#endif + if (unlikely(op_ret != -EFAULT)) { ret = op_ret; goto out; -- cgit v1.2.3 From eee45269b0f5979c70bc151c6c2f4e5f4f5ababe Mon Sep 17 00:00:00 2001 From: Ivan Kokshaysky Date: Fri, 6 Jan 2006 00:12:21 -0800 Subject: [PATCH] Alpha: convert to generic irq framework (generic part) Thanks to Christoph for doing most of the work. This allows automatic SMP IRQ affinity assignment other than default "all interrupts on all CPUs" which is rather expensive. This might be useful if the hardware can be programmed to distribute interrupts among different CPUs, like Alpha does. Signed-off-by: Ivan Kokshaysky Cc: Christoph Hellwig Cc: Richard Henderson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 2 ++ kernel/irq/proc.c | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 81c49a4d679e..97d5559997d2 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -366,6 +366,8 @@ int request_irq(unsigned int irq, action->next = NULL; action->dev_id = dev_id; + select_smp_affinity(irq); + retval = setup_irq(irq, action); if (retval) kfree(action); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index f26e534c6585..8a64a4844cde 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -68,7 +68,9 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, */ cpus_and(tmp, new_value, cpu_online_map); if (cpus_empty(tmp)) - return -EINVAL; + /* Special case for empty set - allow the architecture + code to set default SMP affinity. */ + return select_smp_affinity(irq) ? -EINVAL : full_count; proc_set_irq_affinity(irq, new_value); -- cgit v1.2.3 From f2d97f02961e8b1f8a24befb88ab0e5c886586ff Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 6 Jan 2006 00:12:24 -0800 Subject: [PATCH] swsusp: remove encryption This patch removes the image encryption that is only used by swsusp instead of zeroing the image after resume in order to prevent someone from reading some confidential data from it in the future and it does not protect the image from being read by an unauthorized person before resume. The functionality it provides should really belong to the user space and will possibly be reimplemented after the swap-handling functionality of swsusp is moved to the user space. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/swsusp.c | 163 ++------------------------------------------------ 1 file changed, 4 insertions(+), 159 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index c05f46e7348f..bd3097c583bf 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -30,9 +30,6 @@ * Alex Badea : * Fixed runaway init * - * Andreas Steinmetz : - * Added encrypted suspend option - * * More state savers are welcome. Especially for the scsi layer... * * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt @@ -67,10 +64,6 @@ #include #include -#include -#include -#include - #include "power.h" #ifdef CONFIG_HIGHMEM @@ -81,10 +74,6 @@ static int save_highmem(void) { return 0; } static int restore_highmem(void) { return 0; } #endif -#define CIPHER "aes" -#define MAXKEY 32 -#define MAXIV 32 - extern char resume_file[]; /* Local variables that should not be affected by save */ @@ -102,8 +91,7 @@ suspend_pagedir_t *pagedir_nosave __nosavedata = NULL; #define SWSUSP_SIG "S1SUSPEND" static struct swsusp_header { - char reserved[PAGE_SIZE - 20 - MAXKEY - MAXIV - sizeof(swp_entry_t)]; - u8 key_iv[MAXKEY+MAXIV]; + char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; swp_entry_t swsusp_info; char orig_sig[10]; char sig[10]; @@ -123,131 +111,6 @@ static struct swsusp_info swsusp_info; static unsigned short swapfile_used[MAX_SWAPFILES]; static unsigned short root_swap; -static int write_page(unsigned long addr, swp_entry_t *loc); -static int bio_read_page(pgoff_t page_off, void *page); - -static u8 key_iv[MAXKEY+MAXIV]; - -#ifdef CONFIG_SWSUSP_ENCRYPT - -static int crypto_init(int mode, void **mem) -{ - int error = 0; - int len; - char *modemsg; - struct crypto_tfm *tfm; - - modemsg = mode ? "suspend not possible" : "resume not possible"; - - tfm = crypto_alloc_tfm(CIPHER, CRYPTO_TFM_MODE_CBC); - if(!tfm) { - printk(KERN_ERR "swsusp: no tfm, %s\n", modemsg); - error = -EINVAL; - goto out; - } - - if(MAXKEY < crypto_tfm_alg_min_keysize(tfm)) { - printk(KERN_ERR "swsusp: key buffer too small, %s\n", modemsg); - error = -ENOKEY; - goto fail; - } - - if (mode) - get_random_bytes(key_iv, MAXKEY+MAXIV); - - len = crypto_tfm_alg_max_keysize(tfm); - if (len > MAXKEY) - len = MAXKEY; - - if (crypto_cipher_setkey(tfm, key_iv, len)) { - printk(KERN_ERR "swsusp: key setup failure, %s\n", modemsg); - error = -EKEYREJECTED; - goto fail; - } - - len = crypto_tfm_alg_ivsize(tfm); - - if (MAXIV < len) { - printk(KERN_ERR "swsusp: iv buffer too small, %s\n", modemsg); - error = -EOVERFLOW; - goto fail; - } - - crypto_cipher_set_iv(tfm, key_iv+MAXKEY, len); - - *mem=(void *)tfm; - - goto out; - -fail: crypto_free_tfm(tfm); -out: return error; -} - -static __inline__ void crypto_exit(void *mem) -{ - crypto_free_tfm((struct crypto_tfm *)mem); -} - -static __inline__ int crypto_write(struct pbe *p, void *mem) -{ - int error = 0; - struct scatterlist src, dst; - - src.page = virt_to_page(p->address); - src.offset = 0; - src.length = PAGE_SIZE; - dst.page = virt_to_page((void *)&swsusp_header); - dst.offset = 0; - dst.length = PAGE_SIZE; - - error = crypto_cipher_encrypt((struct crypto_tfm *)mem, &dst, &src, - PAGE_SIZE); - - if (!error) - error = write_page((unsigned long)&swsusp_header, - &(p->swap_address)); - return error; -} - -static __inline__ int crypto_read(struct pbe *p, void *mem) -{ - int error = 0; - struct scatterlist src, dst; - - error = bio_read_page(swp_offset(p->swap_address), (void *)p->address); - if (!error) { - src.offset = 0; - src.length = PAGE_SIZE; - dst.offset = 0; - dst.length = PAGE_SIZE; - src.page = dst.page = virt_to_page((void *)p->address); - - error = crypto_cipher_decrypt((struct crypto_tfm *)mem, &dst, - &src, PAGE_SIZE); - } - return error; -} -#else -static __inline__ int crypto_init(int mode, void *mem) -{ - return 0; -} - -static __inline__ void crypto_exit(void *mem) -{ -} - -static __inline__ int crypto_write(struct pbe *p, void *mem) -{ - return write_page(p->address, &(p->swap_address)); -} - -static __inline__ int crypto_read(struct pbe *p, void *mem) -{ - return bio_read_page(swp_offset(p->swap_address), (void *)p->address); -} -#endif - static int mark_swapfiles(swp_entry_t prev) { int error; @@ -259,7 +122,6 @@ static int mark_swapfiles(swp_entry_t prev) !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); memcpy(swsusp_header.sig,SWSUSP_SIG, 10); - memcpy(swsusp_header.key_iv, key_iv, MAXKEY+MAXIV); swsusp_header.swsusp_info = prev; error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0), @@ -405,10 +267,6 @@ static int data_write(void) int error = 0, i = 0; unsigned int mod = nr_copy_pages / 100; struct pbe *p; - void *tfm; - - if ((error = crypto_init(1, &tfm))) - return error; if (!mod) mod = 1; @@ -417,14 +275,11 @@ static int data_write(void) for_each_pbe (p, pagedir_nosave) { if (!(i%mod)) printk( "\b\b\b\b%3d%%", i / mod ); - if ((error = crypto_write(p, tfm))) { - crypto_exit(tfm); + if ((error = write_page(p->address, &p->swap_address))) return error; - } i++; } printk("\b\b\b\bdone\n"); - crypto_exit(tfm); return error; } @@ -550,7 +405,6 @@ static int write_suspend_image(void) if ((error = close_swap())) goto FreePagedir; Done: - memset(key_iv, 0, MAXKEY+MAXIV); return error; FreePagedir: free_pagedir_entries(); @@ -812,8 +666,6 @@ static int check_sig(void) return error; if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); - memcpy(key_iv, swsusp_header.key_iv, MAXKEY+MAXIV); - memset(swsusp_header.key_iv, 0, MAXKEY+MAXIV); /* * Reset swap signature now. @@ -840,10 +692,6 @@ static int data_read(struct pbe *pblist) int error = 0; int i = 0; int mod = swsusp_info.image_pages / 100; - void *tfm; - - if ((error = crypto_init(0, &tfm))) - return error; if (!mod) mod = 1; @@ -855,15 +703,13 @@ static int data_read(struct pbe *pblist) if (!(i % mod)) printk("\b\b\b\b%3d%%", i / mod); - if ((error = crypto_read(p, tfm))) { - crypto_exit(tfm); + if ((error = bio_read_page(swp_offset(p->swap_address), + (void *)p->address))) return error; - } i++; } printk("\b\b\b\bdone\n"); - crypto_exit(tfm); return error; } @@ -986,7 +832,6 @@ int swsusp_read(void) error = read_suspend_image(); blkdev_put(resume_bdev); - memset(key_iv, 0, MAXKEY+MAXIV); if (!error) pr_debug("swsusp: Reading resume file was successful\n"); -- cgit v1.2.3 From 7088a5c00103ef48782d6c359cd12b13a10666e6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 6 Jan 2006 00:13:05 -0800 Subject: [PATCH] swsusp: introduce the swap map structure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch introduces the swap map structure that can be used by swsusp for keeping tracks of data pages written to the swap.  The structure itself is described in a comment within the patch. The overall idea is to reduce the amount of metadata written to the swap and to write and read the image pages sequentially, in a file-alike way. This makes the swap-handling part of swsusp fairly independent of its snapshot-handling part and will hopefully allow us to completely separate these two parts in the future. This patch is needed to remove the suspend image size limit imposed by the limited size of the swsusp_info structure, which is essential for x86-64 systems with more than 512 MB of RAM. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 8 +- kernel/power/power.h | 13 +- kernel/power/snapshot.c | 14 +- kernel/power/swsusp.c | 558 ++++++++++++++++++++++++++++++++++-------------- 4 files changed, 417 insertions(+), 176 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 4d944b281b28..76a5131b0e80 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -25,9 +25,9 @@ extern suspend_disk_method_t pm_disk_mode; extern int swsusp_suspend(void); -extern int swsusp_write(void); +extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages); extern int swsusp_check(void); -extern int swsusp_read(void); +extern int swsusp_read(struct pbe **pblist_ptr); extern void swsusp_close(void); extern int swsusp_resume(void); @@ -176,7 +176,7 @@ int pm_suspend_disk(void) if (in_suspend) { device_resume(); pr_debug("PM: writing image.\n"); - error = swsusp_write(); + error = swsusp_write(pagedir_nosave, nr_copy_pages); if (!error) power_down(pm_disk_mode); else { @@ -247,7 +247,7 @@ static int software_resume(void) pr_debug("PM: Reading swsusp image.\n"); - if ((error = swsusp_read())) { + if ((error = swsusp_read(&pagedir_nosave))) { swsusp_free(); goto Thaw; } diff --git a/kernel/power/power.h b/kernel/power/power.h index 6c042b5ee14b..977877c6dcfc 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -9,19 +9,14 @@ #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) #endif -#define MAX_PBES ((PAGE_SIZE - sizeof(struct new_utsname) \ - - 4 - 3*sizeof(unsigned long) - sizeof(int) \ - - sizeof(void *)) / sizeof(swp_entry_t)) - struct swsusp_info { struct new_utsname uts; u32 version_code; unsigned long num_physpages; int cpus; unsigned long image_pages; - unsigned long pagedir_pages; - suspend_pagedir_t * suspend_pagedir; - swp_entry_t pagedir[MAX_PBES]; + unsigned long pages; + swp_entry_t start; } __attribute__((aligned(PAGE_SIZE))); @@ -67,6 +62,8 @@ extern asmlinkage int swsusp_arch_resume(void); extern void free_pagedir(struct pbe *pblist); extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed); -extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages); extern void swsusp_free(void); extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed); +extern unsigned int snapshot_nr_pages(void); +extern struct pbe *snapshot_pblist(void); +extern void snapshot_pblist_set(struct pbe *pblist); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 4a6dbcefd378..152d56cdf017 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -33,6 +33,9 @@ #include "power.h" +struct pbe *pagedir_nosave; +unsigned int nr_copy_pages; + #ifdef CONFIG_HIGHMEM struct highmem_page { char *data; @@ -244,7 +247,7 @@ static inline void fill_pb_page(struct pbe *pbpage) * of memory pages allocated with alloc_pagedir() */ -void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) +static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) { struct pbe *pbpage, *p; unsigned int num = PBES_PER_PAGE; @@ -261,7 +264,6 @@ void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) p->next = p + 1; p->next = NULL; } - pr_debug("create_pbe_list(): initialized %d PBEs\n", num); } /** @@ -332,7 +334,8 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed if (!pbe) { /* get_zeroed_page() failed */ free_pagedir(pblist); pblist = NULL; - } + } else + create_pbe_list(pblist, nr_pages); return pblist; } @@ -395,7 +398,6 @@ static struct pbe *swsusp_alloc(unsigned int nr_pages) printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); return NULL; } - create_pbe_list(pblist, nr_pages); if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) { printk(KERN_ERR "suspend: Allocating image pages failed.\n"); @@ -421,10 +423,6 @@ asmlinkage int swsusp_save(void) (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE, PAGES_FOR_IO, nr_free_pages()); - /* This is needed because of the fixed size of swsusp_info */ - if (MAX_PBES < (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE) - return -ENOSPC; - if (!enough_free_mem(nr_pages)) { printk(KERN_ERR "swsusp: Not enough free memory\n"); return -ENOMEM; diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index bd3097c583bf..b09bd7c0998d 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -30,6 +30,9 @@ * Alex Badea : * Fixed runaway init * + * Rafael J. Wysocki + * Added the swap map data structure and reworked the handling of swap + * * More state savers are welcome. Especially for the scsi layer... * * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt @@ -76,18 +79,6 @@ static int restore_highmem(void) { return 0; } extern char resume_file[]; -/* Local variables that should not be affected by save */ -unsigned int nr_copy_pages __nosavedata = 0; - -/* Suspend pagedir is allocated before final copy, therefore it - must be freed after resume - - Warning: this is even more evil than it seems. Pagedirs this file - talks about are completely different from page directories used by - MMU hardware. - */ -suspend_pagedir_t *pagedir_nosave __nosavedata = NULL; - #define SWSUSP_SIG "S1SUSPEND" static struct swsusp_header { @@ -238,48 +229,205 @@ static int write_page(unsigned long addr, swp_entry_t *loc) } /** - * data_free - Free the swap entries used by the saved image. + * Swap map-handling functions + * + * The swap map is a data structure used for keeping track of each page + * written to the swap. It consists of many swap_map_page structures + * that contain each an array of MAP_PAGE_SIZE swap entries. + * These structures are linked together with the help of either the + * .next (in memory) or the .next_swap (in swap) member. * - * Walk the list of used swap entries and free each one. - * This is only used for cleanup when suspend fails. + * The swap map is created during suspend. At that time we need to keep + * it in memory, because we have to free all of the allocated swap + * entries if an error occurs. The memory needed is preallocated + * so that we know in advance if there's enough of it. + * + * The first swap_map_page structure is filled with the swap entries that + * correspond to the first MAP_PAGE_SIZE data pages written to swap and + * so on. After the all of the data pages have been written, the order + * of the swap_map_page structures in the map is reversed so that they + * can be read from swap in the original order. This causes the data + * pages to be loaded in exactly the same order in which they have been + * saved. + * + * During resume we only need to use one swap_map_page structure + * at a time, which means that we only need to use two memory pages for + * reading the image - one for reading the swap_map_page structures + * and the second for reading the data pages from swap. */ -static void data_free(void) + +#define MAP_PAGE_SIZE ((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \ + / sizeof(swp_entry_t)) + +struct swap_map_page { + swp_entry_t entries[MAP_PAGE_SIZE]; + swp_entry_t next_swap; + struct swap_map_page *next; +}; + +static inline void free_swap_map(struct swap_map_page *swap_map) { - swp_entry_t entry; - struct pbe *p; + struct swap_map_page *swp; - for_each_pbe (p, pagedir_nosave) { - entry = p->swap_address; - if (entry.val) - swap_free(entry); - else - break; + while (swap_map) { + swp = swap_map->next; + free_page((unsigned long)swap_map); + swap_map = swp; + } +} + +static struct swap_map_page *alloc_swap_map(unsigned int nr_pages) +{ + struct swap_map_page *swap_map, *swp; + unsigned n = 0; + + if (!nr_pages) + return NULL; + + pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages); + swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); + swp = swap_map; + for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) { + swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); + swp = swp->next; + if (!swp) { + free_swap_map(swap_map); + return NULL; + } } + return swap_map; } /** - * data_write - Write saved image to swap. - * - * Walk the list of pages in the image and sync each one to swap. + * reverse_swap_map - reverse the order of pages in the swap map + * @swap_map */ -static int data_write(void) + +static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map) { - int error = 0, i = 0; - unsigned int mod = nr_copy_pages / 100; - struct pbe *p; + struct swap_map_page *prev, *next; + + prev = NULL; + while (swap_map) { + next = swap_map->next; + swap_map->next = prev; + prev = swap_map; + swap_map = next; + } + return prev; +} - if (!mod) - mod = 1; +/** + * free_swap_map_entries - free the swap entries allocated to store + * the swap map @swap_map (this is only called in case of an error) + */ +static inline void free_swap_map_entries(struct swap_map_page *swap_map) +{ + while (swap_map) { + if (swap_map->next_swap.val) + swap_free(swap_map->next_swap); + swap_map = swap_map->next; + } +} - printk( "Writing data to swap (%d pages)... ", nr_copy_pages ); - for_each_pbe (p, pagedir_nosave) { - if (!(i%mod)) - printk( "\b\b\b\b%3d%%", i / mod ); - if ((error = write_page(p->address, &p->swap_address))) +/** + * save_swap_map - save the swap map used for tracing the data pages + * stored in the swap + */ + +static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start) +{ + swp_entry_t entry = (swp_entry_t){0}; + int error; + + while (swap_map) { + swap_map->next_swap = entry; + if ((error = write_page((unsigned long)swap_map, &entry))) return error; - i++; + swap_map = swap_map->next; } - printk("\b\b\b\bdone\n"); + *start = entry; + return 0; +} + +/** + * free_image_entries - free the swap entries allocated to store + * the image data pages (this is only called in case of an error) + */ + +static inline void free_image_entries(struct swap_map_page *swp) +{ + unsigned k; + + while (swp) { + for (k = 0; k < MAP_PAGE_SIZE; k++) + if (swp->entries[k].val) + swap_free(swp->entries[k]); + swp = swp->next; + } +} + +/** + * The swap_map_handle structure is used for handling the swap map in + * a file-alike way + */ + +struct swap_map_handle { + struct swap_map_page *cur; + unsigned int k; +}; + +static inline void init_swap_map_handle(struct swap_map_handle *handle, + struct swap_map_page *map) +{ + handle->cur = map; + handle->k = 0; +} + +static inline int swap_map_write_page(struct swap_map_handle *handle, + unsigned long addr) +{ + int error; + + error = write_page(addr, handle->cur->entries + handle->k); + if (error) + return error; + if (++handle->k >= MAP_PAGE_SIZE) { + handle->cur = handle->cur->next; + handle->k = 0; + } + return 0; +} + +/** + * save_image_data - save the data pages pointed to by the PBEs + * from the list @pblist using the swap map handle @handle + * (assume there are @nr_pages data pages to save) + */ + +static int save_image_data(struct pbe *pblist, + struct swap_map_handle *handle, + unsigned int nr_pages) +{ + unsigned int m; + struct pbe *p; + int error = 0; + + printk("Saving image data pages (%u pages) ... ", nr_pages); + m = nr_pages / 100; + if (!m) + m = 1; + nr_pages = 0; + for_each_pbe (p, pblist) { + error = swap_map_write_page(handle, p->address); + if (error) + break; + if (!(nr_pages % m)) + printk("\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + } + if (!error) + printk("\b\b\b\bdone\n"); return error; } @@ -295,19 +443,20 @@ static void dump_info(void) pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname); pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus); pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages); - pr_debug(" swsusp: Pagedir: %ld Pages\n",swsusp_info.pagedir_pages); + pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages); } -static void init_header(void) +static void init_header(unsigned int nr_pages) { memset(&swsusp_info, 0, sizeof(swsusp_info)); swsusp_info.version_code = LINUX_VERSION_CODE; swsusp_info.num_physpages = num_physpages; memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname)); - swsusp_info.suspend_pagedir = pagedir_nosave; swsusp_info.cpus = num_online_cpus(); - swsusp_info.image_pages = nr_copy_pages; + swsusp_info.image_pages = nr_pages; + swsusp_info.pages = nr_pages + + ((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT); } static int close_swap(void) @@ -326,39 +475,53 @@ static int close_swap(void) } /** - * free_pagedir_entries - Free pages used by the page directory. - * - * This is used during suspend for error recovery. + * pack_orig_addresses - the .orig_address fields of the PBEs from the + * list starting at @pbe are stored in the array @buf[] (1 page) */ -static void free_pagedir_entries(void) +static inline struct pbe *pack_orig_addresses(unsigned long *buf, + struct pbe *pbe) { - int i; + int j; - for (i = 0; i < swsusp_info.pagedir_pages; i++) - swap_free(swsusp_info.pagedir[i]); + for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { + buf[j] = pbe->orig_address; + pbe = pbe->next; + } + if (!pbe) + for (; j < PAGE_SIZE / sizeof(long); j++) + buf[j] = 0; + return pbe; } - /** - * write_pagedir - Write the array of pages holding the page directory. - * @last: Last swap entry we write (needed for header). + * save_image_metadata - save the .orig_address fields of the PBEs + * from the list @pblist using the swap map handle @handle */ -static int write_pagedir(void) +static int save_image_metadata(struct pbe *pblist, + struct swap_map_handle *handle) { - int error = 0; + unsigned long *buf; unsigned int n = 0; - struct pbe *pbe; + struct pbe *p; + int error = 0; - printk( "Writing pagedir..."); - for_each_pb_page (pbe, pagedir_nosave) { - if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++]))) - return error; + printk("Saving image metadata ... "); + buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC); + if (!buf) + return -ENOMEM; + p = pblist; + while (p) { + p = pack_orig_addresses(buf, p); + error = swap_map_write_page(handle, (unsigned long)buf); + if (error) + break; + n++; } - - swsusp_info.pagedir_pages = n; - printk("done (%u pages)\n", n); + free_page((unsigned long)buf); + if (!error) + printk("done (%u pages saved)\n", n); return error; } @@ -384,33 +547,48 @@ static int enough_swap(unsigned int nr_pages) /** * write_suspend_image - Write entire image and metadata. - * */ -static int write_suspend_image(void) +static int write_suspend_image(struct pbe *pblist, unsigned int nr_pages) { + struct swap_map_page *swap_map; + struct swap_map_handle handle; int error; - if (!enough_swap(nr_copy_pages)) { + if (!enough_swap(nr_pages)) { printk(KERN_ERR "swsusp: Not enough free swap\n"); return -ENOSPC; } - init_header(); - if ((error = data_write())) - goto FreeData; + init_header(nr_pages); + swap_map = alloc_swap_map(swsusp_info.pages); + if (!swap_map) + return -ENOMEM; + init_swap_map_handle(&handle, swap_map); - if ((error = write_pagedir())) - goto FreePagedir; + error = save_image_metadata(pblist, &handle); + if (!error) + error = save_image_data(pblist, &handle, nr_pages); + if (error) + goto Free_image_entries; - if ((error = close_swap())) - goto FreePagedir; - Done: + swap_map = reverse_swap_map(swap_map); + error = save_swap_map(swap_map, &swsusp_info.start); + if (error) + goto Free_map_entries; + + error = close_swap(); + if (error) + goto Free_map_entries; + +Free_swap_map: + free_swap_map(swap_map); return error; - FreePagedir: - free_pagedir_entries(); - FreeData: - data_free(); - goto Done; + +Free_map_entries: + free_swap_map_entries(swap_map); +Free_image_entries: + free_image_entries(swap_map); + goto Free_swap_map; } /* It is important _NOT_ to umount filesystems at this point. We want @@ -418,7 +596,7 @@ static int write_suspend_image(void) * filesystem clean: it is not. (And it does not matter, if we resume * correctly, we'll mark system clean, anyway.) */ -int swsusp_write(void) +int swsusp_write(struct pbe *pblist, unsigned int nr_pages) { int error; @@ -427,14 +605,12 @@ int swsusp_write(void) return error; } lock_swapdevices(); - error = write_suspend_image(); + error = write_suspend_image(pblist, nr_pages); /* This will unlock ignored swap devices since writing is finished */ lock_swapdevices(); return error; } - - int swsusp_suspend(void) { int error; @@ -531,7 +707,6 @@ static void copy_page_backup_list(struct pbe *dst, struct pbe *src) /* We assume both lists contain the same number of elements */ while (src) { dst->orig_address = src->orig_address; - dst->swap_address = src->swap_address; dst = dst->next; src = src->next; } @@ -611,6 +786,61 @@ static int bio_write_page(pgoff_t page_off, void *page) return submit(WRITE, page_off, page); } +/** + * The following functions allow us to read data using a swap map + * in a file-alike way + */ + +static inline void release_swap_map_reader(struct swap_map_handle *handle) +{ + if (handle->cur) + free_page((unsigned long)handle->cur); + handle->cur = NULL; +} + +static inline int get_swap_map_reader(struct swap_map_handle *handle, + swp_entry_t start) +{ + int error; + + if (!swp_offset(start)) + return -EINVAL; + handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); + if (!handle->cur) + return -ENOMEM; + error = bio_read_page(swp_offset(start), handle->cur); + if (error) { + release_swap_map_reader(handle); + return error; + } + handle->k = 0; + return 0; +} + +static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf) +{ + unsigned long offset; + int error; + + if (!handle->cur) + return -EINVAL; + offset = swp_offset(handle->cur->entries[handle->k]); + if (!offset) + return -EINVAL; + error = bio_read_page(offset, buf); + if (error) + return error; + if (++handle->k >= MAP_PAGE_SIZE) { + handle->k = 0; + offset = swp_offset(handle->cur->next_swap); + if (!offset) + release_swap_map_reader(handle); + else + error = bio_read_page(offset, handle->cur); + } + return error; +} + /* * Sanity check if this image makes sense with this kernel/swap context * I really don't think that it's foolproof but more than nothing.. @@ -639,7 +869,6 @@ static const char *sanity_check(void) return NULL; } - static int check_header(void) { const char *reason = NULL; @@ -653,7 +882,6 @@ static int check_header(void) printk(KERN_ERR "swsusp: Resume mismatch: %s\n",reason); return -EPERM; } - nr_copy_pages = swsusp_info.image_pages; return error; } @@ -680,75 +908,88 @@ static int check_sig(void) } /** - * data_read - Read image pages from swap. - * - * You do not need to check for overlaps, check_pagedir() - * already did that. + * load_image_data - load the image data using the swap map handle + * @handle and store them using the page backup list @pblist + * (assume there are @nr_pages pages to load) */ -static int data_read(struct pbe *pblist) +static int load_image_data(struct pbe *pblist, + struct swap_map_handle *handle, + unsigned int nr_pages) { + int error; + unsigned int m; struct pbe *p; - int error = 0; - int i = 0; - int mod = swsusp_info.image_pages / 100; - - if (!mod) - mod = 1; - - printk("swsusp: Reading image data (%lu pages): ", - swsusp_info.image_pages); - - for_each_pbe (p, pblist) { - if (!(i % mod)) - printk("\b\b\b\b%3d%%", i / mod); - if ((error = bio_read_page(swp_offset(p->swap_address), - (void *)p->address))) - return error; - - i++; + if (!pblist) + return -EINVAL; + printk("Loading image data pages (%u pages) ... ", nr_pages); + m = nr_pages / 100; + if (!m) + m = 1; + nr_pages = 0; + p = pblist; + while (p) { + error = swap_map_read_page(handle, (void *)p->address); + if (error) + break; + p = p->next; + if (!(nr_pages % m)) + printk("\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; } - printk("\b\b\b\bdone\n"); + if (!error) + printk("\b\b\b\bdone\n"); return error; } /** - * read_pagedir - Read page backup list pages from swap + * unpack_orig_addresses - copy the elements of @buf[] (1 page) to + * the PBEs in the list starting at @pbe */ -static int read_pagedir(struct pbe *pblist) +static inline struct pbe *unpack_orig_addresses(unsigned long *buf, + struct pbe *pbe) { - struct pbe *pbpage, *p; - unsigned int i = 0; - int error; + int j; - if (!pblist) - return -EFAULT; + for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { + pbe->orig_address = buf[j]; + pbe = pbe->next; + } + return pbe; +} - printk("swsusp: Reading pagedir (%lu pages)\n", - swsusp_info.pagedir_pages); +/** + * load_image_metadata - load the image metadata using the swap map + * handle @handle and put them into the PBEs in the list @pblist + */ - for_each_pb_page (pbpage, pblist) { - unsigned long offset = swp_offset(swsusp_info.pagedir[i++]); +static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle) +{ + struct pbe *p; + unsigned long *buf; + unsigned int n = 0; + int error = 0; - error = -EFAULT; - if (offset) { - p = (pbpage + PB_PAGE_SKIP)->next; - error = bio_read_page(offset, (void *)pbpage); - (pbpage + PB_PAGE_SKIP)->next = p; - } + printk("Loading image metadata ... "); + buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC); + if (!buf) + return -ENOMEM; + p = pblist; + while (p) { + error = swap_map_read_page(handle, buf); if (error) break; + p = unpack_orig_addresses(buf, p); + n++; } - + free_page((unsigned long)buf); if (!error) - BUG_ON(i != swsusp_info.pagedir_pages); - + printk("done (%u pages loaded)\n", n); return error; } - static int check_suspend_image(void) { int error = 0; @@ -762,34 +1003,39 @@ static int check_suspend_image(void) return 0; } -static int read_suspend_image(void) +static int read_suspend_image(struct pbe **pblist_ptr) { int error = 0; - struct pbe *p; + struct pbe *p, *pblist; + struct swap_map_handle handle; + unsigned int nr_pages = swsusp_info.image_pages; - if (!(p = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 0))) + p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0); + if (!p) return -ENOMEM; - - if ((error = read_pagedir(p))) + error = get_swap_map_reader(&handle, swsusp_info.start); + if (error) + /* The PBE list at p will be released by swsusp_free() */ return error; - create_pbe_list(p, nr_copy_pages); - mark_unsafe_pages(p); - pagedir_nosave = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1); - if (pagedir_nosave) { - create_pbe_list(pagedir_nosave, nr_copy_pages); - copy_page_backup_list(pagedir_nosave, p); + error = load_image_metadata(p, &handle); + if (!error) { + mark_unsafe_pages(p); + pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1); + if (pblist) + copy_page_backup_list(pblist, p); + free_pagedir(p); + if (!pblist) + error = -ENOMEM; + + /* Allocate memory for the image and read the data from swap */ + if (!error) + error = alloc_data_pages(pblist, GFP_ATOMIC, 1); + if (!error) + error = load_image_data(pblist, &handle, nr_pages); + if (!error) + *pblist_ptr = pblist; } - free_pagedir(p); - if (!pagedir_nosave) - return -ENOMEM; - - /* Allocate memory for the image and read the data from swap */ - - error = alloc_data_pages(pagedir_nosave, GFP_ATOMIC, 1); - - if (!error) - error = data_read(pagedir_nosave); - + release_swap_map_reader(&handle); return error; } @@ -821,7 +1067,7 @@ int swsusp_check(void) * swsusp_read - Read saved image from swap. */ -int swsusp_read(void) +int swsusp_read(struct pbe **pblist_ptr) { int error; @@ -830,7 +1076,7 @@ int swsusp_read(void) return PTR_ERR(resume_bdev); } - error = read_suspend_image(); + error = read_suspend_image(pblist_ptr); blkdev_put(resume_bdev); if (!error) -- cgit v1.2.3 From 72a97e08394a3b2e75481ff680ec2a0591e3cba4 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 6 Jan 2006 00:13:46 -0800 Subject: [PATCH] swsusp: improve freeing of memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch makes swsusp free only as much memory as needed to complete the suspend and not as much as possible.  In the most of cases this should speed up the suspend and make the system much more responsive after resume, especially if a GUI (eg. X Windows) is used. If needed, the old behavior (ie to free as much memory as possible during suspend) can be restored by unsetting FAST_FREE in power.h Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 30 +++-------------------- kernel/power/power.h | 14 ++++++++--- kernel/power/snapshot.c | 65 +++++++++++++++++++++++++++++++++++++++++++++---- kernel/power/swsusp.c | 52 ++++++++++++++++++++++++++++++++++++++- 4 files changed, 125 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 76a5131b0e80..9e51cdf7b78d 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -24,6 +24,7 @@ extern suspend_disk_method_t pm_disk_mode; +extern int swsusp_shrink_memory(void); extern int swsusp_suspend(void); extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages); extern int swsusp_check(void); @@ -73,31 +74,6 @@ static void power_down(suspend_disk_method_t mode) static int in_suspend __nosavedata = 0; -/** - * free_some_memory - Try to free as much memory as possible - * - * ... but do not OOM-kill anyone - * - * Notice: all userland should be stopped at this point, or - * livelock is possible. - */ - -static void free_some_memory(void) -{ - unsigned int i = 0; - unsigned int tmp; - unsigned long pages = 0; - char *p = "-\\|/"; - - printk("Freeing memory... "); - while ((tmp = shrink_all_memory(10000))) { - pages += tmp; - printk("\b%c", p[i++ % 4]); - } - printk("\bdone (%li pages freed)\n", pages); -} - - static inline void platform_finish(void) { if (pm_disk_mode == PM_DISK_PLATFORM) { @@ -127,8 +103,8 @@ static int prepare_processes(void) } /* Free memory before shutting down devices. */ - free_some_memory(); - return 0; + if (!(error = swsusp_shrink_memory())) + return 0; thaw: thaw_processes(); enable_nonboot_cpus(); diff --git a/kernel/power/power.h b/kernel/power/power.h index 977877c6dcfc..acdc83b3d890 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -49,18 +49,26 @@ extern void thaw_processes(void); extern int pm_prepare_console(void); extern void pm_restore_console(void); - /* References to section boundaries */ extern const void __nosave_begin, __nosave_end; extern unsigned int nr_copy_pages; -extern suspend_pagedir_t *pagedir_nosave; -extern suspend_pagedir_t *pagedir_save; +extern struct pbe *pagedir_nosave; + +/* + * This compilation switch determines the way in which memory will be freed + * during suspend. If defined, only as much memory will be freed as needed + * to complete the suspend, which will make it go faster. Otherwise, the + * largest possible amount of memory will be freed. + */ +#define FAST_FREE 1 extern asmlinkage int swsusp_arch_suspend(void); extern asmlinkage int swsusp_arch_resume(void); +extern unsigned int count_data_pages(void); extern void free_pagedir(struct pbe *pblist); +extern void release_eaten_pages(void); extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed); extern void swsusp_free(void); extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 152d56cdf017..e80d282dbf58 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -37,6 +37,31 @@ struct pbe *pagedir_nosave; unsigned int nr_copy_pages; #ifdef CONFIG_HIGHMEM +unsigned int count_highmem_pages(void) +{ + struct zone *zone; + unsigned long zone_pfn; + unsigned int n = 0; + + for_each_zone (zone) + if (is_highmem(zone)) { + mark_free_pages(zone); + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) { + struct page *page; + unsigned long pfn = zone_pfn + zone->zone_start_pfn; + if (!pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + if (PageReserved(page)) + continue; + if (PageNosaveFree(page)) + continue; + n++; + } + } + return n; +} + struct highmem_page { char *data; struct page *page; @@ -152,17 +177,15 @@ static int saveable(struct zone *zone, unsigned long *zone_pfn) BUG_ON(PageReserved(page) && PageNosave(page)); if (PageNosave(page)) return 0; - if (PageReserved(page) && pfn_is_nosave(pfn)) { - pr_debug("[nosave pfn 0x%lx]", pfn); + if (PageReserved(page) && pfn_is_nosave(pfn)) return 0; - } if (PageNosaveFree(page)) return 0; return 1; } -static unsigned count_data_pages(void) +unsigned int count_data_pages(void) { struct zone *zone; unsigned long zone_pfn; @@ -266,6 +289,35 @@ static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) } } +/** + * On resume it is necessary to trace and eventually free the unsafe + * pages that have been allocated, because they are needed for I/O + * (on x86-64 we likely will "eat" these pages once again while + * creating the temporary page translation tables) + */ + +struct eaten_page { + struct eaten_page *next; + char padding[PAGE_SIZE - sizeof(void *)]; +}; + +static struct eaten_page *eaten_pages = NULL; + +void release_eaten_pages(void) +{ + struct eaten_page *p, *q; + + p = eaten_pages; + while (p) { + q = p->next; + /* We don't want swsusp_free() to free this page again */ + ClearPageNosave(virt_to_page(p)); + free_page((unsigned long)p); + p = q; + } + eaten_pages = NULL; +} + /** * @safe_needed - on resume, for storing the PBE list and the image, * we can only use memory pages that do not conflict with the pages @@ -284,9 +336,12 @@ static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed) if (safe_needed) do { res = (void *)get_zeroed_page(gfp_mask); - if (res && PageNosaveFree(virt_to_page(res))) + if (res && PageNosaveFree(virt_to_page(res))) { /* This is for swsusp_free() */ SetPageNosave(virt_to_page(res)); + ((struct eaten_page *)res)->next = eaten_pages; + eaten_pages = res; + } } while (res && PageNosaveFree(virt_to_page(res))); else res = (void *)get_zeroed_page(gfp_mask); diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index b09bd7c0998d..f77f9397a364 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -70,11 +70,13 @@ #include "power.h" #ifdef CONFIG_HIGHMEM +unsigned int count_highmem_pages(void); int save_highmem(void); int restore_highmem(void); #else static int save_highmem(void) { return 0; } static int restore_highmem(void) { return 0; } +static unsigned int count_highmem_pages(void) { return 0; } #endif extern char resume_file[]; @@ -611,6 +613,52 @@ int swsusp_write(struct pbe *pblist, unsigned int nr_pages) return error; } +/** + * swsusp_shrink_memory - Try to free as much memory as needed + * + * ... but do not OOM-kill anyone + * + * Notice: all userland should be stopped before it is called, or + * livelock is possible. + */ + +#define SHRINK_BITE 10000 + +int swsusp_shrink_memory(void) +{ + long tmp; + struct zone *zone; + unsigned long pages = 0; + unsigned int i = 0; + char *p = "-\\|/"; + + printk("Shrinking memory... "); + do { +#ifdef FAST_FREE + tmp = 2 * count_highmem_pages(); + tmp += tmp / 50 + count_data_pages(); + tmp += (tmp + PBES_PER_PAGE - 1) / PBES_PER_PAGE + + PAGES_FOR_IO; + for_each_zone (zone) + if (!is_highmem(zone)) + tmp -= zone->free_pages; + if (tmp > 0) { + tmp = shrink_all_memory(SHRINK_BITE); + if (!tmp) + return -ENOMEM; + pages += tmp; + } +#else + tmp = shrink_all_memory(SHRINK_BITE); + pages += tmp; +#endif + printk("\b%c", p[i++%4]); + } while (tmp > 0); + printk("\bdone (%lu pages freed)\n", pages); + + return 0; +} + int swsusp_suspend(void) { int error; @@ -1030,8 +1078,10 @@ static int read_suspend_image(struct pbe **pblist_ptr) /* Allocate memory for the image and read the data from swap */ if (!error) error = alloc_data_pages(pblist, GFP_ATOMIC, 1); - if (!error) + if (!error) { + release_eaten_pages(); error = load_image_data(pblist, &handle, nr_pages); + } if (!error) *pblist_ptr = pblist; } -- cgit v1.2.3 From e5e2fa7857f6bf46605c77d949fa6698b9b0bc28 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 6 Jan 2006 00:14:20 -0800 Subject: [PATCH] swsusp: fix enough_free_mem This patch fixes a problem with the function enough_free_mem() used by swsusp to verify if there is a sufficient number of memory pages available to it to create and save the suspend image. Namely, enough_free_mem() uses nr_free_pages() to obtain the number of free memory pages, which is incorrect, because this function returns the total number of free pages, including free highmem pages, and the highmem pages cannot be used by swsusp for storing the image data. The patch makes enough_free_mem() avoid counting the free highmem pages as available to swsusp. Signed-off-by: Rafael J. Wysocki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index e80d282dbf58..41f66365f0d8 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -428,8 +428,14 @@ void swsusp_free(void) static int enough_free_mem(unsigned int nr_pages) { - pr_debug("swsusp: available memory: %u pages\n", nr_free_pages()); - return nr_free_pages() > (nr_pages + PAGES_FOR_IO + + struct zone *zone; + unsigned int n = 0; + + for_each_zone (zone) + if (!is_highmem(zone)) + n += zone->free_pages; + pr_debug("swsusp: available memory: %u pages\n", n); + return n > (nr_pages + PAGES_FOR_IO + (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); } -- cgit v1.2.3 From c050ca78705592d440c22055865bf4de40fe2a4c Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Fri, 6 Jan 2006 00:15:21 -0800 Subject: [PATCH] swsusp: Drop duplicate prototypes These two prototypes are already present in sched.h, remove duplicate version. Signed-off-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/power.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index acdc83b3d890..e521e61e0d95 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -43,9 +43,6 @@ static struct subsys_attribute _name##_attr = { \ extern struct subsystem power_subsys; -extern int freeze_processes(void); -extern void thaw_processes(void); - extern int pm_prepare_console(void); extern void pm_restore_console(void); -- cgit v1.2.3 From b3a93a255ec33a04776ec50efb30b7a99168dda2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 6 Jan 2006 00:15:22 -0800 Subject: [PATCH] swsusp: limit image size Limit the size of the suspend image to approx. 500 MB, which should improve the overall performance of swsusp on systems with more than 1 GB of RAM. It introduces the constant IMAGE_SIZE that can be set to the preferred size of the image (in MB) and modifies the memory-shrinking part of swsusp to take this constant into account (500 is the default value of IMAGE_SIZE). Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/power.h | 8 +++----- kernel/power/swsusp.c | 17 ++++++++--------- 2 files changed, 11 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index e521e61e0d95..9b0459903613 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -53,12 +53,10 @@ extern unsigned int nr_copy_pages; extern struct pbe *pagedir_nosave; /* - * This compilation switch determines the way in which memory will be freed - * during suspend. If defined, only as much memory will be freed as needed - * to complete the suspend, which will make it go faster. Otherwise, the - * largest possible amount of memory will be freed. + * Preferred image size in MB (set it to zero to get the smallest + * image possible) */ -#define FAST_FREE 1 +#define IMAGE_SIZE 500 extern asmlinkage int swsusp_arch_suspend(void); extern asmlinkage int swsusp_arch_resume(void); diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index f77f9397a364..6d5ceaf4c364 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -626,7 +626,7 @@ int swsusp_write(struct pbe *pblist, unsigned int nr_pages) int swsusp_shrink_memory(void) { - long tmp; + long size, tmp; struct zone *zone; unsigned long pages = 0; unsigned int i = 0; @@ -634,11 +634,11 @@ int swsusp_shrink_memory(void) printk("Shrinking memory... "); do { -#ifdef FAST_FREE - tmp = 2 * count_highmem_pages(); - tmp += tmp / 50 + count_data_pages(); - tmp += (tmp + PBES_PER_PAGE - 1) / PBES_PER_PAGE + + size = 2 * count_highmem_pages(); + size += size / 50 + count_data_pages(); + size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE + PAGES_FOR_IO; + tmp = size; for_each_zone (zone) if (!is_highmem(zone)) tmp -= zone->free_pages; @@ -647,11 +647,10 @@ int swsusp_shrink_memory(void) if (!tmp) return -ENOMEM; pages += tmp; + } else if (size > (IMAGE_SIZE * 1024 * 1024) / PAGE_SIZE) { + tmp = shrink_all_memory(SHRINK_BITE); + pages += tmp; } -#else - tmp = shrink_all_memory(SHRINK_BITE); - pages += tmp; -#endif printk("\b%c", p[i++%4]); } while (tmp > 0); printk("\bdone (%lu pages freed)\n", pages); -- cgit v1.2.3 From ca0aec0f7a94bf9f07fefa8bfd23282d4e8ceb8a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 6 Jan 2006 00:15:56 -0800 Subject: [PATCH] swsusp: make image size limit tunable Make the suspend image size limit tunable via /sys/power/image_size. It is necessary for systems on which there is a limited amount of swap available for suspend. It can also be useful for optimizing performance of swsusp on systems with 1 GB of RAM or more. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 20 ++++++++++++++++++++ kernel/power/power.h | 7 ++----- kernel/power/swsusp.c | 10 +++++++++- 3 files changed, 31 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 9e51cdf7b78d..e24446f8d8cd 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -365,9 +365,29 @@ out: power_attr(resume); +static ssize_t image_size_show(struct subsystem * subsys, char *buf) +{ + return sprintf(buf, "%u\n", image_size); +} + +static ssize_t image_size_store(struct subsystem * subsys, const char * buf, size_t n) +{ + unsigned int size; + + if (sscanf(buf, "%u", &size) == 1) { + image_size = size; + return n; + } + + return -EINVAL; +} + +power_attr(image_size); + static struct attribute * g[] = { &disk_attr.attr, &resume_attr.attr, + &image_size_attr.attr, NULL, }; diff --git a/kernel/power/power.h b/kernel/power/power.h index 9b0459903613..273a5b1d70be 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -52,11 +52,8 @@ extern const void __nosave_begin, __nosave_end; extern unsigned int nr_copy_pages; extern struct pbe *pagedir_nosave; -/* - * Preferred image size in MB (set it to zero to get the smallest - * image possible) - */ -#define IMAGE_SIZE 500 +/* Preferred image size in MB (default 500) */ +extern unsigned int image_size; extern asmlinkage int swsusp_arch_suspend(void); extern asmlinkage int swsusp_arch_resume(void); diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 6d5ceaf4c364..d760a6a719f0 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -69,6 +69,14 @@ #include "power.h" +/* + * Preferred image size in MB (tunable via /sys/power/image_size). + * When it is set to N, swsusp will do its best to ensure the image + * size will not exceed N MB, but if that is impossible, it will + * try to create the smallest image possible. + */ +unsigned int image_size = 500; + #ifdef CONFIG_HIGHMEM unsigned int count_highmem_pages(void); int save_highmem(void); @@ -647,7 +655,7 @@ int swsusp_shrink_memory(void) if (!tmp) return -ENOMEM; pages += tmp; - } else if (size > (IMAGE_SIZE * 1024 * 1024) / PAGE_SIZE) { + } else if (size > (image_size * 1024 * 1024) / PAGE_SIZE) { tmp = shrink_all_memory(SHRINK_BITE); pages += tmp; } -- cgit v1.2.3 From 1adf6c8ea916bc4a2587a881ec7715fece63fb5e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 6 Jan 2006 00:17:16 -0800 Subject: [PATCH] swsusp: improve handling of swap partitions This changes the handling of swap partitions by swsusp to avoid locking of the swap devices that are not used for suspend and, consequently, simplifies the code. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/swsusp.c | 128 ++++++++++++++------------------------------------ 1 file changed, 36 insertions(+), 92 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index d760a6a719f0..0479c9be7d71 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -104,13 +104,7 @@ static struct swsusp_info swsusp_info; * Saving part... */ -/* We memorize in swapfile_used what swap devices are used for suspension */ -#define SWAPFILE_UNUSED 0 -#define SWAPFILE_SUSPEND 1 /* This is the suspending device */ -#define SWAPFILE_IGNORED 2 /* Those are other swap devices ignored for suspension */ - -static unsigned short swapfile_used[MAX_SWAPFILES]; -static unsigned short root_swap; +static unsigned short root_swap = 0xffff; static int mark_swapfiles(swp_entry_t prev) { @@ -146,7 +140,7 @@ static int mark_swapfiles(swp_entry_t prev) * devfs, since the resume code can only recognize the form /dev/hda4, * but the suspend code would see the long name.) */ -static int is_resume_device(const struct swap_info_struct *swap_info) +static inline int is_resume_device(const struct swap_info_struct *swap_info) { struct file *file = swap_info->swap_file; struct inode *inode = file->f_dentry->d_inode; @@ -156,55 +150,23 @@ static int is_resume_device(const struct swap_info_struct *swap_info) } static int swsusp_swap_check(void) /* This is called before saving image */ -{ - int i, len; - - len=strlen(resume_file); - root_swap = 0xFFFF; - - spin_lock(&swap_lock); - for (i=0; i (nr_pages + PAGES_FOR_IO + + pr_debug("swsusp: free swap pages: %u\n", free_swap); + return free_swap > (nr_pages + PAGES_FOR_IO + (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); } /** - * write_suspend_image - Write entire image and metadata. + * swsusp_write - Write entire image and metadata. + * + * It is important _NOT_ to umount filesystems at this point. We want + * them synced (in case something goes wrong) but we DO not want to mark + * filesystem clean: it is not. (And it does not matter, if we resume + * correctly, we'll mark system clean, anyway.) */ -static int write_suspend_image(struct pbe *pblist, unsigned int nr_pages) + +int swsusp_write(struct pbe *pblist, unsigned int nr_pages) { struct swap_map_page *swap_map; struct swap_map_handle handle; int error; + if ((error = swsusp_swap_check())) { + printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n"); + return error; + } if (!enough_swap(nr_pages)) { printk(KERN_ERR "swsusp: Not enough free swap\n"); return -ENOSPC; @@ -601,26 +565,6 @@ Free_image_entries: goto Free_swap_map; } -/* It is important _NOT_ to umount filesystems at this point. We want - * them synced (in case something goes wrong) but we DO not want to mark - * filesystem clean: it is not. (And it does not matter, if we resume - * correctly, we'll mark system clean, anyway.) - */ -int swsusp_write(struct pbe *pblist, unsigned int nr_pages) -{ - int error; - - if ((error = swsusp_swap_check())) { - printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n"); - return error; - } - lock_swapdevices(); - error = write_suspend_image(pblist, nr_pages); - /* This will unlock ignored swap devices since writing is finished */ - lock_swapdevices(); - return error; -} - /** * swsusp_shrink_memory - Try to free as much memory as needed * -- cgit v1.2.3 From 277c6e2ad7369558dbd7ffbcc6dcbe16458bf723 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 6 Jan 2006 00:17:58 -0800 Subject: [PATCH] swsusp: save image header first MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This makes the swsusp_info structure become the header of the image in the literal sense (ie. it is saved to the swap and read before any other image data with the help of the swsusp's swap map structure, so generally it is treated in the same way as the rest of the image). The main thing it does is to make swsusp_header contain the offset of the swap map used to track the image data pages rather than the offset of swsusp_info.  Simultaneously, swsusp_info becomes the first image page written to the swap. The other changes are generally consequences of the above with a few exceptions (there's some consolidation in the image reading part as a few functions turn into trivial wrappers around something else). Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/power.h | 1 - kernel/power/swsusp.c | 190 +++++++++++++++++--------------------------------- 2 files changed, 65 insertions(+), 126 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index 273a5b1d70be..7e8492fd1423 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -16,7 +16,6 @@ struct swsusp_info { int cpus; unsigned long image_pages; unsigned long pages; - swp_entry_t start; } __attribute__((aligned(PAGE_SIZE))); diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 0479c9be7d71..55a18d26abed 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -93,7 +93,7 @@ extern char resume_file[]; static struct swsusp_header { char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; - swp_entry_t swsusp_info; + swp_entry_t image; char orig_sig[10]; char sig[10]; } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; @@ -106,7 +106,7 @@ static struct swsusp_info swsusp_info; static unsigned short root_swap = 0xffff; -static int mark_swapfiles(swp_entry_t prev) +static int mark_swapfiles(swp_entry_t start) { int error; @@ -117,7 +117,7 @@ static int mark_swapfiles(swp_entry_t prev) !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); memcpy(swsusp_header.sig,SWSUSP_SIG, 10); - swsusp_header.swsusp_info = prev; + swsusp_header.image = start; error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0), virt_to_page((unsigned long) @@ -423,22 +423,7 @@ static void init_header(unsigned int nr_pages) swsusp_info.cpus = num_online_cpus(); swsusp_info.image_pages = nr_pages; swsusp_info.pages = nr_pages + - ((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT); -} - -static int close_swap(void) -{ - swp_entry_t entry; - int error; - - dump_info(); - error = write_page((unsigned long)&swsusp_info, &entry); - if (!error) { - printk( "S" ); - error = mark_swapfiles(entry); - printk( "|\n" ); - } - return error; + ((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1; } /** @@ -522,6 +507,7 @@ int swsusp_write(struct pbe *pblist, unsigned int nr_pages) { struct swap_map_page *swap_map; struct swap_map_handle handle; + swp_entry_t start; int error; if ((error = swsusp_swap_check())) { @@ -539,18 +525,23 @@ int swsusp_write(struct pbe *pblist, unsigned int nr_pages) return -ENOMEM; init_swap_map_handle(&handle, swap_map); - error = save_image_metadata(pblist, &handle); + error = swap_map_write_page(&handle, (unsigned long)&swsusp_info); + if (!error) + error = save_image_metadata(pblist, &handle); if (!error) error = save_image_data(pblist, &handle, nr_pages); if (error) goto Free_image_entries; swap_map = reverse_swap_map(swap_map); - error = save_swap_map(swap_map, &swsusp_info.start); + error = save_swap_map(swap_map, &start); if (error) goto Free_map_entries; - error = close_swap(); + dump_info(); + printk( "S" ); + error = mark_swapfiles(start); + printk( "|\n" ); if (error) goto Free_map_entries; @@ -840,70 +831,28 @@ static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf) return error; } -/* - * Sanity check if this image makes sense with this kernel/swap context - * I really don't think that it's foolproof but more than nothing.. - */ - -static const char *sanity_check(void) +static int check_header(void) { + char *reason = NULL; + dump_info(); if (swsusp_info.version_code != LINUX_VERSION_CODE) - return "kernel version"; + reason = "kernel version"; if (swsusp_info.num_physpages != num_physpages) - return "memory size"; + reason = "memory size"; if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname)) - return "system type"; + reason = "system type"; if (strcmp(swsusp_info.uts.release,system_utsname.release)) - return "kernel release"; + reason = "kernel release"; if (strcmp(swsusp_info.uts.version,system_utsname.version)) - return "version"; + reason = "version"; if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) - return "machine"; -#if 0 - /* We can't use number of online CPUs when we use hotplug to remove them ;-))) */ - if (swsusp_info.cpus != num_possible_cpus()) - return "number of cpus"; -#endif - return NULL; -} - -static int check_header(void) -{ - const char *reason = NULL; - int error; - - if ((error = bio_read_page(swp_offset(swsusp_header.swsusp_info), &swsusp_info))) - return error; - - /* Is this same machine? */ - if ((reason = sanity_check())) { - printk(KERN_ERR "swsusp: Resume mismatch: %s\n",reason); + reason = "machine"; + if (reason) { + printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason); return -EPERM; } - return error; -} - -static int check_sig(void) -{ - int error; - - memset(&swsusp_header, 0, sizeof(swsusp_header)); - if ((error = bio_read_page(0, &swsusp_header))) - return error; - if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { - memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); - - /* - * Reset swap signature now. - */ - error = bio_write_page(0, &swsusp_header); - } else { - return -EINVAL; - } - if (!error) - pr_debug("swsusp: Signature found, resuming\n"); - return error; + return 0; } /** @@ -989,33 +938,29 @@ static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handl return error; } -static int check_suspend_image(void) -{ - int error = 0; - - if ((error = check_sig())) - return error; - - if ((error = check_header())) - return error; - - return 0; -} - -static int read_suspend_image(struct pbe **pblist_ptr) +int swsusp_read(struct pbe **pblist_ptr) { - int error = 0; + int error; struct pbe *p, *pblist; struct swap_map_handle handle; - unsigned int nr_pages = swsusp_info.image_pages; + unsigned int nr_pages; + if (IS_ERR(resume_bdev)) { + pr_debug("swsusp: block device not initialised\n"); + return PTR_ERR(resume_bdev); + } + + error = get_swap_map_reader(&handle, swsusp_header.image); + if (!error) + error = swap_map_read_page(&handle, &swsusp_info); + if (!error) + error = check_header(); + if (error) + return error; + nr_pages = swsusp_info.image_pages; p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0); if (!p) return -ENOMEM; - error = get_swap_map_reader(&handle, swsusp_info.start); - if (error) - /* The PBE list at p will be released by swsusp_free() */ - return error; error = load_image_metadata(p, &handle); if (!error) { mark_unsafe_pages(p); @@ -1037,11 +982,18 @@ static int read_suspend_image(struct pbe **pblist_ptr) *pblist_ptr = pblist; } release_swap_map_reader(&handle); + + blkdev_put(resume_bdev); + + if (!error) + pr_debug("swsusp: Reading resume file was successful\n"); + else + pr_debug("swsusp: Error %d resuming\n", error); return error; } /** - * swsusp_check - Check for saved image in swap + * swsusp_check - Check for swsusp signature in the resume device */ int swsusp_check(void) @@ -1051,39 +1003,27 @@ int swsusp_check(void) resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); if (!IS_ERR(resume_bdev)) { set_blocksize(resume_bdev, PAGE_SIZE); - error = check_suspend_image(); + memset(&swsusp_header, 0, sizeof(swsusp_header)); + if ((error = bio_read_page(0, &swsusp_header))) + return error; + if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { + memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); + /* Reset swap signature now */ + error = bio_write_page(0, &swsusp_header); + } else { + return -EINVAL; + } if (error) - blkdev_put(resume_bdev); - } else + blkdev_put(resume_bdev); + else + pr_debug("swsusp: Signature found, resuming\n"); + } else { error = PTR_ERR(resume_bdev); - - if (!error) - pr_debug("swsusp: resume file found\n"); - else - pr_debug("swsusp: Error %d check for resume file\n", error); - return error; -} - -/** - * swsusp_read - Read saved image from swap. - */ - -int swsusp_read(struct pbe **pblist_ptr) -{ - int error; - - if (IS_ERR(resume_bdev)) { - pr_debug("swsusp: block device not initialised\n"); - return PTR_ERR(resume_bdev); } - error = read_suspend_image(pblist_ptr); - blkdev_put(resume_bdev); + if (error) + pr_debug("swsusp: Error %d check for resume file\n", error); - if (!error) - pr_debug("swsusp: Reading resume file was successful\n"); - else - pr_debug("swsusp: Error %d resuming\n", error); return error; } -- cgit v1.2.3 From 089545f0c71bab6511395c2a060d7f81a99bad58 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 6 Jan 2006 00:19:12 -0800 Subject: [PATCH] s390: cputime_t fixes There are some more places where the use of cputime_t instead of an integer type and the associated macros is necessary for the virtual cputime accounting on s390. Affected are the s390 specific appldata code and BSD process accounting. Signed-off-by: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 6312d6bd43e3..38d57fa6b78f 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -427,6 +427,7 @@ static void do_acct_process(long exitcode, struct file *file) u64 elapsed; u64 run_time; struct timespec uptime; + unsigned long jiffies; /* * First check to see if there is enough free_space to continue @@ -467,12 +468,12 @@ static void do_acct_process(long exitcode, struct file *file) #endif do_div(elapsed, AHZ); ac.ac_btime = xtime.tv_sec - elapsed; - ac.ac_utime = encode_comp_t(jiffies_to_AHZ( - current->signal->utime + - current->group_leader->utime)); - ac.ac_stime = encode_comp_t(jiffies_to_AHZ( - current->signal->stime + - current->group_leader->stime)); + jiffies = cputime_to_jiffies(cputime_add(current->group_leader->utime, + current->signal->utime)); + ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies)); + jiffies = cputime_to_jiffies(cputime_add(current->group_leader->stime, + current->signal->stime)); + ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies)); /* we really need to bite the bullet and change layout */ ac.ac_uid = current->uid; ac.ac_gid = current->gid; @@ -580,7 +581,8 @@ void acct_process(long exitcode) void acct_update_integrals(struct task_struct *tsk) { if (likely(tsk->mm)) { - long delta = tsk->stime - tsk->acct_stimexpd; + long delta = + cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd; if (delta == 0) return; -- cgit v1.2.3 From 347a8dc3b815f0c0fa62a1df075184ffe4cbdcf1 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 6 Jan 2006 00:19:28 -0800 Subject: [PATCH] s390: cleanup Kconfig Sanitize some s390 Kconfig options. We have ARCH_S390, ARCH_S390X, ARCH_S390_31, 64BIT, S390_SUPPORT and COMPAT. Replace these 6 options by S390, 64BIT and COMPAT. Signed-off-by: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 4 ++-- kernel/sysctl.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index aabc5f86fa3f..c5c4ab255834 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -60,7 +60,7 @@ NORET_TYPE void panic(const char * fmt, ...) long i; static char buf[1024]; va_list args; -#if defined(CONFIG_ARCH_S390) +#if defined(CONFIG_S390) unsigned long caller = (unsigned long) __builtin_return_address(0); #endif @@ -125,7 +125,7 @@ NORET_TYPE void panic(const char * fmt, ...) printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n"); } #endif -#if defined(CONFIG_ARCH_S390) +#if defined(CONFIG_S390) disabled_wait(caller); #endif local_irq_enable(); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 345f4a1d533f..a85047bb5739 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -108,7 +108,7 @@ extern int pwrsw_enabled; extern int unaligned_enabled; #endif -#ifdef CONFIG_ARCH_S390 +#ifdef CONFIG_S390 #ifdef CONFIG_MATHEMU extern int sysctl_ieee_emulation_warnings; #endif @@ -542,7 +542,7 @@ static ctl_table kern_table[] = { .extra1 = &minolduid, .extra2 = &maxolduid, }, -#ifdef CONFIG_ARCH_S390 +#ifdef CONFIG_S390 #ifdef CONFIG_MATHEMU { .ctl_name = KERN_IEEE_EMULATION_WARNINGS, @@ -644,7 +644,7 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, -#if defined(CONFIG_ARCH_S390) +#if defined(CONFIG_S390) { .ctl_name = KERN_SPIN_RETRY, .procname = "spin_retry", -- cgit v1.2.3 From 6fe2e70bbed3995d930f39452fb6ce3be7dc47dc Mon Sep 17 00:00:00 2001 From: Jayachandran C Date: Fri, 6 Jan 2006 00:19:54 -0800 Subject: [PATCH] kernel/module.c: removed dead code This patch fixes an issue reported by Coverity in kernel/module.c Error reported: Cannot reach this line of code "else return ptr;" Patch description: This is the error path, so 'err' will be negative, the else case is not required, this patch removes it. Signed-off-by: Jayachandran C. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 2ea929d51ad0..4b06bbad49c2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1854,8 +1854,7 @@ static struct module *load_module(void __user *umod, kfree(args); free_hdr: vfree(hdr); - if (err < 0) return ERR_PTR(err); - else return ptr; + return ERR_PTR(err); truncated: printk(KERN_ERR "Module len %lu truncated\n", len); -- cgit v1.2.3 From 0aec63e67c69545ca757a73a66f5dcf05fa484bf Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 6 Jan 2006 15:36:48 -0800 Subject: [PATCH] Fix posix-cpu-timers sched_time accumulation I've spent the past 3 days digging into a glibc testsuite failure in current CVS, specifically libc/rt/tst-cputimer1.c The thr1 and thr2 timers fire too early in the second pass of this test. The second pass is noteworthy because it makes use of intervals, whereas the first pass does not. All throughout the posix-cpu-timers.c code, the calculation of the process sched_time sum is implemented roughly as: unsigned long long sum; sum = tsk->signal->sched_time; t = tsk; do { sum += t->sched_time; t = next_thread(t); } while (t != tsk); In fact this is the exact scheme used by check_process_timers(). In the case of check_process_timers(), current->sched_time has just been updated (via scheduler_tick(), which is invoked by update_process_times(), which subsequently invokes run_posix_cpu_timers()) So there is no special processing necessary wrt. that. In other contexts, we have to allot for the fact that tsk->sched_time might be a bit out of date if we are current. And the posix-cpu-timers.c code uses current_sched_time() to deal with that. Unfortunately it does so in an erroneous and inconsistent manner in one spot which is what results in the early timer firing. In cpu_clock_sample_group_locked(), it does this: cpu->sched = p->signal->sched_time; /* Add in each other live thread. */ while ((t = next_thread(t)) != p) { cpu->sched += t->sched_time; } if (p->tgid == current->tgid) { /* * We're sampling ourselves, so include the * cycles not yet banked. We still omit * other threads running on other CPUs, * so the total can always be behind as * much as max(nthreads-1,ncpus) * (NSEC_PER_SEC/HZ). */ cpu->sched += current_sched_time(current); } else { cpu->sched += p->sched_time; } The problem is the "p->tgid == current->tgid" test. If "p" is not current, and the tgids are the same, we will add the process t->sched_time twice into cpu->sched and omit "p"'s sched_time which is very very very wrong. posix-cpu-timers.c has a helper function, sched_ns(p) which takes care of this, so my fix is to use that here instead of this special tgid test. The fact that current can be one of the sub-threads of "p" points out that we could make things a little bit more accurate, perhaps by using sched_ns() on every thread we process in these loops. It also points out that we don't use the most accurate value for threads in the group actively running other cpus (and this is mentioned in the comment). But that is a future enhancement, and this fix here definitely makes sense. Signed-off-by: David S. Miller Signed-off-by: Linus Torvalds --- kernel/posix-cpu-timers.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index cae4f5728997..4c68edff900b 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -238,18 +238,7 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx, while ((t = next_thread(t)) != p) { cpu->sched += t->sched_time; } - if (p->tgid == current->tgid) { - /* - * We're sampling ourselves, so include the - * cycles not yet banked. We still omit - * other threads running on other CPUs, - * so the total can always be behind as - * much as max(nthreads-1,ncpus) * (NSEC_PER_SEC/HZ). - */ - cpu->sched += current_sched_time(current); - } else { - cpu->sched += p->sched_time; - } + cpu->sched += sched_ns(p); break; } return 0; -- cgit v1.2.3