From b36e4758dc1b9ff1f6d97e951edba22366230d11 Mon Sep 17 00:00:00 2001 From: Russell King Date: Sun, 27 Aug 2006 12:26:34 +0100 Subject: [ARM] Fix kernel/fork.c for lockdep on ARM ARM has interrupts enabled over context switches (iow, has __ARCH_WANT_INTERRUPTS_ON_CTXSW defined.) The lockdep code in fork.c assumes that interrupts are always disabled. Fix this wrong assumption by making the initialisation of 'p->hardirqs_enabled' depend on __ARCH_WANT_INTERRUPTS_ON_CTXSW. Acked-by: Ingo Molnar Signed-off-by: Russell King --- kernel/fork.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index f9b014e3e700..8f76adf1c6a6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1056,7 +1056,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + p->hardirqs_enabled = 1; +#else p->hardirqs_enabled = 0; +#endif p->hardirq_enable_ip = 0; p->hardirq_enable_event = 0; p->hardirq_disable_ip = _THIS_IP_; -- cgit v1.2.3 From d33b6fba2c4350651f3f61ff2ab858a2f116e9a4 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 30 Jun 2006 02:31:24 -0700 Subject: Resources: insert identical resources above existing resources If you have two resources which aree exactly the same size, insert_resource() currently inserts the new one below the existing one. This is wrong because there's no way to insert a resource of the same size above an existing one. I took this opportunity to rewrite the initial loop to be a for-loop instead of a goto-loop and fix the documentation. Signed-off-by: Matthew Wilcox Cc: Ivan Kokshaysky Cc: Dominik Brodowski Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- kernel/resource.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 46286434af80..9db38a1a7520 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -344,12 +344,11 @@ EXPORT_SYMBOL(allocate_resource); * * Returns 0 on success, -EBUSY if the resource can't be inserted. * - * This function is equivalent of request_resource when no conflict + * This function is equivalent to request_resource when no conflict * happens. If a conflict happens, and the conflicting resources * entirely fit within the range of the new resource, then the new - * resource is inserted and the conflicting resources become childs of - * the new resource. Otherwise the new resource becomes the child of - * the conflicting resource + * resource is inserted and the conflicting resources become children of + * the new resource. */ int insert_resource(struct resource *parent, struct resource *new) { @@ -357,20 +356,21 @@ int insert_resource(struct resource *parent, struct resource *new) struct resource *first, *next; write_lock(&resource_lock); - begin: - result = 0; - first = __request_resource(parent, new); - if (!first) - goto out; - result = -EBUSY; - if (first == parent) - goto out; + for (;; parent = first) { + result = 0; + first = __request_resource(parent, new); + if (!first) + goto out; - /* Resource fully contained by the clashing resource? Recurse into it */ - if (first->start <= new->start && first->end >= new->end) { - parent = first; - goto begin; + result = -EBUSY; + if (first == parent) + goto out; + + if ((first->start > new->start) || (first->end < new->end)) + break; + if ((first->start == new->start) && (first->end == new->end)) + break; } for (next = first; ; next = next->sibling) { -- cgit v1.2.3 From 0ec76a110f432e98277e464b82ace8dd66571689 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 27 Sep 2006 01:50:15 -0700 Subject: [PATCH] NOMMU: Check that access_process_vm() has a valid target Check that access_process_vm() is accessing a valid mapping in the target process. This limits ptrace() accesses and accesses through /proc//maps to only those regions actually mapped by a program. Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 54 ------------------------------------------------------ 1 file changed, 54 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 9a111f70145c..8aad0331d82e 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -241,60 +241,6 @@ int ptrace_detach(struct task_struct *child, unsigned int data) return 0; } -/* - * Access another process' address space. - * Source/target buffer must be kernel space, - * Do not walk the page table directly, use get_user_pages - */ - -int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) -{ - struct mm_struct *mm; - struct vm_area_struct *vma; - struct page *page; - void *old_buf = buf; - - mm = get_task_mm(tsk); - if (!mm) - return 0; - - down_read(&mm->mmap_sem); - /* ignore errors, just check how much was sucessfully transfered */ - while (len) { - int bytes, ret, offset; - void *maddr; - - ret = get_user_pages(tsk, mm, addr, 1, - write, 1, &page, &vma); - if (ret <= 0) - break; - - bytes = len; - offset = addr & (PAGE_SIZE-1); - if (bytes > PAGE_SIZE-offset) - bytes = PAGE_SIZE-offset; - - maddr = kmap(page); - if (write) { - copy_to_user_page(vma, page, addr, - maddr + offset, buf, bytes); - set_page_dirty_lock(page); - } else { - copy_from_user_page(vma, page, addr, - buf, maddr + offset, bytes); - } - kunmap(page); - page_cache_release(page); - len -= bytes; - buf += bytes; - addr += bytes; - } - up_read(&mm->mmap_sem); - mmput(mm); - - return buf - old_buf; -} - int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) { int copied = 0; -- cgit v1.2.3 From f269fdd1829acc5e53bf57b145003e5733133f2b Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 27 Sep 2006 01:50:23 -0700 Subject: [PATCH] NOMMU: move the fallback arch_vma_name() to a sensible place Move the fallback arch_vma_name() to a sensible place (kernel/signal.c). Currently it's in fs/proc/task_mmu.c, a file that is dependent on both CONFIG_PROC_FS and CONFIG_MMU being enabled, but it's used from kernel/signal.c from where it is called unconditionally. [akpm@osdl.org: build fix] Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index bfdb5686fa3e..05853a7337e3 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2577,6 +2577,11 @@ asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize) } #endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ +__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) +{ + return NULL; +} + void __init signals_init(void) { sigqueue_cachep = -- cgit v1.2.3 From 8e18e2941c53416aa219708e7dcad21fb4bd6794 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 27 Sep 2006 01:50:46 -0700 Subject: [PATCH] inode_diet: Replace inode.u.generic_ip with inode.i_private The following patches reduce the size of the VFS inode structure by 28 bytes on a UP x86. (It would be more on an x86_64 system). This is a 10% reduction in the inode size on a UP kernel that is configured in a production mode (i.e., with no spinlock or other debugging functions enabled; if you want to save memory taken up by in-core inodes, the first thing you should do is disable the debugging options; they are responsible for a huge amount of bloat in the VFS inode structure). This patch: The filesystem or device-specific pointer in the inode is inside a union, which is pretty pointless given that all 30+ users of this field have been using the void pointer. Get rid of the union and rename it to i_private, with a comment to explain who is allowed to use the void pointer. This is just a cleanup, but it allows us to reuse the union 'u' for something something where the union will actually be used. [judith@osdl.org: powerpc build fix] Signed-off-by: "Theodore Ts'o" Signed-off-by: Judith Lebzelter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/relay.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 33345e73485c..85786ff2a4f9 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -669,7 +669,7 @@ EXPORT_SYMBOL_GPL(relay_flush); */ static int relay_file_open(struct inode *inode, struct file *filp) { - struct rchan_buf *buf = inode->u.generic_ip; + struct rchan_buf *buf = inode->i_private; kref_get(&buf->kref); filp->private_data = buf; -- cgit v1.2.3 From ba52de123d454b57369f291348266d86f4b35070 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 27 Sep 2006 01:50:49 -0700 Subject: [PATCH] inode-diet: Eliminate i_blksize from the inode structure This eliminates the i_blksize field from struct inode. Filesystems that want to provide a per-inode st_blksize can do so by providing their own getattr routine instead of using the generic_fillattr() function. Note that some filesystems were providing pretty much random (and incorrect) values for i_blksize. [bunk@stusta.de: cleanup] [akpm@osdl.org: generic_fillattr() fix] Signed-off-by: "Theodore Ts'o" Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index cff41511269f..1b32c2c04c15 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -289,7 +289,6 @@ static struct inode *cpuset_new_inode(mode_t mode) inode->i_mode = mode; inode->i_uid = current->fsuid; inode->i_gid = current->fsgid; - inode->i_blksize = PAGE_CACHE_SIZE; inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info; -- cgit v1.2.3 From b89a81712f486e4f7a606987413e387605fdeaf4 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 27 Sep 2006 01:51:04 -0700 Subject: [PATCH] sysctl: Allow /proc/sys without sys_sysctl Since sys_sysctl is deprecated start allow it to be compiled out. This should catch any remaining user space code that cares, and paves the way for further sysctl cleanups. [akpm@osdl.org: If sys_sysctl() is not compiled-in, emit a warning] Signed-off-by: Eric W. Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 113 +++++++++++++++++++------------------------------------- 1 file changed, 38 insertions(+), 75 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index bcb3a181dbb2..8bfa7d117c54 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -137,8 +137,11 @@ extern int no_unaligned_warning; extern int max_lock_depth; #endif -static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, - ctl_table *, void **); +#ifdef CONFIG_SYSCTL_SYSCALL +static int parse_table(int __user *, int, void __user *, size_t __user *, + void __user *, size_t, ctl_table *, void **); +#endif + static int proc_doutsstring(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -165,7 +168,7 @@ int sysctl_legacy_va_layout; /* /proc declarations: */ -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_PROC_SYSCTL static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *); static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *); @@ -1166,12 +1169,13 @@ static void start_unregistering(struct ctl_table_header *p) void __init sysctl_init(void) { -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_PROC_SYSCTL register_proc_table(root_table, proc_sys_root, &root_table_header); init_irq_proc(); #endif } +#ifdef CONFIG_SYSCTL_SYSCALL int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -1225,6 +1229,7 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args) unlock_kernel(); return error; } +#endif /* CONFIG_SYSCTL_SYSCALL */ /* * ctl_perm does NOT grant the superuser all rights automatically, because @@ -1251,6 +1256,7 @@ static inline int ctl_perm(ctl_table *table, int op) return test_perm(table->mode, op); } +#ifdef CONFIG_SYSCTL_SYSCALL static int parse_table(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen, @@ -1340,6 +1346,7 @@ int do_sysctl_strategy (ctl_table *table, } return 0; } +#endif /* CONFIG_SYSCTL_SYSCALL */ /** * register_sysctl_table - register a sysctl hierarchy @@ -1427,7 +1434,7 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table, else list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); spin_unlock(&sysctl_lock); -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_PROC_SYSCTL register_proc_table(table, proc_sys_root, tmp); #endif return tmp; @@ -1445,18 +1452,31 @@ void unregister_sysctl_table(struct ctl_table_header * header) might_sleep(); spin_lock(&sysctl_lock); start_unregistering(header); -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_PROC_SYSCTL unregister_proc_table(header->ctl_table, proc_sys_root); #endif spin_unlock(&sysctl_lock); kfree(header); } +#else /* !CONFIG_SYSCTL */ +struct ctl_table_header * register_sysctl_table(ctl_table * table, + int insert_at_head) +{ + return NULL; +} + +void unregister_sysctl_table(struct ctl_table_header * table) +{ +} + +#endif /* CONFIG_SYSCTL */ + /* * /proc/sys support */ -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_PROC_SYSCTL /* Scan the sysctl entries in table and add them all into /proc */ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set) @@ -2318,6 +2338,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, #endif /* CONFIG_PROC_FS */ +#ifdef CONFIG_SYSCTL_SYSCALL /* * General sysctl support routines */ @@ -2460,11 +2481,19 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, return 1; } -#else /* CONFIG_SYSCTL */ +#else /* CONFIG_SYSCTL_SYSCALL */ asmlinkage long sys_sysctl(struct __sysctl_args __user *args) { + static int msg_count; + + if (msg_count < 5) { + msg_count++; + printk(KERN_INFO + "warning: process `%s' used the removed sysctl " + "system call\n", current->comm); + } return -ENOSYS; } @@ -2496,73 +2525,7 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, return -ENOSYS; } -int proc_dostring(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, - struct file *filp, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -struct ctl_table_header * register_sysctl_table(ctl_table * table, - int insert_at_head) -{ - return NULL; -} - -void unregister_sysctl_table(struct ctl_table_header * table) -{ -} - -#endif /* CONFIG_SYSCTL */ +#endif /* CONFIG_SYSCTL_SYSCALL */ /* * No sense putting this after each symbol definition, twice, -- cgit v1.2.3 From c18258c6f0848f97e85287f6271c511a092bb784 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 27 Sep 2006 01:51:06 -0700 Subject: [PATCH] pid: Implement transfer_pid and use it to simplify de_thread In de_thread we move pids from one process to another, a rather ugly case. The function transfer_pid makes it clear what we are doing, and makes the action atomic. This is useful we ever want to atomically traverse the process group and session lists, in a rcu safe manner. Even if the atomic properties this change should be a win as transfer_pid should be less code to execute than executing both attach_pid and detach_pid, and this should make de_thread slightly smaller as only a single function call needs to be emitted. The only downside is that the code might be slower to execute as the odds are against transfer_pid being in cache. Signed-off-by: Eric W. Biederman Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 93e212f20671..6db82b68e2f8 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -252,6 +252,15 @@ void fastcall detach_pid(struct task_struct *task, enum pid_type type) free_pid(pid); } +/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ +void fastcall transfer_pid(struct task_struct *old, struct task_struct *new, + enum pid_type type) +{ + new->pids[type].pid = old->pids[type].pid; + hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node); + old->pids[type].pid = NULL; +} + struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type) { struct task_struct *result = NULL; -- cgit v1.2.3 From 65800ac77e080cf159d6c1207b6886e18f22bc08 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 27 Sep 2006 01:51:11 -0700 Subject: [PATCH] pid: remove temporary debug code in attach_pid With the patches flying between Oleg and myself somehow this temporary debug code got left in pid.c. It was never intended to make it to the stable kernel. Signed-off-by: Eric W. Biederman Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 6db82b68e2f8..8387e8c68193 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -223,9 +223,6 @@ int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr) struct pid_link *link; struct pid *pid; - WARN_ON(!task->pid); /* to be removed soon */ - WARN_ON(!nr); /* to be removed soon */ - link = &task->pids[type]; link->pid = pid = find_pid(nr); hlist_add_head_rcu(&link->node, &pid->tasks[type]); -- cgit v1.2.3 From 910067d188d56d80801b71b0ca1f73aa400c7b8c Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 29 Sep 2006 01:58:36 -0700 Subject: [PATCH] remove generic__raw_read_trylock() If the cpu has the lock held for write, is interrupted, and the interrupt handler calls read_trylock(), it's an instant deadlock. Now, Dave Miller has subsequently pointed out that we don't have any situations where this can occur. Nevertheless, we should delete generic__raw_read_lock (and its associated EXPORT to make Arjan happy) so that nobody thinks they can use it. Acked-by: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/spinlock.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 9644a41e0bef..d48143eafbfd 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -21,17 +21,6 @@ #include #include -/* - * Generic declaration of the raw read_trylock() function, - * architectures are supposed to optimize this: - */ -int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock) -{ - __raw_read_lock(lock); - return 1; -} -EXPORT_SYMBOL(generic__raw_read_trylock); - int __lockfunc _spin_trylock(spinlock_t *lock) { preempt_disable(); -- cgit v1.2.3 From d8c7649e99e4b081b624aefe1e77caa30b53cb18 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 29 Sep 2006 01:58:55 -0700 Subject: [PATCH] kernel/params: driver layer error checking Check driver layer return values in kernel/params.c Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 91aea7aa532e..63d432d0ebc0 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -547,6 +547,7 @@ static void __init kernel_param_sysfs_setup(const char *name, unsigned int name_skip) { struct module_kobject *mk; + int ret; mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); BUG_ON(!mk); @@ -554,7 +555,8 @@ static void __init kernel_param_sysfs_setup(const char *name, mk->mod = THIS_MODULE; kobj_set_kset_s(mk, module_subsys); kobject_set_name(&mk->kobj, name); - kobject_register(&mk->kobj); + ret = kobject_register(&mk->kobj); + BUG_ON(ret < 0); /* no need to keep the kobject if no parameter is exported */ if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) { @@ -684,7 +686,14 @@ decl_subsys(module, &module_ktype, NULL); */ static int __init param_sysfs_init(void) { - subsystem_register(&module_subsys); + int ret; + + ret = subsystem_register(&module_subsys); + if (ret < 0) { + printk(KERN_WARNING "%s (%d): subsystem_register error: %d\n", + __FILE__, __LINE__, ret); + return ret; + } param_sysfs_builtin(); -- cgit v1.2.3 From 4c78a6639386f9e7399fbc0d0a173d4cc1a3e9bf Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 29 Sep 2006 01:59:10 -0700 Subject: [PATCH] kernel-doc for relay interface Add relay interface support to DocBook/kernel-api.tmpl. Fix typos etc. in relay.c and relayfs.txt. Signed-off-by: Randy Dunlap Acked-by: Tom Zanussi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/relay.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 85786ff2a4f9..1d63ecddfa70 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -95,7 +95,7 @@ int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) * @buf: the buffer struct * @size: total size of the buffer * - * Returns a pointer to the resulting buffer, NULL if unsuccessful. The + * Returns a pointer to the resulting buffer, %NULL if unsuccessful. The * passed in size will get page aligned, if it isn't already. */ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) @@ -132,10 +132,9 @@ depopulate: /** * relay_create_buf - allocate and initialize a channel buffer - * @alloc_size: size of the buffer to allocate - * @n_subbufs: number of sub-buffers in the channel + * @chan: the relay channel * - * Returns channel buffer if successful, NULL otherwise + * Returns channel buffer if successful, %NULL otherwise. */ struct rchan_buf *relay_create_buf(struct rchan *chan) { @@ -163,6 +162,7 @@ free_buf: /** * relay_destroy_channel - free the channel struct + * @kref: target kernel reference that contains the relay channel * * Should only be called from kref_put(). */ @@ -194,6 +194,7 @@ void relay_destroy_buf(struct rchan_buf *buf) /** * relay_remove_buf - remove a channel buffer + * @kref: target kernel reference that contains the relay buffer * * Removes the file from the fileystem, which also frees the * rchan_buf_struct and the channel buffer. Should only be called from @@ -374,7 +375,7 @@ void relay_reset(struct rchan *chan) } EXPORT_SYMBOL_GPL(relay_reset); -/** +/* * relay_open_buf - create a new relay channel buffer * * Internal - used by relay_open(). @@ -448,12 +449,12 @@ static inline void setup_callbacks(struct rchan *chan, /** * relay_open - create a new relay channel * @base_filename: base name of files to create - * @parent: dentry of parent directory, NULL for root directory + * @parent: dentry of parent directory, %NULL for root directory * @subbuf_size: size of sub-buffers * @n_subbufs: number of sub-buffers * @cb: client callback functions * - * Returns channel pointer if successful, NULL otherwise. + * Returns channel pointer if successful, %NULL otherwise. * * Creates a channel buffer for each cpu using the sizes and * attributes specified. The created channel buffer files @@ -585,7 +586,7 @@ EXPORT_SYMBOL_GPL(relay_switch_subbuf); * subbufs_consumed should be the number of sub-buffers newly consumed, * not the total consumed. * - * NOTE: kernel clients don't need to call this function if the channel + * NOTE: Kernel clients don't need to call this function if the channel * mode is 'overwrite'. */ void relay_subbufs_consumed(struct rchan *chan, @@ -641,7 +642,7 @@ EXPORT_SYMBOL_GPL(relay_close); * relay_flush - close the channel * @chan: the channel * - * Flushes all channel buffers i.e. forces buffer switch. + * Flushes all channel buffers, i.e. forces buffer switch. */ void relay_flush(struct rchan *chan) { @@ -729,7 +730,7 @@ static int relay_file_release(struct inode *inode, struct file *filp) return 0; } -/** +/* * relay_file_read_consume - update the consumed count for the buffer */ static void relay_file_read_consume(struct rchan_buf *buf, @@ -756,7 +757,7 @@ static void relay_file_read_consume(struct rchan_buf *buf, } } -/** +/* * relay_file_read_avail - boolean, are there unconsumed bytes available? */ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) @@ -793,6 +794,8 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) /** * relay_file_read_subbuf_avail - return bytes available in sub-buffer + * @read_pos: file read position + * @buf: relay channel buffer */ static size_t relay_file_read_subbuf_avail(size_t read_pos, struct rchan_buf *buf) @@ -818,6 +821,8 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos, /** * relay_file_read_start_pos - find the first available byte to read + * @read_pos: file read position + * @buf: relay channel buffer * * If the read_pos is in the middle of padding, return the * position of the first actually available byte, otherwise @@ -844,6 +849,9 @@ static size_t relay_file_read_start_pos(size_t read_pos, /** * relay_file_read_end_pos - return the new read position + * @read_pos: file read position + * @buf: relay channel buffer + * @count: number of bytes to be read */ static size_t relay_file_read_end_pos(struct rchan_buf *buf, size_t read_pos, @@ -865,7 +873,7 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf, return end_pos; } -/** +/* * subbuf_read_actor - read up to one subbuf's worth of data */ static int subbuf_read_actor(size_t read_start, @@ -890,7 +898,7 @@ static int subbuf_read_actor(size_t read_start, return ret; } -/** +/* * subbuf_send_actor - send up to one subbuf's worth of data */ static int subbuf_send_actor(size_t read_start, @@ -933,7 +941,7 @@ typedef int (*subbuf_actor_t) (size_t read_start, read_descriptor_t *desc, read_actor_t actor); -/** +/* * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries */ static inline ssize_t relay_file_read_subbufs(struct file *filp, -- cgit v1.2.3 From 2ff6fd8f4a40b72ff35dbb1e08eb9ed6b64b6028 Mon Sep 17 00:00:00 2001 From: Yoichi Yuasa Date: Fri, 29 Sep 2006 01:59:21 -0700 Subject: [PATCH] irq: fixed coding style Signed-off-by: Yoichi Yuasa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index ac1f850d4937..3c3fec55f6d1 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -146,7 +146,7 @@ static void default_disable(unsigned int irq) struct irq_desc *desc = irq_desc + irq; if (!(desc->status & IRQ_DELAYED_DISABLE)) - irq_desc[irq].chip->mask(irq); + desc->chip->mask(irq); } /* -- cgit v1.2.3 From 538d9d532b0e0320c9dd326a560b5a72d73f910d Mon Sep 17 00:00:00 2001 From: Yoichi Yuasa Date: Fri, 29 Sep 2006 01:59:22 -0700 Subject: [PATCH] irq: remove a extra line Signed-off-by: Yoichi Yuasa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/chip.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 3c3fec55f6d1..736cb0bd498f 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -40,10 +40,6 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) spin_lock_irqsave(&desc->lock, flags); irq_chip_set_defaults(chip); desc->chip = chip; - /* - * For compatibility only: - */ - desc->chip = chip; spin_unlock_irqrestore(&desc->lock, flags); return 0; -- cgit v1.2.3 From a49a4af759c0193d42aeaeefb4df7de8973dd713 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Fri, 29 Sep 2006 01:59:30 -0700 Subject: [PATCH] rcu: add lock annotations to rcu{,_bh}_torture_read_{lock,unlock} rcu_torture_read_lock and rcu_bh_torture_read_lock acquire locks without releasing them, and the matching functions rcu_torture_read_unlock and rcu_bh_torture_read_unlock get called with the corresponding locks held and release them. Add lock annotations to these four functions so that sparse can check callers for lock pairing, and so that sparse will not complain about these functions since they intentionally use locks in this manner. Signed-off-by: Josh Triplett Acked-by: Paul McKenney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcutorture.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 4d1c3d247127..4f2c4272d59c 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -192,13 +192,13 @@ static struct rcu_torture_ops *cur_ops = NULL; * Definitions for rcu torture testing. */ -static int rcu_torture_read_lock(void) +static int rcu_torture_read_lock(void) __acquires(RCU) { rcu_read_lock(); return 0; } -static void rcu_torture_read_unlock(int idx) +static void rcu_torture_read_unlock(int idx) __releases(RCU) { rcu_read_unlock(); } @@ -250,13 +250,13 @@ static struct rcu_torture_ops rcu_ops = { * Definitions for rcu_bh torture testing. */ -static int rcu_bh_torture_read_lock(void) +static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH) { rcu_read_lock_bh(); return 0; } -static void rcu_bh_torture_read_unlock(int idx) +static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH) { rcu_read_unlock_bh(); } -- cgit v1.2.3 From d10be6d1bdb0c901b78244872de3cc1c1b6c3fb2 Mon Sep 17 00:00:00 2001 From: Mark Huang Date: Fri, 29 Sep 2006 01:59:34 -0700 Subject: [PATCH] module_subsys: initialize earlier Initialize module_subsys earlier (or at least earlier than devices) since it could be used very early in the boot process if kmod loads a module before the device initcalls. Otherwise, kmod will crash in kernel/module.c:mod_sysfs_setup() since the kset in module_subsys is not initialized yet. I only noticed this problem because occasionally, kmod loads the modules for my SCSI and Ethernet adapters very early, during the boot process itself. I don't quite understand why it loads them sometimes and doesn't load them other times. Or who is telling kmod to do so. Can someone explain? Signed-off-by: Mark Huang Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 63d432d0ebc0..f406655d6653 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -699,7 +699,7 @@ static int __init param_sysfs_init(void) return 0; } -__initcall(param_sysfs_init); +subsys_initcall(param_sysfs_init); EXPORT_SYMBOL(param_set_byte); EXPORT_SYMBOL(param_get_byte); -- cgit v1.2.3 From 89e7e374dde1015d69d2d70797ae4053b14fa9db Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Fri, 29 Sep 2006 01:59:36 -0700 Subject: [PATCH] timer: add lock annotation to lock_timer_base lock_timer_base acquires a lock and returns with that lock held. Add a lock annotation to this function so that sparse can check callers for lock pairing, and so that sparse will not complain about this function since it intentionally uses the lock in this manner. Signed-off-by: Josh Triplett Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/timer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 1d7dd6267c2d..6c9fa80088ed 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -175,6 +175,7 @@ static inline void detach_timer(struct timer_list *timer, */ static tvec_base_t *lock_timer_base(struct timer_list *timer, unsigned long *flags) + __acquires(timer->base->lock) { tvec_base_t *base; -- cgit v1.2.3 From 6c5c934153513dc72e2d6464f39e8ef1f27c0a3e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 29 Sep 2006 01:59:40 -0700 Subject: [PATCH] ifdef blktrace debugging fields Signed-off-by: Alexey Dobriyan Acked-by: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 802b1cf0e63f..bca6ce6d3ded 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -183,7 +183,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) /* One for us, one for whoever does the "release_task()" (usually parent) */ atomic_set(&tsk->usage,2); atomic_set(&tsk->fs_excl, 0); +#ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; +#endif tsk->splice_pipe = NULL; return tsk; } -- cgit v1.2.3 From db630637b2f192bea2ba1c000e9cbe4e542a03ea Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 29 Sep 2006 01:59:44 -0700 Subject: [PATCH] clean up and remove some extra spinlocks from rtmutex Oleg brought up some interesting points about grabbing the pi_lock for some protections. In this discussion, I realized that there are some places that the pi_lock is being grabbed when it really wasn't necessary. Also this patch does a little bit of clean up. This patch basically does three things: 1) renames the "boost" variable to "chain_walk". Since it is used in the debugging case when it isn't going to be boosted. It better describes what the test is going to do if it succeeds. 2) moves get_task_struct to just before the unlocking of the wait_lock. This removes duplicate code, and makes it a little easier to read. The owner wont go away while either the pi_lock or the wait_lock are held. 3) removes the pi_locking and owner blocked checking completely from the debugging case. This is because the grabbing the lock and doing the check, then releasing the lock is just so full of races. It's just as good to go ahead and call the pi_chain_walk function, since after releasing the lock the owner can then block anyway, and we would have missed that. For the debug case, we really do want to do the chain walk to test for deadlocks anyway. [oleg@tv-sign.ru: more of the same] Signed-of-by: Steven Rostedt Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Oleg Nesterov Cc: Esben Nielsen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rtmutex.c | 51 +++++++++++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 3e13a1e5856f..4ab17da46fd8 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -251,6 +251,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* Grab the next task */ task = rt_mutex_owner(lock); + get_task_struct(task); spin_lock_irqsave(&task->pi_lock, flags); if (waiter == rt_mutex_top_waiter(lock)) { @@ -269,7 +270,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, __rt_mutex_adjust_prio(task); } - get_task_struct(task); spin_unlock_irqrestore(&task->pi_lock, flags); top_waiter = rt_mutex_top_waiter(lock); @@ -409,7 +409,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; unsigned long flags; - int boost = 0, res; + int chain_walk = 0, res; spin_lock_irqsave(¤t->pi_lock, flags); __rt_mutex_adjust_prio(current); @@ -433,25 +433,23 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, plist_add(&waiter->pi_list_entry, &owner->pi_waiters); __rt_mutex_adjust_prio(owner); - if (owner->pi_blocked_on) { - boost = 1; - /* gets dropped in rt_mutex_adjust_prio_chain()! */ - get_task_struct(owner); - } - spin_unlock_irqrestore(&owner->pi_lock, flags); - } - else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) { - spin_lock_irqsave(&owner->pi_lock, flags); - if (owner->pi_blocked_on) { - boost = 1; - /* gets dropped in rt_mutex_adjust_prio_chain()! */ - get_task_struct(owner); - } + if (owner->pi_blocked_on) + chain_walk = 1; spin_unlock_irqrestore(&owner->pi_lock, flags); } - if (!boost) + else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) + chain_walk = 1; + + if (!chain_walk) return 0; + /* + * The owner can't disappear while holding a lock, + * so the owner struct is protected by wait_lock. + * Gets dropped in rt_mutex_adjust_prio_chain()! + */ + get_task_struct(owner); + spin_unlock(&lock->wait_lock); res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, @@ -532,7 +530,7 @@ static void remove_waiter(struct rt_mutex *lock, int first = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); unsigned long flags; - int boost = 0; + int chain_walk = 0; spin_lock_irqsave(¤t->pi_lock, flags); plist_del(&waiter->list_entry, &lock->wait_list); @@ -554,19 +552,20 @@ static void remove_waiter(struct rt_mutex *lock, } __rt_mutex_adjust_prio(owner); - if (owner->pi_blocked_on) { - boost = 1; - /* gets dropped in rt_mutex_adjust_prio_chain()! */ - get_task_struct(owner); - } + if (owner->pi_blocked_on) + chain_walk = 1; + spin_unlock_irqrestore(&owner->pi_lock, flags); } WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); - if (!boost) + if (!chain_walk) return; + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(owner); + spin_unlock(&lock->wait_lock); rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); @@ -592,10 +591,10 @@ void rt_mutex_adjust_pi(struct task_struct *task) return; } - /* gets dropped in rt_mutex_adjust_prio_chain()! */ - get_task_struct(task); spin_unlock_irqrestore(&task->pi_lock, flags); + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(task); rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); } -- cgit v1.2.3 From 2aae4a108dab8b8bc92270335f6e4b5c146b32ae Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Fri, 29 Sep 2006 01:59:46 -0700 Subject: [PATCH] Fix kerneldoc comments in kernel/timer.c Some of the kerneldoc comments in this file are ignored since the lead-in is malformed, using either "/*" or "/***" instead of "/**". [rdunlap@xenotime.net: kerneldoc fixes] Signed-off-by: Rolf Eike Beer Acked-by: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/timer.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 6c9fa80088ed..d644f4e9ca0c 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -136,7 +136,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) list_add_tail(&timer->entry, vec); } -/*** +/** * init_timer - initialize a timer. * @timer: the timer to be initialized * @@ -236,7 +236,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) EXPORT_SYMBOL(__mod_timer); -/*** +/** * add_timer_on - start a timer on a particular CPU * @timer: the timer to be added * @cpu: the CPU to start it on @@ -256,9 +256,10 @@ void add_timer_on(struct timer_list *timer, int cpu) } -/*** +/** * mod_timer - modify a timer's timeout * @timer: the timer to be modified + * @expires: new timeout in jiffies * * mod_timer is a more efficient way to update the expire field of an * active timer (if the timer is inactive it will be activated) @@ -292,7 +293,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires) EXPORT_SYMBOL(mod_timer); -/*** +/** * del_timer - deactive a timer. * @timer: the timer to be deactivated * @@ -324,7 +325,10 @@ int del_timer(struct timer_list *timer) EXPORT_SYMBOL(del_timer); #ifdef CONFIG_SMP -/* +/** + * try_to_del_timer_sync - Try to deactivate a timer + * @timer: timer do del + * * This function tries to deactivate a timer. Upon successful (ret >= 0) * exit the timer is not queued and the handler is not running on any CPU. * @@ -352,7 +356,7 @@ out: return ret; } -/*** +/** * del_timer_sync - deactivate a timer and wait for the handler to finish. * @timer: the timer to be deactivated * @@ -402,15 +406,15 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index) return index; } -/*** +#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) + +/** * __run_timers - run all expired timers (if any) on this CPU. * @base: the timer vector to be processed. * * This function cascades all vectors and executes all expired timer * vectors. */ -#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) - static inline void __run_timers(tvec_base_t *base) { struct timer_list *timer; @@ -971,7 +975,7 @@ void __init timekeeping_init(void) static int timekeeping_suspended; -/* +/** * timekeeping_resume - Resumes the generic timekeeping subsystem. * @dev: unused * @@ -1107,7 +1111,7 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset) clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift); } -/* +/** * update_wall_time - Uses the current clocksource to increment the wall time * * Called from the timer interrupt, must hold a write on xtime_lock. @@ -1471,8 +1475,9 @@ asmlinkage long sys_gettid(void) return current->pid; } -/* +/** * sys_sysinfo - fill in sysinfo struct + * @info: pointer to buffer to fill */ asmlinkage long sys_sysinfo(struct sysinfo __user *info) { -- cgit v1.2.3 From e6cab99bb478e067b1a7a120333ff326954a2412 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Fri, 29 Sep 2006 01:59:57 -0700 Subject: [PATCH] unwind: fix unused variable warning when !CONFIG_MODULES Fix "variable defined but not used" compiler warning in unwind.c when CONFIG_MODULES is not set. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Cc: Jan Beulich Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/unwind.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/unwind.c b/kernel/unwind.c index 3430475fcd88..2e2368607aab 100644 --- a/kernel/unwind.c +++ b/kernel/unwind.c @@ -102,7 +102,7 @@ static struct unwind_table { unsigned long size; struct unwind_table *link; const char *name; -} root_table, *last_table; +} root_table; struct unwind_item { enum item_location { @@ -174,6 +174,8 @@ void __init unwind_init(void) #ifdef CONFIG_MODULES +static struct unwind_table *last_table; + /* Must be called with module_mutex held. */ void *unwind_add_table(struct module *module, const void *table_start, -- cgit v1.2.3 From 3b9b8ab65d8eed784b9164d03807cb2bda7b5cd6 Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Fri, 29 Sep 2006 02:00:05 -0700 Subject: [PATCH] Fix unserialized task->files changing Fixed race on put_files_struct on exec with proc. Restoring files on current on error path may lead to proc having a pointer to already kfree-d files_struct. ->files changing at exit.c and khtread.c are safe as exit_files() makes all things under lock. Found during OpenVZ stress testing. [akpm@osdl.org: add export] Signed-off-by: Pavel Emelianov Signed-off-by: Kirill Korotaev Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index d891883420f7..4b6fb054b25d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -487,6 +487,18 @@ void fastcall put_files_struct(struct files_struct *files) EXPORT_SYMBOL(put_files_struct); +void reset_files_struct(struct task_struct *tsk, struct files_struct *files) +{ + struct files_struct *old; + + old = tsk->files; + task_lock(tsk); + tsk->files = files; + task_unlock(tsk); + put_files_struct(old); +} +EXPORT_SYMBOL(reset_files_struct); + static inline void __exit_files(struct task_struct *tsk) { struct files_struct * files = tsk->files; -- cgit v1.2.3 From f400e198b2ed26ce55b22a1412ded0896e7516ac Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Fri, 29 Sep 2006 02:00:07 -0700 Subject: [PATCH] pidspace: is_init() This is an updated version of Eric Biederman's is_init() patch. (http://lkml.org/lkml/2006/2/6/280). It applies cleanly to 2.6.18-rc3 and replaces a few more instances of ->pid == 1 with is_init(). Further, is_init() checks pid and thus removes dependency on Eric's other patches for now. Eric's original description: There are a lot of places in the kernel where we test for init because we give it special properties. Most significantly init must not die. This results in code all over the kernel test ->pid == 1. Introduce is_init to capture this case. With multiple pid spaces for all of the cases affected we are looking for only the first process on the system, not some other process that has pid == 1. Signed-off-by: Eric W. Biederman Signed-off-by: Sukadev Bhattiprolu Cc: Dave Hansen Cc: Serge Hallyn Cc: Cedric Le Goater Cc: Acked-by: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/capability.c | 2 +- kernel/cpuset.c | 2 +- kernel/exit.c | 2 +- kernel/kexec.c | 2 +- kernel/ptrace.c | 1 + kernel/sysctl.c | 2 +- 6 files changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index c7685ad00a97..edb845a6e84a 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -133,7 +133,7 @@ static inline int cap_set_all(kernel_cap_t *effective, int found = 0; do_each_thread(g, target) { - if (target == current || target->pid == 1) + if (target == current || is_init(target)) continue; found = 1; if (security_capset_check(target, effective, inheritable, diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1b32c2c04c15..584bb4e6c042 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -240,7 +240,7 @@ static struct super_block *cpuset_sb; * A cpuset can only be deleted if both its 'count' of using tasks * is zero, and its list of 'children' cpusets is empty. Since all * tasks in the system use _some_ cpuset, and since there is always at - * least one task in the system (init, pid == 1), therefore, top_cpuset + * least one task in the system (init), therefore, top_cpuset * always has either children cpusets and/or using tasks. So we don't * need a special hack to ensure that top_cpuset cannot be deleted. * diff --git a/kernel/exit.c b/kernel/exit.c index 4b6fb054b25d..9961192d6055 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -219,7 +219,7 @@ static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task) do_each_task_pid(pgrp, PIDTYPE_PGID, p) { if (p == ignored_task || p->exit_state - || p->real_parent->pid == 1) + || is_init(p->real_parent)) continue; if (process_group(p->real_parent) != pgrp && p->real_parent->signal->session == p->signal->session) { diff --git a/kernel/kexec.c b/kernel/kexec.c index 50087ecf337e..37cad75cf494 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -40,7 +40,7 @@ struct resource crashk_res = { int kexec_should_crash(struct task_struct *p) { - if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops) + if (in_interrupt() || !p->pid || is_init(p) || panic_on_oops) return 1; return 0; } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 8aad0331d82e..4d50e06fd745 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -440,6 +440,7 @@ struct task_struct *ptrace_get_task_struct(pid_t pid) child = find_task_by_pid(pid); if (child) get_task_struct(child); + read_unlock(&tasklist_lock); if (!child) return ERR_PTR(-ESRCH); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8bfa7d117c54..9535a3839930 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1915,7 +1915,7 @@ int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, return -EPERM; } - op = (current->pid == 1) ? OP_SET : OP_AND; + op = is_init(current) ? OP_SET : OP_AND; return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, do_proc_dointvec_bset_conv,&op); } -- cgit v1.2.3 From 99de055ac065e19ed69de961e97c6336a261b34e Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Fri, 29 Sep 2006 02:00:10 -0700 Subject: [PATCH] lockdep: print kernel version Lets do the same thing we do for oopses - print out the version in the report. It's an extra line of output though. We could tack it on the end of the INFO: lines, but that screws up Ingo's pretty output. Signed-off-by: Dave Jones Cc: Ingo Molnar Cc: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/lockdep.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index c088e5542e84..df1c3594de31 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -36,6 +36,7 @@ #include #include #include +#include #include @@ -515,6 +516,13 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth) return 0; } +static void print_kernel_version(void) +{ + printk("%s %.*s\n", system_utsname.release, + (int)strcspn(system_utsname.version, " "), + system_utsname.version); +} + /* * When a circular dependency is detected, print the * header first: @@ -531,6 +539,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth) printk("\n=======================================================\n"); printk( "[ INFO: possible circular locking dependency detected ]\n"); + print_kernel_version(); printk( "-------------------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, curr->pid); @@ -712,6 +721,7 @@ print_bad_irq_dependency(struct task_struct *curr, printk("\n======================================================\n"); printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", irqclass, irqclass); + print_kernel_version(); printk( "------------------------------------------------------\n"); printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, curr->pid, @@ -793,6 +803,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, printk("\n=============================================\n"); printk( "[ INFO: possible recursive locking detected ]\n"); + print_kernel_version(); printk( "---------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, curr->pid); @@ -1375,6 +1386,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, printk("\n=========================================================\n"); printk( "[ INFO: possible irq lock inversion dependency detected ]\n"); + print_kernel_version(); printk( "---------------------------------------------------------\n"); printk("%s/%d just changed the state of lock:\n", curr->comm, curr->pid); @@ -1469,6 +1481,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, printk("\n=================================\n"); printk( "[ INFO: inconsistent lock state ]\n"); + print_kernel_version(); printk( "---------------------------------\n"); printk("inconsistent {%s} -> {%s} usage.\n", -- cgit v1.2.3 From a45bce49545739a940f8bd4ca85c3b7435564893 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 29 Sep 2006 02:00:11 -0700 Subject: [PATCH] memory ordering in __kfifo primitives Both __kfifo_put() and __kfifo_get() have header comments stating that if there is but one concurrent reader and one concurrent writer, locking is not necessary. This is almost the case, but a couple of memory barriers are needed. Another option would be to change the header comments to remove the bit about locking not being needed, and to change the those callers who currently don't use locking to add the required locking. The attachment analyzes this approach, but the patch below seems simpler. Signed-off-by: Paul E. McKenney Cc: Stelian Pop Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kfifo.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 64ab045c3d9d..5d1d907378a2 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -122,6 +122,13 @@ unsigned int __kfifo_put(struct kfifo *fifo, len = min(len, fifo->size - fifo->in + fifo->out); + /* + * Ensure that we sample the fifo->out index -before- we + * start putting bytes into the kfifo. + */ + + smp_mb(); + /* first put the data starting from fifo->in to buffer end */ l = min(len, fifo->size - (fifo->in & (fifo->size - 1))); memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l); @@ -129,6 +136,13 @@ unsigned int __kfifo_put(struct kfifo *fifo, /* then put the rest (if any) at the beginning of the buffer */ memcpy(fifo->buffer, buffer + l, len - l); + /* + * Ensure that we add the bytes to the kfifo -before- + * we update the fifo->in index. + */ + + smp_wmb(); + fifo->in += len; return len; @@ -154,6 +168,13 @@ unsigned int __kfifo_get(struct kfifo *fifo, len = min(len, fifo->in - fifo->out); + /* + * Ensure that we sample the fifo->in index -before- we + * start removing bytes from the kfifo. + */ + + smp_rmb(); + /* first get the data from fifo->out until the end of the buffer */ l = min(len, fifo->size - (fifo->out & (fifo->size - 1))); memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l); @@ -161,6 +182,13 @@ unsigned int __kfifo_get(struct kfifo *fifo, /* then get the rest (if any) from the beginning of the buffer */ memcpy(buffer + l, fifo->buffer, len - l); + /* + * Ensure that we remove the bytes from the kfifo -before- + * we update the fifo->out index. + */ + + smp_mb(); + fifo->out += len; return len; -- cgit v1.2.3 From 07dccf3344010f9b9df7fe725da7e73bca2992df Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Fri, 29 Sep 2006 02:00:22 -0700 Subject: [PATCH] check return value of cpu_callback Spawing ksoftirqd, migration, or watchdog, and calling init_timers_cpu() may fail with small memory. If it happens in initcalls, kernel NULL pointer dereference happens later. This patch makes crash happen immediately in such cases. It seems a bit better than getting kernel NULL pointer dereference later. Cc: Ingo Molnar Signed-off-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 4 +++- kernel/softirq.c | 4 +++- kernel/softlockup.c | 3 ++- kernel/timer.c | 4 +++- 4 files changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 5c848fd4e461..b946209d9c15 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5272,9 +5272,11 @@ static struct notifier_block __cpuinitdata migration_notifier = { int __init migration_init(void) { void *cpu = (void *)(long)smp_processor_id(); + int err; /* Start one for the boot CPU: */ - migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); + err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); + BUG_ON(err == NOTIFY_BAD); migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); diff --git a/kernel/softirq.c b/kernel/softirq.c index 3789ca98197c..bf25015dce16 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -612,7 +612,9 @@ static struct notifier_block __cpuinitdata cpu_nfb = { __init int spawn_ksoftirqd(void) { void *cpu = (void *)(long)smp_processor_id(); - cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + + BUG_ON(err == NOTIFY_BAD); cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); return 0; diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 03e6a2b0b787..50afeb813305 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -149,8 +149,9 @@ static struct notifier_block __cpuinitdata cpu_nfb = { __init void spawn_softlockup_task(void) { void *cpu = (void *)(long)smp_processor_id(); + int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + BUG_ON(err == NOTIFY_BAD); cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); diff --git a/kernel/timer.c b/kernel/timer.c index d644f4e9ca0c..a2cb1ecb1b28 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1694,8 +1694,10 @@ static struct notifier_block __cpuinitdata timers_nb = { void __init init_timers(void) { - timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, + int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); + + BUG_ON(err == NOTIFY_BAD); register_cpu_notifier(&timers_nb); open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); } -- cgit v1.2.3 From 1711ef3866b0360e102327389fe4b76c849bbe83 Mon Sep 17 00:00:00 2001 From: Toyo Abe Date: Fri, 29 Sep 2006 02:00:28 -0700 Subject: [PATCH] posix-timers: Fix clock_nanosleep() doesn't return the remaining time in compatibility mode The clock_nanosleep() function does not return the time remaining when the sleep is interrupted by a signal. This patch creates a new call out, compat_clock_nanosleep_restart(), which handles returning the remaining time after a sleep is interrupted. This patch revives clock_nanosleep_restart(). It is now accessed via the new call out. The compat_clock_nanosleep_restart() is used for compatibility access. Since this is implemented in compatibility mode the normal path is virtually unaffected - no real performance impact. Signed-off-by: Toyo Abe Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/compat.c | 33 +++++++++++++++++++++++++++++++++ kernel/hrtimer.c | 20 ++++++++++---------- kernel/posix-cpu-timers.c | 17 ++++++++++++----- kernel/posix-timers.c | 21 +++++++++++++++++++++ 4 files changed, 76 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 126dee9530aa..75573e5d27b0 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -601,6 +602,30 @@ long compat_sys_clock_getres(clockid_t which_clock, return err; } +static long compat_clock_nanosleep_restart(struct restart_block *restart) +{ + long err; + mm_segment_t oldfs; + struct timespec tu; + struct compat_timespec *rmtp = (struct compat_timespec *)(restart->arg1); + + restart->arg1 = (unsigned long) &tu; + oldfs = get_fs(); + set_fs(KERNEL_DS); + err = clock_nanosleep_restart(restart); + set_fs(oldfs); + + if ((err == -ERESTART_RESTARTBLOCK) && rmtp && + put_compat_timespec(&tu, rmtp)) + return -EFAULT; + + if (err == -ERESTART_RESTARTBLOCK) { + restart->fn = compat_clock_nanosleep_restart; + restart->arg1 = (unsigned long) rmtp; + } + return err; +} + long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, struct compat_timespec __user *rqtp, struct compat_timespec __user *rmtp) @@ -608,6 +633,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, long err; mm_segment_t oldfs; struct timespec in, out; + struct restart_block *restart; if (get_compat_timespec(&in, rqtp)) return -EFAULT; @@ -618,9 +644,16 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, (struct timespec __user *) &in, (struct timespec __user *) &out); set_fs(oldfs); + if ((err == -ERESTART_RESTARTBLOCK) && rmtp && put_compat_timespec(&out, rmtp)) return -EFAULT; + + if (err == -ERESTART_RESTARTBLOCK) { + restart = ¤t_thread_info()->restart_block; + restart->fn = compat_clock_nanosleep_restart; + restart->arg1 = (unsigned long) rmtp; + } return err; } diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 21c38a7e666b..d0ba190dfeb6 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -693,7 +693,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod return t->task == NULL; } -static long __sched nanosleep_restart(struct restart_block *restart) +long __sched hrtimer_nanosleep_restart(struct restart_block *restart) { struct hrtimer_sleeper t; struct timespec __user *rmtp; @@ -702,13 +702,13 @@ static long __sched nanosleep_restart(struct restart_block *restart) restart->fn = do_no_restart_syscall; - hrtimer_init(&t.timer, restart->arg3, HRTIMER_ABS); - t.timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0; + hrtimer_init(&t.timer, restart->arg0, HRTIMER_ABS); + t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2; if (do_nanosleep(&t, HRTIMER_ABS)) return 0; - rmtp = (struct timespec __user *) restart->arg2; + rmtp = (struct timespec __user *) restart->arg1; if (rmtp) { time = ktime_sub(t.timer.expires, t.timer.base->get_time()); if (time.tv64 <= 0) @@ -718,7 +718,7 @@ static long __sched nanosleep_restart(struct restart_block *restart) return -EFAULT; } - restart->fn = nanosleep_restart; + restart->fn = hrtimer_nanosleep_restart; /* The other values in restart are already filled in */ return -ERESTART_RESTARTBLOCK; @@ -751,11 +751,11 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, } restart = ¤t_thread_info()->restart_block; - restart->fn = nanosleep_restart; - restart->arg0 = t.timer.expires.tv64 & 0xFFFFFFFF; - restart->arg1 = t.timer.expires.tv64 >> 32; - restart->arg2 = (unsigned long) rmtp; - restart->arg3 = (unsigned long) t.timer.base->index; + restart->fn = hrtimer_nanosleep_restart; + restart->arg0 = (unsigned long) t.timer.base->index; + restart->arg1 = (unsigned long) rmtp; + restart->arg2 = t.timer.expires.tv64 & 0xFFFFFFFF; + restart->arg3 = t.timer.expires.tv64 >> 32; return -ERESTART_RESTARTBLOCK; } diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index d38d9ec3276c..5667fef89dd1 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1393,8 +1393,6 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, } } -static long posix_cpu_clock_nanosleep_restart(struct restart_block *); - int posix_cpu_nsleep(const clockid_t which_clock, int flags, struct timespec *rqtp, struct timespec __user *rmtp) { @@ -1471,7 +1469,7 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags, copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) return -EFAULT; - restart_block->fn = posix_cpu_clock_nanosleep_restart; + restart_block->fn = posix_cpu_nsleep_restart; /* Caller already set restart_block->arg1 */ restart_block->arg0 = which_clock; restart_block->arg1 = (unsigned long) rmtp; @@ -1484,8 +1482,7 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags, return error; } -static long -posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block) +long posix_cpu_nsleep_restart(struct restart_block *restart_block) { clockid_t which_clock = restart_block->arg0; struct timespec __user *rmtp; @@ -1524,6 +1521,10 @@ static int process_cpu_nsleep(const clockid_t which_clock, int flags, { return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp); } +static long process_cpu_nsleep_restart(struct restart_block *restart_block) +{ + return -EINVAL; +} static int thread_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) { @@ -1544,6 +1545,10 @@ static int thread_cpu_nsleep(const clockid_t which_clock, int flags, { return -EINVAL; } +static long thread_cpu_nsleep_restart(struct restart_block *restart_block) +{ + return -EINVAL; +} static __init int init_posix_cpu_timers(void) { @@ -1553,6 +1558,7 @@ static __init int init_posix_cpu_timers(void) .clock_set = do_posix_clock_nosettime, .timer_create = process_cpu_timer_create, .nsleep = process_cpu_nsleep, + .nsleep_restart = process_cpu_nsleep_restart, }; struct k_clock thread = { .clock_getres = thread_cpu_clock_getres, @@ -1560,6 +1566,7 @@ static __init int init_posix_cpu_timers(void) .clock_set = do_posix_clock_nosettime, .timer_create = thread_cpu_timer_create, .nsleep = thread_cpu_nsleep, + .nsleep_restart = thread_cpu_nsleep_restart, }; register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index ac6dc8744429..e5ebcc1ec3a0 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -973,3 +973,24 @@ sys_clock_nanosleep(const clockid_t which_clock, int flags, return CLOCK_DISPATCH(which_clock, nsleep, (which_clock, flags, &t, rmtp)); } + +/* + * nanosleep_restart for monotonic and realtime clocks + */ +static int common_nsleep_restart(struct restart_block *restart_block) +{ + return hrtimer_nanosleep_restart(restart_block); +} + +/* + * This will restart clock_nanosleep. This is required only by + * compat_clock_nanosleep_restart for now. + */ +long +clock_nanosleep_restart(struct restart_block *restart_block) +{ + clockid_t which_clock = restart_block->arg0; + + return CLOCK_DISPATCH(which_clock, nsleep_restart, + (restart_block)); +} -- cgit v1.2.3 From e4b765551aa6355eae60b644bed851a9477c4e2b Mon Sep 17 00:00:00 2001 From: Toyo Abe Date: Fri, 29 Sep 2006 02:00:29 -0700 Subject: [PATCH] posix-timers: Fix the flags handling in posix_cpu_nsleep() When a posix_cpu_nsleep() sleep is interrupted by a signal more than twice, it incorrectly reports the sleep time remaining to the user. Because posix_cpu_nsleep() doesn't report back to the user when it's called from restart function due to the wrong flags handling. This patch, which applies after previous one, moves the nanosleep() function from posix_cpu_nsleep() to do_cpu_nanosleep() and cleans up the flags handling appropriately. Signed-off-by: Toyo Abe Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/posix-cpu-timers.c | 84 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 58 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 5667fef89dd1..479b16b44f79 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1393,22 +1393,12 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, } } -int posix_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct timespec __user *rmtp) +static int do_cpu_nanosleep(const clockid_t which_clock, int flags, + struct timespec *rqtp, struct itimerspec *it) { - struct restart_block *restart_block = - ¤t_thread_info()->restart_block; struct k_itimer timer; int error; - /* - * Diagnose required errors first. - */ - if (CPUCLOCK_PERTHREAD(which_clock) && - (CPUCLOCK_PID(which_clock) == 0 || - CPUCLOCK_PID(which_clock) == current->pid)) - return -EINVAL; - /* * Set up a temporary timer and then wait for it to go off. */ @@ -1420,11 +1410,12 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags, timer.it_process = current; if (!error) { static struct itimerspec zero_it; - struct itimerspec it = { .it_value = *rqtp, - .it_interval = {} }; + + memset(it, 0, sizeof *it); + it->it_value = *rqtp; spin_lock_irq(&timer.it_lock); - error = posix_cpu_timer_set(&timer, flags, &it, NULL); + error = posix_cpu_timer_set(&timer, flags, it, NULL); if (error) { spin_unlock_irq(&timer.it_lock); return error; @@ -1452,33 +1443,56 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags, * We were interrupted by a signal. */ sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); - posix_cpu_timer_set(&timer, 0, &zero_it, &it); + posix_cpu_timer_set(&timer, 0, &zero_it, it); spin_unlock_irq(&timer.it_lock); - if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) { + if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) { /* * It actually did fire already. */ return 0; } + error = -ERESTART_RESTARTBLOCK; + } + + return error; +} + +int posix_cpu_nsleep(const clockid_t which_clock, int flags, + struct timespec *rqtp, struct timespec __user *rmtp) +{ + struct restart_block *restart_block = + ¤t_thread_info()->restart_block; + struct itimerspec it; + int error; + + /* + * Diagnose required errors first. + */ + if (CPUCLOCK_PERTHREAD(which_clock) && + (CPUCLOCK_PID(which_clock) == 0 || + CPUCLOCK_PID(which_clock) == current->pid)) + return -EINVAL; + + error = do_cpu_nanosleep(which_clock, flags, rqtp, &it); + + if (error == -ERESTART_RESTARTBLOCK) { + + if (flags & TIMER_ABSTIME) + return -ERESTARTNOHAND; /* - * Report back to the user the time still remaining. - */ - if (rmtp != NULL && !(flags & TIMER_ABSTIME) && - copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + * Report back to the user the time still remaining. + */ + if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) return -EFAULT; restart_block->fn = posix_cpu_nsleep_restart; - /* Caller already set restart_block->arg1 */ restart_block->arg0 = which_clock; restart_block->arg1 = (unsigned long) rmtp; restart_block->arg2 = rqtp->tv_sec; restart_block->arg3 = rqtp->tv_nsec; - - error = -ERESTART_RESTARTBLOCK; } - return error; } @@ -1487,13 +1501,31 @@ long posix_cpu_nsleep_restart(struct restart_block *restart_block) clockid_t which_clock = restart_block->arg0; struct timespec __user *rmtp; struct timespec t; + struct itimerspec it; + int error; rmtp = (struct timespec __user *) restart_block->arg1; t.tv_sec = restart_block->arg2; t.tv_nsec = restart_block->arg3; restart_block->fn = do_no_restart_syscall; - return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t, rmtp); + error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); + + if (error == -ERESTART_RESTARTBLOCK) { + /* + * Report back to the user the time still remaining. + */ + if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + return -EFAULT; + + restart_block->fn = posix_cpu_nsleep_restart; + restart_block->arg0 = which_clock; + restart_block->arg1 = (unsigned long) rmtp; + restart_block->arg2 = t.tv_sec; + restart_block->arg3 = t.tv_nsec; + } + return error; + } -- cgit v1.2.3 From b9ecb2bd5d3ab8904752685696cb76aac1f3fef2 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 29 Sep 2006 02:00:31 -0700 Subject: [PATCH] has_stopped_jobs() cleanup This check has been obsolete since the introduction of TASK_TRACED. Now TASK_STOPPED always means job control stop. Signed-off-by: Roland McGrath Cc: Oleg Nesterov Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 9961192d6055..3ec7b10eae38 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -249,17 +249,6 @@ static int has_stopped_jobs(int pgrp) do_each_task_pid(pgrp, PIDTYPE_PGID, p) { if (p->state != TASK_STOPPED) continue; - - /* If p is stopped by a debugger on a signal that won't - stop it, then don't count p as stopped. This isn't - perfect but it's a good approximation. */ - if (unlikely (p->ptrace) - && p->exit_code != SIGSTOP - && p->exit_code != SIGTSTP - && p->exit_code != SIGTTOU - && p->exit_code != SIGTTIN) - continue; - retval = 1; break; } while_each_task_pid(pgrp, PIDTYPE_PGID, p); -- cgit v1.2.3 From 27d91e07f9b863fa94491ffafe250580f0c2ce78 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 29 Sep 2006 02:00:31 -0700 Subject: [PATCH] __dequeue_signal() cleanup This tightens up __dequeue_signal a little. It also avoids doing recalc_sigpending twice in a row, instead doing it once in dequeue_signal. Signed-off-by: Roland McGrath Cc: Oleg Nesterov Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 05853a7337e3..fb5da6d19f14 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -417,9 +417,8 @@ static int collect_signal(int sig, struct sigpending *list, siginfo_t *info) static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, siginfo_t *info) { - int sig = 0; + int sig = next_signal(pending, mask); - sig = next_signal(pending, mask); if (sig) { if (current->notifier) { if (sigismember(current->notifier_mask, sig)) { @@ -432,9 +431,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, if (!collect_signal(sig, pending, info)) sig = 0; - } - recalc_sigpending(); return sig; } @@ -451,6 +448,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) if (!signr) signr = __dequeue_signal(&tsk->signal->shared_pending, mask, info); + recalc_sigpending_tsk(tsk); if (signr && unlikely(sig_kernel_stop(signr))) { /* * Set a marker that we have dequeued a stop signal. Our -- cgit v1.2.3 From 3171a0305d62e6627a24bff35af4f997e4988a80 Mon Sep 17 00:00:00 2001 From: Atsushi Nemoto Date: Fri, 29 Sep 2006 02:00:32 -0700 Subject: [PATCH] simplify update_times (avoid jiffies/jiffies_64 aliasing problem) Pass ticks to do_timer() and update_times(), and adjust x86_64 and s390 timer interrupt handler with this change. Currently update_times() calculates ticks by "jiffies - wall_jiffies", but callers of do_timer() should know how many ticks to update. Passing ticks get rid of this redundant calculation. Also there are another redundancy pointed out by Martin Schwidefsky. This cleanup make a barrier added by 5aee405c662ca644980c184774277fc6d0769a84 needless. So this patch removes it. As a bonus, this cleanup make wall_jiffies can be removed easily, since now wall_jiffies is always synced with jiffies. (This patch does not really remove wall_jiffies. It would be another cleanup patch) Signed-off-by: Atsushi Nemoto Cc: Martin Schwidefsky Cc: "Eric W. Biederman" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: john stultz Cc: Andi Kleen Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Richard Henderson Cc: Ivan Kokshaysky Acked-by: Russell King Cc: Ian Molton Cc: Mikael Starvik Acked-by: David Howells Cc: Yoshinori Sato Cc: Hirokazu Takata Acked-by: Ralf Baechle Cc: Kyle McMartin Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: Paul Mundt Cc: Kazumoto Kojima Cc: Richard Curnow Cc: William Lee Irwin III Cc: "David S. Miller" Cc: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Cc: Miles Bader Cc: Chris Zankel Acked-by: "Luck, Tony" Cc: Geert Uytterhoeven Cc: Roman Zippel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/timer.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index a2cb1ecb1b28..4f55622b0d38 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1222,10 +1222,8 @@ static inline void calc_load(unsigned long ticks) unsigned long active_tasks; /* fixed-point */ static int count = LOAD_FREQ; - count -= ticks; - if (count < 0) { - count += LOAD_FREQ; - active_tasks = count_active_tasks(); + active_tasks = count_active_tasks(); + for (count -= ticks; count < 0; count += LOAD_FREQ) { CALC_LOAD(avenrun[0], EXP_1, active_tasks); CALC_LOAD(avenrun[1], EXP_5, active_tasks); CALC_LOAD(avenrun[2], EXP_15, active_tasks); @@ -1270,11 +1268,8 @@ void run_local_timers(void) * Called by the timer interrupt. xtime_lock must already be taken * by the timer IRQ! */ -static inline void update_times(void) +static inline void update_times(unsigned long ticks) { - unsigned long ticks; - - ticks = jiffies - wall_jiffies; wall_jiffies += ticks; update_wall_time(); calc_load(ticks); @@ -1286,12 +1281,10 @@ static inline void update_times(void) * jiffies is defined in the linker script... */ -void do_timer(struct pt_regs *regs) +void do_timer(unsigned long ticks) { - jiffies_64++; - /* prevent loading jiffies before storing new jiffies_64 value. */ - barrier(); - update_times(); + jiffies_64 += ticks; + update_times(ticks); } #ifdef __ARCH_WANT_SYS_ALARM -- cgit v1.2.3 From 0b4a8a789a328af6aac613734c362cf6aad72201 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 29 Sep 2006 02:00:39 -0700 Subject: [PATCH] kexec warning fix This fixes a couple of compiler warnings, and adds paranoia checks as well. Signed-off-by: Roland McGrath Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 37cad75cf494..fcdd5d2bc3f4 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -995,7 +995,8 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, image = xchg(dest_image, image); out: - xchg(&kexec_lock, 0); /* Release the mutex */ + locked = xchg(&kexec_lock, 0); /* Release the mutex */ + BUG_ON(!locked); kimage_free(image); return result; @@ -1061,7 +1062,8 @@ void crash_kexec(struct pt_regs *regs) machine_crash_shutdown(&fixed_regs); machine_kexec(kexec_crash_image); } - xchg(&kexec_lock, 0); + locked = xchg(&kexec_lock, 0); + BUG_ON(!locked); } } -- cgit v1.2.3 From 54306cf04c0ea0a8c432603dbc657ab62a438668 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Fri, 29 Sep 2006 02:00:42 -0700 Subject: [PATCH] exit: fix crash case If we are going to BUG() not panic() here then we should cover the case of the BUG being compiled out Signed-off-by: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 3ec7b10eae38..a51b347ae67b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -963,7 +963,8 @@ fastcall NORET_TYPE void do_exit(long code) schedule(); BUG(); /* Avoid "noreturn function does return". */ - for (;;) ; + for (;;) + cpu_relax(); /* For when BUG is null */ } EXPORT_SYMBOL_GPL(do_exit); -- cgit v1.2.3 From 111dbe0c8a21dffa473239861be47ebc87f593b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Steinbrink?= Date: Fri, 29 Sep 2006 02:00:46 -0700 Subject: [PATCH] Fix ____call_usermodehelper errors being silently ignored MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If ____call_usermodehelper fails, we're not interested in the child process' exit value, but the real error, so let's stop wait_for_helper from overwriting it in that case. Issue discovered by Benedikt Böhm while working on a Linux-VServer usermode helper. Signed-off-by: Björn Steinbrink Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 5c470c57fb57..842f8015d7fd 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -176,6 +176,8 @@ static int wait_for_helper(void *data) if (pid < 0) { sub_info->retval = pid; } else { + int ret; + /* * Normally it is bogus to call wait4() from in-kernel because * wait4() wants to write the exit code to a userspace address. @@ -185,7 +187,15 @@ static int wait_for_helper(void *data) * * Thus the __user pointer cast is valid here. */ - sys_wait4(pid, (int __user *) &sub_info->retval, 0, NULL); + sys_wait4(pid, (int __user *)&ret, 0, NULL); + + /* + * If ret is 0, either ____call_usermodehelper failed and the + * real error code is already in sub_info->retval or + * sub_info->retval is 0 anyway, so don't mess with it then. + */ + if (ret) + sub_info->retval = ret; } complete(sub_info->complete); -- cgit v1.2.3 From c9472e0f28cd2f0695a0ac3a0b4bd33f21714a7e Mon Sep 17 00:00:00 2001 From: Cal Peake Date: Fri, 29 Sep 2006 02:00:47 -0700 Subject: [PATCH] kill extraneous printk in kernel_restart() Get rid of an extraneous printk in kernel_restart(). Signed-off-by: Cal Peake Acked-by: Eric W. Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 3f894775488d..8647061c084a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -612,7 +612,6 @@ void kernel_restart(char *cmd) } else { printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); } - printk(".\n"); machine_restart(cmd); } EXPORT_SYMBOL_GPL(kernel_restart); -- cgit v1.2.3 From 5fe1d75f34974046fffcca5e22fb8a7b42fded33 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 29 Sep 2006 02:00:48 -0700 Subject: [PATCH] do_sched_setscheduler(): don't take tasklist_lock Use rcu locks instead. sched_setscheduler() now takes ->siglock before reading ->signal->rlim[]. Signed-off-by: Oleg Nesterov Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Steven Rostedt Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b946209d9c15..f9b3c6a414f1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4080,6 +4080,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) * @p: the task in question. * @policy: new policy. * @param: structure containing the new RT priority. + * + * NOTE: the task may be already dead */ int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) @@ -4115,19 +4117,26 @@ recheck: * Allow unprivileged RT tasks to decrease priority: */ if (!capable(CAP_SYS_NICE)) { + unsigned long rlim_rtprio; + unsigned long flags; + + if (!lock_task_sighand(p, &flags)) + return -ESRCH; + rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; + unlock_task_sighand(p, &flags); + /* * can't change policy, except between SCHED_NORMAL * and SCHED_BATCH: */ if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) && (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) && - !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) + !rlim_rtprio) return -EPERM; /* can't increase priority */ if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) && param->sched_priority > p->rt_priority && - param->sched_priority > - p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) + param->sched_priority > rlim_rtprio) return -EPERM; /* can't change other user's priorities */ if ((current->euid != p->euid) && @@ -4193,14 +4202,13 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) return -EINVAL; if (copy_from_user(&lparam, param, sizeof(struct sched_param))) return -EFAULT; - read_lock_irq(&tasklist_lock); + + rcu_read_lock(); + retval = -ESRCH; p = find_process_by_pid(pid); - if (!p) { - read_unlock_irq(&tasklist_lock); - return -ESRCH; - } - retval = sched_setscheduler(p, policy, &lparam); - read_unlock_irq(&tasklist_lock); + if (p != NULL) + retval = sched_setscheduler(p, policy, &lparam); + rcu_read_unlock(); return retval; } -- cgit v1.2.3 From 57a6f51c4281aa3119975473c70dce0480d322bd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 29 Sep 2006 02:00:49 -0700 Subject: [PATCH] introduce is_rt_policy() helper Imho, makes the code a bit easier to read. Signed-off-by: Oleg Nesterov Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Steven Rostedt Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f9b3c6a414f1..c3c718aea618 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4109,8 +4109,7 @@ recheck: (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) return -EINVAL; - if ((policy == SCHED_NORMAL || policy == SCHED_BATCH) - != (param->sched_priority == 0)) + if (is_rt_policy(policy) != (param->sched_priority != 0)) return -EINVAL; /* @@ -4134,7 +4133,7 @@ recheck: !rlim_rtprio) return -EPERM; /* can't increase priority */ - if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) && + if (is_rt_policy(policy) && param->sched_priority > p->rt_priority && param->sched_priority > rlim_rtprio) return -EPERM; -- cgit v1.2.3 From 8dc3e9099e01dfdd53f27f329c268eced03dfef6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 29 Sep 2006 02:00:50 -0700 Subject: [PATCH] sched_setscheduler: fix? policy checks I am not sure this patch is correct: I can't understand what the current code does, and I don't know what it was supposed to do. The comment says: * can't change policy, except between SCHED_NORMAL * and SCHED_BATCH: The code: if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) && (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) && But this is equivalent to: if ( (is_rt_policy(policy) && has_rt_policy(p)) && which means something different. We can't _decrease_ the current ->rt_priority with such a check (if rlim[RLIMIT_RTPRIO] == 0). Probably, it was supposed to be: if ( !(policy == SCHED_NORMAL && p->policy == SCHED_BATCH) && !(policy == SCHED_BATCH && p->policy == SCHED_NORMAL) this matches the comment, but strange: it doesn't allow to _drop_ the realtime priority when rlim[RLIMIT_RTPRIO] == 0. I think the right check would be: /* can't set/change rt policy */ if (is_rt_policy(policy) && policy != p->policy && !rlim_rtprio) return -EPERM; Signed-off-by: Oleg Nesterov Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Steven Rostedt Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c3c718aea618..155a33da7aa7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4116,27 +4116,25 @@ recheck: * Allow unprivileged RT tasks to decrease priority: */ if (!capable(CAP_SYS_NICE)) { - unsigned long rlim_rtprio; - unsigned long flags; - - if (!lock_task_sighand(p, &flags)) - return -ESRCH; - rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; - unlock_task_sighand(p, &flags); + if (is_rt_policy(policy)) { + unsigned long rlim_rtprio; + unsigned long flags; + + if (!lock_task_sighand(p, &flags)) + return -ESRCH; + rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; + unlock_task_sighand(p, &flags); + + /* can't set/change the rt policy */ + if (policy != p->policy && !rlim_rtprio) + return -EPERM; + + /* can't increase priority */ + if (param->sched_priority > p->rt_priority && + param->sched_priority > rlim_rtprio) + return -EPERM; + } - /* - * can't change policy, except between SCHED_NORMAL - * and SCHED_BATCH: - */ - if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) && - (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) && - !rlim_rtprio) - return -EPERM; - /* can't increase priority */ - if (is_rt_policy(policy) && - param->sched_priority > p->rt_priority && - param->sched_priority > rlim_rtprio) - return -EPERM; /* can't change other user's priorities */ if ((current->euid != p->euid) && (current->euid != p->uid)) -- cgit v1.2.3 From 1c573afebc6213e4372e0d8352034c23d5262e1f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 29 Sep 2006 02:00:51 -0700 Subject: [PATCH] reparent_to_init(): use has_rt_policy() Remove open-coded has_rt_policy(), no changes in kernel/exit.o Signed-off-by: Oleg Nesterov Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Steven Rostedt Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index a51b347ae67b..4a280856acd2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -281,9 +281,7 @@ static void reparent_to_init(void) /* Set the exit signal to SIGCHLD so we signal init on exit */ current->exit_signal = SIGCHLD; - if ((current->policy == SCHED_NORMAL || - current->policy == SCHED_BATCH) - && (task_nice(current) < 0)) + if (!has_rt_policy(current) && (task_nice(current) < 0)) set_user_nice(current, 0); /* cpus_allowed? */ /* rt_priority? */ -- cgit v1.2.3 From 5b160f5ecd2f1b6df2e0015dc1f319c8ef803d62 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 29 Sep 2006 02:00:52 -0700 Subject: [PATCH] copy_process: cosmetic ->ioprio tweak copy_process: // holds tasklist_lock + ->siglock /* * inherit ioprio */ p->ioprio = current->ioprio; Why? ->ioprio was already copied in dup_task_struct(). I guess this is needed to ensure that the child can't escape sys_ioprio_set(IOPRIO_WHO_{PGRP,USER}), yes? In that case we don't need ->siglock held, and the comment should be updated. Signed-off-by: Oleg Nesterov Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index bca6ce6d3ded..1c999f3e0b47 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1150,7 +1150,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* Our parent execution domain becomes current domain These must match for thread signalling to apply */ - p->parent_exec_id = p->self_exec_id; /* ok, now we should be set up.. */ @@ -1173,6 +1172,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); + /* for sys_ioprio_set(IOPRIO_WHO_PGRP) */ + p->ioprio = current->ioprio; + /* * The task hasn't been attached yet, so its cpus_allowed mask will * not be changed, nor will its assigned CPU. @@ -1232,11 +1234,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, } } - /* - * inherit ioprio - */ - p->ioprio = current->ioprio; - if (likely(p->pid)) { add_parent(p); if (unlikely(p->ptrace & PT_PTRACED)) -- cgit v1.2.3 From d359b549bf3d7f42f0084918a4816ea4572e507c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 29 Sep 2006 02:00:55 -0700 Subject: [PATCH] futex_find_get_task(): don't take tasklist_lock It is ok to do find_task_by_pid() + get_task_struct() under rcu_read_lock(), we cand drop tasklist_lock. Note that testing of ->exit_state is racy with or without tasklist anyway. Signed-off-by: Oleg Nesterov Acked-by: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 9d260e838cff..ca8ef11feb65 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -389,7 +389,7 @@ static struct task_struct * futex_find_get_task(pid_t pid) { struct task_struct *p; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_task_by_pid(pid); if (!p) goto out_unlock; @@ -403,7 +403,7 @@ static struct task_struct * futex_find_get_task(pid_t pid) } get_task_struct(p); out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); return p; } -- cgit v1.2.3 From aaa2a97eb9c0e91d7abc66bf76811a9599fdb3ee Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 29 Sep 2006 02:00:55 -0700 Subject: [PATCH] sys_get_robust_list(): don't take tasklist_lock use rcu locks for find_task_by_pid(). Signed-off-by: Oleg Nesterov Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index ca8ef11feb65..4b6770e9806d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1624,7 +1624,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr, struct task_struct *p; ret = -ESRCH; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_task_by_pid(pid); if (!p) goto err_unlock; @@ -1633,7 +1633,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr, !capable(CAP_SYS_PTRACE)) goto err_unlock; head = p->robust_list; - read_unlock(&tasklist_lock); + rcu_read_unlock(); } if (put_user(sizeof(*head), len_ptr)) @@ -1641,7 +1641,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr, return put_user(head, head_ptr); err_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); return ret; } -- cgit v1.2.3 From 29b884921634e1e01cbd276e1c9b8fc07a7e4a90 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 29 Sep 2006 02:01:09 -0700 Subject: [PATCH] set EXIT_DEAD state in do_exit(), not in schedule() schedule() checks PF_DEAD on every context switch and sets ->state = EXIT_DEAD to ensure that the exiting task will be deactivated. Note that this EXIT_DEAD is in fact a "random" value, we can use any bit except normal TASK_XXX values. It is better to set this state in do_exit() along with PF_DEAD flag and remove that check in schedule(). We are safe wrt concurrent try_to_wake_up() (for example ptrace, tkill), it can not change task's ->state: the 'state' argument of try_to_wake_up() can't have EXIT_DEAD bit. And in case when try_to_wake_up() sees a stale value of ->state == TASK_RUNNING it will do nothing. Signed-off-by: Oleg Nesterov Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 1 + kernel/sched.c | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 4a280856acd2..3d759c98fb11 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -957,6 +957,7 @@ fastcall NORET_TYPE void do_exit(long code) preempt_disable(); BUG_ON(tsk->flags & PF_DEAD); tsk->flags |= PF_DEAD; + tsk->state = EXIT_DEAD; schedule(); BUG(); diff --git a/kernel/sched.c b/kernel/sched.c index 155a33da7aa7..e1646b044b69 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3348,9 +3348,6 @@ need_resched_nonpreemptible: spin_lock_irq(&rq->lock); - if (unlikely(prev->flags & PF_DEAD)) - prev->state = EXIT_DEAD; - switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { switch_count = &prev->nvcsw; -- cgit v1.2.3 From 55a101f8f71a3d3dbda7b5c77083ffe47552f831 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 29 Sep 2006 02:01:10 -0700 Subject: [PATCH] kill PF_DEAD flag After the previous change (->flags & PF_DEAD) <=> (->state == EXIT_DEAD), we don't need PF_DEAD any longer. Signed-off-by: Oleg Nesterov Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 6 ++---- kernel/sched.c | 16 ++++++++-------- 2 files changed, 10 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 3d759c98fb11..9dd5f1336da2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -953,10 +953,8 @@ fastcall NORET_TYPE void do_exit(long code) if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); - /* PF_DEAD causes final put_task_struct after we schedule. */ preempt_disable(); - BUG_ON(tsk->flags & PF_DEAD); - tsk->flags |= PF_DEAD; + /* causes final put_task_struct in finish_task_switch(). */ tsk->state = EXIT_DEAD; schedule(); @@ -972,7 +970,7 @@ NORET_TYPE void complete_and_exit(struct completion *comp, long code) { if (comp) complete(comp); - + do_exit(code); } diff --git a/kernel/sched.c b/kernel/sched.c index e1646b044b69..a9405d7cc6ab 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1755,27 +1755,27 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) __releases(rq->lock) { struct mm_struct *mm = rq->prev_mm; - unsigned long prev_task_flags; + long prev_state; rq->prev_mm = NULL; /* * A task struct has one reference for the use as "current". - * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and - * calls schedule one last time. The schedule call will never return, - * and the scheduled task must drop that reference. - * The test for EXIT_ZOMBIE must occur while the runqueue locks are + * If a task dies, then it sets EXIT_DEAD in tsk->state and calls + * schedule one last time. The schedule call will never return, and + * the scheduled task must drop that reference. + * The test for EXIT_DEAD must occur while the runqueue locks are * still held, otherwise prev could be scheduled on another cpu, die * there before we look at prev->state, and then the reference would * be dropped twice. * Manfred Spraul */ - prev_task_flags = prev->flags; + prev_state = prev->state; finish_arch_switch(prev); finish_lock_switch(rq, prev); if (mm) mmdrop(mm); - if (unlikely(prev_task_flags & PF_DEAD)) { + if (unlikely(prev_state == EXIT_DEAD)) { /* * Remove function-return probe instances associated with this * task and put them back on the free list. @@ -5153,7 +5153,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD); /* Cannot have done final schedule yet: would have vanished. */ - BUG_ON(p->flags & PF_DEAD); + BUG_ON(p->state == EXIT_DEAD); get_task_struct(p); -- cgit v1.2.3 From c394cc9fbb367f87faa2228ec2eabacd2d4701c6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 29 Sep 2006 02:01:11 -0700 Subject: [PATCH] introduce TASK_DEAD state I am not sure about this patch, I am asking Ingo to take a decision. task_struct->state == EXIT_DEAD is a very special case, to avoid a confusion it makes sense to introduce a new state, TASK_DEAD, while EXIT_DEAD should live only in ->exit_state as documented in sched.h. Note that this state is not visible to user-space, get_task_state() masks off unsuitable states. Signed-off-by: Oleg Nesterov Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- kernel/sched.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 9dd5f1336da2..2e4c13cba95a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -955,7 +955,7 @@ fastcall NORET_TYPE void do_exit(long code) preempt_disable(); /* causes final put_task_struct in finish_task_switch(). */ - tsk->state = EXIT_DEAD; + tsk->state = TASK_DEAD; schedule(); BUG(); diff --git a/kernel/sched.c b/kernel/sched.c index a9405d7cc6ab..74f169ac0773 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1761,10 +1761,10 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) /* * A task struct has one reference for the use as "current". - * If a task dies, then it sets EXIT_DEAD in tsk->state and calls + * If a task dies, then it sets TASK_DEAD in tsk->state and calls * schedule one last time. The schedule call will never return, and * the scheduled task must drop that reference. - * The test for EXIT_DEAD must occur while the runqueue locks are + * The test for TASK_DEAD must occur while the runqueue locks are * still held, otherwise prev could be scheduled on another cpu, die * there before we look at prev->state, and then the reference would * be dropped twice. @@ -1775,7 +1775,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) finish_lock_switch(rq, prev); if (mm) mmdrop(mm); - if (unlikely(prev_state == EXIT_DEAD)) { + if (unlikely(prev_state == TASK_DEAD)) { /* * Remove function-return probe instances associated with this * task and put them back on the free list. @@ -5153,7 +5153,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD); /* Cannot have done final schedule yet: would have vanished. */ - BUG_ON(p->state == EXIT_DEAD); + BUG_ON(p->state == TASK_DEAD); get_task_struct(p); -- cgit v1.2.3 From 38837fc75acb7fa9b0e111b0241fe4fe76c5d4b3 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Fri, 29 Sep 2006 02:01:16 -0700 Subject: [PATCH] cpuset: top_cpuset tracks hotplug changes to node_online_map Change the list of memory nodes allowed to tasks in the top (root) nodeset to dynamically track what cpus are online, using a call to a cpuset hook from the memory hotplug code. Make this top cpus file read-only. On systems that have cpusets configured in their kernel, but that aren't actively using cpusets (for some distros, this covers the majority of systems) all tasks end up in the top cpuset. If that system does support memory hotplug, then these tasks cannot make use of memory nodes that are added after system boot, because the memory nodes are not allowed in the top cpuset. This is a surprising regression over earlier kernels that didn't have cpusets enabled. One key motivation for this change is to remain consistent with the behaviour for the top_cpuset's 'cpus', which is also read-only, and which automatically tracks the cpu_online_map. This change also has the minor benefit that it fixes a long standing, little noticed, minor bug in cpusets. The cpuset performance tweak to short circuit the cpuset_zone_allowed() check on systems with just a single cpuset (see 'number_of_cpusets', in linux/cpuset.h) meant that simply changing the 'mems' of the top_cpuset had no affect, even though the change (the write system call) appeared to succeed. With the following change, that write to the 'mems' file fails -EACCES, and the 'mems' file stubbornly refuses to be changed via user space writes. Thus no one should be mislead into thinking they've changed the top_cpusets's 'mems' when in affect they haven't. In order to keep the behaviour of cpusets consistent between systems actively making use of them and systems not using them, this patch changes the behaviour of the 'mems' file in the top (root) cpuset, making it read only, and making it automatically track the value of node_online_map. Thus tasks in the top cpuset will have automatic use of hot plugged memory nodes allowed by their cpuset. [akpm@osdl.org: build fix] [bunk@stusta.de: build fix] Signed-off-by: Paul Jackson Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 584bb4e6c042..794af5024c2f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -912,6 +912,10 @@ static int update_nodemask(struct cpuset *cs, char *buf) int fudge; int retval; + /* top_cpuset.mems_allowed tracks node_online_map; it's read-only */ + if (cs == &top_cpuset) + return -EACCES; + trialcs = *cs; retval = nodelist_parse(buf, trialcs.mems_allowed); if (retval < 0) @@ -2042,9 +2046,8 @@ out: * (of no affect) on systems that are actively using CPU hotplug * but making no active use of cpusets. * - * This handles CPU hotplug (cpuhp) events. If someday Memory - * Nodes can be hotplugged (dynamically changing node_online_map) - * then we should handle that too, perhaps in a similar way. + * This routine ensures that top_cpuset.cpus_allowed tracks + * cpu_online_map on each CPU hotplug (cpuhp) event. */ #ifdef CONFIG_HOTPLUG_CPU @@ -2063,6 +2066,25 @@ static int cpuset_handle_cpuhp(struct notifier_block *nb, } #endif +/* + * Keep top_cpuset.mems_allowed tracking node_online_map. + * Call this routine anytime after you change node_online_map. + * See also the previous routine cpuset_handle_cpuhp(). + */ + +#ifdef CONFIG_MEMORY_HOTPLUG +void cpuset_track_online_nodes() +{ + mutex_lock(&manage_mutex); + mutex_lock(&callback_mutex); + + top_cpuset.mems_allowed = node_online_map; + + mutex_unlock(&callback_mutex); + mutex_unlock(&manage_mutex); +} +#endif + /** * cpuset_init_smp - initialize cpus_allowed * -- cgit v1.2.3 From b1aac8bb824c658ddebd296b088a8bff5029c288 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Fri, 29 Sep 2006 02:01:17 -0700 Subject: [PATCH] cpuset: hotunplug cpus and mems in all cpusets The cpuset code handling hot unplug of CPUs or Memory Nodes was incorrect - it could remove a CPU or Node from the top cpuset, while leaving it still in some child cpusets. One basic rule of cpusets is that each cpusets cpus and mems are subsets of its parents. The cpuset hot unplug code violated this rule. So the cpuset hotunplug handler must walk down the tree, removing any removed CPU or Node from all cpusets. However, it is not allowed to make a cpusets cpus or mems become empty. They can only transition from empty to non-empty, not back. So if the last CPU or Node would be removed from a cpuset by the above walk, we scan back up the cpuset hierarchy, finding the nearest ancestor that still has something online, and copy its CPU or Memory placement. Signed-off-by: Paul Jackson Cc: Nathan Lynch Cc: Anton Blanchard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 70 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 794af5024c2f..cc0395d7eba1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2040,6 +2040,73 @@ out: return err; } +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG) +/* + * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs + * or memory nodes, we need to walk over the cpuset hierarchy, + * removing that CPU or node from all cpusets. If this removes the + * last CPU or node from a cpuset, then the guarantee_online_cpus() + * or guarantee_online_mems() code will use that emptied cpusets + * parent online CPUs or nodes. Cpusets that were already empty of + * CPUs or nodes are left empty. + * + * This routine is intentionally inefficient in a couple of regards. + * It will check all cpusets in a subtree even if the top cpuset of + * the subtree has no offline CPUs or nodes. It checks both CPUs and + * nodes, even though the caller could have been coded to know that + * only one of CPUs or nodes needed to be checked on a given call. + * This was done to minimize text size rather than cpu cycles. + * + * Call with both manage_mutex and callback_mutex held. + * + * Recursive, on depth of cpuset subtree. + */ + +static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur) +{ + struct cpuset *c; + + /* Each of our child cpusets mems must be online */ + list_for_each_entry(c, &cur->children, sibling) { + guarantee_online_cpus_mems_in_subtree(c); + if (!cpus_empty(c->cpus_allowed)) + guarantee_online_cpus(c, &c->cpus_allowed); + if (!nodes_empty(c->mems_allowed)) + guarantee_online_mems(c, &c->mems_allowed); + } +} + +/* + * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track + * cpu_online_map and node_online_map. Force the top cpuset to track + * whats online after any CPU or memory node hotplug or unplug event. + * + * To ensure that we don't remove a CPU or node from the top cpuset + * that is currently in use by a child cpuset (which would violate + * the rule that cpusets must be subsets of their parent), we first + * call the recursive routine guarantee_online_cpus_mems_in_subtree(). + * + * Since there are two callers of this routine, one for CPU hotplug + * events and one for memory node hotplug events, we could have coded + * two separate routines here. We code it as a single common routine + * in order to minimize text size. + */ + +static void common_cpu_mem_hotplug_unplug(void) +{ + mutex_lock(&manage_mutex); + mutex_lock(&callback_mutex); + + guarantee_online_cpus_mems_in_subtree(&top_cpuset); + top_cpuset.cpus_allowed = cpu_online_map; + top_cpuset.mems_allowed = node_online_map; + + mutex_unlock(&callback_mutex); + mutex_unlock(&manage_mutex); +} +#endif + +#ifdef CONFIG_HOTPLUG_CPU /* * The top_cpuset tracks what CPUs and Memory Nodes are online, * period. This is necessary in order to make cpusets transparent @@ -2050,38 +2117,24 @@ out: * cpu_online_map on each CPU hotplug (cpuhp) event. */ -#ifdef CONFIG_HOTPLUG_CPU static int cpuset_handle_cpuhp(struct notifier_block *nb, unsigned long phase, void *cpu) { - mutex_lock(&manage_mutex); - mutex_lock(&callback_mutex); - - top_cpuset.cpus_allowed = cpu_online_map; - - mutex_unlock(&callback_mutex); - mutex_unlock(&manage_mutex); - + common_cpu_mem_hotplug_unplug(); return 0; } #endif +#ifdef CONFIG_MEMORY_HOTPLUG /* * Keep top_cpuset.mems_allowed tracking node_online_map. * Call this routine anytime after you change node_online_map. * See also the previous routine cpuset_handle_cpuhp(). */ -#ifdef CONFIG_MEMORY_HOTPLUG void cpuset_track_online_nodes() { - mutex_lock(&manage_mutex); - mutex_lock(&callback_mutex); - - top_cpuset.mems_allowed = node_online_map; - - mutex_unlock(&callback_mutex); - mutex_unlock(&manage_mutex); + common_cpu_mem_hotplug_unplug(); } #endif -- cgit v1.2.3 From 04b1db9fd7eea63c9663072feece616ea41b0a79 Mon Sep 17 00:00:00 2001 From: "Ian S. Nelson" Date: Fri, 29 Sep 2006 02:01:31 -0700 Subject: [PATCH] /sys/modules: allow full length section names I've been using systemtap for some debugging and I noticed that it can't probe a lot of modules. Turns out it's kind of silly, the sections section of /sys/module is limited to 32byte filenames and many of the actual sections are a a bit longer than that. [akpm@osdl.org: rewrite to use dymanic allocation] Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index b7fe6e840963..05625d5dc758 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -933,6 +933,15 @@ static ssize_t module_sect_show(struct module_attribute *mattr, return sprintf(buf, "0x%lx\n", sattr->address); } +static void free_sect_attrs(struct module_sect_attrs *sect_attrs) +{ + int section; + + for (section = 0; section < sect_attrs->nsections; section++) + kfree(sect_attrs->attrs[section].name); + kfree(sect_attrs); +} + static void add_sect_attrs(struct module *mod, unsigned int nsect, char *secstrings, Elf_Shdr *sechdrs) { @@ -949,21 +958,26 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, + nloaded * sizeof(sect_attrs->attrs[0]), sizeof(sect_attrs->grp.attrs[0])); size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]); - if (! (sect_attrs = kmalloc(size[0] + size[1], GFP_KERNEL))) + sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL); + if (sect_attrs == NULL) return; /* Setup section attributes. */ sect_attrs->grp.name = "sections"; sect_attrs->grp.attrs = (void *)sect_attrs + size[0]; + sect_attrs->nsections = 0; sattr = §_attrs->attrs[0]; gattr = §_attrs->grp.attrs[0]; for (i = 0; i < nsect; i++) { if (! (sechdrs[i].sh_flags & SHF_ALLOC)) continue; sattr->address = sechdrs[i].sh_addr; - strlcpy(sattr->name, secstrings + sechdrs[i].sh_name, - MODULE_SECT_NAME_LEN); + sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, + GFP_KERNEL); + if (sattr->name == NULL) + goto out; + sect_attrs->nsections++; sattr->mattr.show = module_sect_show; sattr->mattr.store = NULL; sattr->mattr.attr.name = sattr->name; @@ -979,7 +993,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, mod->sect_attrs = sect_attrs; return; out: - kfree(sect_attrs); + free_sect_attrs(sect_attrs); } static void remove_sect_attrs(struct module *mod) @@ -989,13 +1003,13 @@ static void remove_sect_attrs(struct module *mod) &mod->sect_attrs->grp); /* We are positive that no one is using any sect attrs * at this point. Deallocate immediately. */ - kfree(mod->sect_attrs); + free_sect_attrs(mod->sect_attrs); mod->sect_attrs = NULL; } } - #else + static inline void add_sect_attrs(struct module *mod, unsigned int nsect, char *sectstrings, Elf_Shdr *sechdrs) { -- cgit v1.2.3 From e5582ca21af82929d5cd3613321ac9233c492ebc Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 29 Sep 2006 02:01:35 -0700 Subject: [PATCH] stop_machine.c copyright I had to look back: this code was extracted from the module.c code in 2005. Signed-off-by: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/stop_machine.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 51cacd111dbd..12458040e665 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -1,3 +1,6 @@ +/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. + * GPL v2 and any later version. + */ #include #include #include -- cgit v1.2.3 From eb84a20e9e6b98dcb33023ad22241d79107a08a7 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Fri, 29 Sep 2006 02:01:41 -0700 Subject: [PATCH] audit/accounting: tty locking Add tty locking around the audit and accounting code. The whole current->signal-> locking is all deeply strange but it's for someone else to sort out. Add rather than replace the lock for acct.c Signed-off-by: Alan Cox Acked-by: Arjan van de Ven Cc: Al Viro Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 6 +++++- kernel/auditsc.c | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 2a7c933651c7..f4330acead46 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -483,10 +483,14 @@ static void do_acct_process(struct file *file) ac.ac_ppid = current->parent->tgid; #endif - read_lock(&tasklist_lock); /* pin current->signal */ + mutex_lock(&tty_mutex); + /* FIXME: Whoever is responsible for current->signal locking needs + to use the same locking all over the kernel and document it */ + read_lock(&tasklist_lock); ac.ac_tty = current->signal->tty ? old_encode_dev(tty_devnum(current->signal->tty)) : 0; read_unlock(&tasklist_lock); + mutex_unlock(&tty_mutex); spin_lock_irq(¤t->sighand->siglock); ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index fb83c5cb8c32..105147631753 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -817,6 +817,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_format(ab, " success=%s exit=%ld", (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", context->return_code); + + mutex_lock(&tty_mutex); if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) tty = tsk->signal->tty->name; else @@ -838,6 +840,9 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->gid, context->euid, context->suid, context->fsuid, context->egid, context->sgid, context->fsgid, tty); + + mutex_unlock(&tty_mutex); + audit_log_task_info(ab, tsk); if (context->filterkey) { audit_log_format(ab, " key="); -- cgit v1.2.3 From 03cbc358aab3faf34bfeaa02206fa660127e9b3f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 29 Sep 2006 02:01:46 -0700 Subject: [PATCH] lockdep core: improve the lock-chain-hash With CONFIG_DEBUG_LOCK_ALLOC turned off i was getting sporadic failures in the locking self-test: ------------> | Locking API testsuite: ---------------------------------------------------------------------------- | spin |wlock |rlock |mutex | wsem | rsem | -------------------------------------------------------------------------- A-A deadlock: ok | ok | ok | ok | ok | ok | A-B-B-A deadlock: ok | ok | ok | ok | ok | ok | A-B-B-C-C-A deadlock: ok | ok | ok | ok | ok | ok | A-B-C-A-B-C deadlock: ok | ok | ok | ok | ok | ok | A-B-B-C-C-D-D-A deadlock: ok |FAILED| ok | ok | ok | ok | A-B-C-D-B-D-D-A deadlock: ok | ok | ok | ok | ok | ok | A-B-C-D-B-C-D-A deadlock: ok | ok | ok | ok | ok |FAILED| after much debugging it turned out to be caused by accidental chain-hash key collisions. The current hash is: #define iterate_chain_key(key1, key2) \ (((key1) << MAX_LOCKDEP_KEYS_BITS/2) ^ \ ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS/2)) ^ \ (key2)) where MAX_LOCKDEP_KEYS_BITS is 11. This hash is pretty good as it will shift by 5 bits in every iteration, where every new ID 'mixed' into the hash would have up to 11 bits. But because there was a 6 bits overlap between subsequent IDs and their high bits tended to be similar, there was a chance for accidental chain-hash collision for a low number of locks held. the solution is to shift by 11 bits: #define iterate_chain_key(key1, key2) \ (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \ ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ (key2)) This keeps the hash perfect up to 5 locks held, but even above that the hash is still good because 11 bits is a relative prime to the total 64 bits, so a complete match will only occur after 64 held locks (which doesnt happen in Linux). Even after 5 locks held, entropy of the 5 IDs mixed into the hash is already good enough so that overlap doesnt generate a colliding hash ID. with this change the false positives went away. Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/lockdep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index df1c3594de31..e596525669ed 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -122,8 +122,8 @@ static struct list_head chainhash_table[CHAINHASH_SIZE]; * unique. */ #define iterate_chain_key(key1, key2) \ - (((key1) << MAX_LOCKDEP_KEYS_BITS/2) ^ \ - ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS/2)) ^ \ + (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \ + ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ (key2)) void lockdep_off(void) -- cgit v1.2.3 From 181b64803661209cda64e5e874ad75f373a69de8 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Fri, 29 Sep 2006 02:01:48 -0700 Subject: [PATCH] cpuset: fix obscure attach_task vs exiting race Fix obscure race condition in kernel/cpuset.c attach_task() code. There is basically zero chance of anyone accidentally being harmed by this race. It requires a special 'micro-stress' load and a special timing loop hacks in the kernel to hit in less than an hour, and even then you'd have to hit it hundreds or thousands of times, followed by some unusual and senseless cpuset configuration requests, including removing the top cpuset, to cause any visibly harm affects. One could, with perhaps a few days or weeks of such effort, get the reference count on the top cpuset below zero, and manage to crash the kernel by asking to remove the top cpuset. I found it by code inspection. The race was introduced when 'the_top_cpuset_hack' was introduced, and one piece of code was not updated. An old check for a possibly null task cpuset pointer needed to be changed to a check for a task marked PF_EXITING. The pointer can't be null anymore, thanks to the_top_cpuset_hack (documented in kernel/cpuset.c). But the task could have gone into PF_EXITING state after it was found in the task_list scan. If a task is PF_EXITING in this code, it is possible that its task->cpuset pointer is pointing to the top cpuset due to the_top_cpuset_hack, rather than because the top_cpuset was that tasks last valid cpuset. In that case, the wrong cpuset reference counter would be decremented. The fix is trivial. Instead of failing the system call if the tasks cpuset pointer is null here, fail it if the task is in PF_EXITING state. The code for 'the_top_cpuset_hack' that changes an exiting tasks cpuset to the top_cpuset is done without locking, so could happen at anytime. But it is done during the exit handling, after the PF_EXITING flag is set. So if we verify that a task is still not PF_EXITING after we copy out its cpuset pointer (into 'oldcs', below), we know that 'oldcs' is not one of these hack references to the top_cpuset. Signed-off-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index cc0395d7eba1..8c3c400cce91 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1225,7 +1225,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) task_lock(tsk); oldcs = tsk->cpuset; - if (!oldcs) { + /* + * After getting 'oldcs' cpuset ptr, be sure still not exiting. + * If 'oldcs' might be the top_cpuset due to the_top_cpuset_hack + * then fail this attach_task(), to avoid breaking top_cpuset.count. + */ + if (tsk->flags & PF_EXITING) { task_unlock(tsk); mutex_unlock(&callback_mutex); put_task_struct(tsk); -- cgit v1.2.3 From 29cbc78b90a73ad80f2f58ba2927956cf663abed Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 30 Sep 2006 01:47:55 +0200 Subject: [PATCH] x86: Clean up x86 NMI sysctls Use prototypes in headers Don't define panic_on_unrecovered_nmi for all architectures Cc: dzickus@redhat.com Signed-off-by: Andi Kleen --- kernel/panic.c | 1 - kernel/sysctl.c | 11 ++++------- 2 files changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 6ceb664fb52a..525e365f7239 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -21,7 +21,6 @@ #include int panic_on_oops; -int panic_on_unrecovered_nmi; int tainted; static int pause_on_oops; static int pause_on_oops_flag; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9535a3839930..c57c4532e296 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -52,6 +52,10 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); +#ifdef CONFIG_X86 +#include +#endif + #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ @@ -74,13 +78,6 @@ extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; extern int compat_log; -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) -int unknown_nmi_panic; -int nmi_watchdog_enabled; -extern int proc_nmi_enabled(struct ctl_table *, int , struct file *, - void __user *, size_t *, loff_t *); -#endif - /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; static int minolduid; -- cgit v1.2.3 From 34596dc9e59d7bece16fe5aba08116b49465da26 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 30 Sep 2006 01:47:55 +0200 Subject: [PATCH] Define vsyscall cache as blob to make clearer that user space shouldn't use it Signed-off-by: Andi Kleen --- kernel/sys.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 8647061c084a..b88806c66244 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2083,12 +2083,12 @@ asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep, * padding */ unsigned long t0, t1; - get_user(t0, &cache->t0); - get_user(t1, &cache->t1); + get_user(t0, &cache->blob[0]); + get_user(t1, &cache->blob[1]); t0++; t1++; - put_user(t0, &cache->t0); - put_user(t1, &cache->t1); + put_user(t0, &cache->blob[0]); + put_user(t1, &cache->blob[1]); } return err ? -EFAULT : 0; } -- cgit v1.2.3 From 0d67a46df0125e20d14f12dbd3646f1f1bf23e8c Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 29 Aug 2006 19:05:56 +0100 Subject: [PATCH] BLOCK: Remove duplicate declaration of exit_io_context() [try #6] Remove the duplicate declaration of exit_io_context() from linux/sched.h. Signed-Off-By: David Howells Signed-off-by: Jens Axboe --- kernel/exit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 2e4c13cba95a..c189de2927ab 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -38,6 +38,7 @@ #include #include /* for audit_free() */ #include +#include #include #include -- cgit v1.2.3 From 07f3f05c1e3052b8656129b2a5aca9f888241a34 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 30 Sep 2006 20:52:18 +0200 Subject: [PATCH] BLOCK: Move extern declarations out of fs/*.c into header files [try #6] Create a new header file, fs/internal.h, for common definitions local to the sources in the fs/ directory. Move extern definitions that should be in header files from fs/*.c to fs/internal.h or other main header files where they span directories. Signed-Off-By: David Howells Signed-off-by: Jens Axboe --- kernel/compat.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 75573e5d27b0..b4fbd838cd77 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -26,6 +26,8 @@ #include +extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat); + int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) { return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || -- cgit v1.2.3 From 9361401eb7619c033e2394e4f9f6d410d6719ac7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 30 Sep 2006 20:45:40 +0200 Subject: [PATCH] BLOCK: Make it possible to disable the block layer [try #6] Make it possible to disable the block layer. Not all embedded devices require it, some can make do with just JFFS2, NFS, ramfs, etc - none of which require the block layer to be present. This patch does the following: (*) Introduces CONFIG_BLOCK to disable the block layer, buffering and blockdev support. (*) Adds dependencies on CONFIG_BLOCK to any configuration item that controls an item that uses the block layer. This includes: (*) Block I/O tracing. (*) Disk partition code. (*) All filesystems that are block based, eg: Ext3, ReiserFS, ISOFS. (*) The SCSI layer. As far as I can tell, even SCSI chardevs use the block layer to do scheduling. Some drivers that use SCSI facilities - such as USB storage - end up disabled indirectly from this. (*) Various block-based device drivers, such as IDE and the old CDROM drivers. (*) MTD blockdev handling and FTL. (*) JFFS - which uses set_bdev_super(), something it could avoid doing by taking a leaf out of JFFS2's book. (*) Makes most of the contents of linux/blkdev.h, linux/buffer_head.h and linux/elevator.h contingent on CONFIG_BLOCK being set. sector_div() is, however, still used in places, and so is still available. (*) Also made contingent are the contents of linux/mpage.h, linux/genhd.h and parts of linux/fs.h. (*) Makes a number of files in fs/ contingent on CONFIG_BLOCK. (*) Makes mm/bounce.c (bounce buffering) contingent on CONFIG_BLOCK. (*) set_page_dirty() doesn't call __set_page_dirty_buffers() if CONFIG_BLOCK is not enabled. (*) fs/no-block.c is created to hold out-of-line stubs and things that are required when CONFIG_BLOCK is not set: (*) Default blockdev file operations (to give error ENODEV on opening). (*) Makes some /proc changes: (*) /proc/devices does not list any blockdevs. (*) /proc/diskstats and /proc/partitions are contingent on CONFIG_BLOCK. (*) Makes some compat ioctl handling contingent on CONFIG_BLOCK. (*) If CONFIG_BLOCK is not defined, makes sys_quotactl() return -ENODEV if given command other than Q_SYNC or if a special device is specified. (*) In init/do_mounts.c, no reference is made to the blockdev routines if CONFIG_BLOCK is not defined. This does not prohibit NFS roots or JFFS2. (*) The bdflush, ioprio_set and ioprio_get syscalls can now be absent (return error ENOSYS by way of cond_syscall if so). (*) The seclvl_bd_claim() and seclvl_bd_release() security calls do nothing if CONFIG_BLOCK is not set, since they can't then happen. Signed-Off-By: David Howells Signed-off-by: Jens Axboe --- kernel/sys_ni.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 6991bece67e8..7a3b2e75f040 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -134,3 +134,8 @@ cond_syscall(sys_madvise); cond_syscall(sys_mremap); cond_syscall(sys_remap_file_pages); cond_syscall(compat_sys_move_pages); + +/* block-layer dependent */ +cond_syscall(sys_bdflush); +cond_syscall(sys_ioprio_set); +cond_syscall(sys_ioprio_get); -- cgit v1.2.3 From 5c87579e65ee4f419b2369407f82326d38b5d2d8 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 30 Sep 2006 23:27:17 -0700 Subject: [PATCH] maximum latency tracking infrastructure Add infrastructure to track "maximum allowable latency" for power saving policies. The reason for adding this infrastructure is that power management in the idle loop needs to make a tradeoff between latency and power savings (deeper power save modes have a longer latency to running code again). The code that today makes this tradeoff just does a rather simple algorithm; however this is not good enough: There are devices and use cases where a lower latency is required than that the higher power saving states provide. An example would be audio playback, but another example is the ipw2100 wireless driver that right now has a very direct and ugly acpi hook to disable some higher power states randomly when it gets certain types of error. The proposed solution is to have an interface where drivers can * announce the maximum latency (in microseconds) that they can deal with * modify this latency * give up their constraint and a function where the code that decides on power saving strategy can query the current global desired maximum. This patch has a user of each side: on the consumer side, ACPI is patched to use this, on the producer side the ipw2100 driver is patched. A generic maximum latency is also registered of 2 timer ticks (more and you lose accurate time tracking after all). While the existing users of the patch are x86 specific, the infrastructure is not. I'd like to ask the arch maintainers of other architectures if the infrastructure is generic enough for their use (assuming the architecture has such a tradeoff as concept at all), and the sound/multimedia driver owners to look at the driver facing API to see if this is something they can use. [akpm@osdl.org: cleanups] Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar Acked-by: Jesse Barnes Cc: "Brown, Len" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 2 +- kernel/latency.c | 279 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 280 insertions(+), 1 deletion(-) create mode 100644 kernel/latency.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index d62ec66c1af2..e210e8cf7237 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ - hrtimer.o rwsem.o + hrtimer.o rwsem.o latency.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ diff --git a/kernel/latency.c b/kernel/latency.c new file mode 100644 index 000000000000..258f2555abbc --- /dev/null +++ b/kernel/latency.c @@ -0,0 +1,279 @@ +/* + * latency.c: Explicit system-wide latency-expectation infrastructure + * + * The purpose of this infrastructure is to allow device drivers to set + * latency constraint they have and to collect and summarize these + * expectations globally. The cummulated result can then be used by + * power management and similar users to make decisions that have + * tradoffs with a latency component. + * + * An example user of this are the x86 C-states; each higher C state saves + * more power, but has a higher exit latency. For the idle loop power + * code to make a good decision which C-state to use, information about + * acceptable latencies is required. + * + * An example announcer of latency is an audio driver that knowns it + * will get an interrupt when the hardware has 200 usec of samples + * left in the DMA buffer; in that case the driver can set a latency + * constraint of, say, 150 usec. + * + * Multiple drivers can each announce their maximum accepted latency, + * to keep these appart, a string based identifier is used. + * + * + * (C) Copyright 2006 Intel Corporation + * Author: Arjan van de Ven + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +struct latency_info { + struct list_head list; + int usecs; + char *identifier; +}; + +/* + * locking rule: all modifications to current_max_latency and + * latency_list need to be done while holding the latency_lock. + * latency_lock needs to be taken _irqsave. + */ +static atomic_t current_max_latency; +static DEFINE_SPINLOCK(latency_lock); + +static LIST_HEAD(latency_list); +static BLOCKING_NOTIFIER_HEAD(latency_notifier); + +/* + * This function returns the maximum latency allowed, which + * happens to be the minimum of all maximum latencies on the + * list. + */ +static int __find_max_latency(void) +{ + int min = INFINITE_LATENCY; + struct latency_info *info; + + list_for_each_entry(info, &latency_list, list) { + if (info->usecs < min) + min = info->usecs; + } + return min; +} + +/** + * set_acceptable_latency - sets the maximum latency acceptable + * @identifier: string that identifies this driver + * @usecs: maximum acceptable latency for this driver + * + * This function informs the kernel that this device(driver) + * can accept at most usecs latency. This setting is used for + * power management and similar tradeoffs. + * + * This function sleeps and can only be called from process + * context. + * Calling this function with an existing identifier is valid + * and will cause the existing latency setting to be changed. + */ +void set_acceptable_latency(char *identifier, int usecs) +{ + struct latency_info *info, *iter; + unsigned long flags; + int found_old = 0; + + info = kzalloc(sizeof(struct latency_info), GFP_KERNEL); + if (!info) + return; + info->usecs = usecs; + info->identifier = kstrdup(identifier, GFP_KERNEL); + if (!info->identifier) + goto free_info; + + spin_lock_irqsave(&latency_lock, flags); + list_for_each_entry(iter, &latency_list, list) { + if (strcmp(iter->identifier, identifier)==0) { + found_old = 1; + iter->usecs = usecs; + break; + } + } + if (!found_old) + list_add(&info->list, &latency_list); + + if (usecs < atomic_read(¤t_max_latency)) + atomic_set(¤t_max_latency, usecs); + + spin_unlock_irqrestore(&latency_lock, flags); + + blocking_notifier_call_chain(&latency_notifier, + atomic_read(¤t_max_latency), NULL); + + /* + * if we inserted the new one, we're done; otherwise there was + * an existing one so we need to free the redundant data + */ + if (!found_old) + return; + + kfree(info->identifier); +free_info: + kfree(info); +} +EXPORT_SYMBOL_GPL(set_acceptable_latency); + +/** + * modify_acceptable_latency - changes the maximum latency acceptable + * @identifier: string that identifies this driver + * @usecs: maximum acceptable latency for this driver + * + * This function informs the kernel that this device(driver) + * can accept at most usecs latency. This setting is used for + * power management and similar tradeoffs. + * + * This function does not sleep and can be called in any context. + * Trying to use a non-existing identifier silently gets ignored. + * + * Due to the atomic nature of this function, the modified latency + * value will only be used for future decisions; past decisions + * can still lead to longer latencies in the near future. + */ +void modify_acceptable_latency(char *identifier, int usecs) +{ + struct latency_info *iter; + unsigned long flags; + + spin_lock_irqsave(&latency_lock, flags); + list_for_each_entry(iter, &latency_list, list) { + if (strcmp(iter->identifier, identifier) == 0) { + iter->usecs = usecs; + break; + } + } + if (usecs < atomic_read(¤t_max_latency)) + atomic_set(¤t_max_latency, usecs); + spin_unlock_irqrestore(&latency_lock, flags); +} +EXPORT_SYMBOL_GPL(modify_acceptable_latency); + +/** + * remove_acceptable_latency - removes the maximum latency acceptable + * @identifier: string that identifies this driver + * + * This function removes a previously set maximum latency setting + * for the driver and frees up any resources associated with the + * bookkeeping needed for this. + * + * This function does not sleep and can be called in any context. + * Trying to use a non-existing identifier silently gets ignored. + */ +void remove_acceptable_latency(char *identifier) +{ + unsigned long flags; + int newmax = 0; + struct latency_info *iter, *temp; + + spin_lock_irqsave(&latency_lock, flags); + + list_for_each_entry_safe(iter, temp, &latency_list, list) { + if (strcmp(iter->identifier, identifier) == 0) { + list_del(&iter->list); + newmax = iter->usecs; + kfree(iter->identifier); + kfree(iter); + break; + } + } + + /* If we just deleted the system wide value, we need to + * recalculate with a full search + */ + if (newmax == atomic_read(¤t_max_latency)) { + newmax = __find_max_latency(); + atomic_set(¤t_max_latency, newmax); + } + spin_unlock_irqrestore(&latency_lock, flags); +} +EXPORT_SYMBOL_GPL(remove_acceptable_latency); + +/** + * system_latency_constraint - queries the system wide latency maximum + * + * This function returns the system wide maximum latency in + * microseconds. + * + * This function does not sleep and can be called in any context. + */ +int system_latency_constraint(void) +{ + return atomic_read(¤t_max_latency); +} +EXPORT_SYMBOL_GPL(system_latency_constraint); + +/** + * synchronize_acceptable_latency - recalculates all latency decisions + * + * This function will cause a callback to various kernel pieces that + * will make those pieces rethink their latency decisions. This implies + * that if there are overlong latencies in hardware state already, those + * latencies get taken right now. When this call completes no overlong + * latency decisions should be active anymore. + * + * Typical usecase of this is after a modify_acceptable_latency() call, + * which in itself is non-blocking and non-synchronizing. + * + * This function blocks and should not be called with locks held. + */ + +void synchronize_acceptable_latency(void) +{ + blocking_notifier_call_chain(&latency_notifier, + atomic_read(¤t_max_latency), NULL); +} +EXPORT_SYMBOL_GPL(synchronize_acceptable_latency); + +/* + * Latency notifier: this notifier gets called when a non-atomic new + * latency value gets set. The expectation nof the caller of the + * non-atomic set is that when the call returns, future latencies + * are within bounds, so the functions on the notifier list are + * expected to take the overlong latencies immediately, inside the + * callback, and not make a overlong latency decision anymore. + * + * The callback gets called when the new latency value is made + * active so system_latency_constraint() returns the new latency. + */ +int register_latency_notifier(struct notifier_block * nb) +{ + return blocking_notifier_chain_register(&latency_notifier, nb); +} +EXPORT_SYMBOL_GPL(register_latency_notifier); + +int unregister_latency_notifier(struct notifier_block * nb) +{ + return blocking_notifier_chain_unregister(&latency_notifier, nb); +} +EXPORT_SYMBOL_GPL(unregister_latency_notifier); + +static __init int latency_init(void) +{ + atomic_set(¤t_max_latency, INFINITE_LATENCY); + /* + * we don't want by default to have longer latencies than 2 ticks, + * since that would cause lost ticks + */ + set_acceptable_latency("kernel", 2*1000000/HZ); + return 0; +} + +module_init(latency_init); -- cgit v1.2.3 From 756184b7d771992f4fb1998d62aebcaf3e028076 Mon Sep 17 00:00:00 2001 From: Cal Peake Date: Sat, 30 Sep 2006 23:27:24 -0700 Subject: [PATCH] CodingStyle cleanup for kernel/sys.c Fix up kernel/sys.c to be consistent with CodingStyle and the rest of the file. Signed-off-by: Cal Peake Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 80 +++++++++++++++++++++++------------------------------------- 1 file changed, 30 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index b88806c66244..2460581c928c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -607,11 +607,10 @@ static void kernel_restart_prepare(char *cmd) void kernel_restart(char *cmd) { kernel_restart_prepare(cmd); - if (!cmd) { + if (!cmd) printk(KERN_EMERG "Restarting system.\n"); - } else { + else printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); - } machine_restart(cmd); } EXPORT_SYMBOL_GPL(kernel_restart); @@ -627,9 +626,8 @@ static void kernel_kexec(void) #ifdef CONFIG_KEXEC struct kimage *image; image = xchg(&kexec_image, NULL); - if (!image) { + if (!image) return; - } kernel_restart_prepare(NULL); printk(KERN_EMERG "Starting new kernel\n"); machine_shutdown(); @@ -823,12 +821,10 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) (current->sgid == egid) || capable(CAP_SETGID)) new_egid = egid; - else { + else return -EPERM; - } } - if (new_egid != old_egid) - { + if (new_egid != old_egid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -857,19 +853,14 @@ asmlinkage long sys_setgid(gid_t gid) if (retval) return retval; - if (capable(CAP_SETGID)) - { - if(old_egid != gid) - { + if (capable(CAP_SETGID)) { + if (old_egid != gid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } current->gid = current->egid = current->sgid = current->fsgid = gid; - } - else if ((gid == current->gid) || (gid == current->sgid)) - { - if(old_egid != gid) - { + } else if ((gid == current->gid) || (gid == current->sgid)) { + if (old_egid != gid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -900,8 +891,7 @@ static int set_user(uid_t new_ruid, int dumpclear) switch_uid(new_user); - if(dumpclear) - { + if (dumpclear) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -957,8 +947,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0) return -EAGAIN; - if (new_euid != old_euid) - { + if (new_euid != old_euid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -1008,8 +997,7 @@ asmlinkage long sys_setuid(uid_t uid) } else if ((uid != current->uid) && (uid != new_suid)) return -EPERM; - if (old_euid != uid) - { + if (old_euid != uid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -1054,8 +1042,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) return -EAGAIN; } if (euid != (uid_t) -1) { - if (euid != current->euid) - { + if (euid != current->euid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -1105,8 +1092,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) return -EPERM; } if (egid != (gid_t) -1) { - if (egid != current->egid) - { + if (egid != current->egid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -1151,10 +1137,8 @@ asmlinkage long sys_setfsuid(uid_t uid) if (uid == current->uid || uid == current->euid || uid == current->suid || uid == current->fsuid || - capable(CAP_SETUID)) - { - if (uid != old_fsuid) - { + capable(CAP_SETUID)) { + if (uid != old_fsuid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -1182,10 +1166,8 @@ asmlinkage long sys_setfsgid(gid_t gid) if (gid == current->gid || gid == current->egid || gid == current->sgid || gid == current->fsgid || - capable(CAP_SETGID)) - { - if (gid != old_fsgid) - { + capable(CAP_SETGID)) { + if (gid != old_fsgid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -1321,9 +1303,9 @@ out: asmlinkage long sys_getpgid(pid_t pid) { - if (!pid) { + if (!pid) return process_group(current); - } else { + else { int retval; struct task_struct *p; @@ -1353,9 +1335,9 @@ asmlinkage long sys_getpgrp(void) asmlinkage long sys_getsid(pid_t pid) { - if (!pid) { + if (!pid) return current->signal->session; - } else { + else { int retval; struct task_struct *p; @@ -1363,7 +1345,7 @@ asmlinkage long sys_getsid(pid_t pid) p = find_task_by_pid(pid); retval = -ESRCH; - if(p) { + if (p) { retval = security_task_getsid(p); if (!retval) retval = p->signal->session; @@ -1431,9 +1413,9 @@ struct group_info *groups_alloc(int gidsetsize) group_info->nblocks = nblocks; atomic_set(&group_info->usage, 1); - if (gidsetsize <= NGROUPS_SMALL) { + if (gidsetsize <= NGROUPS_SMALL) group_info->blocks[0] = group_info->small_block; - } else { + else { for (i = 0; i < nblocks; i++) { gid_t *b; b = (void *)__get_free_page(GFP_USER); @@ -1489,7 +1471,7 @@ static int groups_to_user(gid_t __user *grouplist, /* fill a group_info from a user-space array - it must be allocated already */ static int groups_from_user(struct group_info *group_info, gid_t __user *grouplist) - { +{ int i; int count = group_info->ngroups; @@ -1647,9 +1629,8 @@ asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist) int in_group_p(gid_t grp) { int retval = 1; - if (grp != current->fsgid) { + if (grp != current->fsgid) retval = groups_search(current->group_info, grp); - } return retval; } @@ -1658,9 +1639,8 @@ EXPORT_SYMBOL(in_group_p); int in_egroup_p(gid_t grp) { int retval = 1; - if (grp != current->egid) { + if (grp != current->egid) retval = groups_search(current->group_info, grp); - } return retval; } @@ -1775,9 +1755,9 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r task_lock(current->group_leader); x = current->signal->rlim[resource]; task_unlock(current->group_leader); - if(x.rlim_cur > 0x7FFFFFFF) + if (x.rlim_cur > 0x7FFFFFFF) x.rlim_cur = 0x7FFFFFFF; - if(x.rlim_max > 0x7FFFFFFF) + if (x.rlim_max > 0x7FFFFFFF) x.rlim_max = 0x7FFFFFFF; return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0; } -- cgit v1.2.3 From ef6edc9746dc2bfdacf44eefd5f881179971c478 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Sat, 30 Sep 2006 23:27:43 -0700 Subject: [PATCH] Directed yield: cpu_relax variants for spinlocks and rw-locks On systems running with virtual cpus there is optimization potential in regard to spinlocks and rw-locks. If the virtual cpu that has taken a lock is known to a cpu that wants to acquire the same lock it is beneficial to yield the timeslice of the virtual cpu in favour of the cpu that has the lock (directed yield). With CONFIG_PREEMPT="n" this can be implemented by the architecture without common code changes. Powerpc already does this. With CONFIG_PREEMPT="y" the lock loops are coded with _raw_spin_trylock, _raw_read_trylock and _raw_write_trylock in kernel/spinlock.c. If the lock could not be taken cpu_relax is called. A directed yield is not possible because cpu_relax doesn't know anything about the lock. To be able to yield the lock in favour of the current lock holder variants of cpu_relax for spinlocks and rw-locks are needed. The new _raw_spin_relax, _raw_read_relax and _raw_write_relax primitives differ from cpu_relax insofar that they have an argument: a pointer to the lock structure. Signed-off-by: Martin Schwidefsky Cc: Ingo Molnar Cc: Paul Mackerras Cc: Haavard Skinnemoen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/spinlock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/spinlock.c b/kernel/spinlock.c index d48143eafbfd..476c3741511b 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -215,7 +215,7 @@ void __lockfunc _##op##_lock(locktype##_t *lock) \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ while (!op##_can_lock(lock) && (lock)->break_lock) \ - cpu_relax(); \ + _raw_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ } \ @@ -237,7 +237,7 @@ unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ while (!op##_can_lock(lock) && (lock)->break_lock) \ - cpu_relax(); \ + _raw_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ return flags; \ -- cgit v1.2.3 From 4c7ee8de956fc250fe31e2fa91f6da980fabe317 Mon Sep 17 00:00:00 2001 From: john stultz Date: Sat, 30 Sep 2006 23:28:22 -0700 Subject: [PATCH] NTP: Move all the NTP related code to ntp.c Move all the NTP related code to ntp.c [akpm@osdl.org: cleanups, build fix] Signed-off-by: John Stultz Cc: Ingo Molnar Cc: Roman Zippel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time.c | 173 ----------------------- kernel/time/Makefile | 2 +- kernel/time/ntp.c | 389 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/timer.c | 211 +--------------------------- 4 files changed, 391 insertions(+), 384 deletions(-) create mode 100644 kernel/time/ntp.c (limited to 'kernel') diff --git a/kernel/time.c b/kernel/time.c index 5bd489747643..0e017bff4c19 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -202,179 +202,6 @@ asmlinkage long sys_settimeofday(struct timeval __user *tv, return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); } -/* we call this to notify the arch when the clock is being - * controlled. If no such arch routine, do nothing. - */ -void __attribute__ ((weak)) notify_arch_cmos_timer(void) -{ - return; -} - -/* adjtimex mainly allows reading (and writing, if superuser) of - * kernel time-keeping variables. used by xntpd. - */ -int do_adjtimex(struct timex *txc) -{ - long ltemp, mtemp, save_adjust; - int result; - - /* In order to modify anything, you gotta be super-user! */ - if (txc->modes && !capable(CAP_SYS_TIME)) - return -EPERM; - - /* Now we validate the data before disabling interrupts */ - - if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) - /* singleshot must not be used with any other mode bits */ - if (txc->modes != ADJ_OFFSET_SINGLESHOT) - return -EINVAL; - - if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET)) - /* adjustment Offset limited to +- .512 seconds */ - if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE ) - return -EINVAL; - - /* if the quartz is off by more than 10% something is VERY wrong ! */ - if (txc->modes & ADJ_TICK) - if (txc->tick < 900000/USER_HZ || - txc->tick > 1100000/USER_HZ) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - result = time_state; /* mostly `TIME_OK' */ - - /* Save for later - semantics of adjtime is to return old value */ - save_adjust = time_next_adjust ? time_next_adjust : time_adjust; - -#if 0 /* STA_CLOCKERR is never set yet */ - time_status &= ~STA_CLOCKERR; /* reset STA_CLOCKERR */ -#endif - /* If there are input parameters, then process them */ - if (txc->modes) - { - if (txc->modes & ADJ_STATUS) /* only set allowed bits */ - time_status = (txc->status & ~STA_RONLY) | - (time_status & STA_RONLY); - - if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */ - if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) { - result = -EINVAL; - goto leave; - } - time_freq = txc->freq; - } - - if (txc->modes & ADJ_MAXERROR) { - if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) { - result = -EINVAL; - goto leave; - } - time_maxerror = txc->maxerror; - } - - if (txc->modes & ADJ_ESTERROR) { - if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) { - result = -EINVAL; - goto leave; - } - time_esterror = txc->esterror; - } - - if (txc->modes & ADJ_TIMECONST) { /* p. 24 */ - if (txc->constant < 0) { /* NTP v4 uses values > 6 */ - result = -EINVAL; - goto leave; - } - time_constant = txc->constant; - } - - if (txc->modes & ADJ_OFFSET) { /* values checked earlier */ - if (txc->modes == ADJ_OFFSET_SINGLESHOT) { - /* adjtime() is independent from ntp_adjtime() */ - if ((time_next_adjust = txc->offset) == 0) - time_adjust = 0; - } - else if (time_status & STA_PLL) { - ltemp = txc->offset; - - /* - * Scale the phase adjustment and - * clamp to the operating range. - */ - if (ltemp > MAXPHASE) - time_offset = MAXPHASE << SHIFT_UPDATE; - else if (ltemp < -MAXPHASE) - time_offset = -(MAXPHASE << SHIFT_UPDATE); - else - time_offset = ltemp << SHIFT_UPDATE; - - /* - * Select whether the frequency is to be controlled - * and in which mode (PLL or FLL). Clamp to the operating - * range. Ugly multiply/divide should be replaced someday. - */ - - if (time_status & STA_FREQHOLD || time_reftime == 0) - time_reftime = xtime.tv_sec; - mtemp = xtime.tv_sec - time_reftime; - time_reftime = xtime.tv_sec; - if (time_status & STA_FLL) { - if (mtemp >= MINSEC) { - ltemp = (time_offset / mtemp) << (SHIFT_USEC - - SHIFT_UPDATE); - time_freq += shift_right(ltemp, SHIFT_KH); - } else /* calibration interval too short (p. 12) */ - result = TIME_ERROR; - } else { /* PLL mode */ - if (mtemp < MAXSEC) { - ltemp *= mtemp; - time_freq += shift_right(ltemp,(time_constant + - time_constant + - SHIFT_KF - SHIFT_USEC)); - } else /* calibration interval too long (p. 12) */ - result = TIME_ERROR; - } - time_freq = min(time_freq, time_tolerance); - time_freq = max(time_freq, -time_tolerance); - } /* STA_PLL */ - } /* txc->modes & ADJ_OFFSET */ - if (txc->modes & ADJ_TICK) { - tick_usec = txc->tick; - tick_nsec = TICK_USEC_TO_NSEC(tick_usec); - } - } /* txc->modes */ -leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) - result = TIME_ERROR; - - if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) - txc->offset = save_adjust; - else { - txc->offset = shift_right(time_offset, SHIFT_UPDATE); - } - txc->freq = time_freq; - txc->maxerror = time_maxerror; - txc->esterror = time_esterror; - txc->status = time_status; - txc->constant = time_constant; - txc->precision = time_precision; - txc->tolerance = time_tolerance; - txc->tick = tick_usec; - - /* PPS is not implemented, so these are zero */ - txc->ppsfreq = 0; - txc->jitter = 0; - txc->shift = 0; - txc->stabil = 0; - txc->jitcnt = 0; - txc->calcnt = 0; - txc->errcnt = 0; - txc->stbcnt = 0; - write_sequnlock_irq(&xtime_lock); - do_gettimeofday(&txc->time); - notify_arch_cmos_timer(); - return(result); -} - asmlinkage long sys_adjtimex(struct timex __user *txc_p) { struct timex txc; /* Local copy of parameter */ diff --git a/kernel/time/Makefile b/kernel/time/Makefile index e1dfd8e86cce..61a3907d16fb 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1 +1 @@ -obj-y += clocksource.o jiffies.o +obj-y += ntp.o clocksource.o jiffies.o diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c new file mode 100644 index 000000000000..8ccce15b4b23 --- /dev/null +++ b/kernel/time/ntp.c @@ -0,0 +1,389 @@ +/* + * linux/kernel/time/ntp.c + * + * NTP state machine interfaces and logic. + * + * This code was mainly moved from kernel/timer.c and kernel/time.c + * Please see those files for relevant copyright info and historical + * changelogs. + */ + +#include +#include +#include + +#include +#include + +/* Don't completely fail for HZ > 500. */ +int tickadj = 500/HZ ? : 1; /* microsecs */ + +/* + * phase-lock loop variables + */ +/* TIME_ERROR prevents overwriting the CMOS clock */ +int time_state = TIME_OK; /* clock synchronization status */ +int time_status = STA_UNSYNC; /* clock status bits */ +long time_offset; /* time adjustment (us) */ +long time_constant = 2; /* pll time constant */ +long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ +long time_precision = 1; /* clock precision (us) */ +long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ +long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ +long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; + /* frequency offset (scaled ppm)*/ +static long time_adj; /* tick adjust (scaled 1 / HZ) */ +long time_reftime; /* time at last adjustment (s) */ +long time_adjust; +long time_next_adjust; + +/* + * this routine handles the overflow of the microsecond field + * + * The tricky bits of code to handle the accurate clock support + * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. + * They were originally developed for SUN and DEC kernels. + * All the kudos should go to Dave for this stuff. + */ +void second_overflow(void) +{ + long ltemp; + + /* Bump the maxerror field */ + time_maxerror += time_tolerance >> SHIFT_USEC; + if (time_maxerror > NTP_PHASE_LIMIT) { + time_maxerror = NTP_PHASE_LIMIT; + time_status |= STA_UNSYNC; + } + + /* + * Leap second processing. If in leap-insert state at the end of the + * day, the system clock is set back one second; if in leap-delete + * state, the system clock is set ahead one second. The microtime() + * routine or external clock driver will insure that reported time is + * always monotonic. The ugly divides should be replaced. + */ + switch (time_state) { + case TIME_OK: + if (time_status & STA_INS) + time_state = TIME_INS; + else if (time_status & STA_DEL) + time_state = TIME_DEL; + break; + case TIME_INS: + if (xtime.tv_sec % 86400 == 0) { + xtime.tv_sec--; + wall_to_monotonic.tv_sec++; + /* + * The timer interpolator will make time change + * gradually instead of an immediate jump by one second + */ + time_interpolator_update(-NSEC_PER_SEC); + time_state = TIME_OOP; + clock_was_set(); + printk(KERN_NOTICE "Clock: inserting leap second " + "23:59:60 UTC\n"); + } + break; + case TIME_DEL: + if ((xtime.tv_sec + 1) % 86400 == 0) { + xtime.tv_sec++; + wall_to_monotonic.tv_sec--; + /* + * Use of time interpolator for a gradual change of + * time + */ + time_interpolator_update(NSEC_PER_SEC); + time_state = TIME_WAIT; + clock_was_set(); + printk(KERN_NOTICE "Clock: deleting leap second " + "23:59:59 UTC\n"); + } + break; + case TIME_OOP: + time_state = TIME_WAIT; + break; + case TIME_WAIT: + if (!(time_status & (STA_INS | STA_DEL))) + time_state = TIME_OK; + } + + /* + * Compute the phase adjustment for the next second. In PLL mode, the + * offset is reduced by a fixed factor times the time constant. In FLL + * mode the offset is used directly. In either mode, the maximum phase + * adjustment for each second is clamped so as to spread the adjustment + * over not more than the number of seconds between updates. + */ + ltemp = time_offset; + if (!(time_status & STA_FLL)) + ltemp = shift_right(ltemp, SHIFT_KG + time_constant); + ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE); + ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE); + time_offset -= ltemp; + time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); + + /* + * Compute the frequency estimate and additional phase adjustment due + * to frequency error for the next second. + */ + ltemp = time_freq; + time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE)); + +#if HZ == 100 + /* + * Compensate for (HZ==100) != (1 << SHIFT_HZ). Add 25% and 3.125% to + * get 128.125; => only 0.125% error (p. 14) + */ + time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5); +#endif +#if HZ == 250 + /* + * Compensate for (HZ==250) != (1 << SHIFT_HZ). Add 1.5625% and + * 0.78125% to get 255.85938; => only 0.05% error (p. 14) + */ + time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); +#endif +#if HZ == 1000 + /* + * Compensate for (HZ==1000) != (1 << SHIFT_HZ). Add 1.5625% and + * 0.78125% to get 1023.4375; => only 0.05% error (p. 14) + */ + time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); +#endif +} + +/* + * Returns how many microseconds we need to add to xtime this tick + * in doing an adjustment requested with adjtime. + */ +static long adjtime_adjustment(void) +{ + long time_adjust_step; + + time_adjust_step = time_adjust; + if (time_adjust_step) { + /* + * We are doing an adjtime thing. Prepare time_adjust_step to + * be within bounds. Note that a positive time_adjust means we + * want the clock to run faster. + * + * Limit the amount of the step to be in the range + * -tickadj .. +tickadj + */ + time_adjust_step = min(time_adjust_step, (long)tickadj); + time_adjust_step = max(time_adjust_step, (long)-tickadj); + } + return time_adjust_step; +} + +/* in the NTP reference this is called "hardclock()" */ +void update_ntp_one_tick(void) +{ + long time_adjust_step; + + time_adjust_step = adjtime_adjustment(); + if (time_adjust_step) + /* Reduce by this step the amount of time left */ + time_adjust -= time_adjust_step; + + /* Changes by adjtime() do not take effect till next tick. */ + if (time_next_adjust != 0) { + time_adjust = time_next_adjust; + time_next_adjust = 0; + } +} + +/* + * Return how long ticks are at the moment, that is, how much time + * update_wall_time_one_tick will add to xtime next time we call it + * (assuming no calls to do_adjtimex in the meantime). + * The return value is in fixed-point nanoseconds shifted by the + * specified number of bits to the right of the binary point. + * This function has no side-effects. + */ +u64 current_tick_length(void) +{ + long delta_nsec; + u64 ret; + + /* calculate the finest interval NTP will allow. + * ie: nanosecond value shifted by (SHIFT_SCALE - 10) + */ + delta_nsec = tick_nsec + adjtime_adjustment() * 1000; + ret = (u64)delta_nsec << TICK_LENGTH_SHIFT; + ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10)); + + return ret; +} + + +void __attribute__ ((weak)) notify_arch_cmos_timer(void) +{ + return; +} + +/* adjtimex mainly allows reading (and writing, if superuser) of + * kernel time-keeping variables. used by xntpd. + */ +int do_adjtimex(struct timex *txc) +{ + long ltemp, mtemp, save_adjust; + int result; + + /* In order to modify anything, you gotta be super-user! */ + if (txc->modes && !capable(CAP_SYS_TIME)) + return -EPERM; + + /* Now we validate the data before disabling interrupts */ + + if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) + /* singleshot must not be used with any other mode bits */ + if (txc->modes != ADJ_OFFSET_SINGLESHOT) + return -EINVAL; + + if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET)) + /* adjustment Offset limited to +- .512 seconds */ + if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE ) + return -EINVAL; + + /* if the quartz is off by more than 10% something is VERY wrong ! */ + if (txc->modes & ADJ_TICK) + if (txc->tick < 900000/USER_HZ || + txc->tick > 1100000/USER_HZ) + return -EINVAL; + + write_seqlock_irq(&xtime_lock); + result = time_state; /* mostly `TIME_OK' */ + + /* Save for later - semantics of adjtime is to return old value */ + save_adjust = time_next_adjust ? time_next_adjust : time_adjust; + +#if 0 /* STA_CLOCKERR is never set yet */ + time_status &= ~STA_CLOCKERR; /* reset STA_CLOCKERR */ +#endif + /* If there are input parameters, then process them */ + if (txc->modes) + { + if (txc->modes & ADJ_STATUS) /* only set allowed bits */ + time_status = (txc->status & ~STA_RONLY) | + (time_status & STA_RONLY); + + if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */ + if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) { + result = -EINVAL; + goto leave; + } + time_freq = txc->freq; + } + + if (txc->modes & ADJ_MAXERROR) { + if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) { + result = -EINVAL; + goto leave; + } + time_maxerror = txc->maxerror; + } + + if (txc->modes & ADJ_ESTERROR) { + if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) { + result = -EINVAL; + goto leave; + } + time_esterror = txc->esterror; + } + + if (txc->modes & ADJ_TIMECONST) { /* p. 24 */ + if (txc->constant < 0) { /* NTP v4 uses values > 6 */ + result = -EINVAL; + goto leave; + } + time_constant = txc->constant; + } + + if (txc->modes & ADJ_OFFSET) { /* values checked earlier */ + if (txc->modes == ADJ_OFFSET_SINGLESHOT) { + /* adjtime() is independent from ntp_adjtime() */ + if ((time_next_adjust = txc->offset) == 0) + time_adjust = 0; + } + else if (time_status & STA_PLL) { + ltemp = txc->offset; + + /* + * Scale the phase adjustment and + * clamp to the operating range. + */ + if (ltemp > MAXPHASE) + time_offset = MAXPHASE << SHIFT_UPDATE; + else if (ltemp < -MAXPHASE) + time_offset = -(MAXPHASE << SHIFT_UPDATE); + else + time_offset = ltemp << SHIFT_UPDATE; + + /* + * Select whether the frequency is to be controlled + * and in which mode (PLL or FLL). Clamp to the operating + * range. Ugly multiply/divide should be replaced someday. + */ + + if (time_status & STA_FREQHOLD || time_reftime == 0) + time_reftime = xtime.tv_sec; + mtemp = xtime.tv_sec - time_reftime; + time_reftime = xtime.tv_sec; + if (time_status & STA_FLL) { + if (mtemp >= MINSEC) { + ltemp = (time_offset / mtemp) << (SHIFT_USEC - + SHIFT_UPDATE); + time_freq += shift_right(ltemp, SHIFT_KH); + } else /* calibration interval too short (p. 12) */ + result = TIME_ERROR; + } else { /* PLL mode */ + if (mtemp < MAXSEC) { + ltemp *= mtemp; + time_freq += shift_right(ltemp,(time_constant + + time_constant + + SHIFT_KF - SHIFT_USEC)); + } else /* calibration interval too long (p. 12) */ + result = TIME_ERROR; + } + time_freq = min(time_freq, time_tolerance); + time_freq = max(time_freq, -time_tolerance); + } /* STA_PLL */ + } /* txc->modes & ADJ_OFFSET */ + if (txc->modes & ADJ_TICK) { + tick_usec = txc->tick; + tick_nsec = TICK_USEC_TO_NSEC(tick_usec); + } + } /* txc->modes */ +leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) + result = TIME_ERROR; + + if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) + txc->offset = save_adjust; + else { + txc->offset = shift_right(time_offset, SHIFT_UPDATE); + } + txc->freq = time_freq; + txc->maxerror = time_maxerror; + txc->esterror = time_esterror; + txc->status = time_status; + txc->constant = time_constant; + txc->precision = time_precision; + txc->tolerance = time_tolerance; + txc->tick = tick_usec; + + /* PPS is not implemented, so these are zero */ + txc->ppsfreq = 0; + txc->jitter = 0; + txc->shift = 0; + txc->stabil = 0; + txc->jitcnt = 0; + txc->calcnt = 0; + txc->errcnt = 0; + txc->stbcnt = 0; + write_sequnlock_irq(&xtime_lock); + do_gettimeofday(&txc->time); + notify_arch_cmos_timer(); + return(result); +} diff --git a/kernel/timer.c b/kernel/timer.c index 4f55622b0d38..5fccc7cbf3b4 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -41,12 +41,6 @@ #include #include -#ifdef CONFIG_TIME_INTERPOLATION -static void time_interpolator_update(long delta_nsec); -#else -#define time_interpolator_update(x) -#endif - u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; EXPORT_SYMBOL(jiffies_64); @@ -587,209 +581,6 @@ struct timespec wall_to_monotonic __attribute__ ((aligned (16))); EXPORT_SYMBOL(xtime); -/* Don't completely fail for HZ > 500. */ -int tickadj = 500/HZ ? : 1; /* microsecs */ - - -/* - * phase-lock loop variables - */ -/* TIME_ERROR prevents overwriting the CMOS clock */ -int time_state = TIME_OK; /* clock synchronization status */ -int time_status = STA_UNSYNC; /* clock status bits */ -long time_offset; /* time adjustment (us) */ -long time_constant = 2; /* pll time constant */ -long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ -long time_precision = 1; /* clock precision (us) */ -long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ -long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ -long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; - /* frequency offset (scaled ppm)*/ -static long time_adj; /* tick adjust (scaled 1 / HZ) */ -long time_reftime; /* time at last adjustment (s) */ -long time_adjust; -long time_next_adjust; - -/* - * this routine handles the overflow of the microsecond field - * - * The tricky bits of code to handle the accurate clock support - * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. - * They were originally developed for SUN and DEC kernels. - * All the kudos should go to Dave for this stuff. - * - */ -static void second_overflow(void) -{ - long ltemp; - - /* Bump the maxerror field */ - time_maxerror += time_tolerance >> SHIFT_USEC; - if (time_maxerror > NTP_PHASE_LIMIT) { - time_maxerror = NTP_PHASE_LIMIT; - time_status |= STA_UNSYNC; - } - - /* - * Leap second processing. If in leap-insert state at the end of the - * day, the system clock is set back one second; if in leap-delete - * state, the system clock is set ahead one second. The microtime() - * routine or external clock driver will insure that reported time is - * always monotonic. The ugly divides should be replaced. - */ - switch (time_state) { - case TIME_OK: - if (time_status & STA_INS) - time_state = TIME_INS; - else if (time_status & STA_DEL) - time_state = TIME_DEL; - break; - case TIME_INS: - if (xtime.tv_sec % 86400 == 0) { - xtime.tv_sec--; - wall_to_monotonic.tv_sec++; - /* - * The timer interpolator will make time change - * gradually instead of an immediate jump by one second - */ - time_interpolator_update(-NSEC_PER_SEC); - time_state = TIME_OOP; - clock_was_set(); - printk(KERN_NOTICE "Clock: inserting leap second " - "23:59:60 UTC\n"); - } - break; - case TIME_DEL: - if ((xtime.tv_sec + 1) % 86400 == 0) { - xtime.tv_sec++; - wall_to_monotonic.tv_sec--; - /* - * Use of time interpolator for a gradual change of - * time - */ - time_interpolator_update(NSEC_PER_SEC); - time_state = TIME_WAIT; - clock_was_set(); - printk(KERN_NOTICE "Clock: deleting leap second " - "23:59:59 UTC\n"); - } - break; - case TIME_OOP: - time_state = TIME_WAIT; - break; - case TIME_WAIT: - if (!(time_status & (STA_INS | STA_DEL))) - time_state = TIME_OK; - } - - /* - * Compute the phase adjustment for the next second. In PLL mode, the - * offset is reduced by a fixed factor times the time constant. In FLL - * mode the offset is used directly. In either mode, the maximum phase - * adjustment for each second is clamped so as to spread the adjustment - * over not more than the number of seconds between updates. - */ - ltemp = time_offset; - if (!(time_status & STA_FLL)) - ltemp = shift_right(ltemp, SHIFT_KG + time_constant); - ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE); - ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE); - time_offset -= ltemp; - time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); - - /* - * Compute the frequency estimate and additional phase adjustment due - * to frequency error for the next second. - */ - ltemp = time_freq; - time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE)); - -#if HZ == 100 - /* - * Compensate for (HZ==100) != (1 << SHIFT_HZ). Add 25% and 3.125% to - * get 128.125; => only 0.125% error (p. 14) - */ - time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5); -#endif -#if HZ == 250 - /* - * Compensate for (HZ==250) != (1 << SHIFT_HZ). Add 1.5625% and - * 0.78125% to get 255.85938; => only 0.05% error (p. 14) - */ - time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); -#endif -#if HZ == 1000 - /* - * Compensate for (HZ==1000) != (1 << SHIFT_HZ). Add 1.5625% and - * 0.78125% to get 1023.4375; => only 0.05% error (p. 14) - */ - time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); -#endif -} - -/* - * Returns how many microseconds we need to add to xtime this tick - * in doing an adjustment requested with adjtime. - */ -static long adjtime_adjustment(void) -{ - long time_adjust_step; - - time_adjust_step = time_adjust; - if (time_adjust_step) { - /* - * We are doing an adjtime thing. Prepare time_adjust_step to - * be within bounds. Note that a positive time_adjust means we - * want the clock to run faster. - * - * Limit the amount of the step to be in the range - * -tickadj .. +tickadj - */ - time_adjust_step = min(time_adjust_step, (long)tickadj); - time_adjust_step = max(time_adjust_step, (long)-tickadj); - } - return time_adjust_step; -} - -/* in the NTP reference this is called "hardclock()" */ -static void update_ntp_one_tick(void) -{ - long time_adjust_step; - - time_adjust_step = adjtime_adjustment(); - if (time_adjust_step) - /* Reduce by this step the amount of time left */ - time_adjust -= time_adjust_step; - - /* Changes by adjtime() do not take effect till next tick. */ - if (time_next_adjust != 0) { - time_adjust = time_next_adjust; - time_next_adjust = 0; - } -} - -/* - * Return how long ticks are at the moment, that is, how much time - * update_wall_time_one_tick will add to xtime next time we call it - * (assuming no calls to do_adjtimex in the meantime). - * The return value is in fixed-point nanoseconds shifted by the - * specified number of bits to the right of the binary point. - * This function has no side-effects. - */ -u64 current_tick_length(void) -{ - long delta_nsec; - u64 ret; - - /* calculate the finest interval NTP will allow. - * ie: nanosecond value shifted by (SHIFT_SCALE - 10) - */ - delta_nsec = tick_nsec + adjtime_adjustment() * 1000; - ret = (u64)delta_nsec << TICK_LENGTH_SHIFT; - ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10)); - - return ret; -} /* XXX - all of this timekeeping code should be later moved to time.c */ #include @@ -1775,7 +1566,7 @@ unsigned long time_interpolator_get_offset(void) #define INTERPOLATOR_ADJUST 65536 #define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST -static void time_interpolator_update(long delta_nsec) +void time_interpolator_update(long delta_nsec) { u64 counter; unsigned long offset; -- cgit v1.2.3 From b0ee75561beadc4db4d9a899c8ef4a7db50aa0ab Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Sat, 30 Sep 2006 23:28:22 -0700 Subject: [PATCH] ntp: add ntp_update_frequency This introduces ntp_update_frequency() and deinlines ntp_clear() (as it's not performance critical). ntp_update_frequency() calculates the base tick length using tick_usec and adds a base adjustment, in case the frequency doesn't divide evenly by HZ. Signed-off-by: Roman Zippel Cc: john stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/ntp.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------ kernel/timer.c | 11 ++++------- 2 files changed, 48 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 8ccce15b4b23..77137bec2aea 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -15,6 +15,13 @@ #include #include +/* + * Timekeeping variables + */ +unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */ +unsigned long tick_nsec; /* ACTHZ period (nsec) */ +static u64 tick_length, tick_length_base; + /* Don't completely fail for HZ > 500. */ int tickadj = 500/HZ ? : 1; /* microsecs */ @@ -37,6 +44,36 @@ long time_reftime; /* time at last adjustment (s) */ long time_adjust; long time_next_adjust; +/** + * ntp_clear - Clears the NTP state variables + * + * Must be called while holding a write on the xtime_lock + */ +void ntp_clear(void) +{ + time_adjust = 0; /* stop active adjtime() */ + time_status |= STA_UNSYNC; + time_maxerror = NTP_PHASE_LIMIT; + time_esterror = NTP_PHASE_LIMIT; + + ntp_update_frequency(); + + tick_length = tick_length_base; +} + +#define CLOCK_TICK_OVERFLOW (LATCH * HZ - CLOCK_TICK_RATE) +#define CLOCK_TICK_ADJUST (((s64)CLOCK_TICK_OVERFLOW * NSEC_PER_SEC) / (s64)CLOCK_TICK_RATE) + +void ntp_update_frequency(void) +{ + tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT; + tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; + + do_div(tick_length_base, HZ); + + tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT; +} + /* * this routine handles the overflow of the microsecond field * @@ -151,6 +188,7 @@ void second_overflow(void) */ time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); #endif + tick_length = tick_length_base; } /* @@ -204,14 +242,13 @@ void update_ntp_one_tick(void) */ u64 current_tick_length(void) { - long delta_nsec; u64 ret; /* calculate the finest interval NTP will allow. * ie: nanosecond value shifted by (SHIFT_SCALE - 10) */ - delta_nsec = tick_nsec + adjtime_adjustment() * 1000; - ret = (u64)delta_nsec << TICK_LENGTH_SHIFT; + ret = tick_length; + ret += (u64)(adjtime_adjustment() * 1000) << TICK_LENGTH_SHIFT; ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10)); return ret; @@ -351,10 +388,11 @@ int do_adjtimex(struct timex *txc) time_freq = max(time_freq, -time_tolerance); } /* STA_PLL */ } /* txc->modes & ADJ_OFFSET */ - if (txc->modes & ADJ_TICK) { + if (txc->modes & ADJ_TICK) tick_usec = txc->tick; - tick_nsec = TICK_USEC_TO_NSEC(tick_usec); - } + + if (txc->modes & ADJ_TICK) + ntp_update_frequency(); } /* txc->modes */ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) result = TIME_ERROR; diff --git a/kernel/timer.c b/kernel/timer.c index 5fccc7cbf3b4..78d3fa10fcd6 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -562,12 +562,6 @@ found: /******************************************************************/ -/* - * Timekeeping variables - */ -unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */ -unsigned long tick_nsec = TICK_NSEC; /* ACTHZ period (nsec) */ - /* * The current time * wall_to_monotonic is what we need to add to xtime (or xtime corrected @@ -757,10 +751,13 @@ void __init timekeeping_init(void) unsigned long flags; write_seqlock_irqsave(&xtime_lock, flags); + + ntp_clear(); + clock = clocksource_get_next(); clocksource_calculate_interval(clock, tick_nsec); clock->cycle_last = clocksource_read(clock); - ntp_clear(); + write_sequnlock_irqrestore(&xtime_lock, flags); } -- cgit v1.2.3 From ab8783b688f33c40ed7b37b814a4a1e7d341ce11 Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Sat, 30 Sep 2006 23:28:23 -0700 Subject: [PATCH] ntp: add time_adj to tick length This makes time_adj local to second_overflow() and integrates it into the tick length instead of adding it everytime. Signed-off-by: Roman Zippel Cc: john stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/ntp.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 77137bec2aea..c09628d6b848 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -39,7 +39,6 @@ long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; /* frequency offset (scaled ppm)*/ -static long time_adj; /* tick adjust (scaled 1 / HZ) */ long time_reftime; /* time at last adjustment (s) */ long time_adjust; long time_next_adjust; @@ -84,7 +83,7 @@ void ntp_update_frequency(void) */ void second_overflow(void) { - long ltemp; + long ltemp, time_adj; /* Bump the maxerror field */ time_maxerror += time_tolerance >> SHIFT_USEC; @@ -189,6 +188,7 @@ void second_overflow(void) time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); #endif tick_length = tick_length_base; + tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10)); } /* @@ -245,11 +245,9 @@ u64 current_tick_length(void) u64 ret; /* calculate the finest interval NTP will allow. - * ie: nanosecond value shifted by (SHIFT_SCALE - 10) */ ret = tick_length; ret += (u64)(adjtime_adjustment() * 1000) << TICK_LENGTH_SHIFT; - ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10)); return ret; } -- cgit v1.2.3 From dc6a43e46f1b6de22701f97bec022e97088cfa90 Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Sat, 30 Sep 2006 23:28:24 -0700 Subject: [PATCH] ntp: add time_freq to tick length This adds the frequency part to ntp_update_frequency(). Signed-off-by: Roman Zippel Cc: john stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/ntp.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index c09628d6b848..ab21eb06e09b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -37,8 +37,7 @@ long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ long time_precision = 1; /* clock precision (us) */ long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ -long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; - /* frequency offset (scaled ppm)*/ +long time_freq; /* frequency offset (scaled ppm)*/ long time_reftime; /* time at last adjustment (s) */ long time_adjust; long time_next_adjust; @@ -67,6 +66,7 @@ void ntp_update_frequency(void) { tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT; tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; + tick_length_base += ((s64)time_freq * NSEC_PER_USEC) << (TICK_LENGTH_SHIFT - SHIFT_USEC); do_div(tick_length_base, HZ); @@ -163,8 +163,6 @@ void second_overflow(void) * Compute the frequency estimate and additional phase adjustment due * to frequency error for the next second. */ - ltemp = time_freq; - time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE)); #if HZ == 100 /* @@ -389,7 +387,7 @@ int do_adjtimex(struct timex *txc) if (txc->modes & ADJ_TICK) tick_usec = txc->tick; - if (txc->modes & ADJ_TICK) + if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) ntp_update_frequency(); } /* txc->modes */ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) -- cgit v1.2.3 From 3d3675cc3d04d7fd4bb11e8c1ea79e5ade4f5e44 Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Sat, 30 Sep 2006 23:28:25 -0700 Subject: [PATCH] ntp: prescale time_offset This converts time_offset into a scaled per tick value. This avoids now completely the crude compensation in second_overflow(). Signed-off-by: Roman Zippel Cc: john stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/ntp.c | 64 ++++++++++++++----------------------------------------- 1 file changed, 16 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index ab21eb06e09b..238ce47ef09d 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -31,7 +31,7 @@ int tickadj = 500/HZ ? : 1; /* microsecs */ /* TIME_ERROR prevents overwriting the CMOS clock */ int time_state = TIME_OK; /* clock synchronization status */ int time_status = STA_UNSYNC; /* clock status bits */ -long time_offset; /* time adjustment (us) */ +long time_offset; /* time adjustment (ns) */ long time_constant = 2; /* pll time constant */ long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ long time_precision = 1; /* clock precision (us) */ @@ -57,6 +57,7 @@ void ntp_clear(void) ntp_update_frequency(); tick_length = tick_length_base; + time_offset = 0; } #define CLOCK_TICK_OVERFLOW (LATCH * HZ - CLOCK_TICK_RATE) @@ -83,7 +84,7 @@ void ntp_update_frequency(void) */ void second_overflow(void) { - long ltemp, time_adj; + long time_adj; /* Bump the maxerror field */ time_maxerror += time_tolerance >> SHIFT_USEC; @@ -151,42 +152,14 @@ void second_overflow(void) * adjustment for each second is clamped so as to spread the adjustment * over not more than the number of seconds between updates. */ - ltemp = time_offset; - if (!(time_status & STA_FLL)) - ltemp = shift_right(ltemp, SHIFT_KG + time_constant); - ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE); - ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE); - time_offset -= ltemp; - time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); - - /* - * Compute the frequency estimate and additional phase adjustment due - * to frequency error for the next second. - */ - -#if HZ == 100 - /* - * Compensate for (HZ==100) != (1 << SHIFT_HZ). Add 25% and 3.125% to - * get 128.125; => only 0.125% error (p. 14) - */ - time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5); -#endif -#if HZ == 250 - /* - * Compensate for (HZ==250) != (1 << SHIFT_HZ). Add 1.5625% and - * 0.78125% to get 255.85938; => only 0.05% error (p. 14) - */ - time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); -#endif -#if HZ == 1000 - /* - * Compensate for (HZ==1000) != (1 << SHIFT_HZ). Add 1.5625% and - * 0.78125% to get 1023.4375; => only 0.05% error (p. 14) - */ - time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); -#endif tick_length = tick_length_base; - tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10)); + time_adj = time_offset; + if (!(time_status & STA_FLL)) + time_adj = shift_right(time_adj, SHIFT_KG + time_constant); + time_adj = min(time_adj, -((MAXPHASE / HZ) << SHIFT_UPDATE) / MINSEC); + time_adj = max(time_adj, ((MAXPHASE / HZ) << SHIFT_UPDATE) / MINSEC); + time_offset -= time_adj; + tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - SHIFT_UPDATE); } /* @@ -347,12 +320,8 @@ int do_adjtimex(struct timex *txc) * Scale the phase adjustment and * clamp to the operating range. */ - if (ltemp > MAXPHASE) - time_offset = MAXPHASE << SHIFT_UPDATE; - else if (ltemp < -MAXPHASE) - time_offset = -(MAXPHASE << SHIFT_UPDATE); - else - time_offset = ltemp << SHIFT_UPDATE; + time_offset = min(ltemp, MAXPHASE); + time_offset = max(time_offset, -MAXPHASE); /* * Select whether the frequency is to be controlled @@ -366,8 +335,7 @@ int do_adjtimex(struct timex *txc) time_reftime = xtime.tv_sec; if (time_status & STA_FLL) { if (mtemp >= MINSEC) { - ltemp = (time_offset / mtemp) << (SHIFT_USEC - - SHIFT_UPDATE); + ltemp = ((time_offset << 12) / mtemp) << (SHIFT_USEC - 12); time_freq += shift_right(ltemp, SHIFT_KH); } else /* calibration interval too short (p. 12) */ result = TIME_ERROR; @@ -382,6 +350,7 @@ int do_adjtimex(struct timex *txc) } time_freq = min(time_freq, time_tolerance); time_freq = max(time_freq, -time_tolerance); + time_offset = (time_offset * NSEC_PER_USEC / HZ) << SHIFT_UPDATE; } /* STA_PLL */ } /* txc->modes & ADJ_OFFSET */ if (txc->modes & ADJ_TICK) @@ -395,9 +364,8 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) txc->offset = save_adjust; - else { - txc->offset = shift_right(time_offset, SHIFT_UPDATE); - } + else + txc->offset = shift_right(time_offset, SHIFT_UPDATE) * HZ / 1000; txc->freq = time_freq; txc->maxerror = time_maxerror; txc->esterror = time_esterror; -- cgit v1.2.3 From 8f807f8d2137ba728d22820103131038639b68a9 Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Sat, 30 Sep 2006 23:28:25 -0700 Subject: [PATCH] ntp: add time_adjust to tick length This folds update_ntp_one_tick() into second_overflow() and adds time_adjust to the tick length, this makes time_next_adjust unnecessary. This slightly changes the adjtime() behaviour, instead of applying it to the next tick, it's applied to the next second. Signed-off-by: Roman Zippel Cc: john stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/ntp.c | 71 ++++++++++++++----------------------------------------- kernel/timer.c | 2 -- 2 files changed, 18 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 238ce47ef09d..65223b7ed810 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -22,8 +22,9 @@ unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */ unsigned long tick_nsec; /* ACTHZ period (nsec) */ static u64 tick_length, tick_length_base; -/* Don't completely fail for HZ > 500. */ -int tickadj = 500/HZ ? : 1; /* microsecs */ +#define MAX_TICKADJ 500 /* microsecs */ +#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ + TICK_LENGTH_SHIFT) / HZ) /* * phase-lock loop variables @@ -40,7 +41,6 @@ long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ long time_freq; /* frequency offset (scaled ppm)*/ long time_reftime; /* time at last adjustment (s) */ long time_adjust; -long time_next_adjust; /** * ntp_clear - Clears the NTP state variables @@ -160,46 +160,19 @@ void second_overflow(void) time_adj = max(time_adj, ((MAXPHASE / HZ) << SHIFT_UPDATE) / MINSEC); time_offset -= time_adj; tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - SHIFT_UPDATE); -} - -/* - * Returns how many microseconds we need to add to xtime this tick - * in doing an adjustment requested with adjtime. - */ -static long adjtime_adjustment(void) -{ - long time_adjust_step; - - time_adjust_step = time_adjust; - if (time_adjust_step) { - /* - * We are doing an adjtime thing. Prepare time_adjust_step to - * be within bounds. Note that a positive time_adjust means we - * want the clock to run faster. - * - * Limit the amount of the step to be in the range - * -tickadj .. +tickadj - */ - time_adjust_step = min(time_adjust_step, (long)tickadj); - time_adjust_step = max(time_adjust_step, (long)-tickadj); - } - return time_adjust_step; -} -/* in the NTP reference this is called "hardclock()" */ -void update_ntp_one_tick(void) -{ - long time_adjust_step; - - time_adjust_step = adjtime_adjustment(); - if (time_adjust_step) - /* Reduce by this step the amount of time left */ - time_adjust -= time_adjust_step; - - /* Changes by adjtime() do not take effect till next tick. */ - if (time_next_adjust != 0) { - time_adjust = time_next_adjust; - time_next_adjust = 0; + if (unlikely(time_adjust)) { + if (time_adjust > MAX_TICKADJ) { + time_adjust -= MAX_TICKADJ; + tick_length += MAX_TICKADJ_SCALED; + } else if (time_adjust < -MAX_TICKADJ) { + time_adjust += MAX_TICKADJ; + tick_length -= MAX_TICKADJ_SCALED; + } else { + time_adjust = 0; + tick_length += (s64)(time_adjust * NSEC_PER_USEC / + HZ) << TICK_LENGTH_SHIFT; + } } } @@ -213,14 +186,7 @@ void update_ntp_one_tick(void) */ u64 current_tick_length(void) { - u64 ret; - - /* calculate the finest interval NTP will allow. - */ - ret = tick_length; - ret += (u64)(adjtime_adjustment() * 1000) << TICK_LENGTH_SHIFT; - - return ret; + return tick_length; } @@ -263,7 +229,7 @@ int do_adjtimex(struct timex *txc) result = time_state; /* mostly `TIME_OK' */ /* Save for later - semantics of adjtime is to return old value */ - save_adjust = time_next_adjust ? time_next_adjust : time_adjust; + save_adjust = time_adjust; #if 0 /* STA_CLOCKERR is never set yet */ time_status &= ~STA_CLOCKERR; /* reset STA_CLOCKERR */ @@ -310,8 +276,7 @@ int do_adjtimex(struct timex *txc) if (txc->modes & ADJ_OFFSET) { /* values checked earlier */ if (txc->modes == ADJ_OFFSET_SINGLESHOT) { /* adjtime() is independent from ntp_adjtime() */ - if ((time_next_adjust = txc->offset) == 0) - time_adjust = 0; + time_adjust = txc->offset; } else if (time_status & STA_PLL) { ltemp = txc->offset; diff --git a/kernel/timer.c b/kernel/timer.c index 78d3fa10fcd6..654dd5df2417 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -937,8 +937,6 @@ static void update_wall_time(void) /* interpolator bits */ time_interpolator_update(clock->xtime_interval >> clock->shift); - /* increment the NTP state machine */ - update_ntp_one_tick(); /* accumulate error between NTP and clock interval */ clock->error += current_tick_length(); -- cgit v1.2.3 From 97eebe138caaf78354b1fad233e63bafdcc4fd54 Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Sat, 30 Sep 2006 23:28:26 -0700 Subject: [PATCH] ntp: remove time_tolerance time_tolerance isn't changed at all in the kernel, so simply remove it, this simplifies the next patch, as it avoids a number of conversions. Signed-off-by: Roman Zippel Cc: john stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/ntp.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 65223b7ed810..af7563f5d4e2 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -34,7 +34,6 @@ int time_state = TIME_OK; /* clock synchronization status */ int time_status = STA_UNSYNC; /* clock status bits */ long time_offset; /* time adjustment (ns) */ long time_constant = 2; /* pll time constant */ -long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ long time_precision = 1; /* clock precision (us) */ long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ @@ -87,7 +86,7 @@ void second_overflow(void) long time_adj; /* Bump the maxerror field */ - time_maxerror += time_tolerance >> SHIFT_USEC; + time_maxerror += MAXFREQ >> SHIFT_USEC; if (time_maxerror > NTP_PHASE_LIMIT) { time_maxerror = NTP_PHASE_LIMIT; time_status |= STA_UNSYNC; @@ -313,8 +312,8 @@ int do_adjtimex(struct timex *txc) } else /* calibration interval too long (p. 12) */ result = TIME_ERROR; } - time_freq = min(time_freq, time_tolerance); - time_freq = max(time_freq, -time_tolerance); + time_freq = min(time_freq, MAXFREQ); + time_freq = max(time_freq, -MAXFREQ); time_offset = (time_offset * NSEC_PER_USEC / HZ) << SHIFT_UPDATE; } /* STA_PLL */ } /* txc->modes & ADJ_OFFSET */ @@ -337,7 +336,7 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) txc->status = time_status; txc->constant = time_constant; txc->precision = time_precision; - txc->tolerance = time_tolerance; + txc->tolerance = MAXFREQ; txc->tick = tick_usec; /* PPS is not implemented, so these are zero */ -- cgit v1.2.3 From 04b617e71e363e640e88be1e43f53fa6a3afef9f Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Sat, 30 Sep 2006 23:28:27 -0700 Subject: [PATCH] ntp: convert time_freq to nsec value This converts time_freq to a scaled nsec value and adds around 6bit of extra resolution. This pushes the time_freq to its 32bit limits so the calculatons have to be done with 64bit. Signed-off-by: Roman Zippel Cc: john stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/ntp.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index af7563f5d4e2..9137b54613e0 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -66,7 +66,7 @@ void ntp_update_frequency(void) { tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT; tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; - tick_length_base += ((s64)time_freq * NSEC_PER_USEC) << (TICK_LENGTH_SHIFT - SHIFT_USEC); + tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); do_div(tick_length_base, HZ); @@ -200,6 +200,7 @@ void __attribute__ ((weak)) notify_arch_cmos_timer(void) int do_adjtimex(struct timex *txc) { long ltemp, mtemp, save_adjust; + s64 freq_adj; int result; /* In order to modify anything, you gotta be super-user! */ @@ -245,7 +246,7 @@ int do_adjtimex(struct timex *txc) result = -EINVAL; goto leave; } - time_freq = txc->freq; + time_freq = ((s64)txc->freq * NSEC_PER_USEC) >> (SHIFT_USEC - SHIFT_NSEC); } if (txc->modes & ADJ_MAXERROR) { @@ -278,14 +279,14 @@ int do_adjtimex(struct timex *txc) time_adjust = txc->offset; } else if (time_status & STA_PLL) { - ltemp = txc->offset; + ltemp = txc->offset * NSEC_PER_USEC; /* * Scale the phase adjustment and * clamp to the operating range. */ - time_offset = min(ltemp, MAXPHASE); - time_offset = max(time_offset, -MAXPHASE); + time_offset = min(ltemp, MAXPHASE * NSEC_PER_USEC); + time_offset = max(time_offset, -MAXPHASE * NSEC_PER_USEC); /* * Select whether the frequency is to be controlled @@ -297,24 +298,31 @@ int do_adjtimex(struct timex *txc) time_reftime = xtime.tv_sec; mtemp = xtime.tv_sec - time_reftime; time_reftime = xtime.tv_sec; + freq_adj = 0; if (time_status & STA_FLL) { if (mtemp >= MINSEC) { - ltemp = ((time_offset << 12) / mtemp) << (SHIFT_USEC - 12); - time_freq += shift_right(ltemp, SHIFT_KH); + freq_adj = (s64)time_offset << (SHIFT_NSEC - SHIFT_KH); + if (time_offset < 0) { + freq_adj = -freq_adj; + do_div(freq_adj, mtemp); + freq_adj = -freq_adj; + } else + do_div(freq_adj, mtemp); } else /* calibration interval too short (p. 12) */ result = TIME_ERROR; } else { /* PLL mode */ if (mtemp < MAXSEC) { - ltemp *= mtemp; - time_freq += shift_right(ltemp,(time_constant + + freq_adj = (s64)ltemp * mtemp; + freq_adj = shift_right(freq_adj,(time_constant + time_constant + - SHIFT_KF - SHIFT_USEC)); + SHIFT_KF - SHIFT_NSEC)); } else /* calibration interval too long (p. 12) */ result = TIME_ERROR; } - time_freq = min(time_freq, MAXFREQ); - time_freq = max(time_freq, -MAXFREQ); - time_offset = (time_offset * NSEC_PER_USEC / HZ) << SHIFT_UPDATE; + freq_adj += time_freq; + freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC); + time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC); + time_offset = (time_offset / HZ) << SHIFT_UPDATE; } /* STA_PLL */ } /* txc->modes & ADJ_OFFSET */ if (txc->modes & ADJ_TICK) @@ -330,7 +338,7 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) txc->offset = save_adjust; else txc->offset = shift_right(time_offset, SHIFT_UPDATE) * HZ / 1000; - txc->freq = time_freq; + txc->freq = (time_freq / NSEC_PER_USEC) << (SHIFT_USEC - SHIFT_NSEC); txc->maxerror = time_maxerror; txc->esterror = time_esterror; txc->status = time_status; -- cgit v1.2.3 From f19923937321244e7dc334767eb4b67e0e3d5c74 Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Sat, 30 Sep 2006 23:28:28 -0700 Subject: [PATCH] ntp: convert to the NTP4 reference model This converts the kernel ntp model into a model which matches the nanokernel reference implementations. The previous patches already increased the resolution and precision of the computations, so that this conversion becomes quite simple. explains: The original NTP kernel interface was defined in units of microseconds. That's what Linux implements. As computers have gotten faster and can now split microseconds easily, a new kernel interface using nanosecond units was defined ("the nanokernel", confusing as that name is to OS hackers), and there's an STA_NANO bit in the adjtimex() status field to tell the application which units it's using. The current ntpd supports both, but Linux loses some possible timing resolution because of quantization effects, and the ntpd hackers would really like to be able to drop the backwards compatibility code. Ulrich Windl has been maintaining a patch set to do the conversion for years, but it's hard to keep in sync. Signed-off-by: Roman Zippel Cc: john stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/ntp.c | 51 +++++++++++++++++++-------------------------------- 1 file changed, 19 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 9137b54613e0..1ab5e9d7fa50 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -145,18 +145,11 @@ void second_overflow(void) } /* - * Compute the phase adjustment for the next second. In PLL mode, the - * offset is reduced by a fixed factor times the time constant. In FLL - * mode the offset is used directly. In either mode, the maximum phase - * adjustment for each second is clamped so as to spread the adjustment - * over not more than the number of seconds between updates. + * Compute the phase adjustment for the next second. The offset is + * reduced by a fixed factor times the time constant. */ tick_length = tick_length_base; - time_adj = time_offset; - if (!(time_status & STA_FLL)) - time_adj = shift_right(time_adj, SHIFT_KG + time_constant); - time_adj = min(time_adj, -((MAXPHASE / HZ) << SHIFT_UPDATE) / MINSEC); - time_adj = max(time_adj, ((MAXPHASE / HZ) << SHIFT_UPDATE) / MINSEC); + time_adj = shift_right(time_offset, SHIFT_PLL + time_constant); time_offset -= time_adj; tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - SHIFT_UPDATE); @@ -200,7 +193,7 @@ void __attribute__ ((weak)) notify_arch_cmos_timer(void) int do_adjtimex(struct timex *txc) { long ltemp, mtemp, save_adjust; - s64 freq_adj; + s64 freq_adj, temp64; int result; /* In order to modify anything, you gotta be super-user! */ @@ -270,7 +263,7 @@ int do_adjtimex(struct timex *txc) result = -EINVAL; goto leave; } - time_constant = txc->constant; + time_constant = min(txc->constant + 4, (long)MAXTC); } if (txc->modes & ADJ_OFFSET) { /* values checked earlier */ @@ -298,26 +291,20 @@ int do_adjtimex(struct timex *txc) time_reftime = xtime.tv_sec; mtemp = xtime.tv_sec - time_reftime; time_reftime = xtime.tv_sec; - freq_adj = 0; - if (time_status & STA_FLL) { - if (mtemp >= MINSEC) { - freq_adj = (s64)time_offset << (SHIFT_NSEC - SHIFT_KH); - if (time_offset < 0) { - freq_adj = -freq_adj; - do_div(freq_adj, mtemp); - freq_adj = -freq_adj; - } else - do_div(freq_adj, mtemp); - } else /* calibration interval too short (p. 12) */ - result = TIME_ERROR; - } else { /* PLL mode */ - if (mtemp < MAXSEC) { - freq_adj = (s64)ltemp * mtemp; - freq_adj = shift_right(freq_adj,(time_constant + - time_constant + - SHIFT_KF - SHIFT_NSEC)); - } else /* calibration interval too long (p. 12) */ - result = TIME_ERROR; + + freq_adj = (s64)time_offset * mtemp; + freq_adj = shift_right(freq_adj, time_constant * 2 + + (SHIFT_PLL + 2) * 2 - SHIFT_NSEC); + if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) { + temp64 = (s64)time_offset << (SHIFT_NSEC - SHIFT_FLL); + if (time_offset < 0) { + temp64 = -temp64; + do_div(temp64, mtemp); + freq_adj -= temp64; + } else { + do_div(temp64, mtemp); + freq_adj += temp64; + } } freq_adj += time_freq; freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC); -- cgit v1.2.3 From 70bc42f90a3f4721c89dbe865e6c95da8565b41c Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Sat, 30 Sep 2006 23:28:29 -0700 Subject: [PATCH] kernel/time/ntp.c: possible cleanups This patch contains the following possible cleanups: - make the following needlessly global function static: - ntp_update_frequency() - make the following needlessly global variables static: - time_state - time_offset - time_constant - time_reftime - remove the following read-only global variable: - time_precision Signed-off-by: Adrian Bunk Cc: Roman Zippel Cc: john stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/ntp.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 1ab5e9d7fa50..47195fa0ec4f 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -30,17 +30,31 @@ static u64 tick_length, tick_length_base; * phase-lock loop variables */ /* TIME_ERROR prevents overwriting the CMOS clock */ -int time_state = TIME_OK; /* clock synchronization status */ +static int time_state = TIME_OK; /* clock synchronization status */ int time_status = STA_UNSYNC; /* clock status bits */ -long time_offset; /* time adjustment (ns) */ -long time_constant = 2; /* pll time constant */ -long time_precision = 1; /* clock precision (us) */ +static long time_offset; /* time adjustment (ns) */ +static long time_constant = 2; /* pll time constant */ long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ long time_freq; /* frequency offset (scaled ppm)*/ -long time_reftime; /* time at last adjustment (s) */ +static long time_reftime; /* time at last adjustment (s) */ long time_adjust; +#define CLOCK_TICK_OVERFLOW (LATCH * HZ - CLOCK_TICK_RATE) +#define CLOCK_TICK_ADJUST (((s64)CLOCK_TICK_OVERFLOW * NSEC_PER_SEC) / \ + (s64)CLOCK_TICK_RATE) + +static void ntp_update_frequency(void) +{ + tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT; + tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; + tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); + + do_div(tick_length_base, HZ); + + tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT; +} + /** * ntp_clear - Clears the NTP state variables * @@ -59,20 +73,6 @@ void ntp_clear(void) time_offset = 0; } -#define CLOCK_TICK_OVERFLOW (LATCH * HZ - CLOCK_TICK_RATE) -#define CLOCK_TICK_ADJUST (((s64)CLOCK_TICK_OVERFLOW * NSEC_PER_SEC) / (s64)CLOCK_TICK_RATE) - -void ntp_update_frequency(void) -{ - tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT; - tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; - tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); - - do_div(tick_length_base, HZ); - - tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT; -} - /* * this routine handles the overflow of the microsecond field * @@ -330,7 +330,7 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) txc->esterror = time_esterror; txc->status = time_status; txc->constant = time_constant; - txc->precision = time_precision; + txc->precision = 1; txc->tolerance = MAXFREQ; txc->tick = tick_usec; -- cgit v1.2.3 From 8ef386092d7c2891bd7acefb2a87f878f7e9a0d6 Mon Sep 17 00:00:00 2001 From: Atsushi Nemoto Date: Sat, 30 Sep 2006 23:28:31 -0700 Subject: [PATCH] kill wall_jiffies With 2.6.18-rc4-mm2, now wall_jiffies will always be the same as jiffies. So we can kill wall_jiffies completely. This is just a cleanup and logically should not change any real behavior except for one thing: RTC updating code in (old) ppc and xtensa use a condition "jiffies - wall_jiffies == 1". This condition is never met so I suppose it is just a bug. I just remove that condition only instead of kill the whole "if" block. [heiko.carstens@de.ibm.com: s390 build fix and cleanup] Signed-off-by: Atsushi Nemoto Cc: Andi Kleen Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Russell King Cc: Ian Molton Cc: Mikael Starvik Cc: David Howells Cc: Yoshinori Sato Cc: Hirokazu Takata Cc: Ralf Baechle Cc: Kyle McMartin Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: Paul Mundt Cc: Kazumoto Kojima Cc: Richard Curnow Cc: William Lee Irwin III Cc: "David S. Miller" Cc: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Cc: Miles Bader Cc: Chris Zankel Cc: "Luck, Tony" Cc: Geert Uytterhoeven Cc: Roman Zippel Signed-off-by: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/timer.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 654dd5df2417..c1c7fbcffec1 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -768,7 +768,7 @@ static int timekeeping_suspended; * @dev: unused * * This is for the generic clocksource timekeeping. - * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are + * xtime/wall_to_monotonic/jiffies/etc are * still managed by arch specific suspend/resume code. */ static int timekeeping_resume(struct sys_device *dev) @@ -1016,9 +1016,6 @@ static inline void calc_load(unsigned long ticks) } } -/* jiffies at the most recent update of wall time */ -unsigned long wall_jiffies = INITIAL_JIFFIES; - /* * This read-write spinlock protects us from races in SMP while * playing with xtime and avenrun. @@ -1056,7 +1053,6 @@ void run_local_timers(void) */ static inline void update_times(unsigned long ticks) { - wall_jiffies += ticks; update_wall_time(); calc_load(ticks); } -- cgit v1.2.3 From 0ae646845b603e9df5711084436d389f8371ffb3 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Sat, 30 Sep 2006 23:28:53 -0700 Subject: [PATCH] Fix taskstats size calculation (use the new genetlink utility functions) The addition of the CSA patch pushed the size of struct taskstats to 256 bytes. This exposed a problem with prepare_reply(), we were not allocating space for the netlink and genetlink header. It worked earlier because alloc_skb() would align the skb to SMP_CACHE_BYTES, which added some additonal bytes. Signed-off-by: Balbir Singh Cc: Jamal Hadi Cc: Shailabh Nagar Cc: Thomas Graf Cc: "David S. Miller" Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 2ed4040d0dc5..c451af2ddb50 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -75,7 +75,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, /* * If new attributes are added, please revisit this allocation */ - skb = nlmsg_new(size, GFP_KERNEL); + skb = nlmsg_new(genlmsg_total_size(size), GFP_KERNEL); if (!skb) return -ENOMEM; -- cgit v1.2.3 From f3cef7a99469afc159fec3a61b42dc7ca5b6824f Mon Sep 17 00:00:00 2001 From: Jay Lan Date: Sat, 30 Sep 2006 23:28:55 -0700 Subject: [PATCH] csa: basic accounting over taskstats Add some basic accounting fields to the taskstats struct, add a new kernel/tsacct.c to handle basic accounting data handling upon exit. A handle is added to taskstats.c to invoke the basic accounting data handling. Signed-off-by: Jay Lan Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jes Sorensen Cc: Chris Sturtivant Cc: Tony Ernst Cc: Guillaume Thouvenin Cc: "Michal Piotrowski" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 2 +- kernel/taskstats.c | 4 +++ kernel/tsacct.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 kernel/tsacct.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index e210e8cf7237..aacaafb28b9d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -49,7 +49,7 @@ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o -obj-$(CONFIG_TASKSTATS) += taskstats.o +obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/taskstats.c b/kernel/taskstats.c index c451af2ddb50..6c38dce88e8c 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -198,7 +199,10 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk, */ delayacct_add_tsk(stats, tsk); + + /* fill in basic acct fields */ stats->version = TASKSTATS_VERSION; + bacct_add_tsk(stats, tsk); /* Define err: label here if needed */ put_task_struct(tsk); diff --git a/kernel/tsacct.c b/kernel/tsacct.c new file mode 100644 index 000000000000..899067950a88 --- /dev/null +++ b/kernel/tsacct.c @@ -0,0 +1,72 @@ +/* + * tsacct.c - System accounting over taskstats interface + * + * Copyright (C) Jay Lan, + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include + + +#define USEC_PER_TICK (USEC_PER_SEC/HZ) +/* + * fill in basic accounting fields + */ +void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) +{ + struct timespec uptime, ts; + s64 ac_etime; + + BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); + + /* calculate task elapsed time in timespec */ + do_posix_clock_monotonic_gettime(&uptime); + ts = timespec_sub(uptime, current->group_leader->start_time); + /* rebase elapsed time to usec */ + ac_etime = timespec_to_ns(&ts); + do_div(ac_etime, NSEC_PER_USEC); + stats->ac_etime = ac_etime; + stats->ac_btime = xtime.tv_sec - ts.tv_sec; + if (thread_group_leader(tsk)) { + stats->ac_exitcode = tsk->exit_code; + if (tsk->flags & PF_FORKNOEXEC) + stats->ac_flag |= AFORK; + } + if (tsk->flags & PF_SUPERPRIV) + stats->ac_flag |= ASU; + if (tsk->flags & PF_DUMPCORE) + stats->ac_flag |= ACORE; + if (tsk->flags & PF_SIGNALED) + stats->ac_flag |= AXSIG; + stats->ac_nice = task_nice(tsk); + stats->ac_sched = tsk->policy; + stats->ac_uid = tsk->uid; + stats->ac_gid = tsk->gid; + stats->ac_pid = tsk->pid; + stats->ac_ppid = (tsk->parent) ? tsk->parent->pid : 0; + stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC; + stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; + stats->ac_minflt = tsk->min_flt; + stats->ac_majflt = tsk->maj_flt; + /* Each process gets a minimum of one usec cpu time */ + if ((stats->ac_utime == 0) && (stats->ac_stime == 0)) { + stats->ac_stime = 1; + } + + strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm)); +} + -- cgit v1.2.3 From 9acc1853519a0473620d424105f9d49ea5b4e62e Mon Sep 17 00:00:00 2001 From: Jay Lan Date: Sat, 30 Sep 2006 23:28:58 -0700 Subject: [PATCH] csa: Extended system accounting over taskstats Add extended system accounting handling over taskstats interface. A CONFIG_TASK_XACCT flag is created to enable the extended accounting code. Signed-off-by: Jay Lan Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jes Sorensen Cc: Chris Sturtivant Cc: Tony Ernst Cc: Guillaume Thouvenin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 4 ++++ kernel/tsacct.c | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 6c38dce88e8c..5d6a8c54ee85 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -204,6 +205,9 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk, stats->version = TASKSTATS_VERSION; bacct_add_tsk(stats, tsk); + /* fill in extended acct fields */ + xacct_add_tsk(stats, tsk); + /* Define err: label here if needed */ put_task_struct(tsk); return rc; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 899067950a88..410483490cf6 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -70,3 +70,22 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm)); } + +#ifdef CONFIG_TASK_XACCT +/* + * fill in extended accounting fields + */ +void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) +{ + stats->acct_rss_mem1 = p->acct_rss_mem1; + stats->acct_vm_mem1 = p->acct_vm_mem1; + if (p->mm) { + stats->hiwater_rss = p->mm->hiwater_rss; + stats->hiwater_vm = p->mm->hiwater_vm; + } + stats->read_char = p->rchar; + stats->write_char = p->wchar; + stats->read_syscalls = p->syscr; + stats->write_syscalls = p->syscw; +} +#endif -- cgit v1.2.3 From 8f0ab5147951267134612570604cf8341901a80c Mon Sep 17 00:00:00 2001 From: Jay Lan Date: Sat, 30 Sep 2006 23:28:59 -0700 Subject: [PATCH] csa: convert CONFIG tag for extended accounting routines There were a few accounting data/macros that are used in CSA but are #ifdef'ed inside CONFIG_BSD_PROCESS_ACCT. This patch is to change those ifdef's from CONFIG_BSD_PROCESS_ACCT to CONFIG_TASK_XACCT. A few defines are moved from kernel/acct.c and include/linux/acct.h to kernel/tsacct.c and include/linux/tsacct_kern.h. Signed-off-by: Jay Lan Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jes Sorensen Cc: Chris Sturtivant Cc: Tony Ernst Cc: Guillaume Thouvenin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 30 ------------------------------ kernel/exit.c | 1 + kernel/fork.c | 1 + kernel/sched.c | 2 +- kernel/tsacct.c | 30 ++++++++++++++++++++++++++++++ 5 files changed, 33 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index f4330acead46..0aad5ca36a81 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -602,33 +602,3 @@ void acct_process(void) do_acct_process(file); fput(file); } - - -/** - * acct_update_integrals - update mm integral fields in task_struct - * @tsk: task_struct for accounting - */ -void acct_update_integrals(struct task_struct *tsk) -{ - if (likely(tsk->mm)) { - long delta = - cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd; - - if (delta == 0) - return; - tsk->acct_stimexpd = tsk->stime; - tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); - tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; - } -} - -/** - * acct_clear_integrals - clear the mm integral fields in task_struct - * @tsk: task_struct whose accounting fields are cleared - */ -void acct_clear_integrals(struct task_struct *tsk) -{ - tsk->acct_stimexpd = 0; - tsk->acct_rss_mem1 = 0; - tsk->acct_vm_mem1 = 0; -} diff --git a/kernel/exit.c b/kernel/exit.c index c189de2927ab..3b47f26985f2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/fork.c b/kernel/fork.c index 1c999f3e0b47..89f666491d1f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/sched.c b/kernel/sched.c index 74f169ac0773..2bbd948f0169 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -49,7 +49,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 410483490cf6..47c71daa416f 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -88,4 +88,34 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) stats->read_syscalls = p->syscr; stats->write_syscalls = p->syscw; } + + +/** + * acct_update_integrals - update mm integral fields in task_struct + * @tsk: task_struct for accounting + */ +void acct_update_integrals(struct task_struct *tsk) +{ + if (likely(tsk->mm)) { + long delta = + cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd; + + if (delta == 0) + return; + tsk->acct_stimexpd = tsk->stime; + tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); + tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; + } +} + +/** + * acct_clear_integrals - clear the mm integral fields in task_struct + * @tsk: task_struct whose accounting fields are cleared + */ +void acct_clear_integrals(struct task_struct *tsk) +{ + tsk->acct_stimexpd = 0; + tsk->acct_rss_mem1 = 0; + tsk->acct_vm_mem1 = 0; +} #endif -- cgit v1.2.3 From db5fed26b2e0beed939b773dd5896077a1794d65 Mon Sep 17 00:00:00 2001 From: Jay Lan Date: Sat, 30 Sep 2006 23:29:00 -0700 Subject: [PATCH] csa accounting taskstats update ChangeLog: Feedbacks from Andrew Morton: - define TS_COMM_LEN to 32 - change acct_stimexpd field of task_struct to be of cputime_t, which is to be used to save the tsk->stime of last timer interrupt update. - a new Documentation/accounting/taskstats-struct.txt to describe fields of taskstats struct. Feedback from Balbir Singh: - keep the stime of a task to be zero when both stime and utime are zero as recoreded in task_struct. Misc: - convert accumulated RSS/VM from platform dependent pages-ticks to MBytes-usecs in the kernel Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jes Sorensen Cc: Chris Sturtivant Cc: Tony Ernst Cc: Guillaume Thouvenin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/tsacct.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 47c71daa416f..db443221ba5b 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -20,6 +20,7 @@ #include #include #include +#include #define USEC_PER_TICK (USEC_PER_SEC/HZ) @@ -62,33 +63,35 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; stats->ac_minflt = tsk->min_flt; stats->ac_majflt = tsk->maj_flt; - /* Each process gets a minimum of one usec cpu time */ - if ((stats->ac_utime == 0) && (stats->ac_stime == 0)) { - stats->ac_stime = 1; - } strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm)); } #ifdef CONFIG_TASK_XACCT + +#define KB 1024 +#define MB (1024*KB) /* * fill in extended accounting fields */ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) { - stats->acct_rss_mem1 = p->acct_rss_mem1; - stats->acct_vm_mem1 = p->acct_vm_mem1; + /* convert pages-jiffies to Mbyte-usec */ + stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB; + stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB; if (p->mm) { - stats->hiwater_rss = p->mm->hiwater_rss; - stats->hiwater_vm = p->mm->hiwater_vm; + /* adjust to KB unit */ + stats->hiwater_rss = p->mm->hiwater_rss * PAGE_SIZE / KB; + stats->hiwater_vm = p->mm->hiwater_vm * PAGE_SIZE / KB; } stats->read_char = p->rchar; stats->write_char = p->wchar; stats->read_syscalls = p->syscr; stats->write_syscalls = p->syscw; } - +#undef KB +#undef MB /** * acct_update_integrals - update mm integral fields in task_struct @@ -97,8 +100,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) void acct_update_integrals(struct task_struct *tsk) { if (likely(tsk->mm)) { - long delta = - cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd; + long delta = cputime_to_jiffies( + cputime_sub(tsk->stime, tsk->acct_stimexpd)); if (delta == 0) return; -- cgit v1.2.3 From d8c76e6f45c111c32a4b3e50a2adc9210737b0d8 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sat, 30 Sep 2006 23:29:04 -0700 Subject: [PATCH] r/o bind mount prepwork: inc_nlink() helper This is mostly included for parity with dec_nlink(), where we will have some more hooks. This one should stay pretty darn straightforward for now. Signed-off-by: Dave Hansen Acked-by: Christoph Hellwig Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 8c3c400cce91..9d850ae13b1b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -377,7 +377,7 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data, inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directories start off with i_nlink == 2 (for "." entry) */ - inode->i_nlink++; + inc_nlink(inode); } else { return -ENOMEM; } @@ -1565,7 +1565,7 @@ static int cpuset_create_file(struct dentry *dentry, int mode) inode->i_fop = &simple_dir_operations; /* start off with i_nlink == 2 (for "." entry) */ - inode->i_nlink++; + inc_nlink(inode); } else if (S_ISREG(mode)) { inode->i_size = 0; inode->i_fop = &cpuset_file_operations; @@ -1598,7 +1598,7 @@ static int cpuset_create_dir(struct cpuset *cs, const char *name, int mode) error = cpuset_create_file(dentry, S_IFDIR | mode); if (!error) { dentry->d_fsdata = cs; - parent->d_inode->i_nlink++; + inc_nlink(parent->d_inode); cs->dentry = dentry; } dput(dentry); @@ -2033,7 +2033,7 @@ int __init cpuset_init(void) } root = cpuset_mount->mnt_sb->s_root; root->d_fsdata = &top_cpuset; - root->d_inode->i_nlink++; + inc_nlink(root->d_inode); top_cpuset.dentry = root; root->d_inode->i_op = &cpuset_dir_inode_operations; number_of_cpusets = 1; -- cgit v1.2.3 From e239ca540594cff00adcce163dc332b27015d8e5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 30 Sep 2006 23:29:27 -0700 Subject: [PATCH] Create call_usermodehelper_pipe() A new member in the ever growing family of call_usermode* functions is born. The new call_usermodehelper_pipe() function allows to pipe data to the stdin of the called user mode progam and behaves otherwise like the normal call_usermodehelp() (except that it always waits for the child to finish) Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 842f8015d7fd..5c63c53014a9 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -122,6 +122,7 @@ struct subprocess_info { struct key *ring; int wait; int retval; + struct file *stdin; }; /* @@ -145,12 +146,26 @@ static int ____call_usermodehelper(void *data) key_put(old_session); + /* Install input pipe when needed */ + if (sub_info->stdin) { + struct files_struct *f = current->files; + struct fdtable *fdt; + /* no races because files should be private here */ + sys_close(0); + fd_install(0, sub_info->stdin); + spin_lock(&f->file_lock); + fdt = files_fdtable(f); + FD_SET(0, fdt->open_fds); + FD_CLR(0, fdt->close_on_exec); + spin_unlock(&f->file_lock); + } + /* We can run anywhere, unlike our parent keventd(). */ set_cpus_allowed(current, CPU_MASK_ALL); retval = -EPERM; if (current->fs->root) - retval = execve(sub_info->path, sub_info->argv,sub_info->envp); + retval = execve(sub_info->path, sub_info->argv, sub_info->envp); /* Exec failed? */ sub_info->retval = retval; @@ -268,6 +283,44 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, } EXPORT_SYMBOL(call_usermodehelper_keys); +int call_usermodehelper_pipe(char *path, char **argv, char **envp, + struct file **filp) +{ + DECLARE_COMPLETION(done); + struct subprocess_info sub_info = { + .complete = &done, + .path = path, + .argv = argv, + .envp = envp, + .retval = 0, + }; + struct file *f; + DECLARE_WORK(work, __call_usermodehelper, &sub_info); + + if (!khelper_wq) + return -EBUSY; + + if (path[0] == '\0') + return 0; + + f = create_write_pipe(); + if (!f) + return -ENOMEM; + *filp = f; + + f = create_read_pipe(f); + if (!f) { + free_write_pipe(*filp); + return -ENOMEM; + } + sub_info.stdin = f; + + queue_work(khelper_wq, &work); + wait_for_completion(&done); + return sub_info.retval; +} +EXPORT_SYMBOL(call_usermodehelper_pipe); + void __init usermodehelper_init(void) { khelper_wq = create_singlethread_workqueue("khelper"); -- cgit v1.2.3 From d025c9db7f31fc0554ce7fb2dfc78d35a77f3487 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 30 Sep 2006 23:29:28 -0700 Subject: [PATCH] Support piping into commands in /proc/sys/kernel/core_pattern Using the infrastructure created in previous patches implement support to pipe core dumps into programs. This is done by overloading the existing core_pattern sysctl with a new syntax: |program When the first character of the pattern is a '|' the kernel will instead threat the rest of the pattern as a command to run. The core dump will be written to the standard input of that program instead of to a file. This is useful for having automatic core dump analysis without filling up disks. The program can do some simple analysis and save only a summary of the core dump. The core dump proces will run with the privileges and in the name space of the process that caused the core dump. I also increased the core pattern size to 128 bytes so that longer command lines fit. Most of the changes comes from allowing core dumps without seeks. They are fairly straight forward though. One small incompatibility is that if someone had a core pattern previously that started with '|' they will get suddenly new behaviour. I think that's unlikely to be a real problem though. Additional background: > Very nice, do you happen to have a program that can accept this kind of > input for crash dumps? I'm guessing that the embedded people will > really want this functionality. I had a cheesy demo/prototype. Basically it wrote the dump to a file again, ran gdb on it to get a backtrace and wrote the summary to a shared directory. Then there was a simple CGI script to generate a "top 10" crashes HTML listing. Unfortunately this still had the disadvantage to needing full disk space for a dump except for deleting it afterwards (in fact it was worse because over the pipe holes didn't work so if you have a holey address map it would require more space). Fortunately gdb seems to be happy to handle /proc/pid/fd/xxx input pipes as cores (at least it worked with zsh's =(cat core) syntax), so it would be likely possible to do it without temporary space with a simple wrapper that calls it in the right way. I ran out of time before doing that though. The demo prototype scripts weren't very good. If there is really interest I can dig them out (they are currently on a laptop disk on the desk with the laptop itself being in service), but I would recommend to rewrite them for any serious application of this and fix the disk space problem. Also to be really useful it should probably find a way to automatically fetch the debuginfos (I cheated and just installed them in advance). If nobody else does it I can probably do the rewrite myself again at some point. My hope at some point was that desktops would support it in their builtin crash reporters, but at least the KDE people I talked too seemed to be happy with their user space only solution. Alan sayeth: I don't believe that piping as such as neccessarily the right model, but the ability to intercept and processes core dumps from user space is asked for by many enterprise users as well. They want to know about, capture, analyse and process core dumps, often centrally and in automated form. [akpm@osdl.org: loff_t != unsigned long] Signed-off-by: Andi Kleen Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 4 ++++ kernel/sysctl.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 5c63c53014a9..f8121b95183f 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -35,6 +35,7 @@ #include #include #include +#include #include extern int max_threads; @@ -158,6 +159,9 @@ static int ____call_usermodehelper(void *data) FD_SET(0, fdt->open_fds); FD_CLR(0, fdt->close_on_exec); spin_unlock(&f->file_lock); + + /* and disallow core files too */ + current->signal->rlim[RLIMIT_CORE] = (struct rlimit){0, 0}; } /* We can run anywhere, unlike our parent keventd(). */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c57c4532e296..ba42694f0453 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -294,7 +294,7 @@ static ctl_table kern_table[] = { .ctl_name = KERN_CORE_PATTERN, .procname = "core_pattern", .data = core_pattern, - .maxlen = 64, + .maxlen = 128, .mode = 0644, .proc_handler = &proc_dostring, .strategy = &sysctl_string, -- cgit v1.2.3 From 2bc2d61a9638dab670d8361e928d1a5a291173ef Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 2 Oct 2006 02:17:02 -0700 Subject: [PATCH] list module taint flags in Oops/panic When listing loaded modules during an oops or panic, also list each module's Tainted flags if non-zero (P: Proprietary or F: Forced load only). If a module is did not taint the kernel, it is just listed like usbcore but if it did taint the kernel, it is listed like wizmodem(PF) Example: [ 3260.121718] Unable to handle kernel NULL pointer dereference at 0000000000000000 RIP: [ 3260.121729] [] :dump_test:proc_dump_test+0x99/0xc8 [ 3260.121742] PGD fe8d067 PUD 264a6067 PMD 0 [ 3260.121748] Oops: 0002 [1] SMP [ 3260.121753] CPU 1 [ 3260.121756] Modules linked in: dump_test(P) snd_pcm_oss snd_mixer_oss snd_seq snd_seq_device ide_cd generic ohci1394 snd_hda_intel snd_hda_codec snd_pcm snd_timer snd ieee1394 snd_page_alloc piix ide_core arcmsr aic79xx scsi_transport_spi usblp [ 3260.121785] Pid: 5556, comm: bash Tainted: P 2.6.18-git10 #1 [Alternatively, I can look into listing tainted flags with 'lsmod', but that won't help in oopsen/panics so much.] [akpm@osdl.org: cleanup] Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 05625d5dc758..7c77a0a9275c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -851,6 +851,7 @@ static int check_version(Elf_Shdr *sechdrs, printk("%s: no version for \"%s\" found: kernel tainted.\n", mod->name, symname); add_taint(TAINT_FORCED_MODULE); + mod->taints |= TAINT_FORCED_MODULE; } return 1; } @@ -1339,6 +1340,7 @@ static void set_license(struct module *mod, const char *license) printk(KERN_WARNING "%s: module license '%s' taints kernel.\n", mod->name, license); add_taint(TAINT_PROPRIETARY_MODULE); + mod->taints |= TAINT_PROPRIETARY_MODULE; } } @@ -1618,6 +1620,7 @@ static struct module *load_module(void __user *umod, /* This is allowed: modprobe --force will invalidate it. */ if (!modmagic) { add_taint(TAINT_FORCED_MODULE); + mod->taints |= TAINT_FORCED_MODULE; printk(KERN_WARNING "%s: no version magic, tainting kernel.\n", mod->name); } else if (!same_magic(modmagic, vermagic)) { @@ -1711,10 +1714,14 @@ static struct module *load_module(void __user *umod, /* Set up license info based on the info section */ set_license(mod, get_modinfo(sechdrs, infoindex, "license")); - if (strcmp(mod->name, "ndiswrapper") == 0) + if (strcmp(mod->name, "ndiswrapper") == 0) { add_taint(TAINT_PROPRIETARY_MODULE); - if (strcmp(mod->name, "driverloader") == 0) + mod->taints |= TAINT_PROPRIETARY_MODULE; + } + if (strcmp(mod->name, "driverloader") == 0) { add_taint(TAINT_PROPRIETARY_MODULE); + mod->taints |= TAINT_PROPRIETARY_MODULE; + } /* Set up MODINFO_ATTR fields */ setup_modinfo(mod, sechdrs, infoindex); @@ -1760,6 +1767,7 @@ static struct module *load_module(void __user *umod, printk(KERN_WARNING "%s: No versions for exported symbols." " Tainting kernel.\n", mod->name); add_taint(TAINT_FORCED_MODULE); + mod->taints |= TAINT_FORCED_MODULE; } #endif @@ -2226,14 +2234,37 @@ struct module *module_text_address(unsigned long addr) return mod; } +static char *taint_flags(unsigned int taints, char *buf) +{ + *buf = '\0'; + if (taints) { + int bx; + + buf[0] = '('; + bx = 1; + if (taints & TAINT_PROPRIETARY_MODULE) + buf[bx++] = 'P'; + if (taints & TAINT_FORCED_MODULE) + buf[bx++] = 'F'; + /* + * TAINT_FORCED_RMMOD: could be added. + * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't + * apply to modules. + */ + buf[bx] = ')'; + } + return buf; +} + /* Don't grab lock, we're oopsing. */ void print_modules(void) { struct module *mod; + char buf[8]; printk("Modules linked in:"); list_for_each_entry(mod, &modules, list) - printk(" %s", mod->name); + printk(" %s%s", mod->name, taint_flags(mod->taints, buf)); printk("\n"); } -- cgit v1.2.3 From 0804ef4b0de7121261f77c565b20a11ac694e877 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 2 Oct 2006 02:17:04 -0700 Subject: [PATCH] proc: readdir race fix (take 3) The problem: An opendir, readdir, closedir sequence can fail to report process ids that are continually in use throughout the sequence of system calls. For this race to trigger the process that proc_pid_readdir stops at must exit before readdir is called again. This can cause ps to fail to report processes, and it is in violation of posix guarantees and normal application expectations with respect to readdir. Currently there is no way to work around this problem in user space short of providing a gargantuan buffer to user space so the directory read all happens in on system call. This patch implements the normal directory semantics for proc, that guarantee that a directory entry that is neither created nor destroyed while reading the directory entry will be returned. For directory that are either created or destroyed during the readdir you may or may not see them. Furthermore you may seek to a directory offset you have previously seen. These are the guarantee that ext[23] provides and that posix requires, and more importantly that user space expects. Plus it is a simple semantic to implement reliable service. It is just a matter of calling readdir a second time if you are wondering if something new has show up. These better semantics are implemented by scanning through the pids in numerical order and by making the file offset a pid plus a fixed offset. The pid scan happens on the pid bitmap, which when you look at it is remarkably efficient for a brute force algorithm. Given that a typical cache line is 64 bytes and thus covers space for 64*8 == 200 pids. There are only 40 cache lines for the entire 32K pid space. A typical system will have 100 pids or more so this is actually fewer cache lines we have to look at to scan a linked list, and the worst case of having to scan the entire pid bitmap is pretty reasonable. If we need something more efficient we can go to a more efficient data structure for indexing the pids, but for now what we have should be sufficient. In addition this takes no additional locks and is actually less code than what we are doing now. Also another very subtle bug in this area has been fixed. It is possible to catch a task in the middle of de_thread where a thread is assuming the thread of it's thread group leader. This patch carefully handles that case so if we hit it we don't fail to return the pid, that is undergoing the de_thread dance. Thanks to KAMEZAWA Hiroyuki for providing the first fix, pointing this out and working on it. [oleg@tv-sign.ru: fix it] Signed-off-by: Eric W. Biederman Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Oleg Nesterov Cc: Jean Delvare Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 8387e8c68193..ed89a732432c 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -145,6 +145,23 @@ static int alloc_pidmap(void) return -1; } +static int next_pidmap(int last) +{ + int offset; + pidmap_t *map; + + offset = (last + 1) & BITS_PER_PAGE_MASK; + map = &pidmap_array[(last + 1)/BITS_PER_PAGE]; + for (; map < &pidmap_array[PIDMAP_ENTRIES]; map++, offset = 0) { + if (unlikely(!map->page)) + continue; + offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); + if (offset < BITS_PER_PAGE) + return mk_pid(map, offset); + } + return -1; +} + fastcall void put_pid(struct pid *pid) { if (!pid) @@ -302,6 +319,25 @@ struct pid *find_get_pid(pid_t nr) return pid; } +/* + * Used by proc to find the first pid that is greater then or equal to nr. + * + * If there is a pid at nr this function is exactly the same as find_pid. + */ +struct pid *find_ge_pid(int nr) +{ + struct pid *pid; + + do { + pid = find_pid(nr); + if (pid) + break; + nr = next_pidmap(nr); + } while (nr > 0); + + return pid; +} + /* * The pid hash table is scaled according to the amount of memory in the * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or -- cgit v1.2.3 From c4b92fc112f7be5cce308128236ff75cc98535c3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 2 Oct 2006 02:17:10 -0700 Subject: [PATCH] pid: implement signal functions that take a struct pid * Currently the signal functions all either take a task or a pid_t argument. This patch implements variants that take a struct pid *. After all of the users have been update it is my intention to remove the variants that take a pid_t as using pid_t can be more work (an extra hash table lookup) and difficult to get right in the presence of multiple pid namespaces. There are two kinds of functions introduced in this patch. The are the general use functions kill_pgrp and kill_pid which take a priv argument that is ultimately used to create the appropriate siginfo information, Then there are _kill_pgrp_info, kill_pgrp_info, kill_pid_info the internal implementation helpers that take an explicit siginfo. The distinction is made because filling out an explcit siginfo is tricky, and will be even more tricky when pid namespaces are introduced. Signed-off-by: Eric W. Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 47 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index fb5da6d19f14..5230ddcb1757 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1055,28 +1055,44 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) } /* - * kill_pg_info() sends a signal to a process group: this is what the tty + * kill_pgrp_info() sends a signal to a process group: this is what the tty * control characters do (^C, ^Z etc) */ -int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) +int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) { struct task_struct *p = NULL; int retval, success; - if (pgrp <= 0) - return -EINVAL; - success = 0; retval = -ESRCH; - do_each_task_pid(pgrp, PIDTYPE_PGID, p) { + do_each_pid_task(pgrp, PIDTYPE_PGID, p) { int err = group_send_sig_info(sig, info, p); success |= !err; retval = err; - } while_each_task_pid(pgrp, PIDTYPE_PGID, p); + } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return success ? 0 : retval; } +int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) +{ + int retval; + + read_lock(&tasklist_lock); + retval = __kill_pgrp_info(sig, info, pgrp); + read_unlock(&tasklist_lock); + + return retval; +} + +int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) +{ + if (pgrp <= 0) + return -EINVAL; + + return __kill_pgrp_info(sig, info, find_pid(pgrp)); +} + int kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) { @@ -1089,8 +1105,7 @@ kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) return retval; } -int -kill_proc_info(int sig, struct siginfo *info, pid_t pid) +int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) { int error; int acquired_tasklist_lock = 0; @@ -1101,7 +1116,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid) read_lock(&tasklist_lock); acquired_tasklist_lock = 1; } - p = find_task_by_pid(pid); + p = pid_task(pid, PIDTYPE_PID); error = -ESRCH; if (p) error = group_send_sig_info(sig, info, p); @@ -1111,6 +1126,16 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid) return error; } +int +kill_proc_info(int sig, struct siginfo *info, pid_t pid) +{ + int error; + rcu_read_lock(); + error = kill_pid_info(sig, info, find_pid(pid)); + rcu_read_unlock(); + return error; +} + /* like kill_proc_info(), but doesn't use uid/euid of "current" */ int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, uid_t uid, uid_t euid, u32 secid) @@ -1264,6 +1289,18 @@ force_sigsegv(int sig, struct task_struct *p) return 0; } +int kill_pgrp(struct pid *pid, int sig, int priv) +{ + return kill_pgrp_info(sig, __si_special(priv), pid); +} +EXPORT_SYMBOL(kill_pgrp); + +int kill_pid(struct pid *pid, int sig, int priv) +{ + return kill_pid_info(sig, __si_special(priv), pid); +} +EXPORT_SYMBOL(kill_pid); + int kill_pg(pid_t pgrp, int sig, int priv) { -- cgit v1.2.3 From bbf73147e2d46611fbdcbc126f887c614c32350b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 2 Oct 2006 02:17:11 -0700 Subject: [PATCH] pid: export the symbols needed to use struct pid * pids aren't something that drivers should care about. However there are a lot of helper layers in the kernel that do care, and are built as modules. Before I can convert them to using struct pid instead of pid_t I need to export the appropriate symbols so they can continue to be built. Signed-off-by: Eric W. Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index ed89a732432c..5c9037ba42f5 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -170,6 +170,7 @@ fastcall void put_pid(struct pid *pid) atomic_dec_and_test(&pid->count)) kmem_cache_free(pid_cachep, pid); } +EXPORT_SYMBOL_GPL(put_pid); static void delayed_put_pid(struct rcu_head *rhp) { @@ -234,6 +235,7 @@ struct pid * fastcall find_pid(int nr) } return NULL; } +EXPORT_SYMBOL_GPL(find_pid); int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr) { @@ -337,6 +339,7 @@ struct pid *find_ge_pid(int nr) return pid; } +EXPORT_SYMBOL_GPL(find_get_pid); /* * The pid hash table is scaled according to the amount of memory in the -- cgit v1.2.3 From 609d7fa9565c754428d2520cac2accc9052e1245 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 2 Oct 2006 02:17:15 -0700 Subject: [PATCH] file: modify struct fown_struct to use a struct pid File handles can be requested to send sigio and sigurg to processes. By tracking the destination processes using struct pid instead of pid_t we make the interface safe from all potential pid wrap around problems. Signed-off-by: Eric W. Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 4b6770e9806d..4aaf91951a43 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1527,7 +1527,7 @@ static int futex_fd(u32 __user *uaddr, int signal) filp->f_mapping = filp->f_dentry->d_inode->i_mapping; if (signal) { - err = f_setown(filp, current->pid, 1); + err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1); if (err < 0) { goto error; } -- cgit v1.2.3 From 6a1f3b84557774a46af68747c92d8f36382027ae Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Mon, 2 Oct 2006 02:17:20 -0700 Subject: [PATCH] pids: coding style: use struct pidmap Use struct pidmap instead of pidmap_t. Its a subset of Eric Biederman's patch http://lkml.org/lkml/2006/2/6/271. Signed-off-by: Eric W. Biederman Signed-off-by: Sukadev Bhattiprolu Cc: Dave Hansen Cc: Serge Hallyn Cc: Cedric Le Goater Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 5c9037ba42f5..0a45de2918e2 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -53,12 +53,12 @@ int pid_max_max = PID_MAX_LIMIT; * value does not cause lots of bitmaps to be allocated, but * the scheme scales to up to 4 million PIDs, runtime. */ -typedef struct pidmap { +struct pidmap { atomic_t nr_free; void *page; -} pidmap_t; +}; -static pidmap_t pidmap_array[PIDMAP_ENTRIES] = +static struct pidmap pidmap_array[PIDMAP_ENTRIES] = { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }; /* @@ -78,7 +78,7 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); static fastcall void free_pidmap(int pid) { - pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE; + struct pidmap *map = pidmap_array + pid / BITS_PER_PAGE; int offset = pid & BITS_PER_PAGE_MASK; clear_bit(offset, map->page); @@ -88,7 +88,7 @@ static fastcall void free_pidmap(int pid) static int alloc_pidmap(void) { int i, offset, max_scan, pid, last = last_pid; - pidmap_t *map; + struct pidmap *map; pid = last + 1; if (pid >= pid_max) -- cgit v1.2.3 From c88be3eb2e01bbb21c9ccdc3805f0d3546c1898c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 2 Oct 2006 02:17:21 -0700 Subject: [PATCH] pids coding style use struct pidmap in next_pidmap Use struct pidmap instead of pidmap_t. This updates my proc: readdir race fix (take 3) patch to account for the changes made by: Sukadev Bhattiprolu to kill pidmap_t. Signed-off-by: Eric W. Biederman Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 0a45de2918e2..373639a10d84 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -148,7 +148,7 @@ static int alloc_pidmap(void) static int next_pidmap(int last) { int offset; - pidmap_t *map; + struct pidmap *map; offset = (last + 1) & BITS_PER_PAGE_MASK; map = &pidmap_array[(last + 1)/BITS_PER_PAGE]; -- cgit v1.2.3 From aa5a6662f93f52605b6c447ba6f7291e92f515c5 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Mon, 2 Oct 2006 02:17:23 -0700 Subject: [PATCH] Move pidmap to pspace.h Move struct pidmap and PIDMAP_ENTRIES to a new file, include/linux/pspace.h where it will be used in subsequent patches to define pid spaces. Its a subset of Eric Biederman's patch http://lkml.org/lkml/2006/2/6/285 [akpm@osdl.org: cleanups] Signed-off-by: Eric W. Biederman Signed-off-by: Sukadev Bhattiprolu Cc: Dave Hansen Cc: Serge Hallyn Cc: Cedric Le Goater Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 373639a10d84..8234bd08a3cb 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -26,6 +26,7 @@ #include #include #include +#include #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) static struct hlist_head *pid_hash; @@ -40,7 +41,6 @@ int last_pid; int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; -#define PIDMAP_ENTRIES ((PID_MAX_LIMIT + 8*PAGE_SIZE - 1)/PAGE_SIZE/8) #define BITS_PER_PAGE (PAGE_SIZE*8) #define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) #define mk_pid(map, off) (((map) - pidmap_array)*BITS_PER_PAGE + (off)) @@ -53,11 +53,6 @@ int pid_max_max = PID_MAX_LIMIT; * value does not cause lots of bitmaps to be allocated, but * the scheme scales to up to 4 million PIDs, runtime. */ -struct pidmap { - atomic_t nr_free; - void *page; -}; - static struct pidmap pidmap_array[PIDMAP_ENTRIES] = { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }; -- cgit v1.2.3 From 3fbc96486459324e182717b03c50c90c880be6ec Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Mon, 2 Oct 2006 02:17:24 -0700 Subject: [PATCH] Define struct pspace Define a per-container pid space object. And create one instance of this object, init_pspace, to define the entire pid space. Subsequent patches will provide/use interfaces to create/destroy pid spaces. Its a subset/rework of Eric Biederman's patch http://lkml.org/lkml/2006/2/6/285 . Signed-off-by: Eric Biederman Signed-off-by: Sukadev Bhattiprolu Cc: Dave Hansen Cc: Serge Hallyn Cc: Cedric Le Goater Cc: Kirill Korotaev Cc: Andrey Savochkin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 53 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 8234bd08a3cb..89107b7481af 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -34,7 +34,6 @@ static int pidhash_shift; static kmem_cache_t *pid_cachep; int pid_max = PID_MAX_DEFAULT; -int last_pid; #define RESERVED_PIDS 300 @@ -43,7 +42,12 @@ int pid_max_max = PID_MAX_LIMIT; #define BITS_PER_PAGE (PAGE_SIZE*8) #define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) -#define mk_pid(map, off) (((map) - pidmap_array)*BITS_PER_PAGE + (off)) + +static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off) +{ + return (map - pspace->pidmap)*BITS_PER_PAGE + off; +} + #define find_next_offset(map, off) \ find_next_zero_bit((map)->page, BITS_PER_PAGE, off) @@ -53,8 +57,12 @@ int pid_max_max = PID_MAX_LIMIT; * value does not cause lots of bitmaps to be allocated, but * the scheme scales to up to 4 million PIDs, runtime. */ -static struct pidmap pidmap_array[PIDMAP_ENTRIES] = - { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }; +struct pspace init_pspace = { + .pidmap = { + [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } + }, + .last_pid = 0 +}; /* * Note: disable interrupts while the pidmap_lock is held as an @@ -69,40 +77,41 @@ static struct pidmap pidmap_array[PIDMAP_ENTRIES] = * irq handlers that take it we can leave the interrupts enabled. * For now it is easier to be safe than to prove it can't happen. */ + static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); -static fastcall void free_pidmap(int pid) +static fastcall void free_pidmap(struct pspace *pspace, int pid) { - struct pidmap *map = pidmap_array + pid / BITS_PER_PAGE; + struct pidmap *map = pspace->pidmap + pid / BITS_PER_PAGE; int offset = pid & BITS_PER_PAGE_MASK; clear_bit(offset, map->page); atomic_inc(&map->nr_free); } -static int alloc_pidmap(void) +static int alloc_pidmap(struct pspace *pspace) { - int i, offset, max_scan, pid, last = last_pid; + int i, offset, max_scan, pid, last = pspace->last_pid; struct pidmap *map; pid = last + 1; if (pid >= pid_max) pid = RESERVED_PIDS; offset = pid & BITS_PER_PAGE_MASK; - map = &pidmap_array[pid/BITS_PER_PAGE]; + map = &pspace->pidmap[pid/BITS_PER_PAGE]; max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; for (i = 0; i <= max_scan; ++i) { if (unlikely(!map->page)) { - unsigned long page = get_zeroed_page(GFP_KERNEL); + void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); /* * Free the page if someone raced with us * installing it: */ spin_lock_irq(&pidmap_lock); if (map->page) - free_page(page); + kfree(page); else - map->page = (void *)page; + map->page = page; spin_unlock_irq(&pidmap_lock); if (unlikely(!map->page)) break; @@ -111,11 +120,11 @@ static int alloc_pidmap(void) do { if (!test_and_set_bit(offset, map->page)) { atomic_dec(&map->nr_free); - last_pid = pid; + pspace->last_pid = pid; return pid; } offset = find_next_offset(map, offset); - pid = mk_pid(map, offset); + pid = mk_pid(pspace, map, offset); /* * find_next_offset() found a bit, the pid from it * is in-bounds, and if we fell back to the last @@ -126,16 +135,16 @@ static int alloc_pidmap(void) (i != max_scan || pid < last || !((last+1) & BITS_PER_PAGE_MASK))); } - if (map < &pidmap_array[(pid_max-1)/BITS_PER_PAGE]) { + if (map < &pspace->pidmap[(pid_max-1)/BITS_PER_PAGE]) { ++map; offset = 0; } else { - map = &pidmap_array[0]; + map = &pspace->pidmap[0]; offset = RESERVED_PIDS; if (unlikely(last == offset)) break; } - pid = mk_pid(map, offset); + pid = mk_pid(pspace, map, offset); } return -1; } @@ -182,7 +191,7 @@ fastcall void free_pid(struct pid *pid) hlist_del_rcu(&pid->pid_chain); spin_unlock_irqrestore(&pidmap_lock, flags); - free_pidmap(pid->nr); + free_pidmap(&init_pspace, pid->nr); call_rcu(&pid->rcu, delayed_put_pid); } @@ -196,7 +205,7 @@ struct pid *alloc_pid(void) if (!pid) goto out; - nr = alloc_pidmap(); + nr = alloc_pidmap(&init_pspace); if (nr < 0) goto out_free; @@ -363,10 +372,10 @@ void __init pidhash_init(void) void __init pidmap_init(void) { - pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL); + init_pspace.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); /* Reserve PID 0. We never call free_pidmap(0) */ - set_bit(0, pidmap_array->page); - atomic_dec(&pidmap_array->nr_free); + set_bit(0, init_pspace.pidmap[0].page); + atomic_dec(&init_pspace.pidmap[0].nr_free); pid_cachep = kmem_cache_create("pid", sizeof(struct pid), __alignof__(struct pid), -- cgit v1.2.3 From f40f50d3bb33b52dfd550ca80be7daaddad21883 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 2 Oct 2006 02:17:25 -0700 Subject: [PATCH] Use struct pspace in next_pidmap and find_ge_pid This updates my proc: readdir race fix (take 3) patch to account for the changes made by: Sukadev Bhattiprolu to introduce struct pspace. Signed-off-by: Eric W. Biederman Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 89107b7481af..e4779bbb2058 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -149,19 +149,20 @@ static int alloc_pidmap(struct pspace *pspace) return -1; } -static int next_pidmap(int last) +static int next_pidmap(struct pspace *pspace, int last) { int offset; - struct pidmap *map; + struct pidmap *map, *end; offset = (last + 1) & BITS_PER_PAGE_MASK; - map = &pidmap_array[(last + 1)/BITS_PER_PAGE]; - for (; map < &pidmap_array[PIDMAP_ENTRIES]; map++, offset = 0) { + map = &pspace->pidmap[(last + 1)/BITS_PER_PAGE]; + end = &pspace->pidmap[PIDMAP_ENTRIES]; + for (; map < end; map++, offset = 0) { if (unlikely(!map->page)) continue; offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); if (offset < BITS_PER_PAGE) - return mk_pid(map, offset); + return mk_pid(pspace, map, offset); } return -1; } @@ -338,7 +339,7 @@ struct pid *find_ge_pid(int nr) pid = find_pid(nr); if (pid) break; - nr = next_pidmap(nr); + nr = next_pidmap(&init_pspace, nr); } while (nr > 0); return pid; -- cgit v1.2.3 From 2425c08b37244005ff221efe4957d8aaff18609c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 2 Oct 2006 02:17:28 -0700 Subject: [PATCH] usb: fixup usb so it uses struct pid The problem with remembering a user space process by its pid is that it is possible that the process will exit, pid wrap around will occur. Converting to a struct pid avoid that problem, and paves the way for implementing a pid namespace. Also since usb is the only user of kill_proc_info_as_uid rename kill_proc_info_as_uid to kill_pid_info_as_uid and have the new version take a struct pid. Signed-off-by: Eric W. Biederman Acked-by: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 5230ddcb1757..7ed8d5304bec 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1136,8 +1136,8 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid) return error; } -/* like kill_proc_info(), but doesn't use uid/euid of "current" */ -int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, +/* like kill_pid_info(), but doesn't use uid/euid of "current" */ +int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, uid_t uid, uid_t euid, u32 secid) { int ret = -EINVAL; @@ -1147,7 +1147,7 @@ int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, return ret; read_lock(&tasklist_lock); - p = find_task_by_pid(pid); + p = pid_task(pid, PIDTYPE_PID); if (!p) { ret = -ESRCH; goto out_unlock; @@ -1171,7 +1171,7 @@ out_unlock: read_unlock(&tasklist_lock); return ret; } -EXPORT_SYMBOL_GPL(kill_proc_info_as_uid); +EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); /* * kill_something_info() interprets pid in interesting ways just like kill(2). -- cgit v1.2.3 From 3a872d89baae821a0f6e2c1055d4b47650661137 Mon Sep 17 00:00:00 2001 From: Ananth N Mavinakayanahalli Date: Mon, 2 Oct 2006 02:17:30 -0700 Subject: [PATCH] Kprobes: Make kprobe modules more portable In an effort to make kprobe modules more portable, here is a patch that: o Introduces the "symbol_name" field to struct kprobe. The symbol->address resolution now happens in the kernel in an architecture agnostic manner. 64-bit powerpc users no longer have to specify the ".symbols" o Introduces the "offset" field to struct kprobe to allow a user to specify an offset into a symbol. o The legacy mechanism of specifying the kprobe.addr is still supported. However, if both the kprobe.addr and kprobe.symbol_name are specified, probe registration fails with an -EINVAL. o The symbol resolution code uses kallsyms_lookup_name(). So CONFIG_KPROBES now depends on CONFIG_KALLSYMS o Apparantly kprobe modules were the only legitimate out-of-tree user of the kallsyms_lookup_name() EXPORT. Now that the symbol resolution happens in-kernel, remove the EXPORT as suggested by Christoph Hellwig o Modify tcp_probe.c that uses the kprobe interface so as to make it work on multiple platforms (in its earlier form, the code wouldn't work, say, on powerpc) Signed-off-by: Ananth N Mavinakayanahalli Signed-off-by: Prasanna S Panchamukhi Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 1 - kernel/kprobes.c | 26 ++++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index ab16a5a4cfe9..342bca62c496 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -154,7 +154,6 @@ unsigned long kallsyms_lookup_name(const char *name) } return module_kallsyms_lookup_name(name); } -EXPORT_SYMBOL_GPL(kallsyms_lookup_name); /* * Lookup an address diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3f57dfdc8f92..f66b8e681b4d 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -45,6 +46,16 @@ #define KPROBE_HASH_BITS 6 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) + +/* + * Some oddball architectures like 64bit powerpc have function descriptors + * so this must be overridable. + */ +#ifndef kprobe_lookup_name +#define kprobe_lookup_name(name, addr) \ + addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name))) +#endif + static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; static atomic_t kprobe_count; @@ -447,6 +458,21 @@ static int __kprobes __register_kprobe(struct kprobe *p, struct kprobe *old_p; struct module *probed_mod; + /* + * If we have a symbol_name argument look it up, + * and add it to the address. That way the addr + * field can either be global or relative to a symbol. + */ + if (p->symbol_name) { + if (p->addr) + return -EINVAL; + kprobe_lookup_name(p->symbol_name, p->addr); + } + + if (!p->addr) + return -EINVAL; + p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset); + if ((!kernel_text_address((unsigned long) p->addr)) || in_kprobes_functions((unsigned long) p->addr)) return -EINVAL; -- cgit v1.2.3 From 62c27be0dd8144e11bd3ed054a0fb890579925f8 Mon Sep 17 00:00:00 2001 From: "bibo,mao" Date: Mon, 2 Oct 2006 02:17:33 -0700 Subject: [PATCH] kprobe whitespace cleanup Whitespace is used to indent, this patch cleans up these sentences by kernel coding style. Signed-off-by: bibo, mao Signed-off-by: Ananth N Mavinakayanahalli Cc: "Luck, Tony" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index f66b8e681b4d..41dfda50e22a 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -347,17 +347,17 @@ struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) */ void __kprobes kprobe_flush_task(struct task_struct *tk) { - struct kretprobe_instance *ri; - struct hlist_head *head; + struct kretprobe_instance *ri; + struct hlist_head *head; struct hlist_node *node, *tmp; unsigned long flags = 0; spin_lock_irqsave(&kretprobe_lock, flags); - head = kretprobe_inst_table_head(tk); - hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { - if (ri->task == tk) - recycle_rp_inst(ri); - } + head = kretprobe_inst_table_head(tk); + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task == tk) + recycle_rp_inst(ri); + } spin_unlock_irqrestore(&kretprobe_lock, flags); } @@ -514,7 +514,7 @@ static int __kprobes __register_kprobe(struct kprobe *p, (ARCH_INACTIVE_KPROBE_COUNT + 1)) register_page_fault_notifier(&kprobe_page_fault_nb); - arch_arm_kprobe(p); + arch_arm_kprobe(p); out: mutex_unlock(&kprobe_mutex); -- cgit v1.2.3 From f2aa85a0ccd90110e76c6375535adc3ae358f971 Mon Sep 17 00:00:00 2001 From: "bibo,mao" Date: Mon, 2 Oct 2006 02:17:34 -0700 Subject: [PATCH] disallow kprobes on notifier_call_chain When kprobe is re-entered, the re-entered kprobe kernel path will will call atomic_notifier_call_chain function, if this function is kprobed that will incur numerous kprobe recursive fault. This patch disallows kprobes on atomic_notifier_call_chain function. Signed-off-by: bibo, mao Signed-off-by: Ananth N Mavinakayanahalli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 2460581c928c..398d57923f95 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -221,7 +221,7 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); * of the last notifier function called. */ -int atomic_notifier_call_chain(struct atomic_notifier_head *nh, +int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh, unsigned long val, void *v) { int ret; -- cgit v1.2.3 From 99219a3fbc2dcf2eaa954f7b2ac27299fd7894cd Mon Sep 17 00:00:00 2001 From: "bibo,mao" Date: Mon, 2 Oct 2006 02:17:35 -0700 Subject: [PATCH] kretprobe spinlock deadlock patch kprobe_flush_task() possibly calls kfree function during holding kretprobe_lock spinlock, if kfree function is probed by kretprobe that will incur spinlock deadlock. This patch moves kfree function out scope of kretprobe_lock. Signed-off-by: bibo, mao Signed-off-by: Ananth N Mavinakayanahalli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 41dfda50e22a..610c837ad9e0 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -319,7 +319,8 @@ void __kprobes add_rp_inst(struct kretprobe_instance *ri) } /* Called with kretprobe_lock held */ -void __kprobes recycle_rp_inst(struct kretprobe_instance *ri) +void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, + struct hlist_head *head) { /* remove rp inst off the rprobe_inst_table */ hlist_del(&ri->hlist); @@ -331,7 +332,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri) hlist_add_head(&ri->uflist, &ri->rp->free_instances); } else /* Unregistering */ - kfree(ri); + hlist_add_head(&ri->hlist, head); } struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) @@ -348,17 +349,23 @@ struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) void __kprobes kprobe_flush_task(struct task_struct *tk) { struct kretprobe_instance *ri; - struct hlist_head *head; + struct hlist_head *head, empty_rp; struct hlist_node *node, *tmp; unsigned long flags = 0; + INIT_HLIST_HEAD(&empty_rp); spin_lock_irqsave(&kretprobe_lock, flags); head = kretprobe_inst_table_head(tk); hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { if (ri->task == tk) - recycle_rp_inst(ri); + recycle_rp_inst(ri, &empty_rp); } spin_unlock_irqrestore(&kretprobe_lock, flags); + + hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { + hlist_del(&ri->hlist); + kfree(ri); + } } static inline void free_rp_inst(struct kretprobe *rp) -- cgit v1.2.3 From e16b38f71322efd8a221f64b6ddc0748d21d2e1a Mon Sep 17 00:00:00 2001 From: Greg Banks Date: Mon, 2 Oct 2006 02:17:40 -0700 Subject: [PATCH] cpumask: export cpu_online_map and cpu_possible_map consistently cpumask: ensure that the cpu_online_map and cpu_possible_map bitmasks, and hence all the macros in that require them, are available to modules for all supported combinations of architecture and CONFIG_SMP. Signed-off-by: Greg Banks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2bbd948f0169..e4e54e86f4a2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4384,7 +4384,10 @@ EXPORT_SYMBOL(cpu_present_map); #ifndef CONFIG_SMP cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; +EXPORT_SYMBOL(cpu_online_map); + cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; +EXPORT_SYMBOL(cpu_possible_map); #endif long sched_getaffinity(pid_t pid, cpumask_t *mask) -- cgit v1.2.3 From f5dd3d6fadf98a53b35d20427ca198fda42f1251 Mon Sep 17 00:00:00 2001 From: Sam Vilain Date: Mon, 2 Oct 2006 02:18:04 -0700 Subject: [PATCH] proc: sysctl: add _proc_do_string helper The logic in proc_do_string is worth re-using without passing in a ctl_table structure (say, we want to calculate a pointer and pass that in instead); pass in the two fields it uses from that structure as explicit arguments. Signed-off-by: Sam Vilain Cc: Serge E. Hallyn Cc: Kirill Korotaev Cc: "Eric W. Biederman" Cc: Herbert Poetzl Cc: Andrey Savochkin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 65 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 36 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ba42694f0453..8b5f4bb8ad2f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1624,32 +1624,14 @@ static ssize_t proc_writesys(struct file * file, const char __user * buf, return do_rw_proc(1, file, (char __user *) buf, count, ppos); } -/** - * proc_dostring - read a string sysctl - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes a string from/to the user buffer. If the kernel - * buffer provided is not large enough to hold the string, the - * string is truncated. The copied string is %NULL-terminated. - * If the string is being read by the user process, it is copied - * and a newline '\n' is added. It is truncated if the buffer is - * not large enough. - * - * Returns 0 on success. - */ -int proc_dostring(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) +int _proc_do_string(void* data, int maxlen, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) { size_t len; char __user *p; char c; - if (!table->data || !table->maxlen || !*lenp || + if (!data || !maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; @@ -1665,20 +1647,20 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, break; len++; } - if (len >= table->maxlen) - len = table->maxlen-1; - if(copy_from_user(table->data, buffer, len)) + if (len >= maxlen) + len = maxlen-1; + if(copy_from_user(data, buffer, len)) return -EFAULT; - ((char *) table->data)[len] = 0; + ((char *) data)[len] = 0; *ppos += *lenp; } else { - len = strlen(table->data); - if (len > table->maxlen) - len = table->maxlen; + len = strlen(data); + if (len > maxlen) + len = maxlen; if (len > *lenp) len = *lenp; if (len) - if(copy_to_user(buffer, table->data, len)) + if(copy_to_user(buffer, data, len)) return -EFAULT; if (len < *lenp) { if(put_user('\n', ((char __user *) buffer) + len)) @@ -1691,6 +1673,31 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, return 0; } +/** + * proc_dostring - read a string sysctl + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes a string from/to the user buffer. If the kernel + * buffer provided is not large enough to hold the string, the + * string is truncated. The copied string is %NULL-terminated. + * If the string is being read by the user process, it is copied + * and a newline '\n' is added. It is truncated if the buffer is + * not large enough. + * + * Returns 0 on success. + */ +int proc_dostring(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return _proc_do_string(table->data, table->maxlen, write, filp, + buffer, lenp, ppos); +} + /* * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? -- cgit v1.2.3 From b1ba4ddde0cf67991d89f039365eaaeda61aa027 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Mon, 2 Oct 2006 02:18:05 -0700 Subject: [PATCH] make kernel/sysctl.c:_proc_do_string() static This patch makes the needlessly global _proc_do_string() static. Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8b5f4bb8ad2f..8f3f00d5ee52 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1624,8 +1624,9 @@ static ssize_t proc_writesys(struct file * file, const char __user * buf, return do_rw_proc(1, file, (char __user *) buf, count, ppos); } -int _proc_do_string(void* data, int maxlen, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int _proc_do_string(void* data, int maxlen, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos) { size_t len; char __user *p; -- cgit v1.2.3 From ab516013ad9ca47f1d3a936fa81303bfbf734d52 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Mon, 2 Oct 2006 02:18:06 -0700 Subject: [PATCH] namespaces: add nsproxy This patch adds a nsproxy structure to the task struct. Later patches will move the fs namespace pointer into this structure, and introduce a new utsname namespace into the nsproxy. The vserver and openvz functionality, then, would be implemented in large part by virtualizing/isolating more and more resources into namespaces, each contained in the nsproxy. [akpm@osdl.org: build fix] Signed-off-by: Serge Hallyn Cc: Kirill Korotaev Cc: "Eric W. Biederman" Cc: Herbert Poetzl Cc: Andrey Savochkin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 2 +- kernel/exit.c | 7 ++++++ kernel/fork.c | 18 ++++++++++++- kernel/nsproxy.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 102 insertions(+), 2 deletions(-) create mode 100644 kernel/nsproxy.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index aacaafb28b9d..6ec53009b866 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ - hrtimer.o rwsem.o latency.o + hrtimer.o rwsem.o latency.o nsproxy.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ diff --git a/kernel/exit.c b/kernel/exit.c index 3b47f26985f2..1d0e9ea1fa05 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -397,9 +398,14 @@ void daemonize(const char *name, ...) fs = init_task.fs; current->fs = fs; atomic_inc(&fs->count); + exit_namespace(current); + exit_task_namespaces(current); current->namespace = init_task.namespace; + current->nsproxy = init_task.nsproxy; get_namespace(current->namespace); + get_task_namespaces(current); + exit_files(current); current->files = init_task.files; atomic_inc(¤t->files->count); @@ -918,6 +924,7 @@ fastcall NORET_TYPE void do_exit(long code) __exit_files(tsk); __exit_fs(tsk); exit_namespace(tsk); + exit_task_namespaces(tsk); exit_thread(); cpuset_exit(tsk); exit_keys(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 89f666491d1f..c9e660ae47aa 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -1116,8 +1117,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_signal; if ((retval = copy_keys(clone_flags, p))) goto bad_fork_cleanup_mm; - if ((retval = copy_namespace(clone_flags, p))) + if ((retval = copy_namespaces(clone_flags, p))) goto bad_fork_cleanup_keys; + if ((retval = copy_namespace(clone_flags, p))) + goto bad_fork_cleanup_namespaces; retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); if (retval) goto bad_fork_cleanup_namespace; @@ -1262,6 +1265,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, bad_fork_cleanup_namespace: exit_namespace(p); +bad_fork_cleanup_namespaces: + exit_task_namespaces(p); bad_fork_cleanup_keys: exit_keys(p); bad_fork_cleanup_mm: @@ -1606,6 +1611,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; struct sem_undo_list *new_ulist = NULL; + struct nsproxy *new_nsproxy, *old_nsproxy; check_unshare_flags(&unshare_flags); @@ -1632,7 +1638,15 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) { + old_nsproxy = current->nsproxy; + new_nsproxy = dup_namespaces(old_nsproxy); + if (!new_nsproxy) { + err = -ENOMEM; + goto bad_unshare_cleanup_semundo; + } + task_lock(current); + current->nsproxy = new_nsproxy; if (new_fs) { fs = current->fs; @@ -1668,8 +1682,10 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) } task_unlock(current); + put_nsproxy(old_nsproxy); } +bad_unshare_cleanup_semundo: bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c new file mode 100644 index 000000000000..ad9508865473 --- /dev/null +++ b/kernel/nsproxy.c @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2006 IBM Corporation + * + * Author: Serge Hallyn + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include +#include +#include + +static inline void get_nsproxy(struct nsproxy *ns) +{ + atomic_inc(&ns->count); +} + +void get_task_namespaces(struct task_struct *tsk) +{ + struct nsproxy *ns = tsk->nsproxy; + if (ns) { + get_nsproxy(ns); + } +} + +/* + * creates a copy of "orig" with refcount 1. + * This does not grab references to the contained namespaces, + * so that needs to be done by dup_namespaces. + */ +static inline struct nsproxy *clone_namespaces(struct nsproxy *orig) +{ + struct nsproxy *ns; + + ns = kmalloc(sizeof(struct nsproxy), GFP_KERNEL); + if (ns) { + memcpy(ns, orig, sizeof(struct nsproxy)); + atomic_set(&ns->count, 1); + } + return ns; +} + +/* + * copies the nsproxy, setting refcount to 1, and grabbing a + * reference to all contained namespaces. Called from + * sys_unshare() + */ +struct nsproxy *dup_namespaces(struct nsproxy *orig) +{ + struct nsproxy *ns = clone_namespaces(orig); + + return ns; +} + +/* + * called from clone. This now handles copy for nsproxy and all + * namespaces therein. + */ +int copy_namespaces(int flags, struct task_struct *tsk) +{ + struct nsproxy *old_ns = tsk->nsproxy; + + if (!old_ns) + return 0; + + get_nsproxy(old_ns); + + return 0; +} + +void free_nsproxy(struct nsproxy *ns) +{ + kfree(ns); +} -- cgit v1.2.3 From 0437eb594e6e5e699248f865482e61034be846d0 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Mon, 2 Oct 2006 02:18:07 -0700 Subject: [PATCH] nsproxy: move init_nsproxy into kernel/nsproxy.c Move the init_nsproxy definition out of arch/ into kernel/nsproxy.c. This avoids all arches having to be updated. Compiles and boots on s390. Signed-off-by: Serge E. Hallyn Cc: Kirill Korotaev Cc: "Eric W. Biederman" Cc: Herbert Poetzl Cc: Andrey Savochkin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/nsproxy.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index ad9508865473..a3612f82f187 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -12,6 +12,9 @@ #include #include #include +#include + +struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); static inline void get_nsproxy(struct nsproxy *ns) { -- cgit v1.2.3 From 1651e14e28a2d9f446018ef522882e0709a2ce4f Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Mon, 2 Oct 2006 02:18:08 -0700 Subject: [PATCH] namespaces: incorporate fs namespace into nsproxy This moves the mount namespace into the nsproxy. The mount namespace count now refers to the number of nsproxies point to it, rather than the number of tasks. As a result, the unshare_namespace() function in kernel/fork.c no longer checks whether it is being shared. Signed-off-by: Serge Hallyn Cc: Kirill Korotaev Cc: "Eric W. Biederman" Cc: Herbert Poetzl Cc: Andrey Savochkin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 4 ---- kernel/fork.c | 17 ++++++----------- kernel/nsproxy.c | 32 +++++++++++++++++++++++++++++++- 3 files changed, 37 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 1d0e9ea1fa05..741bbe42dfe8 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -399,11 +399,8 @@ void daemonize(const char *name, ...) current->fs = fs; atomic_inc(&fs->count); - exit_namespace(current); exit_task_namespaces(current); - current->namespace = init_task.namespace; current->nsproxy = init_task.nsproxy; - get_namespace(current->namespace); get_task_namespaces(current); exit_files(current); @@ -923,7 +920,6 @@ fastcall NORET_TYPE void do_exit(long code) exit_sem(tsk); __exit_files(tsk); __exit_fs(tsk); - exit_namespace(tsk); exit_task_namespaces(tsk); exit_thread(); cpuset_exit(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index c9e660ae47aa..33fcf0733ca6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1119,11 +1119,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_mm; if ((retval = copy_namespaces(clone_flags, p))) goto bad_fork_cleanup_keys; - if ((retval = copy_namespace(clone_flags, p))) - goto bad_fork_cleanup_namespaces; retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); if (retval) - goto bad_fork_cleanup_namespace; + goto bad_fork_cleanup_namespaces; p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* @@ -1215,7 +1213,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; - goto bad_fork_cleanup_namespace; + goto bad_fork_cleanup_namespaces; } if (clone_flags & CLONE_THREAD) { @@ -1263,8 +1261,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, proc_fork_connector(p); return p; -bad_fork_cleanup_namespace: - exit_namespace(p); bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_keys: @@ -1519,10 +1515,9 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) */ static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs) { - struct namespace *ns = current->namespace; + struct namespace *ns = current->nsproxy->namespace; - if ((unshare_flags & CLONE_NEWNS) && - (ns && atomic_read(&ns->count) > 1)) { + if ((unshare_flags & CLONE_NEWNS) && ns) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1655,8 +1650,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) } if (new_ns) { - ns = current->namespace; - current->namespace = new_ns; + ns = current->nsproxy->namespace; + current->nsproxy->namespace = new_ns; new_ns = ns; } diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index a3612f82f187..e10385c17f73 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -13,6 +13,7 @@ #include #include #include +#include struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); @@ -55,6 +56,11 @@ struct nsproxy *dup_namespaces(struct nsproxy *orig) { struct nsproxy *ns = clone_namespaces(orig); + if (ns) { + if (ns->namespace) + get_namespace(ns->namespace); + } + return ns; } @@ -65,16 +71,40 @@ struct nsproxy *dup_namespaces(struct nsproxy *orig) int copy_namespaces(int flags, struct task_struct *tsk) { struct nsproxy *old_ns = tsk->nsproxy; + struct nsproxy *new_ns; + int err = 0; if (!old_ns) return 0; get_nsproxy(old_ns); - return 0; + if (!(flags & CLONE_NEWNS)) + return 0; + + new_ns = clone_namespaces(old_ns); + if (!new_ns) { + err = -ENOMEM; + goto out; + } + + tsk->nsproxy = new_ns; + + err = copy_namespace(flags, tsk); + if (err) { + tsk->nsproxy = old_ns; + put_nsproxy(new_ns); + goto out; + } + +out: + put_nsproxy(old_ns); + return err; } void free_nsproxy(struct nsproxy *ns) { + if (ns->namespace) + put_namespace(ns->namespace); kfree(ns); } -- cgit v1.2.3 From fab413a334a7b3dd2688c5cd5d4718476e430ea4 Mon Sep 17 00:00:00 2001 From: Cedric Le Goater Date: Mon, 2 Oct 2006 02:18:09 -0700 Subject: [PATCH] namespaces: exit_task_namespaces() invalidates nsproxy exit_task_namespaces() has replaced the former exit_namespace(). It invalidates task->nsproxy and associated namespaces. This is an issue for the (futur) pid namespace which is required to be valid in exit_notify(). This patch moves exit_task_namespaces() after exit_notify() to keep nsproxy valid. Signed-off-by: Cedric Le Goater Cc: Serge E. Hallyn Cc: Kirill Korotaev Cc: "Eric W. Biederman" Cc: Herbert Poetzl Cc: Andrey Savochkin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 741bbe42dfe8..f250a5e3e281 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -920,7 +920,6 @@ fastcall NORET_TYPE void do_exit(long code) exit_sem(tsk); __exit_files(tsk); __exit_fs(tsk); - exit_task_namespaces(tsk); exit_thread(); cpuset_exit(tsk); exit_keys(tsk); @@ -935,6 +934,7 @@ fastcall NORET_TYPE void do_exit(long code) tsk->exit_code = code; proc_exit_connector(tsk); exit_notify(tsk); + exit_task_namespaces(tsk); #ifdef CONFIG_NUMA mpol_free(tsk->mempolicy); tsk->mempolicy = NULL; -- cgit v1.2.3 From e9ff3990f08e9a0c2839cc22808b01732ea5b3e4 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Mon, 2 Oct 2006 02:18:11 -0700 Subject: [PATCH] namespaces: utsname: switch to using uts namespaces Replace references to system_utsname to the per-process uts namespace where appropriate. This includes things like uname. Changes: Per Eric Biederman's comments, use the per-process uts namespace for ELF_PLATFORM, sunrpc, and parts of net/ipv4/ipconfig.c [jdike@addtoit.com: UML fix] [clg@fr.ibm.com: cleanup] [akpm@osdl.org: build fix] Signed-off-by: Serge E. Hallyn Cc: Kirill Korotaev Cc: "Eric W. Biederman" Cc: Herbert Poetzl Cc: Andrey Savochkin Signed-off-by: Cedric Le Goater Cc: Jeff Dike Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 398d57923f95..3a4776e8f16e 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1655,7 +1655,7 @@ asmlinkage long sys_newuname(struct new_utsname __user * name) int errno = 0; down_read(&uts_sem); - if (copy_to_user(name,&system_utsname,sizeof *name)) + if (copy_to_user(name, utsname(), sizeof *name)) errno = -EFAULT; up_read(&uts_sem); return errno; @@ -1673,8 +1673,8 @@ asmlinkage long sys_sethostname(char __user *name, int len) down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - memcpy(system_utsname.nodename, tmp, len); - system_utsname.nodename[len] = 0; + memcpy(utsname()->nodename, tmp, len); + utsname()->nodename[len] = 0; errno = 0; } up_write(&uts_sem); @@ -1690,11 +1690,11 @@ asmlinkage long sys_gethostname(char __user *name, int len) if (len < 0) return -EINVAL; down_read(&uts_sem); - i = 1 + strlen(system_utsname.nodename); + i = 1 + strlen(utsname()->nodename); if (i > len) i = len; errno = 0; - if (copy_to_user(name, system_utsname.nodename, i)) + if (copy_to_user(name, utsname()->nodename, i)) errno = -EFAULT; up_read(&uts_sem); return errno; @@ -1719,8 +1719,8 @@ asmlinkage long sys_setdomainname(char __user *name, int len) down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - memcpy(system_utsname.domainname, tmp, len); - system_utsname.domainname[len] = 0; + memcpy(utsname()->domainname, tmp, len); + utsname()->domainname[len] = 0; errno = 0; } up_write(&uts_sem); -- cgit v1.2.3 From 96b644bdec977b97a45133e5b4466ba47a7a5e65 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Mon, 2 Oct 2006 02:18:13 -0700 Subject: [PATCH] namespaces: utsname: use init_utsname when appropriate In some places, particularly drivers and __init code, the init utsns is the appropriate one to use. This patch replaces those with a the init_utsname helper. Changes: Removed several uses of init_utsname(). Hope I picked all the right ones in net/ipv4/ipconfig.c. These are now changed to utsname() (the per-process namespace utsname) in the previous patch (2/7) [akpm@osdl.org: CIFS fix] Signed-off-by: Serge E. Hallyn Cc: Kirill Korotaev Cc: "Eric W. Biederman" Cc: Herbert Poetzl Cc: Andrey Savochkin Cc: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/lockdep.c | 6 +++--- kernel/power/snapshot.c | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index e596525669ed..4c0553461000 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -518,9 +518,9 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth) static void print_kernel_version(void) { - printk("%s %.*s\n", system_utsname.release, - (int)strcspn(system_utsname.version, " "), - system_utsname.version); + printk("%s %.*s\n", init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); } /* diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 1b84313cbab5..99f9b7d177d6 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -906,7 +906,7 @@ static void init_header(struct swsusp_info *info) memset(info, 0, sizeof(struct swsusp_info)); info->version_code = LINUX_VERSION_CODE; info->num_physpages = num_physpages; - memcpy(&info->uts, &system_utsname, sizeof(system_utsname)); + memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname)); info->cpus = num_online_cpus(); info->image_pages = nr_copy_pages; info->pages = nr_copy_pages + nr_meta_pages + 1; @@ -1050,13 +1050,13 @@ static inline int check_header(struct swsusp_info *info) reason = "kernel version"; if (info->num_physpages != num_physpages) reason = "memory size"; - if (strcmp(info->uts.sysname,system_utsname.sysname)) + if (strcmp(info->uts.sysname,init_utsname()->sysname)) reason = "system type"; - if (strcmp(info->uts.release,system_utsname.release)) + if (strcmp(info->uts.release,init_utsname()->release)) reason = "kernel release"; - if (strcmp(info->uts.version,system_utsname.version)) + if (strcmp(info->uts.version,init_utsname()->version)) reason = "version"; - if (strcmp(info->uts.machine,system_utsname.machine)) + if (strcmp(info->uts.machine,init_utsname()->machine)) reason = "machine"; if (reason) { printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason); -- cgit v1.2.3 From 4865ecf1315b450ab3317a745a6678c04d311e40 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Mon, 2 Oct 2006 02:18:14 -0700 Subject: [PATCH] namespaces: utsname: implement utsname namespaces This patch defines the uts namespace and some manipulators. Adds the uts namespace to task_struct, and initializes a system-wide init namespace. It leaves a #define for system_utsname so sysctl will compile. This define will be removed in a separate patch. [akpm@osdl.org: build fix, cleanup] Signed-off-by: Serge Hallyn Cc: Kirill Korotaev Cc: "Eric W. Biederman" Cc: Herbert Poetzl Cc: Andrey Savochkin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 1 + kernel/nsproxy.c | 14 ++++++++++++++ kernel/utsname.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+) create mode 100644 kernel/utsname.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 6ec53009b866..d948ca12acf0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RELAY) += relay.o +obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index e10385c17f73..47c19280c55b 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -14,6 +14,7 @@ #include #include #include +#include struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); @@ -59,6 +60,8 @@ struct nsproxy *dup_namespaces(struct nsproxy *orig) if (ns) { if (ns->namespace) get_namespace(ns->namespace); + if (ns->uts_ns) + get_uts_ns(ns->uts_ns); } return ns; @@ -97,6 +100,15 @@ int copy_namespaces(int flags, struct task_struct *tsk) goto out; } + err = copy_utsname(flags, tsk); + if (err) { + if (new_ns->namespace) + put_namespace(new_ns->namespace); + tsk->nsproxy = old_ns; + put_nsproxy(new_ns); + goto out; + } + out: put_nsproxy(old_ns); return err; @@ -106,5 +118,7 @@ void free_nsproxy(struct nsproxy *ns) { if (ns->namespace) put_namespace(ns->namespace); + if (ns->uts_ns) + put_uts_ns(ns->uts_ns); kfree(ns); } diff --git a/kernel/utsname.c b/kernel/utsname.c new file mode 100644 index 000000000000..1824384ecfa3 --- /dev/null +++ b/kernel/utsname.c @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2004 IBM Corporation + * + * Author: Serge Hallyn + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include +#include +#include +#include + +/* + * Copy task tsk's utsname namespace, or clone it if flags + * specifies CLONE_NEWUTS. In latter case, changes to the + * utsname of this process won't be seen by parent, and vice + * versa. + */ +int copy_utsname(int flags, struct task_struct *tsk) +{ + struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; + int err = 0; + + if (!old_ns) + return 0; + + get_uts_ns(old_ns); + + return err; +} + +void free_uts_ns(struct kref *kref) +{ + struct uts_namespace *ns; + + ns = container_of(kref, struct uts_namespace, kref); + kfree(ns); +} -- cgit v1.2.3 From 8218c74c02a7bdb5db2e40a2100534bdeb83475b Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Mon, 2 Oct 2006 02:18:15 -0700 Subject: [PATCH] namespaces: utsname: sysctl Sysctl uts patch. This will need to be done another way, but since sysctl itself needs to be container aware, 'the right thing' is a separate patchset. [akpm@osdl.org: ia64 build fix] [sam.vilain@catalyst.net.nz: cleanup] [sam.vilain@catalyst.net.nz: add proc_do_utsns_string] Signed-off-by: Serge E. Hallyn Cc: Kirill Korotaev Cc: "Eric W. Biederman" Cc: Herbert Poetzl Cc: Andrey Savochkin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 119 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8f3f00d5ee52..476ac2522f18 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -139,7 +139,7 @@ static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, ctl_table *, void **); #endif -static int proc_doutsstring(ctl_table *table, int write, struct file *filp, +static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); static ctl_table root_table[]; @@ -229,51 +229,100 @@ static ctl_table root_table[] = { }; static ctl_table kern_table[] = { +#ifndef CONFIG_UTS_NS { .ctl_name = KERN_OSTYPE, .procname = "ostype", - .data = system_utsname.sysname, - .maxlen = sizeof(system_utsname.sysname), + .data = init_uts_ns.name.sysname, + .maxlen = sizeof(init_uts_ns.name.sysname), .mode = 0444, - .proc_handler = &proc_doutsstring, + .proc_handler = &proc_do_uts_string, .strategy = &sysctl_string, }, { .ctl_name = KERN_OSRELEASE, .procname = "osrelease", - .data = system_utsname.release, - .maxlen = sizeof(system_utsname.release), + .data = init_uts_ns.name.release, + .maxlen = sizeof(init_uts_ns.name.release), .mode = 0444, - .proc_handler = &proc_doutsstring, + .proc_handler = &proc_do_uts_string, .strategy = &sysctl_string, }, { .ctl_name = KERN_VERSION, .procname = "version", - .data = system_utsname.version, - .maxlen = sizeof(system_utsname.version), + .data = init_uts_ns.name.version, + .maxlen = sizeof(init_uts_ns.name.version), .mode = 0444, - .proc_handler = &proc_doutsstring, + .proc_handler = &proc_do_uts_string, .strategy = &sysctl_string, }, { .ctl_name = KERN_NODENAME, .procname = "hostname", - .data = system_utsname.nodename, - .maxlen = sizeof(system_utsname.nodename), + .data = init_uts_ns.name.nodename, + .maxlen = sizeof(init_uts_ns.name.nodename), .mode = 0644, - .proc_handler = &proc_doutsstring, + .proc_handler = &proc_do_uts_string, .strategy = &sysctl_string, }, { .ctl_name = KERN_DOMAINNAME, .procname = "domainname", - .data = system_utsname.domainname, - .maxlen = sizeof(system_utsname.domainname), + .data = init_uts_ns.name.domainname, + .maxlen = sizeof(init_uts_ns.name.domainname), .mode = 0644, - .proc_handler = &proc_doutsstring, + .proc_handler = &proc_do_uts_string, .strategy = &sysctl_string, }, +#else /* !CONFIG_UTS_NS */ + { + .ctl_name = KERN_OSTYPE, + .procname = "ostype", + .data = NULL, + /* could maybe use __NEW_UTS_LEN here? */ + .maxlen = FIELD_SIZEOF(struct new_utsname, sysname), + .mode = 0444, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_OSRELEASE, + .procname = "osrelease", + .data = NULL, + .maxlen = FIELD_SIZEOF(struct new_utsname, release), + .mode = 0444, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_VERSION, + .procname = "version", + .data = NULL, + .maxlen = FIELD_SIZEOF(struct new_utsname, version), + .mode = 0444, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_NODENAME, + .procname = "hostname", + .data = NULL, + .maxlen = FIELD_SIZEOF(struct new_utsname, nodename), + .mode = 0644, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_DOMAINNAME, + .procname = "domainname", + .data = NULL, + .maxlen = FIELD_SIZEOF(struct new_utsname, domainname), + .mode = 0644, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_string, + }, +#endif /* !CONFIG_UTS_NS */ { .ctl_name = KERN_PANIC, .procname = "panic", @@ -1704,7 +1753,8 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, * to observe. Should this be in kernel/sys.c ???? */ -static int proc_doutsstring(ctl_table *table, int write, struct file *filp, +#ifndef CONFIG_UTS_NS +static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int r; @@ -1720,6 +1770,48 @@ static int proc_doutsstring(ctl_table *table, int write, struct file *filp, } return r; } +#else /* !CONFIG_UTS_NS */ +static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int r; + struct uts_namespace* uts_ns = current->nsproxy->uts_ns; + char* which; + + switch (table->ctl_name) { + case KERN_OSTYPE: + which = uts_ns->name.sysname; + break; + case KERN_NODENAME: + which = uts_ns->name.nodename; + break; + case KERN_OSRELEASE: + which = uts_ns->name.release; + break; + case KERN_VERSION: + which = uts_ns->name.version; + break; + case KERN_DOMAINNAME: + which = uts_ns->name.domainname; + break; + default: + r = -EINVAL; + goto out; + } + + if (!write) { + down_read(&uts_sem); + r=_proc_do_string(which,table->maxlen,0,filp,buffer,lenp, ppos); + up_read(&uts_sem); + } else { + down_write(&uts_sem); + r=_proc_do_string(which,table->maxlen,1,filp,buffer,lenp, ppos); + up_write(&uts_sem); + } + out: + return r; +} +#endif /* !CONFIG_UTS_NS */ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, int *valp, @@ -2283,12 +2375,20 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, return -ENOSYS; } -static int proc_doutsstring(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } +#ifdef CONFIG_SYSVIPC +static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} +#endif + int proc_dointvec(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { -- cgit v1.2.3 From 071df104f808b8195c40643dcb4d060681742e29 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Mon, 2 Oct 2006 02:18:17 -0700 Subject: [PATCH] namespaces: utsname: implement CLONE_NEWUTS flag Implement a CLONE_NEWUTS flag, and use it at clone and sys_unshare. [clg@fr.ibm.com: IPC unshare fix] [bunk@stusta.de: cleanup] Signed-off-by: Serge Hallyn Cc: Kirill Korotaev Cc: "Eric W. Biederman" Cc: Herbert Poetzl Cc: Andrey Savochkin Signed-off-by: Adrian Bunk Signed-off-by: Cedric Le Goater Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 20 +++++++++++++++++--- kernel/nsproxy.c | 2 +- kernel/utsname.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 33fcf0733ca6..c08cf05dd59b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1607,13 +1607,14 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) struct files_struct *fd, *new_fd = NULL; struct sem_undo_list *new_ulist = NULL; struct nsproxy *new_nsproxy, *old_nsproxy; + struct uts_namespace *uts, *new_uts = NULL; check_unshare_flags(&unshare_flags); /* Return -EINVAL for all unsupported flags */ err = -EINVAL; if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| - CLONE_VM|CLONE_FILES|CLONE_SYSVSEM)) + CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|CLONE_NEWUTS)) goto bad_unshare_out; if ((err = unshare_thread(unshare_flags))) @@ -1630,14 +1631,17 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) goto bad_unshare_cleanup_vm; if ((err = unshare_semundo(unshare_flags, &new_ulist))) goto bad_unshare_cleanup_fd; + if ((err = unshare_utsname(unshare_flags, &new_uts))) + goto bad_unshare_cleanup_semundo; - if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) { + if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist || + new_uts) { old_nsproxy = current->nsproxy; new_nsproxy = dup_namespaces(old_nsproxy); if (!new_nsproxy) { err = -ENOMEM; - goto bad_unshare_cleanup_semundo; + goto bad_unshare_cleanup_uts; } task_lock(current); @@ -1676,10 +1680,20 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) new_fd = fd; } + if (new_uts) { + uts = current->nsproxy->uts_ns; + current->nsproxy->uts_ns = new_uts; + new_uts = uts; + } + task_unlock(current); put_nsproxy(old_nsproxy); } +bad_unshare_cleanup_uts: + if (new_uts) + put_uts_ns(new_uts); + bad_unshare_cleanup_semundo: bad_unshare_cleanup_fd: if (new_fd) diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 47c19280c55b..8246813335cc 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -82,7 +82,7 @@ int copy_namespaces(int flags, struct task_struct *tsk) get_nsproxy(old_ns); - if (!(flags & CLONE_NEWNS)) + if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS))) return 0; new_ns = clone_namespaces(old_ns); diff --git a/kernel/utsname.c b/kernel/utsname.c index 1824384ecfa3..c859164a6993 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -14,6 +14,41 @@ #include #include +/* + * Clone a new ns copying an original utsname, setting refcount to 1 + * @old_ns: namespace to clone + * Return NULL on error (failure to kmalloc), new ns otherwise + */ +static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) +{ + struct uts_namespace *ns; + + ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); + if (ns) { + memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); + kref_init(&ns->kref); + } + return ns; +} + +/* + * unshare the current process' utsname namespace. + * called only in sys_unshare() + */ +int unshare_utsname(unsigned long unshare_flags, struct uts_namespace **new_uts) +{ + if (unshare_flags & CLONE_NEWUTS) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + *new_uts = clone_uts_ns(current->nsproxy->uts_ns); + if (!*new_uts) + return -ENOMEM; + } + + return 0; +} + /* * Copy task tsk's utsname namespace, or clone it if flags * specifies CLONE_NEWUTS. In latter case, changes to the @@ -23,6 +58,7 @@ int copy_utsname(int flags, struct task_struct *tsk) { struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; + struct uts_namespace *new_ns; int err = 0; if (!old_ns) @@ -30,6 +66,23 @@ int copy_utsname(int flags, struct task_struct *tsk) get_uts_ns(old_ns); + if (!(flags & CLONE_NEWUTS)) + return 0; + + if (!capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto out; + } + + new_ns = clone_uts_ns(old_ns); + if (!new_ns) { + err = -ENOMEM; + goto out; + } + tsk->nsproxy->uts_ns = new_ns; + +out: + put_uts_ns(old_ns); return err; } -- cgit v1.2.3 From c0b2fc316599d6cd875b6b8cafa67f03b9512b4d Mon Sep 17 00:00:00 2001 From: Serge Hallyn Date: Mon, 2 Oct 2006 02:18:18 -0700 Subject: [PATCH] uts: copy nsproxy only when needed The nsproxy was being copied in unshare() when anything was being unshared, even if it was something not referenced from nsproxy. This should end up in some cases with far more memory usage than necessary. Signed-off-by: Serge Hallyn Cc: Kirill Korotaev Cc: "Eric W. Biederman" Cc: Herbert Poetzl Cc: Andrey Savochkin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index c08cf05dd59b..208dd99f13bc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1606,7 +1606,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; struct sem_undo_list *new_ulist = NULL; - struct nsproxy *new_nsproxy, *old_nsproxy; + struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL; struct uts_namespace *uts, *new_uts = NULL; check_unshare_flags(&unshare_flags); @@ -1634,18 +1634,24 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) if ((err = unshare_utsname(unshare_flags, &new_uts))) goto bad_unshare_cleanup_semundo; - if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist || - new_uts) { - + if (new_ns || new_uts) { old_nsproxy = current->nsproxy; new_nsproxy = dup_namespaces(old_nsproxy); if (!new_nsproxy) { err = -ENOMEM; goto bad_unshare_cleanup_uts; } + } + + if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist || + new_uts) { task_lock(current); - current->nsproxy = new_nsproxy; + + if (new_nsproxy) { + current->nsproxy = new_nsproxy; + new_nsproxy = old_nsproxy; + } if (new_fs) { fs = current->fs; @@ -1687,9 +1693,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) } task_unlock(current); - put_nsproxy(old_nsproxy); } + if (new_nsproxy) + put_nsproxy(new_nsproxy); + bad_unshare_cleanup_uts: if (new_uts) put_uts_ns(new_uts); -- cgit v1.2.3 From 25b21cb2f6d69b0475b134e0a3e8e269137270fa Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Mon, 2 Oct 2006 02:18:19 -0700 Subject: [PATCH] IPC namespace core This patch set allows to unshare IPCs and have a private set of IPC objects (sem, shm, msg) inside namespace. Basically, it is another building block of containers functionality. This patch implements core IPC namespace changes: - ipc_namespace structure - new config option CONFIG_IPC_NS - adds CLONE_NEWIPC flag - unshare support [clg@fr.ibm.com: small fix for unshare of ipc namespace] [akpm@osdl.org: build fix] Signed-off-by: Pavel Emelianov Signed-off-by: Kirill Korotaev Signed-off-by: Cedric Le Goater Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 22 ++++++++++++++++++---- kernel/nsproxy.c | 41 ++++++++++++++++++++++++++++------------- 2 files changed, 46 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 208dd99f13bc..d6cc56558507 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1608,13 +1608,15 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) struct sem_undo_list *new_ulist = NULL; struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL; struct uts_namespace *uts, *new_uts = NULL; + struct ipc_namespace *ipc, *new_ipc = NULL; check_unshare_flags(&unshare_flags); /* Return -EINVAL for all unsupported flags */ err = -EINVAL; if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| - CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|CLONE_NEWUTS)) + CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| + CLONE_NEWUTS|CLONE_NEWIPC)) goto bad_unshare_out; if ((err = unshare_thread(unshare_flags))) @@ -1633,18 +1635,20 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) goto bad_unshare_cleanup_fd; if ((err = unshare_utsname(unshare_flags, &new_uts))) goto bad_unshare_cleanup_semundo; + if ((err = unshare_ipcs(unshare_flags, &new_ipc))) + goto bad_unshare_cleanup_uts; - if (new_ns || new_uts) { + if (new_ns || new_uts || new_ipc) { old_nsproxy = current->nsproxy; new_nsproxy = dup_namespaces(old_nsproxy); if (!new_nsproxy) { err = -ENOMEM; - goto bad_unshare_cleanup_uts; + goto bad_unshare_cleanup_ipc; } } if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist || - new_uts) { + new_uts || new_ipc) { task_lock(current); @@ -1692,12 +1696,22 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) new_uts = uts; } + if (new_ipc) { + ipc = current->nsproxy->ipc_ns; + current->nsproxy->ipc_ns = new_ipc; + new_ipc = ipc; + } + task_unlock(current); } if (new_nsproxy) put_nsproxy(new_nsproxy); +bad_unshare_cleanup_ipc: + if (new_ipc) + put_ipc_ns(new_ipc); + bad_unshare_cleanup_uts: if (new_uts) put_uts_ns(new_uts); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 8246813335cc..8d6c852dc51e 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -7,6 +7,10 @@ * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation, version 2 of the * License. + * + * Jun 2006 - namespaces support + * OpenVZ, SWsoft Inc. + * Pavel Emelianov */ #include @@ -62,6 +66,8 @@ struct nsproxy *dup_namespaces(struct nsproxy *orig) get_namespace(ns->namespace); if (ns->uts_ns) get_uts_ns(ns->uts_ns); + if (ns->ipc_ns) + get_ipc_ns(ns->ipc_ns); } return ns; @@ -82,7 +88,7 @@ int copy_namespaces(int flags, struct task_struct *tsk) get_nsproxy(old_ns); - if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS))) + if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) return 0; new_ns = clone_namespaces(old_ns); @@ -94,24 +100,31 @@ int copy_namespaces(int flags, struct task_struct *tsk) tsk->nsproxy = new_ns; err = copy_namespace(flags, tsk); - if (err) { - tsk->nsproxy = old_ns; - put_nsproxy(new_ns); - goto out; - } + if (err) + goto out_ns; err = copy_utsname(flags, tsk); - if (err) { - if (new_ns->namespace) - put_namespace(new_ns->namespace); - tsk->nsproxy = old_ns; - put_nsproxy(new_ns); - goto out; - } + if (err) + goto out_uts; + + err = copy_ipcs(flags, tsk); + if (err) + goto out_ipc; out: put_nsproxy(old_ns); return err; + +out_ipc: + if (new_ns->uts_ns) + put_uts_ns(new_ns->uts_ns); +out_uts: + if (new_ns->namespace) + put_namespace(new_ns->namespace); +out_ns: + tsk->nsproxy = old_ns; + put_nsproxy(new_ns); + goto out; } void free_nsproxy(struct nsproxy *ns) @@ -120,5 +133,7 @@ void free_nsproxy(struct nsproxy *ns) put_namespace(ns->namespace); if (ns->uts_ns) put_uts_ns(ns->uts_ns); + if (ns->ipc_ns) + put_ipc_ns(ns->ipc_ns); kfree(ns); } -- cgit v1.2.3 From 73ea41302bab5e02c9e86ab15c509494a550f1db Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Mon, 2 Oct 2006 02:18:20 -0700 Subject: [PATCH] IPC namespace - utils This patch adds basic IPC namespace functionality to IPC utils: - init_ipc_ns - copy/clone/unshare/free IPC ns - /proc preparations Signed-off-by: Pavel Emelianov Signed-off-by: Kirill Korotaev Cc: "Eric W. Biederman" Cc: Cedric Le Goater Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index d6cc56558507..7dc6140baac6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1589,6 +1589,16 @@ static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **n return 0; } +#ifndef CONFIG_IPC_NS +static inline int unshare_ipcs(unsigned long flags, struct ipc_namespace **ns) +{ + if (flags & CLONE_NEWIPC) + return -EINVAL; + + return 0; +} +#endif + /* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* -- cgit v1.2.3 From fcfbd547b1209aae9d880fe5db33464413925cc8 Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Mon, 2 Oct 2006 02:18:23 -0700 Subject: [PATCH] IPC namespace - sysctls Sysctl tweaks for IPC namespace Signed-off-by: Pavel Emelianiov Signed-off-by: Kirill Korotaev Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 116 ++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 88 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 476ac2522f18..a79ccf9d113b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -92,13 +92,8 @@ extern char modprobe_path[]; extern int sg_big_buff; #endif #ifdef CONFIG_SYSVIPC -extern size_t shm_ctlmax; -extern size_t shm_ctlall; -extern int shm_ctlmni; -extern int msg_ctlmax; -extern int msg_ctlmnb; -extern int msg_ctlmni; -extern int sem_ctls[]; +static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); #endif #ifdef __sparc__ @@ -481,58 +476,58 @@ static ctl_table kern_table[] = { { .ctl_name = KERN_SHMMAX, .procname = "shmmax", - .data = &shm_ctlmax, + .data = NULL, .maxlen = sizeof (size_t), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = &proc_do_ipc_string, }, { .ctl_name = KERN_SHMALL, .procname = "shmall", - .data = &shm_ctlall, + .data = NULL, .maxlen = sizeof (size_t), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = &proc_do_ipc_string, }, { .ctl_name = KERN_SHMMNI, .procname = "shmmni", - .data = &shm_ctlmni, + .data = NULL, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_ipc_string, }, { .ctl_name = KERN_MSGMAX, .procname = "msgmax", - .data = &msg_ctlmax, + .data = NULL, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_ipc_string, }, { .ctl_name = KERN_MSGMNI, .procname = "msgmni", - .data = &msg_ctlmni, + .data = NULL, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_ipc_string, }, { .ctl_name = KERN_MSGMNB, .procname = "msgmnb", - .data = &msg_ctlmnb, + .data = NULL, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_ipc_string, }, { .ctl_name = KERN_SEM, .procname = "sem", - .data = &sem_ctls, + .data = NULL, .maxlen = 4*sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_ipc_string, }, #endif #ifdef CONFIG_MAGIC_SYSRQ @@ -1832,8 +1827,9 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, return 0; } -static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos, +static int __do_proc_dointvec(void *tbl_data, ctl_table *table, + int write, struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos, int (*conv)(int *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) @@ -1846,13 +1842,13 @@ static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, char buf[TMPBUFLEN], *p; char __user *s = buffer; - if (!table->data || !table->maxlen || !*lenp || + if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; } - i = (int *) table->data; + i = (int *) tbl_data; vleft = table->maxlen / sizeof(*i); left = *lenp; @@ -1941,6 +1937,16 @@ static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, #undef TMPBUFLEN } +static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(int *negp, unsigned long *lvalp, int *valp, + int write, void *data), + void *data) +{ + return __do_proc_dointvec(table->data, table, write, filp, + buffer, lenp, ppos, conv, data); +} + /** * proc_dointvec - read a vector of integers * @table: the sysctl table @@ -2074,7 +2080,7 @@ int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp, do_proc_dointvec_minmax_conv, ¶m); } -static int do_proc_doulongvec_minmax(ctl_table *table, int write, +static int __do_proc_doulongvec_minmax(void *data, ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos, @@ -2088,13 +2094,13 @@ static int do_proc_doulongvec_minmax(ctl_table *table, int write, char buf[TMPBUFLEN], *p; char __user *s = buffer; - if (!table->data || !table->maxlen || !*lenp || + if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; } - i = (unsigned long *) table->data; + i = (unsigned long *) data; min = (unsigned long *) table->extra1; max = (unsigned long *) table->extra2; vleft = table->maxlen / sizeof(unsigned long); @@ -2179,6 +2185,17 @@ static int do_proc_doulongvec_minmax(ctl_table *table, int write, #undef TMPBUFLEN } +static int do_proc_doulongvec_minmax(ctl_table *table, int write, + struct file *filp, + void __user *buffer, + size_t *lenp, loff_t *ppos, + unsigned long convmul, + unsigned long convdiv) +{ + return __do_proc_doulongvec_minmax(table->data, table, write, + filp, buffer, lenp, ppos, convmul, convdiv); +} + /** * proc_doulongvec_minmax - read a vector of long integers with min/max values * @table: the sysctl table @@ -2367,6 +2384,49 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, do_proc_dointvec_ms_jiffies_conv, NULL); } +#ifdef CONFIG_SYSVIPC +static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + void *data; + struct ipc_namespace *ns; + + ns = current->nsproxy->ipc_ns; + + switch (table->ctl_name) { + case KERN_SHMMAX: + data = &ns->shm_ctlmax; + goto proc_minmax; + case KERN_SHMALL: + data = &ns->shm_ctlall; + goto proc_minmax; + case KERN_SHMMNI: + data = &ns->shm_ctlmni; + break; + case KERN_MSGMAX: + data = &ns->msg_ctlmax; + break; + case KERN_MSGMNI: + data = &ns->msg_ctlmni; + break; + case KERN_MSGMNB: + data = &ns->msg_ctlmnb; + break; + case KERN_SEM: + data = &ns->sem_ctls; + break; + default: + return -EINVAL; + } + + return __do_proc_dointvec(data, table, write, filp, buffer, + lenp, ppos, NULL, NULL); +proc_minmax: + return __do_proc_doulongvec_minmax(data, table, write, filp, buffer, + lenp, ppos, 1l, 1l); +} +#endif + #else /* CONFIG_PROC_FS */ int proc_dostring(ctl_table *table, int write, struct file *filp, -- cgit v1.2.3 From 5d124e99c2fee1c8f3020ecb0dff8d5617ee7991 Mon Sep 17 00:00:00 2001 From: Pavel Date: Mon, 2 Oct 2006 02:18:24 -0700 Subject: [PATCH] nsproxy cloning error path fix This patch fixes copy_namespaces()'s error path. when new nsproxy (new_ns) is created pointers to namespaces (ipc, uts) are copied from the old nsproxy. Later in copy_utsname, copy_ipcs, etc. according namespaces are get-ed. On error path needed namespaces are put-ed, so there's no need to put new nsproxy itelf as it woud cause putting namespaces for the second time. Found when incorporating namespaces into OpenVZ kernel. Signed-off-by: Pavel Emelianov Acked-by: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/nsproxy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 8d6c852dc51e..6ebdb82a0ce4 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -123,7 +123,7 @@ out_uts: put_namespace(new_ns->namespace); out_ns: tsk->nsproxy = old_ns; - put_nsproxy(new_ns); + kfree(new_ns); goto out; } -- cgit v1.2.3 From 6760856791c6e527da678021ee6a67896549d4da Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 2 Oct 2006 02:18:26 -0700 Subject: [PATCH] introduce kernel_execve The use of execve() in the kernel is dubious, since it relies on the __KERNEL_SYSCALLS__ mechanism that stores the result in a global errno variable. As a first step of getting rid of this, change all users to a global kernel_execve function that returns a proper error code. This function is a terrible hack, and a later patch removes it again after the kernel syscalls are gone. Signed-off-by: Arnd Bergmann Cc: Andi Kleen Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Russell King Cc: Ian Molton Cc: Mikael Starvik Cc: David Howells Cc: Yoshinori Sato Cc: Hirokazu Takata Cc: Ralf Baechle Cc: Kyle McMartin Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: Paul Mundt Cc: Kazumoto Kojima Cc: Richard Curnow Cc: William Lee Irwin III Cc: "David S. Miller" Cc: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Cc: Miles Bader Cc: Chris Zankel Cc: "Luck, Tony" Cc: Geert Uytterhoeven Cc: Roman Zippel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index f8121b95183f..bb4e29d924e4 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -18,8 +18,6 @@ call_usermodehelper wait flag, and remove exec_usermodehelper. Rusty Russell Jan 2003 */ -#define __KERNEL_SYSCALLS__ - #include #include #include @@ -169,7 +167,8 @@ static int ____call_usermodehelper(void *data) retval = -EPERM; if (current->fs->root) - retval = execve(sub_info->path, sub_info->argv, sub_info->envp); + retval = kernel_execve(sub_info->path, + sub_info->argv, sub_info->envp); /* Exec failed? */ sub_info->retval = retval; -- cgit v1.2.3 From 1a657f78dcc8ea7c53eaa1f2a45ea2315738c15f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 2 Oct 2006 02:18:59 -0700 Subject: [PATCH] introduce get_task_pid() to fix unsafe get_pid() proc_pid_make_inode: ei->pid = get_pid(task_pid(task)); I think this is not safe. get_pid() can be preempted after checking "pid != NULL". Then the task exits, does detach_pid(), and RCU frees the pid. Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index e4779bbb2058..b914392085f9 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -304,6 +304,15 @@ struct task_struct *find_task_by_pid_type(int type, int nr) EXPORT_SYMBOL(find_task_by_pid_type); +struct pid *get_task_pid(struct task_struct *task, enum pid_type type) +{ + struct pid *pid; + rcu_read_lock(); + pid = get_pid(task->pids[type].pid); + rcu_read_unlock(); + return pid; +} + struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type) { struct task_struct *result; -- cgit v1.2.3 From 9ec52099e4b8678a60e9f93e41ad87885d64f3e6 Mon Sep 17 00:00:00 2001 From: Cedric Le Goater Date: Mon, 2 Oct 2006 02:19:00 -0700 Subject: [PATCH] replace cad_pid by a struct pid There are a few places in the kernel where the init task is signaled. The ctrl+alt+del sequence is one them. It kills a task, usually init, using a cached pid (cad_pid). This patch replaces the pid_t by a struct pid to avoid pid wrap around problem. The struct pid is initialized at boot time in init() and can be modified through systctl with /proc/sys/kernel/cad_pid [ I haven't found any distro using it ? ] It also introduces a small helper routine kill_cad_pid() which is used where it seemed ok to use cad_pid instead of pid 1. [akpm@osdl.org: cleanups, build fix] Signed-off-by: Cedric Le Goater Cc: Eric W. Biederman Cc: Martin Schwidefsky Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 6 +++--- kernel/sysctl.c | 30 +++++++++++++++++++++++++++--- 2 files changed, 30 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 3a4776e8f16e..2314867ae34f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -92,7 +92,8 @@ EXPORT_SYMBOL(fs_overflowgid); */ int C_A_D = 1; -int cad_pid = 1; +struct pid *cad_pid; +EXPORT_SYMBOL(cad_pid); /* * Notifier list for kernel code which wants to be called @@ -773,10 +774,9 @@ void ctrl_alt_del(void) if (C_A_D) schedule_work(&cad_work); else - kill_proc(cad_pid, SIGINT, 1); + kill_cad_pid(SIGINT, 1); } - /* * Unprivileged users may change the real gid to the effective gid * or vice versa. (BSD-style) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a79ccf9d113b..8020fb273c4f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -68,7 +68,6 @@ extern int sysrq_enabled; extern int core_uses_pid; extern int suid_dumpable; extern char core_pattern[]; -extern int cad_pid; extern int pid_max; extern int min_free_kbytes; extern int printk_ratelimit_jiffies; @@ -137,6 +136,9 @@ static int parse_table(int __user *, int, void __user *, size_t __user *, static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); +static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); + static ctl_table root_table[]; static struct ctl_table_header root_table_header = { root_table, LIST_HEAD_INIT(root_table_header.ctl_entry) }; @@ -543,10 +545,10 @@ static ctl_table kern_table[] = { { .ctl_name = KERN_CADPID, .procname = "cad_pid", - .data = &cad_pid, + .data = NULL, .maxlen = sizeof (int), .mode = 0600, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_cad_pid, }, { .ctl_name = KERN_MAX_THREADS, @@ -2427,6 +2429,28 @@ proc_minmax: } #endif +static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct pid *new_pid; + pid_t tmp; + int r; + + tmp = pid_nr(cad_pid); + + r = __do_proc_dointvec(&tmp, table, write, filp, buffer, + lenp, ppos, NULL, NULL); + if (r || !write) + return r; + + new_pid = find_get_pid(tmp); + if (!new_pid) + return -ESRCH; + + put_pid(xchg(&cad_pid, new_pid)); + return 0; +} + #else /* CONFIG_PROC_FS */ int proc_dostring(ctl_table *table, int write, struct file *filp, -- cgit v1.2.3 From 3f2e05e90e0846c42626e3d272454f26be34a1bc Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 2 Oct 2006 14:12:31 +0100 Subject: [PATCH] BLOCK: Revert patch to hack around undeclared sigset_t in linux/compat.h Revert Andrew Morton's patch to temporarily hack around the lack of a declaration of sigset_t in linux/compat.h to make the block-disablement patches build on IA64. This got accidentally pushed to Linus and should be fixed in a different manner. Also make linux/compat.h #include asm/signal.h to gain a definition of sigset_t so that it can externally declare sigset_from_compat(). This has been compile-tested for i386, x86_64, ia64, mips, mips64, frv, ppc and ppc64 and run-tested on frv. Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/compat.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index b4fbd838cd77..75573e5d27b0 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -26,8 +26,6 @@ #include -extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat); - int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) { return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || -- cgit v1.2.3 From ffc5089196446c08d9a005cf0dd7cab18d119606 Mon Sep 17 00:00:00 2001 From: Franck Bui-Huu Date: Tue, 3 Oct 2006 01:13:48 -0700 Subject: [PATCH] Create kallsyms_lookup_size_offset() Some uses of kallsyms_lookup() do not need to find out the name of a symbol and its module's name it belongs. This is specially true in arch specific code, which needs to unwind the stack to show the back trace during oops (mips is an example). In this specific case, we just need to retreive the function's size and the offset of the active intruction inside it. Adds a new entry "kallsyms_lookup_size_offset()" This new entry does exactly the same as kallsyms_lookup() but does not require any buffers to store any names. It returns 0 if it fails otherwise 1. Signed-off-by: Franck Bui-Huu Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 123 +++++++++++++++++++++++++++++++++++------------------- kernel/module.c | 3 +- 2 files changed, 82 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 342bca62c496..eeac3e313b2b 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -69,6 +69,15 @@ static inline int is_kernel(unsigned long addr) return in_gate_area_no_task(addr); } +static int is_ksym_addr(unsigned long addr) +{ + if (all_var) + return is_kernel(addr); + + return is_kernel_text(addr) || is_kernel_inittext(addr) || + is_kernel_extratext(addr); +} + /* expand a compressed symbol data into the resulting uncompressed string, given the offset to where the symbol is in the compressed stream */ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) @@ -155,6 +164,73 @@ unsigned long kallsyms_lookup_name(const char *name) return module_kallsyms_lookup_name(name); } +static unsigned long get_symbol_pos(unsigned long addr, + unsigned long *symbolsize, + unsigned long *offset) +{ + unsigned long symbol_start = 0, symbol_end = 0; + unsigned long i, low, high, mid; + + /* This kernel should never had been booted. */ + BUG_ON(!kallsyms_addresses); + + /* do a binary search on the sorted kallsyms_addresses array */ + low = 0; + high = kallsyms_num_syms; + + while (high - low > 1) { + mid = (low + high) / 2; + if (kallsyms_addresses[mid] <= addr) + low = mid; + else + high = mid; + } + + /* + * search for the first aliased symbol. Aliased + * symbols are symbols with the same address + */ + while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low]) + --low; + + symbol_start = kallsyms_addresses[low]; + + /* Search for next non-aliased symbol */ + for (i = low + 1; i < kallsyms_num_syms; i++) { + if (kallsyms_addresses[i] > symbol_start) { + symbol_end = kallsyms_addresses[i]; + break; + } + } + + /* if we found no next symbol, we use the end of the section */ + if (!symbol_end) { + if (is_kernel_inittext(addr)) + symbol_end = (unsigned long)_einittext; + else if (all_var) + symbol_end = (unsigned long)_end; + else + symbol_end = (unsigned long)_etext; + } + + *symbolsize = symbol_end - symbol_start; + *offset = addr - symbol_start; + + return low; +} + +/* + * Lookup an address but don't bother to find any names. + */ +int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, + unsigned long *offset) +{ + if (is_ksym_addr(addr)) + return !!get_symbol_pos(addr, symbolsize, offset); + + return !!module_address_lookup(addr, symbolsize, offset, NULL); +} + /* * Lookup an address * - modname is set to NULL if it's in the kernel @@ -167,57 +243,18 @@ const char *kallsyms_lookup(unsigned long addr, unsigned long *offset, char **modname, char *namebuf) { - unsigned long i, low, high, mid; const char *msym; - /* This kernel should never had been booted. */ - BUG_ON(!kallsyms_addresses); - namebuf[KSYM_NAME_LEN] = 0; namebuf[0] = 0; - if ((all_var && is_kernel(addr)) || - (!all_var && (is_kernel_text(addr) || is_kernel_inittext(addr) || - is_kernel_extratext(addr)))) { - unsigned long symbol_end = 0; - - /* do a binary search on the sorted kallsyms_addresses array */ - low = 0; - high = kallsyms_num_syms; - - while (high-low > 1) { - mid = (low + high) / 2; - if (kallsyms_addresses[mid] <= addr) low = mid; - else high = mid; - } - - /* search for the first aliased symbol. Aliased symbols are - symbols with the same address */ - while (low && kallsyms_addresses[low - 1] == kallsyms_addresses[low]) - --low; + if (is_ksym_addr(addr)) { + unsigned long pos; + pos = get_symbol_pos(addr, symbolsize, offset); /* Grab name */ - kallsyms_expand_symbol(get_symbol_offset(low), namebuf); - - /* Search for next non-aliased symbol */ - for (i = low + 1; i < kallsyms_num_syms; i++) { - if (kallsyms_addresses[i] > kallsyms_addresses[low]) { - symbol_end = kallsyms_addresses[i]; - break; - } - } - - /* if we found no next symbol, we use the end of the section */ - if (!symbol_end) { - if (is_kernel_inittext(addr)) - symbol_end = (unsigned long)_einittext; - else - symbol_end = all_var ? (unsigned long)_end : (unsigned long)_etext; - } - - *symbolsize = symbol_end - kallsyms_addresses[low]; + kallsyms_expand_symbol(get_symbol_offset(pos), namebuf); *modname = NULL; - *offset = addr - kallsyms_addresses[low]; return namebuf; } diff --git a/kernel/module.c b/kernel/module.c index 7c77a0a9275c..7f60e782de1e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2040,7 +2040,8 @@ const char *module_address_lookup(unsigned long addr, list_for_each_entry(mod, &modules, list) { if (within(addr, mod->module_init, mod->init_size) || within(addr, mod->module_core, mod->core_size)) { - *modname = mod->name; + if (modname) + *modname = mod->name; return get_ksymbol(mod, addr, size, offset); } } -- cgit v1.2.3 From eed34d0fc5e4b89269053ed855ef714edbcf4518 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 3 Oct 2006 01:13:50 -0700 Subject: [PATCH] kernel-doc for kernel/dma.c Add kernel-doc function headers in kernel/dma.c and use it in DocBook. Clean up kernel-doc in mca_dma.h (the colon (':') represents a section header). Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/dma.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma.c b/kernel/dma.c index aef0a45b7893..2020644c938a 100644 --- a/kernel/dma.c +++ b/kernel/dma.c @@ -62,6 +62,11 @@ static struct dma_chan dma_chan_busy[MAX_DMA_CHANNELS] = { }; +/** + * request_dma - request and reserve a system DMA channel + * @dmanr: DMA channel number + * @device_id: reserving device ID string, used in /proc/dma + */ int request_dma(unsigned int dmanr, const char * device_id) { if (dmanr >= MAX_DMA_CHANNELS) @@ -76,7 +81,10 @@ int request_dma(unsigned int dmanr, const char * device_id) return 0; } /* request_dma */ - +/** + * free_dma - free a reserved system DMA channel + * @dmanr: DMA channel number + */ void free_dma(unsigned int dmanr) { if (dmanr >= MAX_DMA_CHANNELS) { -- cgit v1.2.3 From e1ca66d1b990b23e7753c729332c0ada61f4f38d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 3 Oct 2006 01:13:51 -0700 Subject: [PATCH] kernel-doc for kernel/resource.c Add kernel-doc function headers in kernel/resource.c and use them in DocBook. Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 83 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 72 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 9db38a1a7520..6de60c12143e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -193,6 +193,13 @@ static int __release_resource(struct resource *old) return -EINVAL; } +/** + * request_resource - request and reserve an I/O or memory resource + * @root: root resource descriptor + * @new: resource descriptor desired by caller + * + * Returns 0 for success, negative error code on error. + */ int request_resource(struct resource *root, struct resource *new) { struct resource *conflict; @@ -205,6 +212,15 @@ int request_resource(struct resource *root, struct resource *new) EXPORT_SYMBOL(request_resource); +/** + * ____request_resource - reserve a resource, with resource conflict returned + * @root: root resource descriptor + * @new: resource descriptor desired by caller + * + * Returns: + * On success, NULL is returned. + * On error, a pointer to the conflicting resource is returned. + */ struct resource *____request_resource(struct resource *root, struct resource *new) { struct resource *conflict; @@ -217,6 +233,10 @@ struct resource *____request_resource(struct resource *root, struct resource *ne EXPORT_SYMBOL(____request_resource); +/** + * release_resource - release a previously reserved resource + * @old: resource pointer + */ int release_resource(struct resource *old) { int retval; @@ -315,8 +335,16 @@ static int find_resource(struct resource *root, struct resource *new, return -EBUSY; } -/* - * Allocate empty slot in the resource tree given range and alignment. +/** + * allocate_resource - allocate empty slot in the resource tree given range & alignment + * @root: root resource descriptor + * @new: resource descriptor desired by caller + * @size: requested resource region size + * @min: minimum size to allocate + * @max: maximum size to allocate + * @align: alignment requested, in bytes + * @alignf: alignment function, optional, called if not NULL + * @alignf_data: arbitrary data to pass to the @alignf function */ int allocate_resource(struct resource *root, struct resource *new, resource_size_t size, resource_size_t min, @@ -407,10 +435,15 @@ int insert_resource(struct resource *parent, struct resource *new) return result; } -/* +/** + * adjust_resource - modify a resource's start and size + * @res: resource to modify + * @start: new start value + * @size: new size + * * Given an existing resource, change its start and size to match the - * arguments. Returns -EBUSY if it can't fit. Existing children of - * the resource are assumed to be immutable. + * arguments. Returns 0 on success, -EBUSY if it can't fit. + * Existing children of the resource are assumed to be immutable. */ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) { @@ -456,11 +489,19 @@ EXPORT_SYMBOL(adjust_resource); * Note how this, unlike the above, knows about * the IO flag meanings (busy etc). * - * Request-region creates a new busy region. + * request_region creates a new busy region. * - * Check-region returns non-zero if the area is already busy + * check_region returns non-zero if the area is already busy. * - * Release-region releases a matching busy region. + * release_region releases a matching busy region. + */ + +/** + * __request_region - create a new busy resource region + * @parent: parent resource descriptor + * @start: resource start address + * @n: resource region size + * @name: reserving caller's ID string */ struct resource * __request_region(struct resource *parent, resource_size_t start, resource_size_t n, @@ -497,9 +538,23 @@ struct resource * __request_region(struct resource *parent, } return res; } - EXPORT_SYMBOL(__request_region); +/** + * __check_region - check if a resource region is busy or free + * @parent: parent resource descriptor + * @start: resource start address + * @n: resource region size + * + * Returns 0 if the region is free at the moment it is checked, + * returns %-EBUSY if the region is busy. + * + * NOTE: + * This function is deprecated because its use is racy. + * Even if it returns 0, a subsequent call to request_region() + * may fail because another driver etc. just allocated the region. + * Do NOT use it. It will be removed from the kernel. + */ int __check_region(struct resource *parent, resource_size_t start, resource_size_t n) { @@ -513,9 +568,16 @@ int __check_region(struct resource *parent, resource_size_t start, kfree(res); return 0; } - EXPORT_SYMBOL(__check_region); +/** + * __release_region - release a previously reserved resource region + * @parent: parent resource descriptor + * @start: resource start address + * @n: resource region size + * + * The described resource region must match a currently busy region. + */ void __release_region(struct resource *parent, resource_size_t start, resource_size_t n) { @@ -553,7 +615,6 @@ void __release_region(struct resource *parent, resource_size_t start, "<%016llx-%016llx>\n", (unsigned long long)start, (unsigned long long)end); } - EXPORT_SYMBOL(__release_region); /* -- cgit v1.2.3 From 5c1e176781f43bc902a51e5832f789756bff911b Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 3 Oct 2006 01:14:04 -0700 Subject: [PATCH] sched: force /sbin/init off isolated cpus Force /sbin/init off isolated cpus (unless every CPU is specified as an isolcpu). Users seem to think that the isolated CPUs shouldn't have much running on them to begin with. That's fair enough: intuitive, I guess. It also means that the cpu affinity masks of tasks will not include isolcpus by default, which is also more intuitive, perhaps. /sbin/init is spawned from the boot CPU's idle thread, and /sbin/init starts the rest of userspace. So if the boot CPU is specified to be an isolcpu, then prior to this patch, all of userspace will be run there. (throw in a couple of plausible devinit -> cpuinit conversions I spotted while we're here). Signed-off-by: Nick Piggin Cc: Dimitri Sivanich Acked-by: Paul Jackson Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e4e54e86f4a2..ddf418810c39 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4817,7 +4817,7 @@ void show_state(void) * NOTE: this function does not set the idle thread's NEED_RESCHED * flag, to make booting more robust. */ -void __devinit init_idle(struct task_struct *idle, int cpu) +void __cpuinit init_idle(struct task_struct *idle, int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -5461,7 +5461,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu) } /* cpus with isolated domains */ -static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; +static cpumask_t __cpuinitdata cpu_isolated_map = CPU_MASK_NONE; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) @@ -6747,11 +6747,20 @@ static int update_sched_domains(struct notifier_block *nfb, void __init sched_init_smp(void) { + cpumask_t non_isolated_cpus; + lock_cpu_hotplug(); arch_init_sched_domains(&cpu_online_map); + cpus_andnot(non_isolated_cpus, cpu_online_map, cpu_isolated_map); + if (cpus_empty(non_isolated_cpus)) + cpu_set(smp_processor_id(), non_isolated_cpus); unlock_cpu_hotplug(); /* XXX: Theoretical race here - CPU may be hotplugged now */ hotcpu_notifier(update_sched_domains, 0); + + /* Move init over to a non-isolated CPU */ + if (set_cpus_allowed(current, non_isolated_cpus) < 0) + BUG(); } #else void __init sched_init_smp(void) -- cgit v1.2.3 From a616058b7815aafb2163fc795e02a055b0dbc5e2 Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Tue, 3 Oct 2006 01:14:06 -0700 Subject: [PATCH] sched: remove unnecessary sched group allocations Remove dynamic sched group allocations for MC and SMP domains. These allocations can easily fail on big systems(1024 or so CPUs) and we can live with out these dynamic allocations. [akpm@osdl.org: build fix] Signed-off-by: Suresh Siddha Acked-by: Ingo Molnar Acked-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 104 +++++++++++++++++++++------------------------------------ 1 file changed, 38 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ddf418810c39..6b956bd9b49a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5489,15 +5489,17 @@ __setup ("isolcpus=", isolated_cpu_setup); * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. */ -static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, - int (*group_fn)(int cpu)) +static void +init_sched_build_groups(struct sched_group groups[], cpumask_t span, + const cpumask_t *cpu_map, + int (*group_fn)(int cpu, const cpumask_t *cpu_map)) { struct sched_group *first = NULL, *last = NULL; cpumask_t covered = CPU_MASK_NONE; int i; for_each_cpu_mask(i, span) { - int group = group_fn(i); + int group = group_fn(i, cpu_map); struct sched_group *sg = &groups[group]; int j; @@ -5508,7 +5510,7 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, sg->cpu_power = 0; for_each_cpu_mask(j, span) { - if (group_fn(j) != group) + if (group_fn(j, cpu_map) != group) continue; cpu_set(j, covered); @@ -6084,7 +6086,7 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; static DEFINE_PER_CPU(struct sched_domain, cpu_domains); static struct sched_group sched_group_cpus[NR_CPUS]; -static int cpu_to_cpu_group(int cpu) +static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map) { return cpu; } @@ -6095,31 +6097,36 @@ static int cpu_to_cpu_group(int cpu) */ #ifdef CONFIG_SCHED_MC static DEFINE_PER_CPU(struct sched_domain, core_domains); -static struct sched_group *sched_group_core_bycpu[NR_CPUS]; +static struct sched_group sched_group_core[NR_CPUS]; #endif #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) -static int cpu_to_core_group(int cpu) +static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map) { - return first_cpu(cpu_sibling_map[cpu]); + cpumask_t mask = cpu_sibling_map[cpu]; + cpus_and(mask, mask, *cpu_map); + return first_cpu(mask); } #elif defined(CONFIG_SCHED_MC) -static int cpu_to_core_group(int cpu) +static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map) { return cpu; } #endif static DEFINE_PER_CPU(struct sched_domain, phys_domains); -static struct sched_group *sched_group_phys_bycpu[NR_CPUS]; +static struct sched_group sched_group_phys[NR_CPUS]; -static int cpu_to_phys_group(int cpu) +static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map) { #ifdef CONFIG_SCHED_MC cpumask_t mask = cpu_coregroup_map(cpu); + cpus_and(mask, mask, *cpu_map); return first_cpu(mask); #elif defined(CONFIG_SCHED_SMT) - return first_cpu(cpu_sibling_map[cpu]); + cpumask_t mask = cpu_sibling_map[cpu]; + cpus_and(mask, mask, *cpu_map); + return first_cpu(mask); #else return cpu; #endif @@ -6137,7 +6144,7 @@ static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; -static int cpu_to_allnodes_group(int cpu) +static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map) { return cpu_to_node(cpu); } @@ -6169,12 +6176,11 @@ next_sg: } #endif +#ifdef CONFIG_NUMA /* Free memory allocated for various sched_group structures */ static void free_sched_groups(const cpumask_t *cpu_map) { - int cpu; -#ifdef CONFIG_NUMA - int i; + int cpu, i; for_each_cpu_mask(cpu, *cpu_map) { struct sched_group *sched_group_allnodes @@ -6211,20 +6217,12 @@ next_sg: kfree(sched_group_nodes); sched_group_nodes_bycpu[cpu] = NULL; } -#endif - for_each_cpu_mask(cpu, *cpu_map) { - if (sched_group_phys_bycpu[cpu]) { - kfree(sched_group_phys_bycpu[cpu]); - sched_group_phys_bycpu[cpu] = NULL; - } -#ifdef CONFIG_SCHED_MC - if (sched_group_core_bycpu[cpu]) { - kfree(sched_group_core_bycpu[cpu]); - sched_group_core_bycpu[cpu] = NULL; - } -#endif - } } +#else +static void free_sched_groups(const cpumask_t *cpu_map) +{ +} +#endif /* * Build sched domains for a given set of cpus and attach the sched domains @@ -6233,10 +6231,6 @@ next_sg: static int build_sched_domains(const cpumask_t *cpu_map) { int i; - struct sched_group *sched_group_phys = NULL; -#ifdef CONFIG_SCHED_MC - struct sched_group *sched_group_core = NULL; -#endif #ifdef CONFIG_NUMA struct sched_group **sched_group_nodes = NULL; struct sched_group *sched_group_allnodes = NULL; @@ -6282,7 +6276,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) sd = &per_cpu(allnodes_domains, i); *sd = SD_ALLNODES_INIT; sd->span = *cpu_map; - group = cpu_to_allnodes_group(i); + group = cpu_to_allnodes_group(i, cpu_map); sd->groups = &sched_group_allnodes[group]; p = sd; } else @@ -6295,42 +6289,18 @@ static int build_sched_domains(const cpumask_t *cpu_map) cpus_and(sd->span, sd->span, *cpu_map); #endif - if (!sched_group_phys) { - sched_group_phys - = kmalloc(sizeof(struct sched_group) * NR_CPUS, - GFP_KERNEL); - if (!sched_group_phys) { - printk (KERN_WARNING "Can not alloc phys sched" - "group\n"); - goto error; - } - sched_group_phys_bycpu[i] = sched_group_phys; - } - p = sd; sd = &per_cpu(phys_domains, i); - group = cpu_to_phys_group(i); + group = cpu_to_phys_group(i, cpu_map); *sd = SD_CPU_INIT; sd->span = nodemask; sd->parent = p; sd->groups = &sched_group_phys[group]; #ifdef CONFIG_SCHED_MC - if (!sched_group_core) { - sched_group_core - = kmalloc(sizeof(struct sched_group) * NR_CPUS, - GFP_KERNEL); - if (!sched_group_core) { - printk (KERN_WARNING "Can not alloc core sched" - "group\n"); - goto error; - } - sched_group_core_bycpu[i] = sched_group_core; - } - p = sd; sd = &per_cpu(core_domains, i); - group = cpu_to_core_group(i); + group = cpu_to_core_group(i, cpu_map); *sd = SD_MC_INIT; sd->span = cpu_coregroup_map(i); cpus_and(sd->span, sd->span, *cpu_map); @@ -6341,7 +6311,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) #ifdef CONFIG_SCHED_SMT p = sd; sd = &per_cpu(cpu_domains, i); - group = cpu_to_cpu_group(i); + group = cpu_to_cpu_group(i, cpu_map); *sd = SD_SIBLING_INIT; sd->span = cpu_sibling_map[i]; cpus_and(sd->span, sd->span, *cpu_map); @@ -6359,7 +6329,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) continue; init_sched_build_groups(sched_group_cpus, this_sibling_map, - &cpu_to_cpu_group); + cpu_map, &cpu_to_cpu_group); } #endif @@ -6371,7 +6341,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) if (i != first_cpu(this_core_map)) continue; init_sched_build_groups(sched_group_core, this_core_map, - &cpu_to_core_group); + cpu_map, &cpu_to_core_group); } #endif @@ -6385,14 +6355,14 @@ static int build_sched_domains(const cpumask_t *cpu_map) continue; init_sched_build_groups(sched_group_phys, nodemask, - &cpu_to_phys_group); + cpu_map, &cpu_to_phys_group); } #ifdef CONFIG_NUMA /* Set up node groups */ if (sched_group_allnodes) init_sched_build_groups(sched_group_allnodes, *cpu_map, - &cpu_to_allnodes_group); + cpu_map, &cpu_to_allnodes_group); for (i = 0; i < MAX_NUMNODES; i++) { /* Set up node groups */ @@ -6537,7 +6507,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) init_numa_sched_groups_power(sched_group_nodes[i]); if (sched_group_allnodes) { - int group = cpu_to_allnodes_group(first_cpu(*cpu_map)); + int group = cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map); struct sched_group *sg = &sched_group_allnodes[group]; init_numa_sched_groups_power(sg); @@ -6563,9 +6533,11 @@ static int build_sched_domains(const cpumask_t *cpu_map) return 0; +#ifdef CONFIG_NUMA error: free_sched_groups(cpu_map); return -ENOMEM; +#endif } /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. -- cgit v1.2.3 From 74732646431a1bb7e23e6b564127a8881cfef900 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 3 Oct 2006 01:14:07 -0700 Subject: [PATCH] sched: don't print migration cost when only 1 CPU If only a single CPU is present, printing this doesn't make much sense. Signed-off-by: Dave Jones Acked-by: Ingo Molnar Acked-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6b956bd9b49a..6d7bf55ec33d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5977,13 +5977,15 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map) #endif ); if (system_state == SYSTEM_BOOTING) { - printk("migration_cost="); - for (distance = 0; distance <= max_distance; distance++) { - if (distance) - printk(","); - printk("%ld", (long)migration_cost[distance] / 1000); + if (num_online_cpus() > 1) { + printk("migration_cost="); + for (distance = 0; distance <= max_distance; distance++) { + if (distance) + printk(","); + printk("%ld", (long)migration_cost[distance] / 1000); + } + printk("\n"); } - printk("\n"); } j1 = jiffies; if (migration_debug) -- cgit v1.2.3 From 1a84887080dc15f048db7c3a643e98f1435790d6 Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Tue, 3 Oct 2006 01:14:08 -0700 Subject: [PATCH] sched: introduce child field in sched_domain Introduce the child field in sched_domain struct and use it in sched_balance_self(). We will also use this field in cleaning up the sched group cpu_power setup(done in a different patch) code. Signed-off-by: Suresh Siddha Acked-by: Ingo Molnar Acked-by: Nick Piggin Cc: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6d7bf55ec33d..0feeacb91497 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1286,21 +1286,29 @@ static int sched_balance_self(int cpu, int flag) while (sd) { cpumask_t span; struct sched_group *group; - int new_cpu; - int weight; + int new_cpu, weight; + + if (!(sd->flags & flag)) { + sd = sd->child; + continue; + } span = sd->span; group = find_idlest_group(sd, t, cpu); - if (!group) - goto nextlevel; + if (!group) { + sd = sd->child; + continue; + } new_cpu = find_idlest_cpu(group, t, cpu); - if (new_cpu == -1 || new_cpu == cpu) - goto nextlevel; + if (new_cpu == -1 || new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + continue; + } - /* Now try balancing at a lower domain level */ + /* Now try balancing at a lower domain level of new_cpu */ cpu = new_cpu; -nextlevel: sd = NULL; weight = cpus_weight(span); for_each_domain(cpu, tmp) { @@ -5448,12 +5456,18 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu) struct sched_domain *parent = tmp->parent; if (!parent) break; - if (sd_parent_degenerate(tmp, parent)) + if (sd_parent_degenerate(tmp, parent)) { tmp->parent = parent->parent; + if (parent->parent) + parent->parent->child = tmp; + } } - if (sd && sd_degenerate(sd)) + if (sd && sd_degenerate(sd)) { sd = sd->parent; + if (sd) + sd->child = NULL; + } sched_domain_debug(sd, cpu); @@ -6288,6 +6302,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) *sd = SD_NODE_INIT; sd->span = sched_domain_node_span(cpu_to_node(i)); sd->parent = p; + if (p) + p->child = sd; cpus_and(sd->span, sd->span, *cpu_map); #endif @@ -6297,6 +6313,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) *sd = SD_CPU_INIT; sd->span = nodemask; sd->parent = p; + if (p) + p->child = sd; sd->groups = &sched_group_phys[group]; #ifdef CONFIG_SCHED_MC @@ -6307,6 +6325,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) sd->span = cpu_coregroup_map(i); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; + p->child = sd; sd->groups = &sched_group_core[group]; #endif @@ -6318,6 +6337,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) sd->span = cpu_sibling_map[i]; cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; + p->child = sd; sd->groups = &sched_group_cpus[group]; #endif } -- cgit v1.2.3 From 89c4710ee9bbbefe6a4d469d9f36266a92c275c5 Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Tue, 3 Oct 2006 01:14:09 -0700 Subject: [PATCH] sched: cleanup sched_group cpu_power setup Up to now sched group's cpu_power for each sched domain is initialized independently. This made the setup code ugly as the new sched domains are getting added. Make the sched group cpu_power setup code generic, by using domain child field and new domain flag in sched_domain. For most of the sched domains(except NUMA), sched group's cpu_power is now computed generically using the domain properties of itself and of the child domain. sched groups in NUMA domains are setup little differently and hence they don't use this generic mechanism. Signed-off-by: Suresh Siddha Acked-by: Ingo Molnar Acked-by: Nick Piggin Cc: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 145 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 82 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0feeacb91497..0a5e814cc618 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2541,8 +2541,14 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct rq *busiest; cpumask_t cpus = CPU_MASK_ALL; + /* + * When power savings policy is enabled for the parent domain, idle + * sibling can pick up load irrespective of busy siblings. In this case, + * let the state of idle sibling percolate up as IDLE, instead of + * portraying it as NOT_IDLE. + */ if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && - !sched_smt_power_savings) + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) sd_idle = 1; schedstat_inc(sd, lb_cnt[idle]); @@ -2638,7 +2644,7 @@ redo: } if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !sched_smt_power_savings) + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; return nr_moved; @@ -2654,7 +2660,7 @@ out_one_pinned: sd->balance_interval *= 2; if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !sched_smt_power_savings) + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; return 0; } @@ -2676,7 +2682,14 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) int sd_idle = 0; cpumask_t cpus = CPU_MASK_ALL; - if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) + /* + * When power savings policy is enabled for the parent domain, idle + * sibling can pick up load irrespective of busy siblings. In this case, + * let the state of idle sibling percolate up as IDLE, instead of + * portraying it as NOT_IDLE. + */ + if (sd->flags & SD_SHARE_CPUPOWER && + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) sd_idle = 1; schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); @@ -2717,7 +2730,8 @@ redo: if (!nr_moved) { schedstat_inc(sd, lb_failed[NEWLY_IDLE]); - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; } else sd->nr_balance_failed = 0; @@ -2727,7 +2741,7 @@ redo: out_balanced: schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !sched_smt_power_savings) + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; sd->nr_balance_failed = 0; @@ -5400,7 +5414,9 @@ static int sd_degenerate(struct sched_domain *sd) if (sd->flags & (SD_LOAD_BALANCE | SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | - SD_BALANCE_EXEC)) { + SD_BALANCE_EXEC | + SD_SHARE_CPUPOWER | + SD_SHARE_PKG_RESOURCES)) { if (sd->groups != sd->groups->next) return 0; } @@ -5434,7 +5450,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) pflags &= ~(SD_LOAD_BALANCE | SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | - SD_BALANCE_EXEC); + SD_BALANCE_EXEC | + SD_SHARE_CPUPOWER | + SD_SHARE_PKG_RESOURCES); } if (~cflags & pflags) return 0; @@ -6240,6 +6258,58 @@ static void free_sched_groups(const cpumask_t *cpu_map) } #endif +/* + * Initialize sched groups cpu_power. + * + * cpu_power indicates the capacity of sched group, which is used while + * distributing the load between different sched groups in a sched domain. + * Typically cpu_power for all the groups in a sched domain will be same unless + * there are asymmetries in the topology. If there are asymmetries, group + * having more cpu_power will pickup more load compared to the group having + * less cpu_power. + * + * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents + * the maximum number of tasks a group can handle in the presence of other idle + * or lightly loaded groups in the same sched domain. + */ +static void init_sched_groups_power(int cpu, struct sched_domain *sd) +{ + struct sched_domain *child; + struct sched_group *group; + + WARN_ON(!sd || !sd->groups); + + if (cpu != first_cpu(sd->groups->cpumask)) + return; + + child = sd->child; + + /* + * For perf policy, if the groups in child domain share resources + * (for example cores sharing some portions of the cache hierarchy + * or SMT), then set this domain groups cpu_power such that each group + * can handle only one task, when there are other idle groups in the + * same sched domain. + */ + if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && + (child->flags & + (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { + sd->groups->cpu_power = SCHED_LOAD_SCALE; + return; + } + + sd->groups->cpu_power = 0; + + /* + * add cpu_power of each child group to this groups cpu_power + */ + group = child->groups; + do { + sd->groups->cpu_power += group->cpu_power; + group = group->next; + } while (group != child->groups); +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -6247,6 +6317,7 @@ static void free_sched_groups(const cpumask_t *cpu_map) static int build_sched_domains(const cpumask_t *cpu_map) { int i; + struct sched_domain *sd; #ifdef CONFIG_NUMA struct sched_group **sched_group_nodes = NULL; struct sched_group *sched_group_allnodes = NULL; @@ -6456,72 +6527,20 @@ static int build_sched_domains(const cpumask_t *cpu_map) /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT for_each_cpu_mask(i, *cpu_map) { - struct sched_domain *sd; sd = &per_cpu(cpu_domains, i); - sd->groups->cpu_power = SCHED_LOAD_SCALE; + init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC for_each_cpu_mask(i, *cpu_map) { - int power; - struct sched_domain *sd; sd = &per_cpu(core_domains, i); - if (sched_smt_power_savings) - power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); - else - power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) - * SCHED_LOAD_SCALE / 10; - sd->groups->cpu_power = power; + init_sched_groups_power(i, sd); } #endif for_each_cpu_mask(i, *cpu_map) { - struct sched_domain *sd; -#ifdef CONFIG_SCHED_MC - sd = &per_cpu(phys_domains, i); - if (i != first_cpu(sd->groups->cpumask)) - continue; - - sd->groups->cpu_power = 0; - if (sched_mc_power_savings || sched_smt_power_savings) { - int j; - - for_each_cpu_mask(j, sd->groups->cpumask) { - struct sched_domain *sd1; - sd1 = &per_cpu(core_domains, j); - /* - * for each core we will add once - * to the group in physical domain - */ - if (j != first_cpu(sd1->groups->cpumask)) - continue; - - if (sched_smt_power_savings) - sd->groups->cpu_power += sd1->groups->cpu_power; - else - sd->groups->cpu_power += SCHED_LOAD_SCALE; - } - } else - /* - * This has to be < 2 * SCHED_LOAD_SCALE - * Lets keep it SCHED_LOAD_SCALE, so that - * while calculating NUMA group's cpu_power - * we can simply do - * numa_group->cpu_power += phys_group->cpu_power; - * - * See "only add power once for each physical pkg" - * comment below - */ - sd->groups->cpu_power = SCHED_LOAD_SCALE; -#else - int power; sd = &per_cpu(phys_domains, i); - if (sched_smt_power_savings) - power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); - else - power = SCHED_LOAD_SCALE; - sd->groups->cpu_power = power; -#endif + init_sched_groups_power(i, sd); } #ifdef CONFIG_NUMA -- cgit v1.2.3 From 0feaece97795c4c775a3c732c045706eda28d0e5 Mon Sep 17 00:00:00 2001 From: Satoru Takeuchi Date: Tue, 3 Oct 2006 01:14:10 -0700 Subject: [PATCH] sched: fixing wrong comment for find_idlest_cpu() Fixing wrong comment for find_idlest_cpu(). Signed-off-by: Satoru Takeuchi Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0a5e814cc618..fec97e4e196d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1232,7 +1232,7 @@ nextgroup: } /* - * find_idlest_queue - find the idlest runqueue among the cpus in group. + * find_idlest_cpu - find the idlest cpu among the cpus in group. */ static int find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) -- cgit v1.2.3 From ce164428c4cabfd284ca81913415cacd889aac33 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 3 Oct 2006 01:14:11 -0700 Subject: [PATCH] scheduler: NUMA aware placement of sched_group_allnodes When the per cpu sched domains are build then they also need to be placed on the node where the cpu resides otherwise we will have frequent off node accesses which will slow down the system. Signed-off-by: Christoph Lameter Acked-by: Ingo Molnar Acked-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index fec97e4e196d..53608a59d6e3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6349,9 +6349,10 @@ static int build_sched_domains(const cpumask_t *cpu_map) > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { if (!sched_group_allnodes) { sched_group_allnodes - = kmalloc(sizeof(struct sched_group) - * MAX_NUMNODES, - GFP_KERNEL); + = kmalloc_node(sizeof(struct sched_group) + * MAX_NUMNODES, + GFP_KERNEL, + cpu_to_node(i)); if (!sched_group_allnodes) { printk(KERN_WARNING "Can not alloc allnodes sched group\n"); -- cgit v1.2.3 From f30c2269544bffc7bf1b0d7c0abe5be1be83b8cb Mon Sep 17 00:00:00 2001 From: Uwe Zeisberger Date: Tue, 3 Oct 2006 23:01:26 +0200 Subject: fix file specification in comments Many files include the filename at the beginning, serveral used a wrong one. Signed-off-by: Uwe Zeisberger Signed-off-by: Adrian Bunk --- kernel/posix-timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index e5ebcc1ec3a0..9cbb5d1be06f 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -1,5 +1,5 @@ /* - * linux/kernel/posix_timers.c + * linux/kernel/posix-timers.c * * * 2002-10-15 Posix Clocks & timers -- cgit v1.2.3 From 9f5d785e93f7c68c37582ddea848de23689fdd76 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Tue, 3 Oct 2006 23:07:31 +0200 Subject: remove duplicate "until" from kernel/workqueue.c s/until until/until/ Signed-off-by: Rolf Eike Beer Signed-off-by: Adrian Bunk --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 835fe28b87a8..cfc737bffe6d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -34,7 +34,7 @@ * possible cpu). * * The sequence counters are for flush_scheduled_work(). It wants to wait - * until until all currently-scheduled works are completed, but it doesn't + * until all currently-scheduled works are completed, but it doesn't * want to be livelocked by new, incoming ones. So it waits until * remove_sequence is >= the insert_sequence which pertained when * flush_scheduled_work() was called. -- cgit v1.2.3 From 4802211cfd68e44c8401a8fe3657e9c2522ec517 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Tue, 3 Oct 2006 23:26:16 +0200 Subject: rcutorture: Fix incorrect description of default for nreaders parameter The comment for the nreaders parameter of rcutorture gives the default as 4*ncpus, but the value actually defaults to 2*ncpus; fix the comment. Signed-off-by: Josh Triplett Acked-by: Paul E. McKenney Signed-off-by: Adrian Bunk --- kernel/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 4f2c4272d59c..23446e91cded 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -47,7 +47,7 @@ MODULE_LICENSE("GPL"); -static int nreaders = -1; /* # reader threads, defaults to 4*ncpus */ +static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ static int stat_interval; /* Interval between stats, in seconds. */ /* Defaults to "only at end of test". */ static int verbose; /* Print more debug info. */ -- cgit v1.2.3 From 038b0a6d8d32db934bba6a24e74e76e4e327a94f Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Wed, 4 Oct 2006 03:38:54 -0400 Subject: Remove all inclusions of kbuild explicitly includes this at build time. Signed-off-by: Dave Jones --- kernel/rtmutex-debug.c | 1 - kernel/rtmutex-tester.c | 1 - 2 files changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index 0c1faa950af7..da8d6bf46457 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -16,7 +16,6 @@ * * See rt.c in preempt-rt for proper credits and further information */ -#include #include #include #include diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 948bd8f643e2..6dcea9dd8c94 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -6,7 +6,6 @@ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner * */ -#include #include #include #include -- cgit v1.2.3 From 4b8a311bb161a3bd2ab44311f42c526b6dc76270 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 28 Sep 2006 17:46:21 -0400 Subject: [PATCH] arch filter lists with < or > should not be accepted Currently the kernel audit system represents arch's as numbers and will gladly accept comparisons between archs using >, <, >=, <= when the only thing that makes sense is = or !=. I'm told that the next revision of auditctl will do this checking but this will provide enforcement in the kernel even for old userspace. A simple command to show the issue would be to run auditctl -d entry,always -F arch>i686 -S chmod with this patch the kernel will reject this with -EINVAL Please comment/ack/nak as soon as possible. -Eric kernel/auditfilter.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) Signed-off-by: Al Viro --- kernel/auditfilter.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 1a58a81fb09d..4f40d923af8e 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -411,7 +411,6 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) case AUDIT_FSGID: case AUDIT_LOGINUID: case AUDIT_PERS: - case AUDIT_ARCH: case AUDIT_MSGTYPE: case AUDIT_PPID: case AUDIT_DEVMAJOR: @@ -423,6 +422,14 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) case AUDIT_ARG2: case AUDIT_ARG3: break; + /* arch is only allowed to be = or != */ + case AUDIT_ARCH: + if ((f->op != AUDIT_NOT_EQUAL) && (f->op != AUDIT_EQUAL) + && (f->op != AUDIT_NEGATE) && (f->op)) { + err = -EINVAL; + goto exit_free; + } + break; case AUDIT_PERM: if (f->val & ~15) goto exit_free; -- cgit v1.2.3 From 419c58f11fb732cc8bd1335fa43e0decb34e0be3 Mon Sep 17 00:00:00 2001 From: Alexander Viro Date: Fri, 29 Sep 2006 00:08:50 -0400 Subject: [PATCH] PPID filtering fix On Thu, Sep 28, 2006 at 04:03:06PM -0400, Eric Paris wrote: > After some looking I did not see a way to get into audit_log_exit > without having set the ppid. So I am dropping the set from there and > only doing it at the beginning. > > Please comment/ack/nak as soon as possible. Ehh... That's one hell of an overhead to be had ;-/ Let's be lazy. Signed-off-by: Al Viro --- kernel/auditsc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 105147631753..b61c0191f3da 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -278,8 +278,11 @@ static int audit_filter_rules(struct task_struct *tsk, result = audit_comparator(tsk->pid, f->op, f->val); break; case AUDIT_PPID: - if (ctx) + if (ctx) { + if (!ctx->ppid) + ctx->ppid = sys_getppid(); result = audit_comparator(ctx->ppid, f->op, f->val); + } break; case AUDIT_UID: result = audit_comparator(tsk->uid, f->op, f->val); @@ -795,7 +798,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts /* tsk == current */ context->pid = tsk->pid; - context->ppid = sys_getppid(); /* sic. tsk == current in all cases */ + if (!context->ppid) + context->ppid = sys_getppid(); context->uid = tsk->uid; context->gid = tsk->gid; context->euid = tsk->euid; @@ -1137,6 +1141,7 @@ void audit_syscall_entry(int arch, int major, context->ctime = CURRENT_TIME; context->in_syscall = 1; context->auditable = !!(state == AUDIT_RECORD_CONTEXT); + context->ppid = 0; } /** -- cgit v1.2.3 From ac9910ce017ff5f86f3a25e969b2c4f5d6ac438f Mon Sep 17 00:00:00 2001 From: Steve Grubb Date: Thu, 28 Sep 2006 14:31:32 -0400 Subject: [PATCH] name_count array overrun Hi, This patch removes the rdev logging from the previous patch The below patch closes an unbounded use of name_count. This can lead to oopses in some new file systems. Signed-off-by: Steve Grubb Signed-off-by: Al Viro --- kernel/auditsc.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b61c0191f3da..42f2f1179711 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1357,7 +1357,13 @@ void __audit_inode_child(const char *dname, const struct inode *inode, } update_context: - idx = context->name_count++; + idx = context->name_count; + if (context->name_count == AUDIT_NAMES) { + printk(KERN_DEBUG "name_count maxed and losing %s\n", + found_name ?: "(null)"); + return; + } + context->name_count++; #if AUDIT_DEBUG context->ino_count++; #endif @@ -1375,7 +1381,16 @@ update_context: /* A parent was not found in audit_names, so copy the inode data for the * provided parent. */ if (!found_name) { - idx = context->name_count++; + idx = context->name_count; + if (context->name_count == AUDIT_NAMES) { + printk(KERN_DEBUG + "name_count maxed and losing parent inode data: dev=%02x:%02x, inode=%lu", + MAJOR(parent->i_sb->s_dev), + MINOR(parent->i_sb->s_dev), + parent->i_ino); + return; + } + context->name_count++; #if AUDIT_DEBUG context->ino_count++; #endif -- cgit v1.2.3 From a24ceab4f44f21749aa0b6bd38bee37c775e036f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 4 Oct 2006 02:16:27 -0700 Subject: [PATCH] genirq: irq: convert the move_irq flag from a 32bit word to a single bit The primary aim of this patchset is to remove maintenances problems caused by the irq infrastructure. The two big issues I address are an artificially small cap on the number of irqs, and that MSI assumes vector == irq. My primary focus is on x86_64 but I have touched other architectures where necessary to keep them from breaking. - To increase the number of irqs I modify the code to look at the (cpu, vector) pair instead of just looking at the vector. With a large number of irqs available systems with a large irq count no longer need to compress their irq numbers to fit. Removing a lot of brittle special cases. For acpi guys the result is that irq == gsi. - Addressing the fact that MSI assumes irq == vector takes a few more patches. But suffice it to say when I am done none of the generic irq code even knows what a vector is. In quick testing on a large Unisys x86_64 machine we stumbled over at least one driver that assumed that NR_IRQS could always fit into an 8 bit number. This driver is clearly buggy today. But this has become a class of bugs that it is now much easier to hit. This patch: This is a minor space optimization. In practice I don't think this has any affect because of our alignment constraints and the other fields but there is not point in chewing up an uncessary word and since we already read the flag field this should improve the cache hit ratio of the irq handler. Signed-off-by: Eric W. Biederman Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Benjamin Herrenschmidt Cc: Rajesh Shah Cc: Andi Kleen Cc: "Protasevich, Natalie" Cc: "Luck, Tony" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/migration.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index a57ebe9fa6f6..9b234df810d0 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -7,7 +7,7 @@ void set_pending_irq(unsigned int irq, cpumask_t mask) unsigned long flags; spin_lock_irqsave(&desc->lock, flags); - desc->move_irq = 1; + desc->status |= IRQ_MOVE_PENDING; irq_desc[irq].pending_mask = mask; spin_unlock_irqrestore(&desc->lock, flags); } @@ -17,7 +17,7 @@ void move_native_irq(int irq) struct irq_desc *desc = irq_desc + irq; cpumask_t tmp; - if (likely(!desc->move_irq)) + if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; /* @@ -28,7 +28,7 @@ void move_native_irq(int irq) return; } - desc->move_irq = 0; + desc->status &= ~IRQ_MOVE_PENDING; if (unlikely(cpus_empty(irq_desc[irq].pending_mask))) return; -- cgit v1.2.3 From e7b946e98a456077dd6897f726f3d6197bd7e3b9 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 4 Oct 2006 02:16:29 -0700 Subject: [PATCH] genirq: irq: add moved_masked_irq Currently move_native_irq disables and renables the irq we are migrating to ensure we don't take that irq when we are actually doing the migration operation. Disabling the irq needs to happen but sometimes doing the work is move_native_irq is too late. On x86 with ioapics the irq move sequences needs to be: edge_triggered: mask irq. move irq. unmask irq. ack irq. level_triggered: mask irq. ack irq. move irq. unmask irq. We can easily perform the edge triggered sequence, with the current defintion of move_native_irq. However the level triggered case does not map well. For that I have added move_masked_irq, to allow me to disable the irqs around both the ack and the move. Q: Why have we not seen this problem earlier? A: The only symptom I have been able to reproduce is that if we change the vector before acknowleding an irq the wrong irq is acknowledged. Since we currently are not reprogramming the irq vector during migration no problems show up. We have to mask the irq before we acknowledge the irq or else we could hit a window where an irq is asserted just before we acknowledge it. Edge triggered irqs do not have this problem because acknowledgements do not propogate in the same way. Signed-off-by: Eric W. Biederman Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Benjamin Herrenschmidt Cc: Rajesh Shah Cc: Andi Kleen Cc: "Protasevich, Natalie" Cc: "Luck, Tony" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/migration.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 9b234df810d0..4baa3bbcd25a 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -12,7 +12,7 @@ void set_pending_irq(unsigned int irq, cpumask_t mask) spin_unlock_irqrestore(&desc->lock, flags); } -void move_native_irq(int irq) +void move_masked_irq(int irq) { struct irq_desc *desc = irq_desc + irq; cpumask_t tmp; @@ -48,15 +48,29 @@ void move_native_irq(int irq) * when an active trigger is comming in. This could * cause some ioapics to mal-function. * Being paranoid i guess! + * + * For correct operation this depends on the caller + * masking the irqs. */ if (likely(!cpus_empty(tmp))) { - if (likely(!(desc->status & IRQ_DISABLED))) - desc->chip->disable(irq); - desc->chip->set_affinity(irq,tmp); - - if (likely(!(desc->status & IRQ_DISABLED))) - desc->chip->enable(irq); } cpus_clear(irq_desc[irq].pending_mask); } + +void move_native_irq(int irq) +{ + struct irq_desc *desc = irq_desc + irq; + + if (likely(!(desc->status & IRQ_MOVE_PENDING))) + return; + + if (likely(!(desc->status & IRQ_DISABLED))) + desc->chip->disable(irq); + + move_masked_irq(irq); + + if (likely(!(desc->status & IRQ_DISABLED))) + desc->chip->enable(irq); +} + -- cgit v1.2.3 From 3a16d713626735f3016da0521b7bf251cd78e836 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 4 Oct 2006 02:16:37 -0700 Subject: [PATCH] genirq: irq: add a dynamic irq creation API With the msi support comes a new concept in irq handling, irqs that are created dynamically at run time. Currently the msi code allocates irqs backwards. First it allocates a platform dependent routing value for an interrupt the ``vector'' and then it figures out from the vector which irq you are on. This msi backwards allocator suffers from two basic problems. The allocator suffers because it is trying to do something that is architecture specific in a generic way making it brittle, inflexible, and tied to tightly to the architecture implementation. The alloctor also suffers from it's very backwards nature as it has tied things together that should have no dependencies. To solve the basic dynamic irq allocation problem two new architecture specific functions are added: create_irq and destroy_irq. create_irq takes no input and returns an unused irq number, that won't be reused until it is returned to the free poll with destroy_irq. The irq then can be used for any purpose although the only initial consumer is the msi code. destroy_irq takes an irq number allocated with create_irq and returns it to the free pool. Making this functionality per architecture increases the simplicity of the irq allocation code and increases it's flexibility. dynamic_irq_init() and dynamic_irq_cleanup() are added to automate the irq_desc initializtion that should happen for dynamic irqs. Signed-off-by: Eric W. Biederman Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Benjamin Herrenschmidt Cc: Rajesh Shah Cc: Andi Kleen Cc: "Protasevich, Natalie" Cc: "Luck, Tony" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/chip.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 736cb0bd498f..0dc24386dd99 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -17,6 +17,62 @@ #include "internals.h" +/** + * dynamic_irq_init - initialize a dynamically allocated irq + * @irq: irq number to initialize + */ +void dynamic_irq_init(unsigned int irq) +{ + struct irq_desc *desc; + unsigned long flags; + + if (irq >= NR_IRQS) { + printk(KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); + WARN_ON(1); + return; + } + + /* Ensure we don't have left over values from a previous use of this irq */ + desc = irq_desc + irq; + spin_lock_irqsave(&desc->lock, flags); + desc->status = IRQ_DISABLED; + desc->chip = &no_irq_chip; + desc->handle_irq = handle_bad_irq; + desc->depth = 1; + desc->handler_data = NULL; + desc->chip_data = NULL; + desc->action = NULL; + desc->irq_count = 0; + desc->irqs_unhandled = 0; +#ifdef CONFIG_SMP + desc->affinity = CPU_MASK_ALL; +#endif + spin_unlock_irqrestore(&desc->lock, flags); +} + +/** + * dynamic_irq_cleanup - cleanup a dynamically allocated irq + * @irq: irq number to initialize + */ +void dynamic_irq_cleanup(unsigned int irq) +{ + struct irq_desc *desc; + unsigned long flags; + + if (irq >= NR_IRQS) { + printk(KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); + WARN_ON(1); + return; + } + + desc = irq_desc + irq; + spin_lock_irqsave(&desc->lock, flags); + desc->handle_irq = handle_bad_irq; + desc->chip = &no_irq_chip; + spin_unlock_irqrestore(&desc->lock, flags); +} + + /** * set_irq_chip - set the irq chip for an irq * @irq: irq number -- cgit v1.2.3 From 1f80025e624bb14fefadfef7e80fbfb9740d4714 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 4 Oct 2006 02:16:56 -0700 Subject: [PATCH] msi: simplify msi sanity checks by adding with generic irq code Currently msi.c is doing sanity checks that make certain before an irq is destroyed it has no more users. By adding irq_has_action I can perform the test is a generic way, instead of relying on a msi specific data structure. By performing the core check in dynamic_irq_cleanup I ensure every user of dynamic irqs has a test present and we don't free resources that are in use. In msi.c this allows me to kill the attrib.state member of msi_desc and all of the assciated code to maintain it. To keep from freeing data structures when irq cleanup code is called to soon changing dyanamic_irq_cleanup is insufficient because there are msi specific data structures that are also not safe to free. Signed-off-by: Eric W. Biederman Cc: Ingo Molnar Cc: Tony Luck Cc: Andi Kleen Cc: Thomas Gleixner Cc: Greg KH Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/chip.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 0dc24386dd99..4cf65f5c6a74 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -67,6 +67,13 @@ void dynamic_irq_cleanup(unsigned int irq) desc = irq_desc + irq; spin_lock_irqsave(&desc->lock, flags); + if (desc->action) { + spin_unlock_irqrestore(&desc->lock, flags); + printk(KERN_ERR "Destroying IRQ%d without calling free_irq\n", + irq); + WARN_ON(1); + return; + } desc->handle_irq = handle_bad_irq; desc->chip = &no_irq_chip; spin_unlock_irqrestore(&desc->lock, flags); -- cgit v1.2.3 From 621934ee7ed5b073c7fd638b347e632c53572761 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Oct 2006 02:17:02 -0700 Subject: [PATCH] srcu-3: RCU variant permitting read-side blocking Updated patch adding a variant of RCU that permits sleeping in read-side critical sections. SRCU is as follows: o Each use of SRCU creates its own srcu_struct, and each srcu_struct has its own set of grace periods. This is critical, as it prevents one subsystem with a blocking reader from holding up SRCU grace periods for other subsystems. o The SRCU primitives (srcu_read_lock(), srcu_read_unlock(), and synchronize_srcu()) all take a pointer to a srcu_struct. o The SRCU primitives must be called from process context. o srcu_read_lock() returns an int that must be passed to the matching srcu_read_unlock(). Realtime RCU avoids the need for this by storing the state in the task struct, but SRCU needs to allow a given code path to pass through multiple SRCU domains -- storing state in the task struct would therefore require either arbitrary space in the task struct or arbitrary limits on SRCU nesting. So I kicked the state-storage problem up to the caller. Of course, it is not permitted to call synchronize_srcu() while in an SRCU read-side critical section. o There is no call_srcu(). It would not be hard to implement one, but it seems like too easy a way to OOM the system. (Hey, we have enough trouble with call_rcu(), which does -not- permit readers to sleep!!!) So, if you want it, please tell me why... [josht@us.ibm.com: sparse notation] Signed-off-by: Paul E. McKenney Signed-off-by: Josh Triplett Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 2 +- kernel/srcu.c | 257 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 258 insertions(+), 1 deletion(-) create mode 100644 kernel/srcu.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index d948ca12acf0..5e3f3b75563a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ - hrtimer.o rwsem.o latency.o nsproxy.o + hrtimer.o rwsem.o latency.o nsproxy.o srcu.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ diff --git a/kernel/srcu.c b/kernel/srcu.c new file mode 100644 index 000000000000..7e1979f624ba --- /dev/null +++ b/kernel/srcu.c @@ -0,0 +1,257 @@ +/* + * Sleepable Read-Copy Update mechanism for mutual exclusion. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2006 + * + * Author: Paul McKenney + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU/ *.txt + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * init_srcu_struct - initialize a sleep-RCU structure + * @sp: structure to initialize. + * + * Must invoke this on a given srcu_struct before passing that srcu_struct + * to any other function. Each srcu_struct represents a separate domain + * of SRCU protection. + */ +void init_srcu_struct(struct srcu_struct *sp) +{ + sp->completed = 0; + sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); + mutex_init(&sp->mutex); +} + +/* + * srcu_readers_active_idx -- returns approximate number of readers + * active on the specified rank of per-CPU counters. + */ + +static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) +{ + int cpu; + int sum; + + sum = 0; + for_each_possible_cpu(cpu) + sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]; + return sum; +} + +/** + * srcu_readers_active - returns approximate number of readers. + * @sp: which srcu_struct to count active readers (holding srcu_read_lock). + * + * Note that this is not an atomic primitive, and can therefore suffer + * severe errors when invoked on an active srcu_struct. That said, it + * can be useful as an error check at cleanup time. + */ +int srcu_readers_active(struct srcu_struct *sp) +{ + return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1); +} + +/** + * cleanup_srcu_struct - deconstruct a sleep-RCU structure + * @sp: structure to clean up. + * + * Must invoke this after you are finished using a given srcu_struct that + * was initialized via init_srcu_struct(), else you leak memory. + */ +void cleanup_srcu_struct(struct srcu_struct *sp) +{ + int sum; + + sum = srcu_readers_active(sp); + WARN_ON(sum); /* Leakage unless caller handles error. */ + if (sum != 0) + return; + free_percpu(sp->per_cpu_ref); + sp->per_cpu_ref = NULL; +} + +/** + * srcu_read_lock - register a new reader for an SRCU-protected structure. + * @sp: srcu_struct in which to register the new reader. + * + * Counts the new reader in the appropriate per-CPU element of the + * srcu_struct. Must be called from process context. + * Returns an index that must be passed to the matching srcu_read_unlock(). + */ +int srcu_read_lock(struct srcu_struct *sp) +{ + int idx; + + preempt_disable(); + idx = sp->completed & 0x1; + barrier(); /* ensure compiler looks -once- at sp->completed. */ + per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++; + srcu_barrier(); /* ensure compiler won't misorder critical section. */ + preempt_enable(); + return idx; +} + +/** + * srcu_read_unlock - unregister a old reader from an SRCU-protected structure. + * @sp: srcu_struct in which to unregister the old reader. + * @idx: return value from corresponding srcu_read_lock(). + * + * Removes the count for the old reader from the appropriate per-CPU + * element of the srcu_struct. Note that this may well be a different + * CPU than that which was incremented by the corresponding srcu_read_lock(). + * Must be called from process context. + */ +void srcu_read_unlock(struct srcu_struct *sp, int idx) +{ + preempt_disable(); + srcu_barrier(); /* ensure compiler won't misorder critical section. */ + per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; + preempt_enable(); +} + +/** + * synchronize_srcu - wait for prior SRCU read-side critical-section completion + * @sp: srcu_struct with which to synchronize. + * + * Flip the completed counter, and wait for the old count to drain to zero. + * As with classic RCU, the updater must use some separate means of + * synchronizing concurrent updates. Can block; must be called from + * process context. + * + * Note that it is illegal to call synchornize_srcu() from the corresponding + * SRCU read-side critical section; doing so will result in deadlock. + * However, it is perfectly legal to call synchronize_srcu() on one + * srcu_struct from some other srcu_struct's read-side critical section. + */ +void synchronize_srcu(struct srcu_struct *sp) +{ + int idx; + + idx = sp->completed; + mutex_lock(&sp->mutex); + + /* + * Check to see if someone else did the work for us while we were + * waiting to acquire the lock. We need -two- advances of + * the counter, not just one. If there was but one, we might have + * shown up -after- our helper's first synchronize_sched(), thus + * having failed to prevent CPU-reordering races with concurrent + * srcu_read_unlock()s on other CPUs (see comment below). So we + * either (1) wait for two or (2) supply the second ourselves. + */ + + if ((sp->completed - idx) >= 2) { + mutex_unlock(&sp->mutex); + return; + } + + synchronize_sched(); /* Force memory barrier on all CPUs. */ + + /* + * The preceding synchronize_sched() ensures that any CPU that + * sees the new value of sp->completed will also see any preceding + * changes to data structures made by this CPU. This prevents + * some other CPU from reordering the accesses in its SRCU + * read-side critical section to precede the corresponding + * srcu_read_lock() -- ensuring that such references will in + * fact be protected. + * + * So it is now safe to do the flip. + */ + + idx = sp->completed & 0x1; + sp->completed++; + + synchronize_sched(); /* Force memory barrier on all CPUs. */ + + /* + * At this point, because of the preceding synchronize_sched(), + * all srcu_read_lock() calls using the old counters have completed. + * Their corresponding critical sections might well be still + * executing, but the srcu_read_lock() primitives themselves + * will have finished executing. + */ + + while (srcu_readers_active_idx(sp, idx)) + schedule_timeout_interruptible(1); + + synchronize_sched(); /* Force memory barrier on all CPUs. */ + + /* + * The preceding synchronize_sched() forces all srcu_read_unlock() + * primitives that were executing concurrently with the preceding + * for_each_possible_cpu() loop to have completed by this point. + * More importantly, it also forces the corresponding SRCU read-side + * critical sections to have also completed, and the corresponding + * references to SRCU-protected data items to be dropped. + * + * Note: + * + * Despite what you might think at first glance, the + * preceding synchronize_sched() -must- be within the + * critical section ended by the following mutex_unlock(). + * Otherwise, a task taking the early exit can race + * with a srcu_read_unlock(), which might have executed + * just before the preceding srcu_readers_active() check, + * and whose CPU might have reordered the srcu_read_unlock() + * with the preceding critical section. In this case, there + * is nothing preventing the synchronize_sched() task that is + * taking the early exit from freeing a data structure that + * is still being referenced (out of order) by the task + * doing the srcu_read_unlock(). + * + * Alternatively, the comparison with "2" on the early exit + * could be changed to "3", but this increases synchronize_srcu() + * latency for bulk loads. So the current code is preferred. + */ + + mutex_unlock(&sp->mutex); +} + +/** + * srcu_batches_completed - return batches completed. + * @sp: srcu_struct on which to report batch completion. + * + * Report the number of batches, correlated with, but not necessarily + * precisely the same as, the number of grace periods that have elapsed. + */ + +long srcu_batches_completed(struct srcu_struct *sp) +{ + return sp->completed; +} + +EXPORT_SYMBOL_GPL(init_srcu_struct); +EXPORT_SYMBOL_GPL(cleanup_srcu_struct); +EXPORT_SYMBOL_GPL(srcu_read_lock); +EXPORT_SYMBOL_GPL(srcu_read_unlock); +EXPORT_SYMBOL_GPL(synchronize_srcu); +EXPORT_SYMBOL_GPL(srcu_batches_completed); +EXPORT_SYMBOL_GPL(srcu_readers_active); -- cgit v1.2.3 From b2896d2e75c87ea6a842c088db730b03c91db737 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Oct 2006 02:17:03 -0700 Subject: [PATCH] srcu-3: add SRCU operations to rcutorture Adds SRCU operations to rcutorture and updates rcutorture documentation. Also increases the stress imposed by the rcutorture test. [bunk@stusta.de: make needlessly global code static] Signed-off-by: Paul E. McKenney Cc: Paul E. McKenney Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcutorture.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 123 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 23446e91cded..e34d22bf2293 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -44,6 +44,7 @@ #include #include #include +#include MODULE_LICENSE("GPL"); @@ -53,7 +54,7 @@ static int stat_interval; /* Interval between stats, in seconds. */ static int verbose; /* Print more debug info. */ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ -static char *torture_type = "rcu"; /* What to torture. */ +static char *torture_type = "rcu"; /* What to torture: rcu, srcu. */ module_param(nreaders, int, 0); MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); @@ -66,7 +67,7 @@ MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); module_param(shuffle_interval, int, 0); MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); module_param(torture_type, charp, 0); -MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh)"); +MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); #define TORTURE_FLAG "-torture:" #define PRINTK_STRING(s) \ @@ -104,11 +105,11 @@ static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = { 0 }; static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; -atomic_t n_rcu_torture_alloc; -atomic_t n_rcu_torture_alloc_fail; -atomic_t n_rcu_torture_free; -atomic_t n_rcu_torture_mberror; -atomic_t n_rcu_torture_error; +static atomic_t n_rcu_torture_alloc; +static atomic_t n_rcu_torture_alloc_fail; +static atomic_t n_rcu_torture_free; +static atomic_t n_rcu_torture_mberror; +static atomic_t n_rcu_torture_error; /* * Allocate an element from the rcu_tortures pool. @@ -180,6 +181,7 @@ struct rcu_torture_ops { void (*init)(void); void (*cleanup)(void); int (*readlock)(void); + void (*readdelay)(struct rcu_random_state *rrsp); void (*readunlock)(int idx); int (*completed)(void); void (*deferredfree)(struct rcu_torture *p); @@ -198,6 +200,18 @@ static int rcu_torture_read_lock(void) __acquires(RCU) return 0; } +static void rcu_read_delay(struct rcu_random_state *rrsp) +{ + long delay; + const long longdelay = 200; + + /* We want there to be long-running readers, but not all the time. */ + + delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay); + if (!delay) + udelay(longdelay); +} + static void rcu_torture_read_unlock(int idx) __releases(RCU) { rcu_read_unlock(); @@ -239,6 +253,7 @@ static struct rcu_torture_ops rcu_ops = { .init = NULL, .cleanup = NULL, .readlock = rcu_torture_read_lock, + .readdelay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, .completed = rcu_torture_completed, .deferredfree = rcu_torture_deferred_free, @@ -275,6 +290,7 @@ static struct rcu_torture_ops rcu_bh_ops = { .init = NULL, .cleanup = NULL, .readlock = rcu_bh_torture_read_lock, + .readdelay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, .completed = rcu_bh_torture_completed, .deferredfree = rcu_bh_torture_deferred_free, @@ -282,8 +298,105 @@ static struct rcu_torture_ops rcu_bh_ops = { .name = "rcu_bh" }; +/* + * Definitions for srcu torture testing. + */ + +static struct srcu_struct srcu_ctl; +static struct list_head srcu_removed; + +static void srcu_torture_init(void) +{ + init_srcu_struct(&srcu_ctl); + INIT_LIST_HEAD(&srcu_removed); +} + +static void srcu_torture_cleanup(void) +{ + synchronize_srcu(&srcu_ctl); + cleanup_srcu_struct(&srcu_ctl); +} + +static int srcu_torture_read_lock(void) +{ + return srcu_read_lock(&srcu_ctl); +} + +static void srcu_read_delay(struct rcu_random_state *rrsp) +{ + long delay; + const long uspertick = 1000000 / HZ; + const long longdelay = 10; + + /* We want there to be long-running readers, but not all the time. */ + + delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); + if (!delay) + schedule_timeout_interruptible(longdelay); +} + +static void srcu_torture_read_unlock(int idx) +{ + srcu_read_unlock(&srcu_ctl, idx); +} + +static int srcu_torture_completed(void) +{ + return srcu_batches_completed(&srcu_ctl); +} + +static void srcu_torture_deferred_free(struct rcu_torture *p) +{ + int i; + struct rcu_torture *rp; + struct rcu_torture *rp1; + + synchronize_srcu(&srcu_ctl); + list_add(&p->rtort_free, &srcu_removed); + list_for_each_entry_safe(rp, rp1, &srcu_removed, rtort_free) { + i = rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { + rp->rtort_mbtest = 0; + list_del(&rp->rtort_free); + rcu_torture_free(rp); + } + } +} + +static int srcu_torture_stats(char *page) +{ + int cnt = 0; + int cpu; + int idx = srcu_ctl.completed & 0x1; + + cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", + torture_type, TORTURE_FLAG, idx); + for_each_possible_cpu(cpu) { + cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu, + per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], + per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); + } + cnt += sprintf(&page[cnt], "\n"); + return cnt; +} + +static struct rcu_torture_ops srcu_ops = { + .init = srcu_torture_init, + .cleanup = srcu_torture_cleanup, + .readlock = srcu_torture_read_lock, + .readdelay = srcu_read_delay, + .readunlock = srcu_torture_read_unlock, + .completed = srcu_torture_completed, + .deferredfree = srcu_torture_deferred_free, + .stats = srcu_torture_stats, + .name = "srcu" +}; + static struct rcu_torture_ops *torture_ops[] = - { &rcu_ops, &rcu_bh_ops, NULL }; + { &rcu_ops, &rcu_bh_ops, &srcu_ops, NULL }; /* * RCU torture writer kthread. Repeatedly substitutes a new structure @@ -359,7 +472,7 @@ rcu_torture_reader(void *arg) } if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); - udelay(rcu_random(&rand) & 0x7f); + cur_ops->readdelay(&rand); preempt_disable(); pipe_count = p->rtort_pipe_count; if (pipe_count > RCU_TORTURE_PIPE_LEN) { @@ -483,7 +596,7 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ /* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs. */ -void rcu_torture_shuffle_tasks(void) +static void rcu_torture_shuffle_tasks(void) { cpumask_t tmp_mask = CPU_MASK_ALL; int i; -- cgit v1.2.3 From eabc069401bcf45bcc3f19e643017bf761780aa8 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Wed, 4 Oct 2006 02:17:04 -0700 Subject: [PATCH] Add SRCU-based notifier chains This patch (as751) adds a new type of notifier chain, based on the SRCU (Sleepable Read-Copy Update) primitives recently added to the kernel. An SRCU notifier chain is much like a blocking notifier chain, in that it must be called in process context and its callout routines are allowed to sleep. The difference is that the chain's links are protected by the SRCU mechanism rather than by an rw-semaphore, so calling the chain has extremely low overhead: no memory barriers and no cache-line bouncing. On the other hand, unregistering from the chain is expensive and the chain head requires special runtime initialization (plus cleanup if it is to be deallocated). SRCU notifiers are appropriate for notifiers that will be called very frequently and for which unregistration occurs very seldom. The proposed "task notifier" scheme qualifies, as may some of the network notifiers. Signed-off-by: Alan Stern Acked-by: Paul E. McKenney Acked-by: Chandra Seetharaman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 123 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 2314867ae34f..fd5c71006775 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -153,7 +153,7 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl, /* * Atomic notifier chain routines. Registration and unregistration - * use a mutex, and call_chain is synchronized by RCU (no locks). + * use a spinlock, and call_chain is synchronized by RCU (no locks). */ /** @@ -401,6 +401,128 @@ int raw_notifier_call_chain(struct raw_notifier_head *nh, EXPORT_SYMBOL_GPL(raw_notifier_call_chain); +/* + * SRCU notifier chain routines. Registration and unregistration + * use a mutex, and call_chain is synchronized by SRCU (no locks). + */ + +/** + * srcu_notifier_chain_register - Add notifier to an SRCU notifier chain + * @nh: Pointer to head of the SRCU notifier chain + * @n: New entry in notifier chain + * + * Adds a notifier to an SRCU notifier chain. + * Must be called in process context. + * + * Currently always returns zero. + */ + +int srcu_notifier_chain_register(struct srcu_notifier_head *nh, + struct notifier_block *n) +{ + int ret; + + /* + * This code gets used during boot-up, when task switching is + * not yet working and interrupts must remain disabled. At + * such times we must not call mutex_lock(). + */ + if (unlikely(system_state == SYSTEM_BOOTING)) + return notifier_chain_register(&nh->head, n); + + mutex_lock(&nh->mutex); + ret = notifier_chain_register(&nh->head, n); + mutex_unlock(&nh->mutex); + return ret; +} + +EXPORT_SYMBOL_GPL(srcu_notifier_chain_register); + +/** + * srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain + * @nh: Pointer to head of the SRCU notifier chain + * @n: Entry to remove from notifier chain + * + * Removes a notifier from an SRCU notifier chain. + * Must be called from process context. + * + * Returns zero on success or %-ENOENT on failure. + */ +int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh, + struct notifier_block *n) +{ + int ret; + + /* + * This code gets used during boot-up, when task switching is + * not yet working and interrupts must remain disabled. At + * such times we must not call mutex_lock(). + */ + if (unlikely(system_state == SYSTEM_BOOTING)) + return notifier_chain_unregister(&nh->head, n); + + mutex_lock(&nh->mutex); + ret = notifier_chain_unregister(&nh->head, n); + mutex_unlock(&nh->mutex); + synchronize_srcu(&nh->srcu); + return ret; +} + +EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); + +/** + * srcu_notifier_call_chain - Call functions in an SRCU notifier chain + * @nh: Pointer to head of the SRCU notifier chain + * @val: Value passed unmodified to notifier function + * @v: Pointer passed unmodified to notifier function + * + * Calls each function in a notifier chain in turn. The functions + * run in a process context, so they are allowed to block. + * + * If the return value of the notifier can be and'ed + * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain + * will return immediately, with the return value of + * the notifier function which halted execution. + * Otherwise the return value is the return value + * of the last notifier function called. + */ + +int srcu_notifier_call_chain(struct srcu_notifier_head *nh, + unsigned long val, void *v) +{ + int ret; + int idx; + + idx = srcu_read_lock(&nh->srcu); + ret = notifier_call_chain(&nh->head, val, v); + srcu_read_unlock(&nh->srcu, idx); + return ret; +} + +EXPORT_SYMBOL_GPL(srcu_notifier_call_chain); + +/** + * srcu_init_notifier_head - Initialize an SRCU notifier head + * @nh: Pointer to head of the srcu notifier chain + * + * Unlike other sorts of notifier heads, SRCU notifier heads require + * dynamic initialization. Be sure to call this routine before + * calling any of the other SRCU notifier routines for this head. + * + * If an SRCU notifier head is deallocated, it must first be cleaned + * up by calling srcu_cleanup_notifier_head(). Otherwise the head's + * per-cpu data (used by the SRCU mechanism) will leak. + */ + +void srcu_init_notifier_head(struct srcu_notifier_head *nh) +{ + mutex_init(&nh->mutex); + init_srcu_struct(&nh->srcu); + nh->head = NULL; +} + +EXPORT_SYMBOL_GPL(srcu_init_notifier_head); + /** * register_reboot_notifier - Register function to be called at reboot time * @nb: Info about notifier function to be called -- cgit v1.2.3 From e6a92013ba458804161c0c5b6d134d82204dc233 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Wed, 4 Oct 2006 02:17:05 -0700 Subject: [PATCH] SRCU: report out-of-memory errors Currently the init_srcu_struct() routine has no way to report out-of-memory errors. This patch (as761) makes it return -ENOMEM when the per-cpu data allocation fails. The patch also makes srcu_init_notifier_head() report a BUG if a notifier head can't be initialized. Perhaps it should return -ENOMEM instead, but in the most likely cases where this might occur I don't think any recovery is possible. Notifier chains generally are not created dynamically. [akpm@osdl.org: avoid statement-with-side-effect in macro] Signed-off-by: Alan Stern Acked-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/srcu.c | 5 +++-- kernel/sys.c | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/srcu.c b/kernel/srcu.c index 7e1979f624ba..3507cabe963b 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -42,11 +42,12 @@ * to any other function. Each srcu_struct represents a separate domain * of SRCU protection. */ -void init_srcu_struct(struct srcu_struct *sp) +int init_srcu_struct(struct srcu_struct *sp) { sp->completed = 0; - sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); mutex_init(&sp->mutex); + sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); + return (sp->per_cpu_ref ? 0 : -ENOMEM); } /* diff --git a/kernel/sys.c b/kernel/sys.c index fd5c71006775..98489d82801b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -517,7 +517,8 @@ EXPORT_SYMBOL_GPL(srcu_notifier_call_chain); void srcu_init_notifier_head(struct srcu_notifier_head *nh) { mutex_init(&nh->mutex); - init_srcu_struct(&nh->srcu); + if (init_srcu_struct(&nh->srcu) < 0) + BUG(); nh->head = NULL; } -- cgit v1.2.3 From ff2c93a5373f12f86f3281705d11278a9f2334e2 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Wed, 4 Oct 2006 02:17:09 -0700 Subject: [PATCH] rcu: Add MODULE_AUTHOR to rcutorture module Signed-off-by: Josh Triplett Acked-by: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcutorture.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index e34d22bf2293..7b965341b1dc 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -47,6 +47,7 @@ #include MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney "); static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ static int stat_interval; /* Interval between stats, in seconds. */ -- cgit v1.2.3 From 3c29e03d9121e07714fb9e5303d9b026800ffd5a Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Wed, 4 Oct 2006 02:17:10 -0700 Subject: [PATCH] rcu: Mention rcu_bh in description of rcutorture's torture_type parameter The comment for rcutorture's torture_type parameter only lists the RCU variants rcu and srcu, but not rcu_bh; add rcu_bh to the list. Signed-off-by: Josh Triplett Acked-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 7b965341b1dc..021d3108bb6e 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -55,7 +55,7 @@ static int stat_interval; /* Interval between stats, in seconds. */ static int verbose; /* Print more debug info. */ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ -static char *torture_type = "rcu"; /* What to torture: rcu, srcu. */ +static char *torture_type = "rcu"; /* What to torture: rcu, rcu_bh, srcu. */ module_param(nreaders, int, 0); MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); -- cgit v1.2.3 From 2860aaba4dc87fa43c08724434b87a8650f3bff5 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Wed, 4 Oct 2006 02:17:11 -0700 Subject: [PATCH] rcu: Avoid kthread_stop on invalid pointer if rcutorture reader startup fails rcu_torture_init kmallocs the array of reader threads, then creates each one with kthread_run, cleaning up with rcu_torture_cleanup if this fails. rcu_torture_cleanup calls kthread_stop on any non-NULL pointer in the array; however, any readers after the one that failed to start up will have invalid pointers, not null pointers. Avoid this by using kzalloc instead. Signed-off-by: Josh Triplett Acked-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 021d3108bb6e..42e7f01e8003 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -780,7 +780,7 @@ rcu_torture_init(void) writer_task = NULL; goto unwind; } - reader_tasks = kmalloc(nrealreaders * sizeof(reader_tasks[0]), + reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), GFP_KERNEL); if (reader_tasks == NULL) { VERBOSE_PRINTK_ERRSTRING("out of memory"); -- cgit v1.2.3 From 75cfef32f26d03f5d0a0833572d52f94ad858a36 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Wed, 4 Oct 2006 02:17:12 -0700 Subject: [PATCH] rcu: Fix sign bug making rcu_random always return the same sequence rcu_random uses a counter rrs_count to occasionally mix data from get_random_bytes into the state of its pseudorandom generator. However, the rrs_counter gets declared as an unsigned long, and rcu_random checks for --rrs_count < 0, so this code will never mix any real random data into the state, and will thus always return the same sequence of random numbers. Also, change the return value of rcu_random from long to unsigned long, to avoid potential issues caused by the use of the % operator, which can return negative values for negative left operands. Signed-off-by: Josh Triplett Acked-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 42e7f01e8003..43d6d4f9ef09 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -147,7 +147,7 @@ rcu_torture_free(struct rcu_torture *p) struct rcu_random_state { unsigned long rrs_state; - unsigned long rrs_count; + long rrs_count; }; #define RCU_RANDOM_MULT 39916801 /* prime */ @@ -160,7 +160,7 @@ struct rcu_random_state { * Crude but fast random-number generator. Uses a linear congruential * generator, with occasional help from get_random_bytes(). */ -static long +static unsigned long rcu_random(struct rcu_random_state *rrsp) { long refresh; -- cgit v1.2.3 From b772e1dd4b1e60a7a160f7bd4ea08e28394ceb54 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Wed, 4 Oct 2006 02:17:13 -0700 Subject: [PATCH] RCU: add fake writers to rcutorture rcutorture currently has one writer and an arbitrary number of readers. To better exercise some of the code paths in RCU implementations, add fake writer threads which call the synchronize function for the RCU variant in a loop, with a delay between calls to arrange for different numbers of writers running in parallel. [bunk@stusta.de: cleanup] Acked-by: Paul McKenney Cc: Dipkanar Sarma Signed-off-by: Josh Triplett Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcutorture.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 104 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 43d6d4f9ef09..e0450210ce4d 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -15,9 +15,10 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * - * Copyright (C) IBM Corporation, 2005 + * Copyright (C) IBM Corporation, 2005, 2006 * * Authors: Paul E. McKenney + * Josh Triplett * * See also: Documentation/RCU/torture.txt */ @@ -47,9 +48,11 @@ #include MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney "); +MODULE_AUTHOR("Paul E. McKenney and " + "Josh Triplett "); static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ +static int nfakewriters = 4; /* # fake writer threads */ static int stat_interval; /* Interval between stats, in seconds. */ /* Defaults to "only at end of test". */ static int verbose; /* Print more debug info. */ @@ -59,6 +62,8 @@ static char *torture_type = "rcu"; /* What to torture: rcu, rcu_bh, srcu. */ module_param(nreaders, int, 0); MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); +module_param(nfakewriters, int, 0); +MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); module_param(stat_interval, int, 0); MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); module_param(verbose, bool, 0); @@ -82,6 +87,7 @@ static char printk_buf[4096]; static int nrealreaders; static struct task_struct *writer_task; +static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; static struct task_struct *stats_task; static struct task_struct *shuffler_task; @@ -186,6 +192,7 @@ struct rcu_torture_ops { void (*readunlock)(int idx); int (*completed)(void); void (*deferredfree)(struct rcu_torture *p); + void (*sync)(void); int (*stats)(char *page); char *name; }; @@ -258,6 +265,7 @@ static struct rcu_torture_ops rcu_ops = { .readunlock = rcu_torture_read_unlock, .completed = rcu_torture_completed, .deferredfree = rcu_torture_deferred_free, + .sync = synchronize_rcu, .stats = NULL, .name = "rcu" }; @@ -287,6 +295,28 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p) call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); } +struct rcu_bh_torture_synchronize { + struct rcu_head head; + struct completion completion; +}; + +static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head) +{ + struct rcu_bh_torture_synchronize *rcu; + + rcu = container_of(head, struct rcu_bh_torture_synchronize, head); + complete(&rcu->completion); +} + +static void rcu_bh_torture_synchronize(void) +{ + struct rcu_bh_torture_synchronize rcu; + + init_completion(&rcu.completion); + call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb); + wait_for_completion(&rcu.completion); +} + static struct rcu_torture_ops rcu_bh_ops = { .init = NULL, .cleanup = NULL, @@ -295,6 +325,7 @@ static struct rcu_torture_ops rcu_bh_ops = { .readunlock = rcu_bh_torture_read_unlock, .completed = rcu_bh_torture_completed, .deferredfree = rcu_bh_torture_deferred_free, + .sync = rcu_bh_torture_synchronize, .stats = NULL, .name = "rcu_bh" }; @@ -367,6 +398,11 @@ static void srcu_torture_deferred_free(struct rcu_torture *p) } } +static void srcu_torture_synchronize(void) +{ + synchronize_srcu(&srcu_ctl); +} + static int srcu_torture_stats(char *page) { int cnt = 0; @@ -392,6 +428,7 @@ static struct rcu_torture_ops srcu_ops = { .readunlock = srcu_torture_read_unlock, .completed = srcu_torture_completed, .deferredfree = srcu_torture_deferred_free, + .sync = srcu_torture_synchronize, .stats = srcu_torture_stats, .name = "srcu" }; @@ -443,6 +480,30 @@ rcu_torture_writer(void *arg) return 0; } +/* + * RCU torture fake writer kthread. Repeatedly calls sync, with a random + * delay between calls. + */ +static int +rcu_torture_fakewriter(void *arg) +{ + DEFINE_RCU_RANDOM(rand); + + VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); + set_user_nice(current, 19); + + do { + schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); + udelay(rcu_random(&rand) & 0x3ff); + cur_ops->sync(); + } while (!kthread_should_stop() && !fullstop); + + VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + /* * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, * incrementing the corresponding element of the pipeline array. The @@ -621,6 +682,12 @@ static void rcu_torture_shuffle_tasks(void) set_cpus_allowed(reader_tasks[i], tmp_mask); } + if (fakewriter_tasks != NULL) { + for (i = 0; i < nfakewriters; i++) + if (fakewriter_tasks[i]) + set_cpus_allowed(fakewriter_tasks[i], tmp_mask); + } + if (writer_task) set_cpus_allowed(writer_task, tmp_mask); @@ -654,11 +721,12 @@ rcu_torture_shuffle(void *arg) static inline void rcu_torture_print_module_parms(char *tag) { - printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d " + printk(KERN_ALERT "%s" TORTURE_FLAG + "--- %s: nreaders=%d nfakewriters=%d " "stat_interval=%d verbose=%d test_no_idle_hz=%d " "shuffle_interval = %d\n", - torture_type, tag, nrealreaders, stat_interval, verbose, - test_no_idle_hz, shuffle_interval); + torture_type, tag, nrealreaders, nfakewriters, + stat_interval, verbose, test_no_idle_hz, shuffle_interval); } static void @@ -693,6 +761,19 @@ rcu_torture_cleanup(void) } rcu_torture_current = NULL; + if (fakewriter_tasks != NULL) { + for (i = 0; i < nfakewriters; i++) { + if (fakewriter_tasks[i] != NULL) { + VERBOSE_PRINTK_STRING( + "Stopping rcu_torture_fakewriter task"); + kthread_stop(fakewriter_tasks[i]); + } + fakewriter_tasks[i] = NULL; + } + kfree(fakewriter_tasks); + fakewriter_tasks = NULL; + } + if (stats_task != NULL) { VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); kthread_stop(stats_task); @@ -780,6 +861,24 @@ rcu_torture_init(void) writer_task = NULL; goto unwind; } + fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), + GFP_KERNEL); + if (fakewriter_tasks == NULL) { + VERBOSE_PRINTK_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nfakewriters; i++) { + VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); + fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, + "rcu_torture_fakewriter"); + if (IS_ERR(fakewriter_tasks[i])) { + firsterr = PTR_ERR(fakewriter_tasks[i]); + VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); + fakewriter_tasks[i] = NULL; + goto unwind; + } + } reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), GFP_KERNEL); if (reader_tasks == NULL) { -- cgit v1.2.3 From e3033736581f125ba5fd6c0760e0d430d54fb5c0 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Wed, 4 Oct 2006 02:17:14 -0700 Subject: [PATCH] rcu: refactor srcu_torture_deferred_free to work for any implementation Make srcu_torture_deferred_free use cur_ops->sync() so it will work for any implementation. Move and rename it in preparation for use in the ops of other implementations. Signed-off-by: Josh Triplett Acked-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcutorture.c | 53 +++++++++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index e0450210ce4d..6e2f0a8344c2 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -117,6 +117,7 @@ static atomic_t n_rcu_torture_alloc_fail; static atomic_t n_rcu_torture_free; static atomic_t n_rcu_torture_mberror; static atomic_t n_rcu_torture_error; +static struct list_head rcu_torture_removed; /* * Allocate an element from the rcu_tortures pool. @@ -270,6 +271,32 @@ static struct rcu_torture_ops rcu_ops = { .name = "rcu" }; +static void rcu_sync_torture_deferred_free(struct rcu_torture *p) +{ + int i; + struct rcu_torture *rp; + struct rcu_torture *rp1; + + cur_ops->sync(); + list_add(&p->rtort_free, &rcu_torture_removed); + list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) { + i = rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { + rp->rtort_mbtest = 0; + list_del(&rp->rtort_free); + rcu_torture_free(rp); + } + } +} + +static void rcu_sync_torture_init(void) +{ + INIT_LIST_HEAD(&rcu_torture_removed); +} + /* * Definitions for rcu_bh torture testing. */ @@ -335,12 +362,11 @@ static struct rcu_torture_ops rcu_bh_ops = { */ static struct srcu_struct srcu_ctl; -static struct list_head srcu_removed; static void srcu_torture_init(void) { init_srcu_struct(&srcu_ctl); - INIT_LIST_HEAD(&srcu_removed); + rcu_sync_torture_init(); } static void srcu_torture_cleanup(void) @@ -377,27 +403,6 @@ static int srcu_torture_completed(void) return srcu_batches_completed(&srcu_ctl); } -static void srcu_torture_deferred_free(struct rcu_torture *p) -{ - int i; - struct rcu_torture *rp; - struct rcu_torture *rp1; - - synchronize_srcu(&srcu_ctl); - list_add(&p->rtort_free, &srcu_removed); - list_for_each_entry_safe(rp, rp1, &srcu_removed, rtort_free) { - i = rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { - rp->rtort_mbtest = 0; - list_del(&rp->rtort_free); - rcu_torture_free(rp); - } - } -} - static void srcu_torture_synchronize(void) { synchronize_srcu(&srcu_ctl); @@ -427,7 +432,7 @@ static struct rcu_torture_ops srcu_ops = { .readdelay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, .completed = srcu_torture_completed, - .deferredfree = srcu_torture_deferred_free, + .deferredfree = rcu_sync_torture_deferred_free, .sync = srcu_torture_synchronize, .stats = srcu_torture_stats, .name = "srcu" -- cgit v1.2.3 From 20d2e4283a97665a3db78c60dfa342a0c7c1b180 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Wed, 4 Oct 2006 02:17:15 -0700 Subject: [PATCH] rcu: add rcu_sync torture type to rcutorture Use the newly-generic synchronous deferred free function to implement torture testing for RCU using synchronize_rcu rather than the asynchronous call_rcu. Signed-off-by: Josh Triplett Acked-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcutorture.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 6e2f0a8344c2..1c329df60bcc 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -58,7 +58,7 @@ static int stat_interval; /* Interval between stats, in seconds. */ static int verbose; /* Print more debug info. */ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ -static char *torture_type = "rcu"; /* What to torture: rcu, rcu_bh, srcu. */ +static char *torture_type = "rcu"; /* What RCU implementation to torture. */ module_param(nreaders, int, 0); MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); @@ -297,6 +297,19 @@ static void rcu_sync_torture_init(void) INIT_LIST_HEAD(&rcu_torture_removed); } +static struct rcu_torture_ops rcu_sync_ops = { + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = rcu_torture_read_lock, + .readdelay = rcu_read_delay, + .readunlock = rcu_torture_read_unlock, + .completed = rcu_torture_completed, + .deferredfree = rcu_sync_torture_deferred_free, + .sync = synchronize_rcu, + .stats = NULL, + .name = "rcu_sync" +}; + /* * Definitions for rcu_bh torture testing. */ @@ -439,7 +452,7 @@ static struct rcu_torture_ops srcu_ops = { }; static struct rcu_torture_ops *torture_ops[] = - { &rcu_ops, &rcu_bh_ops, &srcu_ops, NULL }; + { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &srcu_ops, NULL }; /* * RCU torture writer kthread. Repeatedly substitutes a new structure -- cgit v1.2.3 From 11a147013e39ff4cb031395cb78a9d307c4799cd Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Wed, 4 Oct 2006 02:17:16 -0700 Subject: [PATCH] rcu: add rcu_bh_sync torture type to rcutorture Use the newly-generic synchronous deferred free function to implement torture testing for rcu_bh using synchronize_rcu_bh rather than the asynchronous call_rcu_bh. Signed-off-by: Josh Triplett Acked-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcutorture.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 1c329df60bcc..0f0ff1556b5d 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -370,6 +370,19 @@ static struct rcu_torture_ops rcu_bh_ops = { .name = "rcu_bh" }; +static struct rcu_torture_ops rcu_bh_sync_ops = { + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = rcu_bh_torture_read_lock, + .readdelay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_bh_torture_read_unlock, + .completed = rcu_bh_torture_completed, + .deferredfree = rcu_sync_torture_deferred_free, + .sync = rcu_bh_torture_synchronize, + .stats = NULL, + .name = "rcu_bh_sync" +}; + /* * Definitions for srcu torture testing. */ @@ -452,7 +465,8 @@ static struct rcu_torture_ops srcu_ops = { }; static struct rcu_torture_ops *torture_ops[] = - { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &srcu_ops, NULL }; + { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &srcu_ops, + NULL }; /* * RCU torture writer kthread. Repeatedly substitutes a new structure -- cgit v1.2.3 From 4b6c2cca6eef9cc4a15350bf1c61839e12e08b84 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Wed, 4 Oct 2006 02:17:16 -0700 Subject: [PATCH] rcu: add sched torture type to rcutorture Implement torture testing for the "sched" variant of RCU, which uses preempt_disable, preempt_enable, and synchronize_sched. Signed-off-by: Josh Triplett Acked-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcutorture.c | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 0f0ff1556b5d..e2bda18f6f42 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -464,9 +464,47 @@ static struct rcu_torture_ops srcu_ops = { .name = "srcu" }; +/* + * Definitions for sched torture testing. + */ + +static int sched_torture_read_lock(void) +{ + preempt_disable(); + return 0; +} + +static void sched_torture_read_unlock(int idx) +{ + preempt_enable(); +} + +static int sched_torture_completed(void) +{ + return 0; +} + +static void sched_torture_synchronize(void) +{ + synchronize_sched(); +} + +static struct rcu_torture_ops sched_ops = { + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = sched_torture_read_lock, + .readdelay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = sched_torture_read_unlock, + .completed = sched_torture_completed, + .deferredfree = rcu_sync_torture_deferred_free, + .sync = sched_torture_synchronize, + .stats = NULL, + .name = "sched" +}; + static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &srcu_ops, - NULL }; + &sched_ops, NULL }; /* * RCU torture writer kthread. Repeatedly substitutes a new structure -- cgit v1.2.3 From 20e9751bd9dd6b832fd84ada27840360f7e877f1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 4 Oct 2006 02:17:17 -0700 Subject: [PATCH] rcu: simplify/improve batch tuning Kill a hard-to-calculate 'rsinterval' boot parameter and per-cpu rcu_data.last_rs_qlen. Instead, it adds adds a flag rcu_ctrlblk.signaled, which records the fact that one of CPUs has sent a resched IPI since the last rcu_start_batch(). Roughly speaking, we need two rcu_start_batch()s in order to move callbacks from ->nxtlist to ->donelist. This means that when ->qlen exceeds qhimark and continues to grow, we should send a resched IPI, and then do it again after we gone through a quiescent state. On the other hand, if it was already sent, we don't need to do it again when another CPU detects overflow of the queue. Signed-off-by: Oleg Nesterov Acked-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcupdate.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 523e46483b99..26bb5ffe1ef1 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -71,9 +71,6 @@ static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; static int blimit = 10; static int qhimark = 10000; static int qlowmark = 100; -#ifdef CONFIG_SMP -static int rsinterval = 1000; -#endif static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); @@ -86,8 +83,8 @@ static void force_quiescent_state(struct rcu_data *rdp, int cpu; cpumask_t cpumask; set_need_resched(); - if (unlikely(rdp->qlen - rdp->last_rs_qlen > rsinterval)) { - rdp->last_rs_qlen = rdp->qlen; + if (unlikely(!rcp->signaled)) { + rcp->signaled = 1; /* * Don't send IPI to itself. With irqs disabled, * rdp->cpu is the current cpu. @@ -301,6 +298,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) smp_mb(); cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); + rcp->signaled = 0; } } @@ -628,9 +626,6 @@ void synchronize_rcu(void) module_param(blimit, int, 0); module_param(qhimark, int, 0); module_param(qlowmark, int, 0); -#ifdef CONFIG_SMP -module_param(rsinterval, int, 0); -#endif EXPORT_SYMBOL_GPL(rcu_batches_completed); EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); EXPORT_SYMBOL_GPL(call_rcu); -- cgit v1.2.3 From 57a58a9435aef3e0342ba4b2c97e0ddfea6f2c7f Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 5 Oct 2006 13:06:34 +0100 Subject: IRQ: Typedef the IRQ flow handler function type Typedef the IRQ flow handler function type. Signed-Off-By: David Howells (cherry picked from 8e973fbdf5716b93a0a8c0365be33a31ca0fa351 commit) --- kernel/irq/chip.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 4cf65f5c6a74..53e9dce6c657 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -505,10 +505,7 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) #endif /* CONFIG_SMP */ void -__set_irq_handler(unsigned int irq, - void fastcall (*handle)(unsigned int, irq_desc_t *, - struct pt_regs *), - int is_chained) +__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained) { struct irq_desc *desc; unsigned long flags; @@ -561,9 +558,7 @@ __set_irq_handler(unsigned int irq, void set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, - void fastcall (*handle)(unsigned int, - struct irq_desc *, - struct pt_regs *)) + irq_flow_handler_t handle) { set_irq_chip(irq, chip); __set_irq_handler(irq, handle, 0); @@ -574,8 +569,7 @@ set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, * /proc/interrupts output: */ const char * -handle_irq_name(void fastcall (*handle)(unsigned int, struct irq_desc *, - struct pt_regs *)) +handle_irq_name(irq_flow_handler_t handle) { if (handle == handle_level_irq) return "level "; -- cgit v1.2.3 From da482792a6d1a3fbaaa25fae867b343fb4db3246 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 5 Oct 2006 13:06:34 +0100 Subject: IRQ: Typedef the IRQ handler function type Typedef the IRQ handler function type. Signed-Off-By: David Howells (cherry picked from 1356d1e5fd256997e3d3dce0777ab787d0515c7a commit) --- kernel/irq/manage.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 92be519eff26..6879202afe9a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -427,8 +427,7 @@ EXPORT_SYMBOL(free_irq); * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy * */ -int request_irq(unsigned int irq, - irqreturn_t (*handler)(int, void *, struct pt_regs *), +int request_irq(unsigned int irq, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id) { struct irqaction *action; @@ -475,4 +474,3 @@ int request_irq(unsigned int irq, return retval; } EXPORT_SYMBOL(request_irq); - -- cgit v1.2.3 From 7d12e780e003f93433d49ce78cfedf4b4c52adc5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 5 Oct 2006 14:55:46 +0100 Subject: IRQ: Maintain regs pointer globally rather than passing to IRQ handlers Maintain a per-CPU global "struct pt_regs *" variable which can be used instead of passing regs around manually through all ~1800 interrupt handlers in the Linux kernel. The regs pointer is used in few places, but it potentially costs both stack space and code to pass it around. On the FRV arch, removing the regs parameter from all the genirq function results in a 20% speed up of the IRQ exit path (ie: from leaving timer_interrupt() to leaving do_IRQ()). Where appropriate, an arch may override the generic storage facility and do something different with the variable. On FRV, for instance, the address is maintained in GR28 at all times inside the kernel as part of general exception handling. Having looked over the code, it appears that the parameter may be handed down through up to twenty or so layers of functions. Consider a USB character device attached to a USB hub, attached to a USB controller that posts its interrupts through a cascaded auxiliary interrupt controller. A character device driver may want to pass regs to the sysrq handler through the input layer which adds another few layers of parameter passing. I've build this code with allyesconfig for x86_64 and i386. I've runtested the main part of the code on FRV and i386, though I can't test most of the drivers. I've also done partial conversion for powerpc and MIPS - these at least compile with minimal configurations. This will affect all archs. Mostly the changes should be relatively easy. Take do_IRQ(), store the regs pointer at the beginning, saving the old one: struct pt_regs *old_regs = set_irq_regs(regs); And put the old one back at the end: set_irq_regs(old_regs); Don't pass regs through to generic_handle_irq() or __do_IRQ(). In timer_interrupt(), this sort of change will be necessary: - update_process_times(user_mode(regs)); - profile_tick(CPU_PROFILING, regs); + update_process_times(user_mode(get_irq_regs())); + profile_tick(CPU_PROFILING); I'd like to move update_process_times()'s use of get_irq_regs() into itself, except that i386, alone of the archs, uses something other than user_mode(). Some notes on the interrupt handling in the drivers: (*) input_dev() is now gone entirely. The regs pointer is no longer stored in the input_dev struct. (*) finish_unlinks() in drivers/usb/host/ohci-q.c needs checking. It does something different depending on whether it's been supplied with a regs pointer or not. (*) Various IRQ handler function pointers have been moved to type irq_handler_t. Signed-Off-By: David Howells (cherry picked from 1b16e7ac850969f38b375e511e3fa2f474a33867 commit) --- kernel/irq/chip.c | 36 +++++++++++++++--------------------- kernel/irq/handle.c | 19 ++++++++----------- kernel/irq/spurious.c | 10 +++++----- kernel/power/poweroff.c | 3 +-- kernel/profile.c | 5 ++++- 5 files changed, 33 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 53e9dce6c657..11c99697acfe 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -249,7 +249,6 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq) * handle_simple_irq - Simple and software-decoded IRQs. * @irq: the interrupt number * @desc: the interrupt description structure for this irq - * @regs: pointer to a register structure * * Simple interrupts are either sent from a demultiplexing interrupt * handler or come from hardware, where no interrupt hardware control @@ -259,7 +258,7 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq) * unmask issues if necessary. */ void fastcall -handle_simple_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) +handle_simple_irq(unsigned int irq, struct irq_desc *desc) { struct irqaction *action; irqreturn_t action_ret; @@ -279,9 +278,9 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) desc->status |= IRQ_INPROGRESS; spin_unlock(&desc->lock); - action_ret = handle_IRQ_event(irq, regs, action); + action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) - note_interrupt(irq, desc, action_ret, regs); + note_interrupt(irq, desc, action_ret); spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; @@ -293,7 +292,6 @@ out_unlock: * handle_level_irq - Level type irq handler * @irq: the interrupt number * @desc: the interrupt description structure for this irq - * @regs: pointer to a register structure * * Level type interrupts are active as long as the hardware line has * the active level. This may require to mask the interrupt and unmask @@ -301,7 +299,7 @@ out_unlock: * interrupt line is back to inactive. */ void fastcall -handle_level_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) +handle_level_irq(unsigned int irq, struct irq_desc *desc) { unsigned int cpu = smp_processor_id(); struct irqaction *action; @@ -329,9 +327,9 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) desc->status &= ~IRQ_PENDING; spin_unlock(&desc->lock); - action_ret = handle_IRQ_event(irq, regs, action); + action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) - note_interrupt(irq, desc, action_ret, regs); + note_interrupt(irq, desc, action_ret); spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; @@ -345,7 +343,6 @@ out_unlock: * handle_fasteoi_irq - irq handler for transparent controllers * @irq: the interrupt number * @desc: the interrupt description structure for this irq - * @regs: pointer to a register structure * * Only a single callback will be issued to the chip: an ->eoi() * call when the interrupt has been serviced. This enables support @@ -353,8 +350,7 @@ out_unlock: * details in hardware, transparently. */ void fastcall -handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc, - struct pt_regs *regs) +handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) { unsigned int cpu = smp_processor_id(); struct irqaction *action; @@ -382,9 +378,9 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc, desc->status &= ~IRQ_PENDING; spin_unlock(&desc->lock); - action_ret = handle_IRQ_event(irq, regs, action); + action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) - note_interrupt(irq, desc, action_ret, regs); + note_interrupt(irq, desc, action_ret); spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; @@ -398,7 +394,6 @@ out: * handle_edge_irq - edge type IRQ handler * @irq: the interrupt number * @desc: the interrupt description structure for this irq - * @regs: pointer to a register structure * * Interrupt occures on the falling and/or rising edge of a hardware * signal. The occurence is latched into the irq controller hardware @@ -412,7 +407,7 @@ out: * loop is left. */ void fastcall -handle_edge_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) +handle_edge_irq(unsigned int irq, struct irq_desc *desc) { const unsigned int cpu = smp_processor_id(); @@ -463,9 +458,9 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) desc->status &= ~IRQ_PENDING; spin_unlock(&desc->lock); - action_ret = handle_IRQ_event(irq, regs, action); + action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) - note_interrupt(irq, desc, action_ret, regs); + note_interrupt(irq, desc, action_ret); spin_lock(&desc->lock); } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); @@ -480,12 +475,11 @@ out_unlock: * handle_percpu_IRQ - Per CPU local irq handler * @irq: the interrupt number * @desc: the interrupt description structure for this irq - * @regs: pointer to a register structure * * Per CPU interrupts on SMP machines without locking requirements */ void fastcall -handle_percpu_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) +handle_percpu_irq(unsigned int irq, struct irq_desc *desc) { irqreturn_t action_ret; @@ -494,9 +488,9 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) if (desc->chip->ack) desc->chip->ack(irq); - action_ret = handle_IRQ_event(irq, regs, desc->action); + action_ret = handle_IRQ_event(irq, desc->action); if (!noirqdebug) - note_interrupt(irq, desc, action_ret, regs); + note_interrupt(irq, desc, action_ret); if (desc->chip->eoi) desc->chip->eoi(irq); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 4c6cdbaed661..42aa6f1a3f0f 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -27,7 +27,7 @@ * Handles spurious and unhandled IRQ's. It also prints a debugmessage. */ void fastcall -handle_bad_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) +handle_bad_irq(unsigned int irq, struct irq_desc *desc) { print_irq_desc(irq, desc); kstat_this_cpu.irqs[irq]++; @@ -115,7 +115,7 @@ struct irq_chip dummy_irq_chip = { /* * Special, empty irq handler: */ -irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs) +irqreturn_t no_action(int cpl, void *dev_id) { return IRQ_NONE; } @@ -123,13 +123,11 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs) /** * handle_IRQ_event - irq action chain handler * @irq: the interrupt number - * @regs: pointer to a register structure * @action: the interrupt action chain for this irq * * Handles the action chain of an irq event */ -irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, - struct irqaction *action) +irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) { irqreturn_t ret, retval = IRQ_NONE; unsigned int status = 0; @@ -140,7 +138,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, local_irq_enable_in_hardirq(); do { - ret = action->handler(irq, action->dev_id, regs); + ret = action->handler(irq, action->dev_id); if (ret == IRQ_HANDLED) status |= action->flags; retval |= ret; @@ -158,7 +156,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, /** * __do_IRQ - original all in one highlevel IRQ handler * @irq: the interrupt number - * @regs: pointer to a register structure * * __do_IRQ handles all normal device IRQ's (the special * SMP cross-CPU interrupts have their own specific @@ -167,7 +164,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, * This is the original x86 implementation which is used for every * interrupt type. */ -fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) +fastcall unsigned int __do_IRQ(unsigned int irq) { struct irq_desc *desc = irq_desc + irq; struct irqaction *action; @@ -182,7 +179,7 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) */ if (desc->chip->ack) desc->chip->ack(irq); - action_ret = handle_IRQ_event(irq, regs, desc->action); + action_ret = handle_IRQ_event(irq, desc->action); desc->chip->end(irq); return 1; } @@ -233,11 +230,11 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) spin_unlock(&desc->lock); - action_ret = handle_IRQ_event(irq, regs, action); + action_ret = handle_IRQ_event(irq, action); spin_lock(&desc->lock); if (!noirqdebug) - note_interrupt(irq, desc, action_ret, regs); + note_interrupt(irq, desc, action_ret); if (likely(!(desc->status & IRQ_PENDING))) break; desc->status &= ~IRQ_PENDING; diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 417e98092cf2..543ea2e5ad93 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -16,7 +16,7 @@ static int irqfixup __read_mostly; /* * Recovery handler for misrouted interrupts. */ -static int misrouted_irq(int irq, struct pt_regs *regs) +static int misrouted_irq(int irq) { int i; int ok = 0; @@ -49,7 +49,7 @@ static int misrouted_irq(int irq, struct pt_regs *regs) while (action) { /* Only shared IRQ handlers are safe to call */ if (action->flags & IRQF_SHARED) { - if (action->handler(i, action->dev_id, regs) == + if (action->handler(i, action->dev_id) == IRQ_HANDLED) ok = 1; } @@ -70,7 +70,7 @@ static int misrouted_irq(int irq, struct pt_regs *regs) */ work = 1; spin_unlock(&desc->lock); - handle_IRQ_event(i, regs, action); + handle_IRQ_event(i, action); spin_lock(&desc->lock); desc->status &= ~IRQ_PENDING; } @@ -136,7 +136,7 @@ report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) } void note_interrupt(unsigned int irq, struct irq_desc *desc, - irqreturn_t action_ret, struct pt_regs *regs) + irqreturn_t action_ret) { if (unlikely(action_ret != IRQ_HANDLED)) { desc->irqs_unhandled++; @@ -147,7 +147,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, if (unlikely(irqfixup)) { /* Don't punish working computers */ if ((irqfixup == 2 && irq == 0) || action_ret == IRQ_NONE) { - int ok = misrouted_irq(irq, regs); + int ok = misrouted_irq(irq); if (action_ret == IRQ_NONE) desc->irqs_unhandled -= ok; } diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 7a4144ba3afd..f1f900ac3164 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -23,8 +23,7 @@ static void do_poweroff(void *dummy) static DECLARE_WORK(poweroff_work, do_poweroff, NULL); -static void handle_poweroff(int key, struct pt_regs *pt_regs, - struct tty_struct *tty) +static void handle_poweroff(int key, struct tty_struct *tty) { schedule_work(&poweroff_work); } diff --git a/kernel/profile.c b/kernel/profile.c index fb660c7d35ba..857300a2afec 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -25,6 +25,7 @@ #include #include #include +#include struct profile_hit { u32 pc, hits; @@ -366,8 +367,10 @@ void profile_hit(int type, void *__pc) } #endif /* !CONFIG_SMP */ -void profile_tick(int type, struct pt_regs *regs) +void profile_tick(int type) { + struct pt_regs *regs = get_irq_regs(); + if (type == CPU_PROFILING && timer_hook) timer_hook(regs); if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask)) -- cgit v1.2.3 From 4899b8b16b302299cc91289f7b5bac295e9ab387 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 6 Oct 2006 00:43:48 -0700 Subject: [PATCH] kauditd_thread warning fix Squash this warning: kernel/audit.c: In function 'kauditd_thread': kernel/audit.c:367: warning: no return statement in function returning non-void We might as test kthread_should_stop(), although it's not very pointful at present. The code which starts this thread looks racy - the kernel could start multiple threads. Cc: Al Viro Cc: Jeff Garzik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index f9889ee77825..98106f6078b0 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -340,7 +340,7 @@ static int kauditd_thread(void *dummy) { struct sk_buff *skb; - while (1) { + while (!kthread_should_stop()) { skb = skb_dequeue(&audit_skb_queue); wake_up(&audit_backlog_wait); if (skb) { @@ -369,6 +369,7 @@ static int kauditd_thread(void *dummy) remove_wait_queue(&kauditd_wait, &wait); } } + return 0; } int audit_send_list(void *_dest) -- cgit v1.2.3 From e317c8ccaaf900abf39cc3240e4dc5ba82a3cc67 Mon Sep 17 00:00:00 2001 From: Frederik Deweerdt Date: Fri, 6 Oct 2006 18:58:24 +0000 Subject: [PATCH] ixp4xxdefconfig arm fixes With the following patch, the ixp4xxdefconfig builds correctly. I'll test some more configs if I get some time. Signed-off-by: Frederik Deweerdt Signed-off-by: Linus Torvalds --- kernel/irq/resend.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 35f10f7ff94a..5bfeaed7e487 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -38,7 +38,7 @@ static void resend_irqs(unsigned long arg) clear_bit(irq, irqs_resend); desc = irq_desc + irq; local_irq_disable(); - desc->handle_irq(irq, desc, NULL); + desc->handle_irq(irq, desc); local_irq_enable(); } } -- cgit v1.2.3 From 5c339d4541995df2fd3ca31a84c042e7afe9b3c1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 6 Oct 2006 22:19:44 -0700 Subject: [PATCH] swsusp: Make userland suspend work on SMP again Unfortunately one of the recent changes in swsusp has broken the userland suspend on SMP. Fix it. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/user.c b/kernel/power/user.c index 72825c853cd7..93b5dd283dea 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -145,10 +145,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, error = freeze_processes(); if (error) { thaw_processes(); + enable_nonboot_cpus(); error = -EBUSY; } } - enable_nonboot_cpus(); up(&pm_sem); if (!error) data->frozen = 1; -- cgit v1.2.3 From ba46df984b8e8114c3cf19c51670fab084bd4196 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 10 Oct 2006 22:46:07 +0100 Subject: [PATCH] __user annotations: futex Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- kernel/futex.c | 15 ++++++++------- kernel/futex_compat.c | 12 ++++++------ 2 files changed, 14 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 4aaf91951a43..b364e0026191 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1612,10 +1612,10 @@ sys_set_robust_list(struct robust_list_head __user *head, * @len_ptr: pointer to a length field, the kernel fills in the header size */ asmlinkage long -sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr, +sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, size_t __user *len_ptr) { - struct robust_list_head *head; + struct robust_list_head __user *head; unsigned long ret; if (!pid) @@ -1694,14 +1694,15 @@ retry: * Fetch a robust-list pointer. Bit 0 signals PI futexes: */ static inline int fetch_robust_entry(struct robust_list __user **entry, - struct robust_list __user **head, int *pi) + struct robust_list __user * __user *head, + int *pi) { unsigned long uentry; - if (get_user(uentry, (unsigned long *)head)) + if (get_user(uentry, (unsigned long __user *)head)) return -EFAULT; - *entry = (void *)(uentry & ~1UL); + *entry = (void __user *)(uentry & ~1UL); *pi = uentry & 1; return 0; @@ -1739,7 +1740,7 @@ void exit_robust_list(struct task_struct *curr) return; if (pending) - handle_futex_death((void *)pending + futex_offset, curr, pip); + handle_futex_death((void __user *)pending + futex_offset, curr, pip); while (entry != &head->list) { /* @@ -1747,7 +1748,7 @@ void exit_robust_list(struct task_struct *curr) * don't process it twice: */ if (entry != pending) - if (handle_futex_death((void *)entry + futex_offset, + if (handle_futex_death((void __user *)entry + futex_offset, curr, pi)) return; /* diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index c5cca3f65cb7..50f24eea6cd0 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -18,7 +18,7 @@ */ static inline int fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, - compat_uptr_t *head, int *pi) + compat_uptr_t __user *head, int *pi) { if (get_user(*uentry, head)) return -EFAULT; @@ -62,7 +62,7 @@ void compat_exit_robust_list(struct task_struct *curr) &head->list_op_pending, &pip)) return; if (upending) - handle_futex_death((void *)pending + futex_offset, curr, pip); + handle_futex_death((void __user *)pending + futex_offset, curr, pip); while (compat_ptr(uentry) != &head->list) { /* @@ -70,7 +70,7 @@ void compat_exit_robust_list(struct task_struct *curr) * dont process it twice: */ if (entry != pending) - if (handle_futex_death((void *)entry + futex_offset, + if (handle_futex_death((void __user *)entry + futex_offset, curr, pi)) return; @@ -78,7 +78,7 @@ void compat_exit_robust_list(struct task_struct *curr) * Fetch the next entry in the list: */ if (fetch_robust_entry(&uentry, &entry, - (compat_uptr_t *)&entry->next, &pi)) + (compat_uptr_t __user *)&entry->next, &pi)) return; /* * Avoid excessively long or circular lists: @@ -103,10 +103,10 @@ compat_sys_set_robust_list(struct compat_robust_list_head __user *head, } asmlinkage long -compat_sys_get_robust_list(int pid, compat_uptr_t *head_ptr, +compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, compat_size_t __user *len_ptr) { - struct compat_robust_list_head *head; + struct compat_robust_list_head __user *head; unsigned long ret; if (!pid) -- cgit v1.2.3 From ba2397efe10ba728c7ff22b179e218c292227c13 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 10 Oct 2006 22:46:57 +0100 Subject: [PATCH] make kernel/relay.c __user-clean Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- kernel/relay.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 1d63ecddfa70..f04bbdb56ac2 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -887,7 +887,7 @@ static int subbuf_read_actor(size_t read_start, from = buf->start + read_start; ret = avail; - if (copy_to_user(desc->arg.data, from, avail)) { + if (copy_to_user(desc->arg.buf, from, avail)) { desc->error = -EFAULT; ret = 0; } @@ -946,24 +946,17 @@ typedef int (*subbuf_actor_t) (size_t read_start, */ static inline ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, - size_t count, subbuf_actor_t subbuf_actor, read_actor_t actor, - void *target) + read_descriptor_t *desc) { struct rchan_buf *buf = filp->private_data; size_t read_start, avail; - read_descriptor_t desc; int ret; - if (!count) + if (!desc->count) return 0; - desc.written = 0; - desc.count = count; - desc.arg.data = target; - desc.error = 0; - mutex_lock(&filp->f_dentry->d_inode->i_mutex); do { if (!relay_file_read_avail(buf, *ppos)) @@ -974,19 +967,19 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp, if (!avail) break; - avail = min(desc.count, avail); - ret = subbuf_actor(read_start, buf, avail, &desc, actor); - if (desc.error < 0) + avail = min(desc->count, avail); + ret = subbuf_actor(read_start, buf, avail, desc, actor); + if (desc->error < 0) break; if (ret) { relay_file_read_consume(buf, read_start, ret); *ppos = relay_file_read_end_pos(buf, read_start, ret); } - } while (desc.count && ret); + } while (desc->count && ret); mutex_unlock(&filp->f_dentry->d_inode->i_mutex); - return desc.written; + return desc->written; } static ssize_t relay_file_read(struct file *filp, @@ -994,8 +987,13 @@ static ssize_t relay_file_read(struct file *filp, size_t count, loff_t *ppos) { - return relay_file_read_subbufs(filp, ppos, count, subbuf_read_actor, - NULL, buffer); + read_descriptor_t desc; + desc.written = 0; + desc.count = count; + desc.arg.buf = buffer; + desc.error = 0; + return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, + NULL, &desc); } static ssize_t relay_file_sendfile(struct file *filp, @@ -1004,8 +1002,13 @@ static ssize_t relay_file_sendfile(struct file *filp, read_actor_t actor, void *target) { - return relay_file_read_subbufs(filp, ppos, count, subbuf_send_actor, - actor, target); + read_descriptor_t desc; + desc.written = 0; + desc.count = count; + desc.arg.data = target; + desc.error = 0; + return relay_file_read_subbufs(filp, ppos, subbuf_send_actor, + actor, &desc); } struct file_operations relay_file_operations = { -- cgit v1.2.3 From 1af9892811ad3bf2abb31566bfb2ec0b5070a66a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 10 Oct 2006 22:48:57 +0100 Subject: [PATCH] cpuset ANSI prototype Signed-off-by: Al Viro Acked-by: Paul Jackson Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 9d850ae13b1b..6313c38c930e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2137,7 +2137,7 @@ static int cpuset_handle_cpuhp(struct notifier_block *nb, * See also the previous routine cpuset_handle_cpuhp(). */ -void cpuset_track_online_nodes() +void cpuset_track_online_nodes(void) { common_cpu_mem_hotplug_unplug(); } -- cgit v1.2.3 From 4dfbb9d8c6cbfc32faa5c71145bd2a43e1f8237c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Oct 2006 01:45:14 -0400 Subject: Lockdep: add lockdep_set_class_and_subclass() and lockdep_set_subclass() This annotation makes it possible to assign a subclass on lock init. This annotation is meant to reduce the _nested() annotations by assigning a default subclass. One could do without this annotation and rely on lockdep_set_class() exclusively, but that would require a manual stack of struct lock_class_key objects. Signed-off-by: Peter Zijlstra Signed-off-by: Dmitry Torokhov --- kernel/lockdep.c | 10 ++++++---- kernel/mutex-debug.c | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 4c0553461000..ba7156ac70c1 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -1177,7 +1177,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) * itself, so actual lookup of the hash should be once per lock object. */ static inline struct lock_class * -register_lock_class(struct lockdep_map *lock, unsigned int subclass) +register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) { struct lockdep_subclass_key *key; struct list_head *hash_head; @@ -1249,7 +1249,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass) out_unlock_set: __raw_spin_unlock(&hash_lock); - if (!subclass) + if (!subclass || force) lock->class_cache = class; DEBUG_LOCKS_WARN_ON(class->subclass != subclass); @@ -1937,7 +1937,7 @@ void trace_softirqs_off(unsigned long ip) * Initialize a lock instance's lock-class mapping info: */ void lockdep_init_map(struct lockdep_map *lock, const char *name, - struct lock_class_key *key) + struct lock_class_key *key, int subclass) { if (unlikely(!debug_locks)) return; @@ -1957,6 +1957,8 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, lock->name = name; lock->key = key; lock->class_cache = NULL; + if (subclass) + register_lock_class(lock, subclass, 1); } EXPORT_SYMBOL_GPL(lockdep_init_map); @@ -1995,7 +1997,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, * Not cached yet or subclass? */ if (unlikely(!class)) { - class = register_lock_class(lock, subclass); + class = register_lock_class(lock, subclass, 0); if (!class) return 0; } diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index e3203c654dda..18651641a7b5 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -91,7 +91,7 @@ void debug_mutex_init(struct mutex *lock, const char *name, * Make sure we are not reinitializing a held lock: */ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key); + lockdep_init_map(&lock->dep_map, name, key, 0); #endif lock->owner = NULL; lock->magic = lock; -- cgit v1.2.3 From 97c7801cd5b0bb6a38c16108a496235474dc6310 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 11 Oct 2006 01:20:45 -0700 Subject: [PATCH] swsusp: Use suspend_console Add suspend_console() and resume_console() to the suspend-to-disk code paths so that the users of netconsole can use swsusp with it. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 8 ++++++++ kernel/power/user.c | 8 +++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index d72234942798..d3a158a60312 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include "power.h" @@ -119,8 +120,10 @@ int pm_suspend_disk(void) if (error) return error; + suspend_console(); error = device_suspend(PMSG_FREEZE); if (error) { + resume_console(); printk("Some devices failed to suspend\n"); unprepare_processes(); return error; @@ -133,6 +136,7 @@ int pm_suspend_disk(void) if (in_suspend) { device_resume(); + resume_console(); pr_debug("PM: writing image.\n"); error = swsusp_write(); if (!error) @@ -148,6 +152,7 @@ int pm_suspend_disk(void) swsusp_free(); Done: device_resume(); + resume_console(); unprepare_processes(); return error; } @@ -212,7 +217,9 @@ static int software_resume(void) pr_debug("PM: Preparing devices for restore.\n"); + suspend_console(); if ((error = device_suspend(PMSG_PRETHAW))) { + resume_console(); printk("Some devices failed to suspend\n"); swsusp_free(); goto Thaw; @@ -224,6 +231,7 @@ static int software_resume(void) swsusp_resume(); pr_debug("PM: Restore failed, recovering.n"); device_resume(); + resume_console(); Thaw: unprepare_processes(); Done: diff --git a/kernel/power/user.c b/kernel/power/user.c index 93b5dd283dea..d991d3b0e5a4 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -173,12 +174,14 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, /* Free memory before shutting down devices. */ error = swsusp_shrink_memory(); if (!error) { + suspend_console(); error = device_suspend(PMSG_FREEZE); if (!error) { in_suspend = 1; error = swsusp_suspend(); device_resume(); } + resume_console(); } up(&pm_sem); if (!error) @@ -196,11 +199,13 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, snapshot_free_unused_memory(&data->handle); down(&pm_sem); pm_prepare_console(); + suspend_console(); error = device_suspend(PMSG_PRETHAW); if (!error) { error = swsusp_resume(); device_resume(); } + resume_console(); pm_restore_console(); up(&pm_sem); break; @@ -289,6 +294,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, } /* Put devices to sleep */ + suspend_console(); error = device_suspend(PMSG_SUSPEND); if (error) { printk(KERN_ERR "Failed to suspend some devices.\n"); @@ -299,7 +305,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, /* Wake up devices */ device_resume(); } - + resume_console(); if (pm_ops->finish) pm_ops->finish(PM_SUSPEND_MEM); -- cgit v1.2.3 From 469340236a7c9673df3e6a2425f559517f01464e Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 11 Oct 2006 01:21:26 -0700 Subject: [PATCH] mm: kevent threads: use MPOL_DEFAULT Switch the memory policy of the kevent threads to MPOL_DEFAULT while leaving the kzalloc of the workqueue structure on interleave. This means that all code executed in the context of the kevent thread is allocating node local. Signed-off-by: Christoph Lameter Cc: Christoph Lameter Cc: Alok Kataria Cc: Andi Kleen Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index cfc737bffe6d..3df9bfc7ff78 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -28,6 +28,7 @@ #include #include #include +#include /* * The per-CPU workqueue (if single thread, we always use the first @@ -245,6 +246,12 @@ static int worker_thread(void *__cwq) sigprocmask(SIG_BLOCK, &blocked, NULL); flush_signals(current); + /* + * We inherited MPOL_INTERLEAVE from the booting kernel. + * Set MPOL_DEFAULT to insure node local allocations. + */ + numa_default_policy(); + /* SIG_IGN makes children autoreap: see do_notify_parent(). */ sa.sa.sa_handler = SIG_IGN; sa.sa.sa_flags = 0; -- cgit v1.2.3 From fa3ba2e81ea23416272a22009bba95954c81969c Mon Sep 17 00:00:00 2001 From: Florin Malita Date: Wed, 11 Oct 2006 01:21:48 -0700 Subject: [PATCH] fix Module taint flags listing in Oops/panic Module taint flags listing in Oops/panic has a couple of issues: * taint_flags() doesn't null-terminate the buffer after printing the flags * per-module taints are only set if the kernel is not already tainted (with that particular flag) => only the first offending module gets its taint info correctly updated Some additional changes: * 'license_gplok' is no longer needed - equivalent to !(taints & TAINT_PROPRIETARY_MODULE) - so we can drop it from struct module * exporting module taint info via /proc/module: pwc 88576 0 - Live 0xf8c32000 evilmod 6784 1 pwc, Live 0xf8bbf000 (PF) Signed-off-by: Florin Malita Cc: "Randy.Dunlap" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 94 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 49 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 7f60e782de1e..67009bd56c52 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -87,6 +87,12 @@ static inline int strong_try_module_get(struct module *mod) return try_module_get(mod); } +static inline void add_taint_module(struct module *mod, unsigned flag) +{ + add_taint(flag); + mod->taints |= flag; +} + /* A thread that wants to hold a reference to a module only while it * is running can call ths to safely exit. * nfsd and lockd use this. @@ -847,12 +853,10 @@ static int check_version(Elf_Shdr *sechdrs, return 0; } /* Not in module's version table. OK, but that taints the kernel. */ - if (!(tainted & TAINT_FORCED_MODULE)) { + if (!(tainted & TAINT_FORCED_MODULE)) printk("%s: no version for \"%s\" found: kernel tainted.\n", mod->name, symname); - add_taint(TAINT_FORCED_MODULE); - mod->taints |= TAINT_FORCED_MODULE; - } + add_taint_module(mod, TAINT_FORCED_MODULE); return 1; } @@ -910,7 +914,8 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, unsigned long ret; const unsigned long *crc; - ret = __find_symbol(name, &owner, &crc, mod->license_gplok); + ret = __find_symbol(name, &owner, &crc, + !(mod->taints & TAINT_PROPRIETARY_MODULE)); if (ret) { /* use_module can fail due to OOM, or module unloading */ if (!check_version(sechdrs, versindex, name, mod, crc) || @@ -1335,12 +1340,11 @@ static void set_license(struct module *mod, const char *license) if (!license) license = "unspecified"; - mod->license_gplok = license_is_gpl_compatible(license); - if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) { - printk(KERN_WARNING "%s: module license '%s' taints kernel.\n", - mod->name, license); - add_taint(TAINT_PROPRIETARY_MODULE); - mod->taints |= TAINT_PROPRIETARY_MODULE; + if (!license_is_gpl_compatible(license)) { + if (!(tainted & TAINT_PROPRIETARY_MODULE)) + printk(KERN_WARNING "%s: module license '%s' taints" + "kernel.\n", mod->name, license); + add_taint_module(mod, TAINT_PROPRIETARY_MODULE); } } @@ -1619,8 +1623,7 @@ static struct module *load_module(void __user *umod, modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); /* This is allowed: modprobe --force will invalidate it. */ if (!modmagic) { - add_taint(TAINT_FORCED_MODULE); - mod->taints |= TAINT_FORCED_MODULE; + add_taint_module(mod, TAINT_FORCED_MODULE); printk(KERN_WARNING "%s: no version magic, tainting kernel.\n", mod->name); } else if (!same_magic(modmagic, vermagic)) { @@ -1714,14 +1717,10 @@ static struct module *load_module(void __user *umod, /* Set up license info based on the info section */ set_license(mod, get_modinfo(sechdrs, infoindex, "license")); - if (strcmp(mod->name, "ndiswrapper") == 0) { - add_taint(TAINT_PROPRIETARY_MODULE); - mod->taints |= TAINT_PROPRIETARY_MODULE; - } - if (strcmp(mod->name, "driverloader") == 0) { - add_taint(TAINT_PROPRIETARY_MODULE); - mod->taints |= TAINT_PROPRIETARY_MODULE; - } + if (strcmp(mod->name, "ndiswrapper") == 0) + add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + if (strcmp(mod->name, "driverloader") == 0) + add_taint_module(mod, TAINT_PROPRIETARY_MODULE); /* Set up MODINFO_ATTR fields */ setup_modinfo(mod, sechdrs, infoindex); @@ -1766,8 +1765,7 @@ static struct module *load_module(void __user *umod, (mod->num_unused_gpl_syms && !unusedgplcrcindex)) { printk(KERN_WARNING "%s: No versions for exported symbols." " Tainting kernel.\n", mod->name); - add_taint(TAINT_FORCED_MODULE); - mod->taints |= TAINT_FORCED_MODULE; + add_taint_module(mod, TAINT_FORCED_MODULE); } #endif @@ -2132,9 +2130,33 @@ static void m_stop(struct seq_file *m, void *p) mutex_unlock(&module_mutex); } +static char *taint_flags(unsigned int taints, char *buf) +{ + int bx = 0; + + if (taints) { + buf[bx++] = '('; + if (taints & TAINT_PROPRIETARY_MODULE) + buf[bx++] = 'P'; + if (taints & TAINT_FORCED_MODULE) + buf[bx++] = 'F'; + /* + * TAINT_FORCED_RMMOD: could be added. + * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't + * apply to modules. + */ + buf[bx++] = ')'; + } + buf[bx] = '\0'; + + return buf; +} + static int m_show(struct seq_file *m, void *p) { struct module *mod = list_entry(p, struct module, list); + char buf[8]; + seq_printf(m, "%s %lu", mod->name, mod->init_size + mod->core_size); print_unload_info(m, mod); @@ -2147,6 +2169,10 @@ static int m_show(struct seq_file *m, void *p) /* Used by oprofile and other similar tools. */ seq_printf(m, " 0x%p", mod->module_core); + /* Taints info */ + if (mod->taints) + seq_printf(m, " %s", taint_flags(mod->taints, buf)); + seq_printf(m, "\n"); return 0; } @@ -2235,28 +2261,6 @@ struct module *module_text_address(unsigned long addr) return mod; } -static char *taint_flags(unsigned int taints, char *buf) -{ - *buf = '\0'; - if (taints) { - int bx; - - buf[0] = '('; - bx = 1; - if (taints & TAINT_PROPRIETARY_MODULE) - buf[bx++] = 'P'; - if (taints & TAINT_FORCED_MODULE) - buf[bx++] = 'F'; - /* - * TAINT_FORCED_RMMOD: could be added. - * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't - * apply to modules. - */ - buf[bx] = ')'; - } - return buf; -} - /* Don't grab lock, we're oopsing. */ void print_modules(void) { -- cgit v1.2.3 From beed33a816204cb402c69266475b6a60a2433ceb Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 11 Oct 2006 01:21:52 -0700 Subject: [PATCH] sched: likely profiling This likely profiling is pretty fun. I found a few possible problems in sched.c. This patch may be not measurable, but when I did measure long ago, nooping (un)likely cost a couple of % on scheduler heavy benchmarks, so it all adds up. Tweak some branch hints: - the 2nd 64 bits in the bitmask is likely to be populated, because it contains the first 28 bits (nearly 3/4) of the normal priorities. (ratio of 669669:691 ~= 1000:1). - it isn't unlikely that context switching switches to another process. it might be very rapidly switching to and from the idle process (ratio of 475815:419004 and 471330:423544). Let the branch predictor decide. - preempt_enable seems to be very often called in a nested preempt_disable or with interrupts disabled (ratio of 3567760:87965 ~= 40:1) Signed-off-by: Nick Piggin Acked-by: Ingo Molnar Cc: Daniel Walker Cc: Hua Zhong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 53608a59d6e3..094b5687eef6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1822,14 +1822,14 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm = next->mm; struct mm_struct *oldmm = prev->active_mm; - if (unlikely(!mm)) { + if (!mm) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else switch_mm(oldmm, mm, next); - if (unlikely(!prev->mm)) { + if (!prev->mm) { prev->active_mm = NULL; WARN_ON(rq->prev_mm); rq->prev_mm = oldmm; @@ -3491,7 +3491,7 @@ asmlinkage void __sched preempt_schedule(void) * If there is a non-zero preempt_count or interrupts are disabled, * we do not want to preempt the current task. Just return.. */ - if (unlikely(ti->preempt_count || irqs_disabled())) + if (likely(ti->preempt_count || irqs_disabled())) return; need_resched: -- cgit v1.2.3 From 01a3ee2b203e511e20f98b85a9172fd32c53e87c Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Wed, 11 Oct 2006 01:21:55 -0700 Subject: [PATCH] bitmap: parse input from kernel and user buffers lib/bitmap.c:bitmap_parse() is a library function that received as input a user buffer. This seemed to have originated from the way the write_proc function of the /proc filesystem operates. This has been reworked to not use kmalloc and eliminates a lot of get_user() overhead by performing one access_ok before using __get_user(). We need to test if we are in kernel or user space (is_user) and access the buffer differently. We cannot use __get_user() to access kernel addresses in all cases, for example in architectures with separate address space for kernel and user. This function will be useful for other uses as well; for example, taking input for /sysfs instead of /proc, so it was changed to accept kernel buffers. We have this use for the Linux UWB project, as part as the upcoming bandwidth allocator code. Only a few routines used this function and they were changed too. Signed-off-by: Reinette Chatre Signed-off-by: Inaky Perez-Gonzalez Cc: Paul Jackson Cc: Joe Korty Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/proc.c | 2 +- kernel/profile.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 607c7809ad01..9a352667007c 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -57,7 +57,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, if (!irq_desc[irq].chip->set_affinity || no_irq_affinity) return -EIO; - err = cpumask_parse(buffer, count, new_value); + err = cpumask_parse_user(buffer, count, new_value); if (err) return err; diff --git a/kernel/profile.c b/kernel/profile.c index 857300a2afec..f940b462eec9 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -399,7 +399,7 @@ static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffe unsigned long full_count = count, err; cpumask_t new_value; - err = cpumask_parse(buffer, count, new_value); + err = cpumask_parse_user(buffer, count, new_value); if (err) return err; -- cgit v1.2.3 From 3dc3099a9b2c346b16383597fadaa79a05a52388 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 11 Oct 2006 01:22:06 -0700 Subject: [PATCH] lockdep: use BUILD_BUG_ON Signed-off-by: Alexey Dobriyan Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/lockdep.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 4c0553461000..805a322a5655 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -1114,8 +1114,6 @@ static int count_matching_names(struct lock_class *new_class) return count + 1; } -extern void __error_too_big_MAX_LOCKDEP_SUBCLASSES(void); - /* * Register a lock's class in the hash-table, if the class is not present * yet. Otherwise we look it up. We cache the result in the lock object @@ -1153,8 +1151,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) * (or spin_lock_init()) call - which acts as the key. For static * locks we use the lock object itself as the key. */ - if (sizeof(struct lock_class_key) > sizeof(struct lock_class)) - __error_too_big_MAX_LOCKDEP_SUBCLASSES(); + BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(struct lock_class)); key = lock->key->subkeys + subclass; -- cgit v1.2.3 From 256a6b41365e17cebe5c2fc91ddff716c9aa055a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 11 Oct 2006 01:22:08 -0700 Subject: [PATCH] lockdep: fix printk recursion logic Bug reported and fixed by Tilman Schmidt : if lockdep is enabled then log messages make it to /var/log/messages belatedly. The reason is a missed wakeup of klogd. Initially there was only a lockdep_internal() protection against lockdep recursion within vprintk() - it grew the 'outer' lockdep_off()/on() protection only later on. But that lockdep_off() made the release_console_sem() within vprintk() always happen under the lockdep_internal() condition, causing the bug. The right solution to remove the inner protection against recursion here - the outer one is enough. Signed-off-by: Ingo Molnar Cc: Tilman Schmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 771f5e861bcd..f7d427ef5038 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -820,15 +820,8 @@ void release_console_sem(void) console_locked = 0; up(&console_sem); spin_unlock_irqrestore(&logbuf_lock, flags); - if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) { - /* - * If we printk from within the lock dependency code, - * from within the scheduler code, then do not lock - * up due to self-recursion: - */ - if (!lockdep_internal()) - wake_up_interruptible(&log_wait); - } + if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) + wake_up_interruptible(&log_wait); } EXPORT_SYMBOL(release_console_sem); -- cgit v1.2.3 From 39af114377bf80d2a88e47be33d578d1fa9b0775 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 16 Oct 2006 09:01:46 -0700 Subject: [PATCH] fix epoll_pwait when EPOLL=n Fixes http://bugzilla.kernel.org/show_bug.cgi?id=7371 sys_epoll_pwait needs to be listed as a conditional (weak) entry point for CONFIG_EPOLL=n. Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 7a3b2e75f040..0e53314b14de 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -49,6 +49,7 @@ cond_syscall(compat_sys_get_robust_list); cond_syscall(sys_epoll_create); cond_syscall(sys_epoll_ctl); cond_syscall(sys_epoll_wait); +cond_syscall(sys_epoll_pwait); cond_syscall(sys_semget); cond_syscall(sys_semop); cond_syscall(sys_semtimedop); -- cgit v1.2.3 From ca268c691de95612981b93e58899c1d73fdb6b47 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 17 Oct 2006 00:09:28 -0700 Subject: [PATCH] lockdep: increase max allowed recursion depth In general, lockdep warnings are intended to be non-fatal, so I have put in various practical limits on internal data structure failure modes. We haven't had a /single/ lockdep-internal crash ever since lockdep went upstream [the unwinder crashes are outside of lockdep], and that's largely due to the good internal checks it does. Recursion within the dependency graph is currently limited to 20, that's probably not enough on some many-CPU boxes - this patch doubles it to 40. I have written the lockdep functions to have as small stackframes as possible, so 40 should be OK too. (The practical recursion limit should be somewhere between 100 and 200 entries. If we hit that then I'll change the algorithm to be iteration-based. Graph walking logic is so easy to program via recursion, so i'd like to keep recursion as long as possible.) Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/lockdep.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 805a322a5655..d1a3b2cfe4a9 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -575,6 +575,8 @@ static noinline int print_circular_bug_tail(void) return 0; } +#define RECURSION_LIMIT 40 + static int noinline print_infinite_recursion_bug(void) { __raw_spin_unlock(&hash_lock); @@ -595,7 +597,7 @@ check_noncircular(struct lock_class *source, unsigned int depth) debug_atomic_inc(&nr_cyclic_check_recursions); if (depth > max_recursion_depth) max_recursion_depth = depth; - if (depth >= 20) + if (depth >= RECURSION_LIMIT) return print_infinite_recursion_bug(); /* * Check this lock's dependency list: @@ -645,7 +647,7 @@ find_usage_forwards(struct lock_class *source, unsigned int depth) if (depth > max_recursion_depth) max_recursion_depth = depth; - if (depth >= 20) + if (depth >= RECURSION_LIMIT) return print_infinite_recursion_bug(); debug_atomic_inc(&nr_find_usage_forwards_checks); @@ -684,7 +686,7 @@ find_usage_backwards(struct lock_class *source, unsigned int depth) if (depth > max_recursion_depth) max_recursion_depth = depth; - if (depth >= 20) + if (depth >= RECURSION_LIMIT) return print_infinite_recursion_bug(); debug_atomic_inc(&nr_find_usage_backwards_checks); -- cgit v1.2.3 From 3f4a0b917ce72ef47e438d354c433eb645218e87 Mon Sep 17 00:00:00 2001 From: john stultz Date: Tue, 17 Oct 2006 00:09:32 -0700 Subject: [PATCH] i386 Time: Avoid PIT SMP lockups Avoid possible PIT livelock issues seen on SMP systems (and reported by Andi), by not allowing it as a clocksource on SMP boxes. However, since the PIT may no longer be present, we have to properly handle the cases where SMP systems have TSC skew and fall back from the TSC. Since the PIT isn't there, it would "fall back" to the TSC again. So this changes the jiffies rating to 1, and the TSC-bad rating value to 0. Thus you will get the following behavior priority on i386 systems: tsc [if present & stable] hpet [if present] cyclone [if present] acpi_pm [if present] pit [if UP] jiffies Rather then the current more complicated: tsc [if present & stable] hpet [if present] cyclone [if present] acpi_pm [if present] pit [if cpus < 4] tsc [if present & unstable] jiffies Signed-off-by: John Stultz Cc: Andi Kleen Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/jiffies.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 126bb30c4afe..a99b2a6e6a07 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -57,7 +57,7 @@ static cycle_t jiffies_read(void) struct clocksource clocksource_jiffies = { .name = "jiffies", - .rating = 0, /* lowest rating*/ + .rating = 1, /* lowest valid rating*/ .read = jiffies_read, .mask = 0xffffffff, /*32bits*/ .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ -- cgit v1.2.3 From ac08c26492a0ad4d94a25bd47d5630cd38337069 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Oct 2006 00:09:39 -0700 Subject: [PATCH] posix-cpu-timers: prevent signal delivery starvation The integer divisions in the timer accounting code can round the result down to 0. Adding 0 is without effect and the signal delivery stops. Clamp the division result to minimum 1 to avoid this. Problem was reported by Seongbae Park , who provided also an inital patch. Roland sayeth: I have had some more time to think about the problem, and to reproduce it using Toyo's test case. For the record, if my understanding of the problem is correct, this happens only in one very particular case. First, the expiry time has to be so soon that in cputime_t units (usually 1s/HZ ticks) it's < nthreads so the division yields zero. Second, it only affects each thread that is so new that its CPU time accumulation is zero so now+0 is still zero and ->it_*_expires winds up staying zero. For the VIRT and PROF clocks when cputime_t is tick granularity (or the SCHED clock on configurations where sched_clock's value only advances on clock ticks), this is not hard to arrange with new threads starting up and blocking before they accumulate a whole tick of CPU time. That's what happens in Toyo's test case. Note that in general it is fine for that division to round down to zero, and set each thread's expiry time to its "now" time. The problem only arises with thread's whose "now" value is still zero, so that now+0 winds up 0 and is interpreted as "not set" instead of ">= now". So it would be a sufficient and more precise fix to just use max(ticks, 1) inside the loop when setting each it_*_expires value. But, it does no harm to round the division up to one and always advance every thread's expiry time. If the thread didn't already fire timers for the expiry time of "now", there is no expectation that it will do so before the next tick anyway. So I followed Thomas's patch in lifting the max out of the loops. This patch also covers the reload cases, which are harder to write a test for (and I didn't try). I've tested it with Toyo's case and it fixes that. [toyoa@mvista.com: fix: min_t -> max_t] Signed-off-by: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Roland McGrath Cc: Daniel Walker Cc: Toyo Abe Cc: john stultz Cc: Roman Zippel Cc: Seongbae Park Cc: Peter Mattis Cc: Rohit Seth Cc: Martin Bligh Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/posix-cpu-timers.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 479b16b44f79..7c3e1e6dfb5b 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -87,6 +87,19 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, return a; } +/* + * Divide and limit the result to res >= 1 + * + * This is necessary to prevent signal delivery starvation, when the result of + * the division would be rounded down to 0. + */ +static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div) +{ + cputime_t res = cputime_div(time, div); + + return max_t(cputime_t, res, 1); +} + /* * Update expiry time from increment, and increase overrun count, * given the current clock sample. @@ -483,8 +496,8 @@ static void process_timer_rebalance(struct task_struct *p, BUG(); break; case CPUCLOCK_PROF: - left = cputime_div(cputime_sub(expires.cpu, val.cpu), - nthreads); + left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), + nthreads); do { if (likely(!(t->flags & PF_EXITING))) { ticks = cputime_add(prof_ticks(t), left); @@ -498,8 +511,8 @@ static void process_timer_rebalance(struct task_struct *p, } while (t != p); break; case CPUCLOCK_VIRT: - left = cputime_div(cputime_sub(expires.cpu, val.cpu), - nthreads); + left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), + nthreads); do { if (likely(!(t->flags & PF_EXITING))) { ticks = cputime_add(virt_ticks(t), left); @@ -515,6 +528,7 @@ static void process_timer_rebalance(struct task_struct *p, case CPUCLOCK_SCHED: nsleft = expires.sched - val.sched; do_div(nsleft, nthreads); + nsleft = max_t(unsigned long long, nsleft, 1); do { if (likely(!(t->flags & PF_EXITING))) { ns = t->sched_time + nsleft; @@ -1159,12 +1173,13 @@ static void check_process_timers(struct task_struct *tsk, prof_left = cputime_sub(prof_expires, utime); prof_left = cputime_sub(prof_left, stime); - prof_left = cputime_div(prof_left, nthreads); + prof_left = cputime_div_non_zero(prof_left, nthreads); virt_left = cputime_sub(virt_expires, utime); - virt_left = cputime_div(virt_left, nthreads); + virt_left = cputime_div_non_zero(virt_left, nthreads); if (sched_expires) { sched_left = sched_expires - sched_time; do_div(sched_left, nthreads); + sched_left = max_t(unsigned long long, sched_left, 1); } else { sched_left = 0; } -- cgit v1.2.3 From c60099bfe3a5e6fa22a930627689b3769c52153f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 17 Oct 2006 00:09:59 -0700 Subject: [PATCH] swsusp: fix memory leaks My fancy new swsusp IO code had a big memory leak. It's somewhat invisible because the whole mem_map[] gets overwritten after resume, but it can cause us to get low on memory during the actual suspend process. Cc: "Rafael J. Wysocki" Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/swap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 9b2ee5344dee..1a3b0dd2c3fc 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -425,7 +425,8 @@ static int submit(int rw, pgoff_t page_off, struct page *page, bio_set_pages_dirty(bio); bio_put(bio); } else { - get_page(page); + if (rw == READ) + get_page(page); /* These pages are freed later */ bio->bi_private = *bio_chain; *bio_chain = bio; submit_bio(rw | (1 << BIO_RW_SYNC), bio); -- cgit v1.2.3 From a460e745e8f9c75a0525ff94154a0629f9d3e05d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 17 Oct 2006 00:10:03 -0700 Subject: [PATCH] genirq: clean up irq-flow-type naming Introduce desc->name and eliminate the handle_irq_name() hack. Add set_irq_chip_and_handler_name() to set the flow type and name at once. Signed-off-by: Ingo Molnar Acked-by: Thomas Gleixner Cc: "Eric W. Biederman" Cc: Matthew Wilcox Cc: Kyle McMartin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/chip.c | 33 +++++++++------------------------ 1 file changed, 9 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 11c99697acfe..2d0dc3efe813 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -499,7 +499,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) #endif /* CONFIG_SMP */ void -__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained) +__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, + const char *name) { struct irq_desc *desc; unsigned long flags; @@ -540,6 +541,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained) desc->depth = 1; } desc->handle_irq = handle; + desc->name = name; if (handle != handle_bad_irq && is_chained) { desc->status &= ~IRQ_DISABLED; @@ -555,30 +557,13 @@ set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, irq_flow_handler_t handle) { set_irq_chip(irq, chip); - __set_irq_handler(irq, handle, 0); + __set_irq_handler(irq, handle, 0, NULL); } -/* - * Get a descriptive string for the highlevel handler, for - * /proc/interrupts output: - */ -const char * -handle_irq_name(irq_flow_handler_t handle) +void +set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, + irq_flow_handler_t handle, const char *name) { - if (handle == handle_level_irq) - return "level "; - if (handle == handle_fasteoi_irq) - return "fasteoi"; - if (handle == handle_edge_irq) - return "edge "; - if (handle == handle_simple_irq) - return "simple "; -#ifdef CONFIG_SMP - if (handle == handle_percpu_irq) - return "percpu "; -#endif - if (handle == handle_bad_irq) - return "bad "; - - return NULL; + set_irq_chip(irq, chip); + __set_irq_handler(irq, handle, 0, name); } -- cgit v1.2.3 From bea493a031fe3337f4fe5479e8e865513828ea76 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Oct 2006 00:10:33 -0700 Subject: [PATCH] rt-mutex: fixup rt-mutex debug code BUG: warning at kernel/rtmutex-debug.c:125/rt_mutex_debug_task_free() (Not tainted) [] show_trace_log_lvl+0x58/0x16a [] show_trace+0xd/0x10 [] dump_stack+0x19/0x1b [] rt_mutex_debug_task_free+0x35/0x6a [] free_task+0x15/0x24 [] copy_process+0x12bd/0x1324 [] do_fork+0x42/0x113 [] sys_fork+0x19/0x1b [] syscall_call+0x7/0xb In copy_process(), dup_task_struct() also duplicates the ->pi_lock, ->pi_waiters and ->pi_blocked_on members. rt_mutex_debug_task_free() called from free_task() validates these members. However free_task() can be invoked before these members are reset for the new task. Move the initialization code before the first bail that can hit free_task(). Signed-off-by: Peter Zijlstra Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 7dc6140baac6..29ebb30850ed 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -984,6 +984,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (!p) goto fork_out; + rt_mutex_init_task(p); + #ifdef CONFIG_TRACE_IRQFLAGS DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); @@ -1088,8 +1090,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->lockdep_recursion = 0; #endif - rt_mutex_init_task(p); - #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif -- cgit v1.2.3 From bd5349cfd2b9bbb10a3dbcd3fe5cbaabe0b2ab9e Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Tue, 17 Oct 2006 00:10:35 -0700 Subject: [PATCH] Convert cpu hotplug notifiers to use raw_notifier instead of blocking_notifier The use of blocking notifier by _cpu_up and _cpu_down in cpu.c has two problem. 1/ An interaction with the workqueue notifier causes lockdep to spit a warning. 2/ A notifier could conceivable be added or removed while _cpu_up or _cpu_down are in process. As each notifier is called twice (prepare then commit/abort) this could be unhealthy. To fix to we simply take cpu_add_remove_lock while adding or removing notifiers to/from the list. This makes the 'blocking' usage unnecessary as all accesses to cpu_chain are now protected by cpu_add_remove_lock. So change "blocking" to "raw" in all relevant places. This fixes 1. Credit: Andrew Morton Cc: Rusty Russell Cc: Michal Piotrowski (reporter) Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 32c96628463e..27dd3ee47099 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -19,7 +19,7 @@ static DEFINE_MUTEX(cpu_add_remove_lock); static DEFINE_MUTEX(cpu_bitmask_lock); -static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain); +static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); /* If set, cpu_up and cpu_down will return -EBUSY and do nothing. * Should always be manipulated under cpu_add_remove_lock @@ -68,7 +68,11 @@ EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); /* Need to know about CPUs going up/down? */ int __cpuinit register_cpu_notifier(struct notifier_block *nb) { - return blocking_notifier_chain_register(&cpu_chain, nb); + int ret; + mutex_lock(&cpu_add_remove_lock); + ret = raw_notifier_chain_register(&cpu_chain, nb); + mutex_unlock(&cpu_add_remove_lock); + return ret; } #ifdef CONFIG_HOTPLUG_CPU @@ -77,7 +81,9 @@ EXPORT_SYMBOL(register_cpu_notifier); void unregister_cpu_notifier(struct notifier_block *nb) { - blocking_notifier_chain_unregister(&cpu_chain, nb); + mutex_lock(&cpu_add_remove_lock); + raw_notifier_chain_unregister(&cpu_chain, nb); + mutex_unlock(&cpu_add_remove_lock); } EXPORT_SYMBOL(unregister_cpu_notifier); @@ -126,7 +132,7 @@ static int _cpu_down(unsigned int cpu) if (!cpu_online(cpu)) return -EINVAL; - err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, + err = raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, (void *)(long)cpu); if (err == NOTIFY_BAD) { printk("%s: attempt to take down CPU %u failed\n", @@ -146,7 +152,7 @@ static int _cpu_down(unsigned int cpu) if (IS_ERR(p)) { /* CPU didn't die: tell everyone. Can't complain. */ - if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, + if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, (void *)(long)cpu) == NOTIFY_BAD) BUG(); @@ -169,7 +175,7 @@ static int _cpu_down(unsigned int cpu) put_cpu(); /* CPU is completely dead: tell everyone. Too late to complain. */ - if (blocking_notifier_call_chain(&cpu_chain, CPU_DEAD, + if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD, (void *)(long)cpu) == NOTIFY_BAD) BUG(); @@ -206,7 +212,7 @@ static int __devinit _cpu_up(unsigned int cpu) if (cpu_online(cpu) || !cpu_present(cpu)) return -EINVAL; - ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); + ret = raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); if (ret == NOTIFY_BAD) { printk("%s: attempt to bring up CPU %u failed\n", __FUNCTION__, cpu); @@ -223,11 +229,11 @@ static int __devinit _cpu_up(unsigned int cpu) BUG_ON(!cpu_online(cpu)); /* Now call notifier in preparation. */ - blocking_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); + raw_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); out_notify: if (ret != 0) - blocking_notifier_call_chain(&cpu_chain, + raw_notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu); return ret; -- cgit v1.2.3 From 91fcdd4e0314145d7d4fa52dba2f9c2da25346fd Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 19 Oct 2006 23:28:29 -0700 Subject: [PATCH] readjust comments of task_timeslice for kernel doc Signed-off-by: Borislav Petkov Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 094b5687eef6..3399701c680e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -160,15 +160,6 @@ #define TASK_PREEMPTS_CURR(p, rq) \ ((p)->prio < (rq)->curr->prio) -/* - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] - * to time slice values: [800ms ... 100ms ... 5ms] - * - * The higher a thread's priority, the bigger timeslices - * it gets during one round of execution. But even the lowest - * priority thread gets MIN_TIMESLICE worth of execution time. - */ - #define SCALE_PRIO(x, prio) \ max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) @@ -180,6 +171,15 @@ static unsigned int static_prio_timeslice(int static_prio) return SCALE_PRIO(DEF_TIMESLICE, static_prio); } +/* + * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] + * to time slice values: [800ms ... 100ms ... 5ms] + * + * The higher a thread's priority, the bigger timeslices + * it gets during one round of execution. But even the lowest + * priority thread gets MIN_TIMESLICE worth of execution time. + */ + static inline unsigned int task_timeslice(struct task_struct *p) { return static_prio_timeslice(p->static_prio); -- cgit v1.2.3 From d6f8ff7381501887233666b508b9eac70143303d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 19 Oct 2006 23:28:34 -0700 Subject: [PATCH] cad_pid sysctl with PROC_FS=n If CONFIG_PROC_FS=n: kernel/sysctl.c:148: warning: 'proc_do_cad_pid' used but never defined kernel/built-in.o:(.data+0x1228): undefined reference to `proc_do_cad_pid' make: *** [.tmp_vmlinux1] Error 1 Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8020fb273c4f..8bff2c18fb5a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -136,8 +136,10 @@ static int parse_table(int __user *, int, void __user *, size_t __user *, static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); +#ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); +#endif static ctl_table root_table[]; static struct ctl_table_header root_table_header = @@ -542,6 +544,7 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_PROC_SYSCTL { .ctl_name = KERN_CADPID, .procname = "cad_pid", @@ -550,6 +553,7 @@ static ctl_table kern_table[] = { .mode = 0600, .proc_handler = &proc_do_cad_pid, }, +#endif { .ctl_name = KERN_MAX_THREADS, .procname = "threads-max", -- cgit v1.2.3 From e05d722e4555cd54677b4c8431d9e81fd047ef7a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 19 Oct 2006 23:29:12 -0700 Subject: [PATCH] kernel/nsproxy.c: use kmemdup() Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/nsproxy.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 6ebdb82a0ce4..674aceb7335a 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -44,11 +44,9 @@ static inline struct nsproxy *clone_namespaces(struct nsproxy *orig) { struct nsproxy *ns; - ns = kmalloc(sizeof(struct nsproxy), GFP_KERNEL); - if (ns) { - memcpy(ns, orig, sizeof(struct nsproxy)); + ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL); + if (ns) atomic_set(&ns->count, 1); - } return ns; } -- cgit v1.2.3 From 690a973f48b6ba2954465992c08e65059c8374fe Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Sat, 21 Oct 2006 18:37:01 +0200 Subject: [PATCH] x86-64: Speed up dwarf2 unwinder This changes the dwarf2 unwinder to do a binary search for CIEs instead of a linear work. The linker is unfortunately not able to build a proper lookup table at link time, instead it creates one at runtime as soon as the bootmem allocator is usable (so you'll continue using the linear lookup for the first [hopefully] few calls). The code should be ready to utilize a build-time created table once a fixed linker becomes available. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- kernel/unwind.c | 318 +++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 279 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/unwind.c b/kernel/unwind.c index 2e2368607aab..f7e50d16dbf6 100644 --- a/kernel/unwind.c +++ b/kernel/unwind.c @@ -11,13 +11,15 @@ #include #include -#include +#include +#include #include #include #include #include extern char __start_unwind[], __end_unwind[]; +extern const u8 __start_unwind_hdr[], __end_unwind_hdr[]; #define MAX_STACK_DEPTH 8 @@ -100,6 +102,8 @@ static struct unwind_table { } core, init; const void *address; unsigned long size; + const unsigned char *header; + unsigned long hdrsz; struct unwind_table *link; const char *name; } root_table; @@ -145,6 +149,10 @@ static struct unwind_table *find_table(unsigned long pc) return table; } +static unsigned long read_pointer(const u8 **pLoc, + const void *end, + signed ptrType); + static void init_unwind_table(struct unwind_table *table, const char *name, const void *core_start, @@ -152,14 +160,30 @@ static void init_unwind_table(struct unwind_table *table, const void *init_start, unsigned long init_size, const void *table_start, - unsigned long table_size) + unsigned long table_size, + const u8 *header_start, + unsigned long header_size) { + const u8 *ptr = header_start + 4; + const u8 *end = header_start + header_size; + table->core.pc = (unsigned long)core_start; table->core.range = core_size; table->init.pc = (unsigned long)init_start; table->init.range = init_size; table->address = table_start; table->size = table_size; + /* See if the linker provided table looks valid. */ + if (header_size <= 4 + || header_start[0] != 1 + || (void *)read_pointer(&ptr, end, header_start[1]) != table_start + || header_start[2] == DW_EH_PE_omit + || read_pointer(&ptr, end, header_start[2]) <= 0 + || header_start[3] == DW_EH_PE_omit) + header_start = NULL; + table->hdrsz = header_size; + smp_wmb(); + table->header = header_start; table->link = NULL; table->name = name; } @@ -169,7 +193,143 @@ void __init unwind_init(void) init_unwind_table(&root_table, "kernel", _text, _end - _text, NULL, 0, - __start_unwind, __end_unwind - __start_unwind); + __start_unwind, __end_unwind - __start_unwind, + __start_unwind_hdr, __end_unwind_hdr - __start_unwind_hdr); +} + +static const u32 bad_cie, not_fde; +static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *); +static signed fde_pointer_type(const u32 *cie); + +struct eh_frame_hdr_table_entry { + unsigned long start, fde; +}; + +static int cmp_eh_frame_hdr_table_entries(const void *p1, const void *p2) +{ + const struct eh_frame_hdr_table_entry *e1 = p1; + const struct eh_frame_hdr_table_entry *e2 = p2; + + return (e1->start > e2->start) - (e1->start < e2->start); +} + +static void swap_eh_frame_hdr_table_entries(void *p1, void *p2, int size) +{ + struct eh_frame_hdr_table_entry *e1 = p1; + struct eh_frame_hdr_table_entry *e2 = p2; + unsigned long v; + + v = e1->start; + e1->start = e2->start; + e2->start = v; + v = e1->fde; + e1->fde = e2->fde; + e2->fde = v; +} + +static void __init setup_unwind_table(struct unwind_table *table, + void *(*alloc)(unsigned long)) +{ + const u8 *ptr; + unsigned long tableSize = table->size, hdrSize; + unsigned n; + const u32 *fde; + struct { + u8 version; + u8 eh_frame_ptr_enc; + u8 fde_count_enc; + u8 table_enc; + unsigned long eh_frame_ptr; + unsigned int fde_count; + struct eh_frame_hdr_table_entry table[]; + } __attribute__((__packed__)) *header; + + if (table->header) + return; + + if (table->hdrsz) + printk(KERN_WARNING ".eh_frame_hdr for '%s' present but unusable\n", + table->name); + + if (tableSize & (sizeof(*fde) - 1)) + return; + + for (fde = table->address, n = 0; + tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde; + tableSize -= sizeof(*fde) + *fde, fde += 1 + *fde / sizeof(*fde)) { + const u32 *cie = cie_for_fde(fde, table); + signed ptrType; + + if (cie == ¬_fde) + continue; + if (cie == NULL + || cie == &bad_cie + || (ptrType = fde_pointer_type(cie)) < 0) + return; + ptr = (const u8 *)(fde + 2); + if (!read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType)) + return; + ++n; + } + + if (tableSize || !n) + return; + + hdrSize = 4 + sizeof(unsigned long) + sizeof(unsigned int) + + 2 * n * sizeof(unsigned long); + header = alloc(hdrSize); + if (!header) + return; + header->version = 1; + header->eh_frame_ptr_enc = DW_EH_PE_abs|DW_EH_PE_native; + header->fde_count_enc = DW_EH_PE_abs|DW_EH_PE_data4; + header->table_enc = DW_EH_PE_abs|DW_EH_PE_native; + put_unaligned((unsigned long)table->address, &header->eh_frame_ptr); + BUILD_BUG_ON(offsetof(typeof(*header), fde_count) + % __alignof(typeof(header->fde_count))); + header->fde_count = n; + + BUILD_BUG_ON(offsetof(typeof(*header), table) + % __alignof(typeof(*header->table))); + for (fde = table->address, tableSize = table->size, n = 0; + tableSize; + tableSize -= sizeof(*fde) + *fde, fde += 1 + *fde / sizeof(*fde)) { + const u32 *cie = fde + 1 - fde[1] / sizeof(*fde); + + if (!fde[1]) + continue; /* this is a CIE */ + ptr = (const u8 *)(fde + 2); + header->table[n].start = read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + fde_pointer_type(cie)); + header->table[n].fde = (unsigned long)fde; + ++n; + } + WARN_ON(n != header->fde_count); + + sort(header->table, + n, + sizeof(*header->table), + cmp_eh_frame_hdr_table_entries, + swap_eh_frame_hdr_table_entries); + + table->hdrsz = hdrSize; + smp_wmb(); + table->header = (const void *)header; +} + +static void *__init balloc(unsigned long sz) +{ + return __alloc_bootmem_nopanic(sz, + sizeof(unsigned int), + __pa(MAX_DMA_ADDRESS)); +} + +void __init unwind_setup(void) +{ + setup_unwind_table(&root_table, balloc); } #ifdef CONFIG_MODULES @@ -193,7 +353,8 @@ void *unwind_add_table(struct module *module, init_unwind_table(table, module->name, module->module_core, module->core_size, module->module_init, module->init_size, - table_start, table_size); + table_start, table_size, + NULL, 0); if (last_table) last_table->link = table; @@ -303,6 +464,26 @@ static sleb128_t get_sleb128(const u8 **pcur, const u8 *end) return value; } +static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *table) +{ + const u32 *cie; + + if (!*fde || (*fde & (sizeof(*fde) - 1))) + return &bad_cie; + if (!fde[1]) + return ¬_fde; /* this is a CIE */ + if ((fde[1] & (sizeof(*fde) - 1)) + || fde[1] > (unsigned long)(fde + 1) - (unsigned long)table->address) + return NULL; /* this is not a valid FDE */ + cie = fde + 1 - fde[1] / sizeof(*fde); + if (*cie <= sizeof(*cie) + 4 + || *cie >= fde[1] - sizeof(*fde) + || (*cie & (sizeof(*cie) - 1)) + || cie[1]) + return NULL; /* this is not a (valid) CIE */ + return cie; +} + static unsigned long read_pointer(const u8 **pLoc, const void *end, signed ptrType) @@ -610,49 +791,108 @@ int unwind(struct unwind_frame_info *frame) unsigned i; signed ptrType = -1; uleb128_t retAddrReg = 0; - struct unwind_table *table; + const struct unwind_table *table; struct unwind_state state; if (UNW_PC(frame) == 0) return -EINVAL; if ((table = find_table(pc)) != NULL && !(table->size & (sizeof(*fde) - 1))) { - unsigned long tableSize = table->size; - - for (fde = table->address; - tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde; - tableSize -= sizeof(*fde) + *fde, - fde += 1 + *fde / sizeof(*fde)) { - if (!*fde || (*fde & (sizeof(*fde) - 1))) - break; - if (!fde[1]) - continue; /* this is a CIE */ - if ((fde[1] & (sizeof(*fde) - 1)) - || fde[1] > (unsigned long)(fde + 1) - - (unsigned long)table->address) - continue; /* this is not a valid FDE */ - cie = fde + 1 - fde[1] / sizeof(*fde); - if (*cie <= sizeof(*cie) + 4 - || *cie >= fde[1] - sizeof(*fde) - || (*cie & (sizeof(*cie) - 1)) - || cie[1] - || (ptrType = fde_pointer_type(cie)) < 0) { - cie = NULL; /* this is not a (valid) CIE */ - continue; + const u8 *hdr = table->header; + unsigned long tableSize; + + smp_rmb(); + if (hdr && hdr[0] == 1) { + switch(hdr[3] & DW_EH_PE_FORM) { + case DW_EH_PE_native: tableSize = sizeof(unsigned long); break; + case DW_EH_PE_data2: tableSize = 2; break; + case DW_EH_PE_data4: tableSize = 4; break; + case DW_EH_PE_data8: tableSize = 8; break; + default: tableSize = 0; break; + } + ptr = hdr + 4; + end = hdr + table->hdrsz; + if (tableSize + && read_pointer(&ptr, end, hdr[1]) + == (unsigned long)table->address + && (i = read_pointer(&ptr, end, hdr[2])) > 0 + && i == (end - ptr) / (2 * tableSize) + && !((end - ptr) % (2 * tableSize))) { + do { + const u8 *cur = ptr + (i / 2) * (2 * tableSize); + + startLoc = read_pointer(&cur, + cur + tableSize, + hdr[3]); + if (pc < startLoc) + i /= 2; + else { + ptr = cur - tableSize; + i = (i + 1) / 2; + } + } while (startLoc && i > 1); + if (i == 1 + && (startLoc = read_pointer(&ptr, + ptr + tableSize, + hdr[3])) != 0 + && pc >= startLoc) + fde = (void *)read_pointer(&ptr, + ptr + tableSize, + hdr[3]); } + } + + if (fde != NULL) { + cie = cie_for_fde(fde, table); ptr = (const u8 *)(fde + 2); - startLoc = read_pointer(&ptr, - (const u8 *)(fde + 1) + *fde, - ptrType); - endLoc = startLoc - + read_pointer(&ptr, - (const u8 *)(fde + 1) + *fde, - ptrType & DW_EH_PE_indirect - ? ptrType - : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed)); - if (pc >= startLoc && pc < endLoc) - break; - cie = NULL; + if(cie != NULL + && cie != &bad_cie + && cie != ¬_fde + && (ptrType = fde_pointer_type(cie)) >= 0 + && read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType) == startLoc) { + if (!(ptrType & DW_EH_PE_indirect)) + ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed; + endLoc = startLoc + + read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType); + if(pc >= endLoc) + fde = NULL; + } else + fde = NULL; + } + if (fde == NULL) { + for (fde = table->address, tableSize = table->size; + cie = NULL, tableSize > sizeof(*fde) + && tableSize - sizeof(*fde) >= *fde; + tableSize -= sizeof(*fde) + *fde, + fde += 1 + *fde / sizeof(*fde)) { + cie = cie_for_fde(fde, table); + if (cie == &bad_cie) { + cie = NULL; + break; + } + if (cie == NULL + || cie == ¬_fde + || (ptrType = fde_pointer_type(cie)) < 0) + continue; + ptr = (const u8 *)(fde + 2); + startLoc = read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType); + if (!startLoc) + continue; + if (!(ptrType & DW_EH_PE_indirect)) + ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed; + endLoc = startLoc + + read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType); + if (pc >= startLoc && pc < endLoc) + break; + } } } if (cie != NULL) { -- cgit v1.2.3 From 1d4d262769cd1894a0306b9c57e72f005cd9e75a Mon Sep 17 00:00:00 2001 From: Jan Dittmer Date: Sat, 28 Oct 2006 10:38:38 -0700 Subject: [PATCH] Add missing space in module.c for taintskernel Obvious fix. Signed-off-by: Jan Dittmer Acked-by: Florin Malita Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 67009bd56c52..5072a943fe35 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1342,7 +1342,7 @@ static void set_license(struct module *mod, const char *license) if (!license_is_gpl_compatible(license)) { if (!(tainted & TAINT_PROPRIETARY_MODULE)) - printk(KERN_WARNING "%s: module license '%s' taints" + printk(KERN_WARNING "%s: module license '%s' taints " "kernel.\n", mod->name, license); add_taint_module(mod, TAINT_PROPRIETARY_MODULE); } -- cgit v1.2.3 From 5fa3839a64203b2ab727dcb37da9b2d7079fca28 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Sat, 28 Oct 2006 10:38:46 -0700 Subject: [PATCH] Constify compat_get_bitmap argument This means we can call it when the bitmap we want to fetch is declared const. Signed-off-by: Stephen Rothwell Cc: Christoph Lameter Cc: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 75573e5d27b0..d4898aad6cfa 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -678,7 +678,7 @@ int get_compat_sigevent(struct sigevent *event, ? -EFAULT : 0; } -long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask, +long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, unsigned long bitmap_size) { int i, j; -- cgit v1.2.3 From fca178c0c6e8d52a1875be36b070f30884ebfae9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 28 Oct 2006 10:38:49 -0700 Subject: [PATCH] fill_tgid: fix task_struct leak and possible oops 1. fill_tgid() forgets to do put_task_struct(first). 2. release_task(first) can happen after fill_tgid() drops tasklist_lock, it is unsafe to dereference first->signal. This is a temporary fix, imho the locking should be reworked. Signed-off-by: Oleg Nesterov Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 5d6a8c54ee85..9aeee511a463 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -237,14 +237,17 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, } else get_task_struct(first); - /* Start with stats from dead tasks */ - spin_lock_irqsave(&first->signal->stats_lock, flags); - if (first->signal->stats) - memcpy(stats, first->signal->stats, sizeof(*stats)); - spin_unlock_irqrestore(&first->signal->stats_lock, flags); tsk = first; read_lock(&tasklist_lock); + /* Start with stats from dead tasks */ + if (first->signal) { + spin_lock_irqsave(&first->signal->stats_lock, flags); + if (first->signal->stats) + memcpy(stats, first->signal->stats, sizeof(*stats)); + spin_unlock_irqrestore(&first->signal->stats_lock, flags); + } + do { if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk)) continue; @@ -264,7 +267,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, * Accounting subsytems can also add calls here to modify * fields of taskstats. */ - + put_task_struct(first); return 0; } -- cgit v1.2.3 From 05d5bcd60e8202e5c7b28cf61186043a4d612623 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 28 Oct 2006 10:38:50 -0700 Subject: [PATCH] bacct_add_tsk: fix unsafe and wrong parent/group_leader dereference 1. ts = timespec_sub(uptime, current->group_leader->start_time); It is possible that current != tsk. Probably it was supposed to be 'tsk->group_leader->start_time. But why we are reading group_leader's start_time ? This accounting is per thread, not per procees, I changed this to 'tsk->start_time. Please corect me. 2. stats->ac_ppid = (tsk->parent) ? tsk->parent->pid : 0; tsk->parent never == NULL, and it is unsafe to dereference it. Both the task and it's parent may exit after the caller unlocks tasklist_lock, the memory could be unmapped (DEBUG_SLAB). (And we should use ->real_parent->tgid in fact). Q: I don't understand the 'if (thread_group_leader(tsk))' check. Why it is needed ? Signed-off-by: Oleg Nesterov Cc: Shailabh Nagar Cc: Balbir Singh Acked-by: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/tsacct.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/tsacct.c b/kernel/tsacct.c index db443221ba5b..65a5036a3d95 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -36,7 +36,7 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) /* calculate task elapsed time in timespec */ do_posix_clock_monotonic_gettime(&uptime); - ts = timespec_sub(uptime, current->group_leader->start_time); + ts = timespec_sub(uptime, tsk->start_time); /* rebase elapsed time to usec */ ac_etime = timespec_to_ns(&ts); do_div(ac_etime, NSEC_PER_USEC); @@ -58,7 +58,10 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) stats->ac_uid = tsk->uid; stats->ac_gid = tsk->gid; stats->ac_pid = tsk->pid; - stats->ac_ppid = (tsk->parent) ? tsk->parent->pid : 0; + rcu_read_lock(); + stats->ac_ppid = pid_alive(tsk) ? + rcu_dereference(tsk->real_parent)->tgid : 0; + rcu_read_unlock(); stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC; stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; stats->ac_minflt = tsk->min_flt; -- cgit v1.2.3 From 093a8e8aecd77b2799934996a55a6838e1e2b8f3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 28 Oct 2006 10:38:51 -0700 Subject: [PATCH] taskstats_tgid_free: fix usage taskstats_tgid_free() is called on copy_process's error path. This is wrong. IF (clone_flags & CLONE_THREAD) We should not clear ->signal->taskstats, current uses it, it probably has a valid accumulated info. ELSE taskstats_tgid_init() set ->signal->taskstats = NULL, there is nothing to free. Move the callsite to __exit_signal(). We don't need any locking, entire thread group is exiting, nobody should have a reference to soon to be released ->signal. Signed-off-by: Oleg Nesterov Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 1 + kernel/fork.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f250a5e3e281..06de6c4e8ca3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -128,6 +128,7 @@ static void __exit_signal(struct task_struct *tsk) flush_sigqueue(&tsk->pending); if (sig) { flush_sigqueue(&sig->shared_pending); + taskstats_tgid_free(sig); __cleanup_signal(sig); } } diff --git a/kernel/fork.c b/kernel/fork.c index 29ebb30850ed..213326609bac 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -897,7 +897,6 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts void __cleanup_signal(struct signal_struct *sig) { exit_thread_group_keys(sig); - taskstats_tgid_free(sig); kmem_cache_free(signal_cachep, sig); } -- cgit v1.2.3 From b8534d7bd89df0cd41cd47bcd6733a05ea9a691a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 28 Oct 2006 10:38:53 -0700 Subject: [PATCH] taskstats: kill ->taskstats_lock in favor of ->siglock signal_struct is (mostly) protected by ->sighand->siglock, I think we don't need ->taskstats_lock to protect ->stats. This also allows us to simplify the locking in fill_tgid(). Signed-off-by: Oleg Nesterov Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- kernel/taskstats.c | 16 ++++++---------- 2 files changed, 7 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 213326609bac..3da978eec791 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -830,7 +830,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts if (clone_flags & CLONE_THREAD) { atomic_inc(¤t->signal->count); atomic_inc(¤t->signal->live); - taskstats_tgid_alloc(current->signal); + taskstats_tgid_alloc(current); return 0; } sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 9aeee511a463..b2efda94615a 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -241,11 +241,11 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, tsk = first; read_lock(&tasklist_lock); /* Start with stats from dead tasks */ - if (first->signal) { - spin_lock_irqsave(&first->signal->stats_lock, flags); + if (first->sighand) { + spin_lock_irqsave(&first->sighand->siglock, flags); if (first->signal->stats) memcpy(stats, first->signal->stats, sizeof(*stats)); - spin_unlock_irqrestore(&first->signal->stats_lock, flags); + spin_unlock_irqrestore(&first->sighand->siglock, flags); } do { @@ -276,7 +276,7 @@ static void fill_tgid_exit(struct task_struct *tsk) { unsigned long flags; - spin_lock_irqsave(&tsk->signal->stats_lock, flags); + spin_lock_irqsave(&tsk->sighand->siglock, flags); if (!tsk->signal->stats) goto ret; @@ -288,7 +288,7 @@ static void fill_tgid_exit(struct task_struct *tsk) */ delayacct_add_tsk(tsk->signal->stats, tsk); ret: - spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); + spin_unlock_irqrestore(&tsk->sighand->siglock, flags); return; } @@ -464,15 +464,10 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, size_t size; int is_thread_group; struct nlattr *na; - unsigned long flags; if (!family_registered || !tidstats) return; - spin_lock_irqsave(&tsk->signal->stats_lock, flags); - is_thread_group = tsk->signal->stats ? 1 : 0; - spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); - rc = 0; /* * Size includes space for nested attributes @@ -480,6 +475,7 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, size = nla_total_size(sizeof(u32)) + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + is_thread_group = (tsk->signal->stats != NULL); if (is_thread_group) size = 2 * size; /* PID + STATS + TGID + STATS */ -- cgit v1.2.3 From a98b6094261c0112e9c455c96995972181bff049 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 28 Oct 2006 10:38:54 -0700 Subject: [PATCH] taskstats: don't use tasklist_lock Remove tasklist_lock from taskstats.c. find_task_by_pid() is rcu-safe. ->siglock allows us to traverse subthread without tasklist. Q: delay accounting looks wrong to me. If sub-thread has already called taskstats_exit_send() but didn't call release_task(self) yet it will be accounted twice. The window is big. No? Signed-off-by: Oleg Nesterov Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 59 ++++++++++++++++++++++-------------------------------- 1 file changed, 24 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index b2efda94615a..b724aeea5443 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -174,21 +174,19 @@ static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) up_write(&listeners->sem); } -static int fill_pid(pid_t pid, struct task_struct *pidtsk, +static int fill_pid(pid_t pid, struct task_struct *tsk, struct taskstats *stats) { int rc = 0; - struct task_struct *tsk = pidtsk; - if (!pidtsk) { - read_lock(&tasklist_lock); + if (!tsk) { + rcu_read_lock(); tsk = find_task_by_pid(pid); - if (!tsk) { - read_unlock(&tasklist_lock); + if (tsk) + get_task_struct(tsk); + rcu_read_unlock(); + if (!tsk) return -ESRCH; - } - get_task_struct(tsk); - read_unlock(&tasklist_lock); } else get_task_struct(tsk); @@ -214,40 +212,28 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk, } -static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, +static int fill_tgid(pid_t tgid, struct task_struct *first, struct taskstats *stats) { - struct task_struct *tsk, *first; + struct task_struct *tsk; unsigned long flags; + int rc = -ESRCH; /* * Add additional stats from live tasks except zombie thread group * leaders who are already counted with the dead tasks */ - first = tgidtsk; - if (!first) { - read_lock(&tasklist_lock); + rcu_read_lock(); + if (!first) first = find_task_by_pid(tgid); - if (!first) { - read_unlock(&tasklist_lock); - return -ESRCH; - } - get_task_struct(first); - read_unlock(&tasklist_lock); - } else - get_task_struct(first); + if (!first || !lock_task_sighand(first, &flags)) + goto out; - tsk = first; - read_lock(&tasklist_lock); - /* Start with stats from dead tasks */ - if (first->sighand) { - spin_lock_irqsave(&first->sighand->siglock, flags); - if (first->signal->stats) - memcpy(stats, first->signal->stats, sizeof(*stats)); - spin_unlock_irqrestore(&first->sighand->siglock, flags); - } + if (first->signal->stats) + memcpy(stats, first->signal->stats, sizeof(*stats)); + tsk = first; do { if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk)) continue; @@ -260,15 +246,18 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, delayacct_add_tsk(stats, tsk); } while_each_thread(first, tsk); - read_unlock(&tasklist_lock); - stats->version = TASKSTATS_VERSION; + unlock_task_sighand(first, &flags); + rc = 0; +out: + rcu_read_unlock(); + + stats->version = TASKSTATS_VERSION; /* * Accounting subsytems can also add calls here to modify * fields of taskstats. */ - put_task_struct(first); - return 0; + return rc; } -- cgit v1.2.3 From d7c3f5f231c60d7e6ada5770b536df2b3ec1bd08 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 28 Oct 2006 10:38:54 -0700 Subject: [PATCH] fill_tgid: cleanup delays accounting fill_tgid() should skip not only an already exited group leader. If the task has ->exit_state != 0 it already did exit_notify(), so it also did fill_tgid_exit()->delayacct_add_tsk(->signal->stats) and we should skip it to avoid a double accounting. This patch doesn't close the race completely, but it cleanups the code. Signed-off-by: Oleg Nesterov Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index b724aeea5443..8adfb8069c6d 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -235,7 +235,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first, tsk = first; do { - if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk)) + if (tsk->exit_state) continue; /* * Accounting subsystem can call its functions here to -- cgit v1.2.3 From bb1d860551c4307b1a7ee9a21b120319075e987e Mon Sep 17 00:00:00 2001 From: Jim Houston Date: Sat, 28 Oct 2006 10:38:56 -0700 Subject: [PATCH] time_adjust cleared before use I notice that the code which implements adjtime clears the time_adjust value before using it. The attached patch makes the obvious fix. Acked-by: Roman Zippel Signed-off-by: Jim Houston Cc: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/ntp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 47195fa0ec4f..3afeaa3a73f9 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -161,9 +161,9 @@ void second_overflow(void) time_adjust += MAX_TICKADJ; tick_length -= MAX_TICKADJ_SCALED; } else { - time_adjust = 0; tick_length += (s64)(time_adjust * NSEC_PER_USEC / HZ) << TICK_LENGTH_SHIFT; + time_adjust = 0; } } } -- cgit v1.2.3 From 8fa1d7d3b2c51594c0f3aa151983dd51f605e07d Mon Sep 17 00:00:00 2001 From: Satoru Takeuchi Date: Sat, 28 Oct 2006 10:38:57 -0700 Subject: [PATCH] cpu-hotplug: release `workqueue_mutex' properly on CPU hot-remove _cpu_down() acquires `workqueue_mutex' on its process, but doen't release it if __cpu_disable() fails. Signed-off-by: Satoru Takeuchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 27dd3ee47099..663c920b2234 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -150,18 +150,18 @@ static int _cpu_down(unsigned int cpu) p = __stop_machine_run(take_cpu_down, NULL, cpu); mutex_unlock(&cpu_bitmask_lock); - if (IS_ERR(p)) { + if (IS_ERR(p) || cpu_online(cpu)) { /* CPU didn't die: tell everyone. Can't complain. */ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, (void *)(long)cpu) == NOTIFY_BAD) BUG(); - err = PTR_ERR(p); - goto out_allowed; - } - - if (cpu_online(cpu)) + if (IS_ERR(p)) { + err = PTR_ERR(p); + goto out_allowed; + } goto out_thread; + } /* Wait for it to sleep (leaving idle task). */ while (!idle_cpu(cpu)) -- cgit v1.2.3 From 057647fc47b3a5fbcfa997041db3f483d506603c Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Sat, 28 Oct 2006 10:38:58 -0700 Subject: [PATCH] workqueue: update kerneldoc This patch (as812) changes the kerneldoc comments explaining the return values from queue_work(), queue_delayed_work(), and queue_delayed_work_on(). The updated comments explain more accurately the meaning of the return code and avoid suggesting that a 0 value means the routine was unsuccessful. Signed-off-by: Alan Stern Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3df9bfc7ff78..17c2f03d2c27 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -99,7 +99,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, * @wq: workqueue to use * @work: work to queue * - * Returns non-zero if it was successfully added. + * Returns 0 if @work was already on a queue, non-zero otherwise. * * We queue the work to the CPU it was submitted, but there is no * guarantee that it will be processed by that CPU. @@ -138,7 +138,7 @@ static void delayed_work_timer_fn(unsigned long __data) * @work: work to queue * @delay: number of jiffies to wait before queueing * - * Returns non-zero if it was successfully added. + * Returns 0 if @work was already on a queue, non-zero otherwise. */ int fastcall queue_delayed_work(struct workqueue_struct *wq, struct work_struct *work, unsigned long delay) @@ -169,7 +169,7 @@ EXPORT_SYMBOL_GPL(queue_delayed_work); * @work: work to queue * @delay: number of jiffies to wait before queueing * - * Returns non-zero if it was successfully added. + * Returns 0 if @work was already on a queue, non-zero otherwise. */ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work, unsigned long delay) -- cgit v1.2.3 From d46a3d0d07ba539aea5b0e1ad30e568f0cb03576 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 29 Oct 2006 16:45:58 +0300 Subject: [PATCH] taskstats: fix sk_buff leak 'return genlmsg_cancel()' in taskstats_user_cmd/taskstats_exit_send potentially leaks a skb. Unless we pass 'rep_skb' to the netlink layer we own sk_buff. This means we should always do kfree_skb() on failure. [ Thomas acked and pointed out missing return value in original version ] Signed-off-by: Oleg Nesterov Acked-by: Thomas Graf Cc: Andrew Morton Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 8adfb8069c6d..f3c3e9d43d2c 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -411,7 +411,7 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) return send_reply(rep_skb, info->snd_pid); nla_put_failure: - return genlmsg_cancel(rep_skb, reply); + rc = genlmsg_cancel(rep_skb, reply); err: nlmsg_free(rep_skb); return rc; @@ -507,7 +507,6 @@ send: nla_put_failure: genlmsg_cancel(rep_skb, reply); - goto ret; err_skb: nlmsg_free(rep_skb); ret: -- cgit v1.2.3 From 3d8334def5cf831d2ed438aae021696a2faa4ddd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 29 Oct 2006 18:57:16 +0300 Subject: [PATCH] taskstats: fix sk_buff size calculation prepare_reply() adds GENL_HDRLEN to the payload (genlmsg_total_size()), but then it does genlmsg_put()->nlmsg_put(). This means we forget to reserve a room for 'struct nlmsghdr'. Signed-off-by: Oleg Nesterov Cc: Thomas Graf Cc: Andrew Morton Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index f3c3e9d43d2c..2039585ec5e1 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -77,7 +77,8 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, /* * If new attributes are added, please revisit this allocation */ - skb = nlmsg_new(genlmsg_total_size(size), GFP_KERNEL); + size = nlmsg_total_size(genlmsg_total_size(size)); + skb = nlmsg_new(size, GFP_KERNEL); if (!skb) return -ENOMEM; -- cgit v1.2.3 From 0e2d57fc6e7dabdbfdd4f26c861e7e6c75d5bdcf Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 29 Oct 2006 22:46:34 -0800 Subject: [PATCH] ndiswrapper: don't set the module->taints flags For ndiswrapper, don't set the module->taints flags, just set the kernel global tainted flag. This should allow ndiswrapper to continue to use GPL symbols. Signed-off-by: Randy Dunlap Cc: Florin Malita Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 5072a943fe35..f0166563c602 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1718,7 +1718,7 @@ static struct module *load_module(void __user *umod, set_license(mod, get_modinfo(sechdrs, infoindex, "license")); if (strcmp(mod->name, "ndiswrapper") == 0) - add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + add_taint(TAINT_PROPRIETARY_MODULE); if (strcmp(mod->name, "driverloader") == 0) add_taint_module(mod, TAINT_PROPRIETARY_MODULE); -- cgit v1.2.3 From f0ec1aaf54caddd21c259aea8b2ecfbde4ee4fb9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 29 Oct 2006 22:46:43 -0800 Subject: [PATCH] xacct_add_tsk: fix pure theoretical ->mm use-after-free Paranoid fix. The task can free its ->mm after the 'if (p->mm)' check. Signed-off-by: Oleg Nesterov Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/tsacct.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 65a5036a3d95..96f77013d3f0 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -80,13 +80,17 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) */ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) { + struct mm_struct *mm; + /* convert pages-jiffies to Mbyte-usec */ stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB; stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB; - if (p->mm) { + mm = get_task_mm(p); + if (mm) { /* adjust to KB unit */ - stats->hiwater_rss = p->mm->hiwater_rss * PAGE_SIZE / KB; - stats->hiwater_vm = p->mm->hiwater_vm * PAGE_SIZE / KB; + stats->hiwater_rss = mm->hiwater_rss * PAGE_SIZE / KB; + stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB; + mmput(mm); } stats->read_char = p->rchar; stats->write_char = p->wchar; -- cgit v1.2.3 From 4a279ff1ea1cf325775ada983035123fcdc8e986 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 30 Oct 2006 22:07:15 -0800 Subject: [PATCH] taskstats: fix sub-threads accounting If there are no listeners, taskstats_exit_send() just returns because taskstats_exit_alloc() didn't allocate *tidstats. This is wrong, each sub-thread should do fill_tgid_exit() on exit, otherwise its ->delays is not recorded in ->signal->stats and lost. Q: We don't send TASKSTATS_TYPE_AGGR_TGID when single-threaded process exits. Is it good? How can the listener figure out that it was actually a process exit, not sub-thread? Signed-off-by: Oleg Nesterov Cc: Balbir Singh Acked-by: Shailabh Nagar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 2039585ec5e1..f45c5e70773c 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -455,10 +455,9 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, int is_thread_group; struct nlattr *na; - if (!family_registered || !tidstats) + if (!family_registered) return; - rc = 0; /* * Size includes space for nested attributes */ @@ -466,8 +465,15 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); is_thread_group = (tsk->signal->stats != NULL); - if (is_thread_group) - size = 2 * size; /* PID + STATS + TGID + STATS */ + if (is_thread_group) { + /* PID + STATS + TGID + STATS */ + size = 2 * size; + /* fill the tsk->signal->stats structure */ + fill_tgid_exit(tsk); + } + + if (!tidstats) + return; rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); if (rc < 0) @@ -487,11 +493,8 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, goto send; /* - * tsk has/had a thread group so fill the tsk->signal->stats structure * Doesn't matter if tsk is the leader or the last group member leaving */ - - fill_tgid_exit(tsk); if (!group_dead) goto send; -- cgit v1.2.3 From f46c483357c2d87606bbefb511321e3efd4baae0 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 2 Nov 2006 22:07:16 -0800 Subject: [PATCH] Add printk_timed_ratelimit() printk_ratelimit() has global state which makes it not useful for callers which wish to perform ratelimiting at a particular frequency. Add a printk_timed_ratelimit() which utilises caller-provided state storage to permit more flexibility. This function can in fact be used for things other than printk ratelimiting and is perhaps poorly named. Cc: Ulrich Drepper Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index f7d427ef5038..66426552fbfe 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -1101,3 +1102,23 @@ int printk_ratelimit(void) printk_ratelimit_burst); } EXPORT_SYMBOL(printk_ratelimit); + +/** + * printk_timed_ratelimit - caller-controlled printk ratelimiting + * @caller_jiffies: pointer to caller's state + * @interval_msecs: minimum interval between prints + * + * printk_timed_ratelimit() returns true if more than @interval_msecs + * milliseconds have elapsed since the last time printk_timed_ratelimit() + * returned true. + */ +bool printk_timed_ratelimit(unsigned long *caller_jiffies, + unsigned int interval_msecs) +{ + if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) { + *caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs); + return true; + } + return false; +} +EXPORT_SYMBOL(printk_timed_ratelimit); -- cgit v1.2.3 From 19c6b6ed3f597a583f58e3fc99256cc01ae8c394 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 2 Nov 2006 22:07:17 -0800 Subject: [PATCH] schedule removal of FUTEX_FD Apparently FUTEX_FD is unfixably racy and nothing uses it (or if it does, it shouldn't). Add a warning printk, give any remaining users six months to migrate off it. Cc: Ulrich Drepper Cc: Ingo Molnar Acked-by: Thomas Gleixner Cc: Rusty Russell Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index b364e0026191..93ef30ba209f 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1507,6 +1507,13 @@ static int futex_fd(u32 __user *uaddr, int signal) struct futex_q *q; struct file *filp; int ret, err; + static unsigned long printk_interval; + + if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) { + printk(KERN_WARNING "Process `%s' used FUTEX_FD, which " + "will be removed from the kernel in June 2007\n", + current->comm); + } ret = -EINVAL; if (!valid_signal(signal)) -- cgit v1.2.3 From b918f6e62cd46774f9fc0a3fbba6bd10ad85ee14 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 2 Nov 2006 22:07:19 -0800 Subject: [PATCH] swsusp: debugging Add a swsusp debugging mode. This does everything that's needed for a suspend except for actually suspending. So we can look in the log messages and work out a) what code is being slow and b) which drivers are misbehaving. (1) # echo testproc > /sys/power/disk # echo disk > /sys/power/state This should turn off the non-boot CPU, freeze all processes, wait for 5 seconds and then thaw the processes and the CPU. (2) # echo test > /sys/power/disk # echo disk > /sys/power/state This should turn off the non-boot CPU, freeze all processes, shrink memory, suspend all devices, wait for 5 seconds, resume the devices etc. Cc: Pavel Machek Cc: Stefan Seyfried Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index d3a158a60312..b1fb7866b0b3 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -71,7 +71,7 @@ static inline void platform_finish(void) static int prepare_processes(void) { - int error; + int error = 0; pm_prepare_console(); @@ -84,6 +84,12 @@ static int prepare_processes(void) goto thaw; } + if (pm_disk_mode == PM_DISK_TESTPROC) { + printk("swsusp debug: Waiting for 5 seconds.\n"); + mdelay(5000); + goto thaw; + } + /* Free memory before shutting down devices. */ if (!(error = swsusp_shrink_memory())) return 0; @@ -120,13 +126,21 @@ int pm_suspend_disk(void) if (error) return error; + if (pm_disk_mode == PM_DISK_TESTPROC) + goto Thaw; + suspend_console(); error = device_suspend(PMSG_FREEZE); if (error) { resume_console(); printk("Some devices failed to suspend\n"); - unprepare_processes(); - return error; + goto Thaw; + } + + if (pm_disk_mode == PM_DISK_TEST) { + printk("swsusp debug: Waiting for 5 seconds.\n"); + mdelay(5000); + goto Done; } pr_debug("PM: snapshotting memory.\n"); @@ -143,16 +157,17 @@ int pm_suspend_disk(void) power_down(pm_disk_mode); else { swsusp_free(); - unprepare_processes(); - return error; + goto Thaw; } - } else + } else { pr_debug("PM: Image restored successfully.\n"); + } swsusp_free(); Done: device_resume(); resume_console(); + Thaw: unprepare_processes(); return error; } @@ -249,6 +264,8 @@ static const char * const pm_disk_modes[] = { [PM_DISK_PLATFORM] = "platform", [PM_DISK_SHUTDOWN] = "shutdown", [PM_DISK_REBOOT] = "reboot", + [PM_DISK_TEST] = "test", + [PM_DISK_TESTPROC] = "testproc", }; /** @@ -303,17 +320,19 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n) } } if (mode) { - if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT) + if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT || + mode == PM_DISK_TEST || mode == PM_DISK_TESTPROC) { pm_disk_mode = mode; - else { + } else { if (pm_ops && pm_ops->enter && (mode == pm_ops->pm_disk_mode)) pm_disk_mode = mode; else error = -EINVAL; } - } else + } else { error = -EINVAL; + } pr_debug("PM: suspend-to-disk mode set to '%s'\n", pm_disk_modes[mode]); -- cgit v1.2.3 From 3fd593979802f81ff6452596ac61e3840f917589 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 2 Nov 2006 22:07:24 -0800 Subject: [PATCH] Create compat_sys_migrate_pages This is needed on bigendian 64bit architectures. Signed-off-by: Stephen Rothwell Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/compat.c | 33 +++++++++++++++++++++++++++++++++ kernel/sys_ni.c | 1 + 2 files changed, 34 insertions(+) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index d4898aad6cfa..6952dd057300 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -982,4 +982,37 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, } return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); } + +asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, + compat_ulong_t maxnode, + const compat_ulong_t __user *old_nodes, + const compat_ulong_t __user *new_nodes) +{ + unsigned long __user *old = NULL; + unsigned long __user *new = NULL; + nodemask_t tmp_mask; + unsigned long nr_bits; + unsigned long size; + + nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES); + size = ALIGN(nr_bits, BITS_PER_LONG) / 8; + if (old_nodes) { + if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits)) + return -EFAULT; + old = compat_alloc_user_space(new_nodes ? size * 2 : size); + if (new_nodes) + new = old + size / sizeof(unsigned long); + if (copy_to_user(old, nodes_addr(tmp_mask), size)) + return -EFAULT; + } + if (new_nodes) { + if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits)) + return -EFAULT; + if (new == NULL) + new = compat_alloc_user_space(size); + if (copy_to_user(new, nodes_addr(tmp_mask), size)) + return -EFAULT; + } + return sys_migrate_pages(pid, nr_bits + 1, old, new); +} #endif diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 0e53314b14de..d7306d0f3dfc 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -135,6 +135,7 @@ cond_syscall(sys_madvise); cond_syscall(sys_mremap); cond_syscall(sys_remap_file_pages); cond_syscall(compat_sys_move_pages); +cond_syscall(compat_sys_migrate_pages); /* block-layer dependent */ cond_syscall(sys_bdflush); -- cgit v1.2.3 From 45c18b0bb579b5c1b89f8c99f1b6ffa4c586ba08 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 4 Nov 2006 10:06:02 -0800 Subject: Fix unlikely (but possible) race condition on task->user access There's a possible race condition when doing a "switch_uid()" from one user to another, which could race with another thread doing a signal allocation and looking at the old thread ->user pointer as it is freed. This explains an oops reported by Lukasz Trabinski: http://permalink.gmane.org/gmane.linux.kernel/462241 We fix this by delaying the (reference-counted) freeing of the user structure until the thread signal handler lock has been released, so that we know that the signal allocation has either seen the new value or has properly incremented the reference count of the old one. Race identified by Oleg Nesterov. Cc: Lukasz Trabinski Cc: Oleg Nesterov Cc: Andrew Morton Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/user.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 6408c0424291..220e586127a0 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -187,6 +187,17 @@ void switch_uid(struct user_struct *new_user) atomic_dec(&old_user->processes); switch_uid_keyring(new_user); current->user = new_user; + + /* + * We need to synchronize with __sigqueue_alloc() + * doing a get_uid(p->user).. If that saw the old + * user value, we need to wait until it has exited + * its critical region before we can free the old + * structure. + */ + smp_mb(); + spin_unlock_wait(¤t->sighand->siglock); + free_uid(old_user); suid_keys(current); } -- cgit v1.2.3 From 10b1fbdb0a0ca91847a534ad26d0bc250c25b74f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 4 Nov 2006 13:03:00 -0800 Subject: Make sure "user->sigpending" count is in sync The previous commit (45c18b0bb579b5c1b89f8c99f1b6ffa4c586ba08, aka "Fix unlikely (but possible) race condition on task->user access") fixed a potential oops due to __sigqueue_alloc() getting its "user" pointer out of sync with switch_user(), and accessing a user pointer that had been de-allocated on another CPU. It still left another (much less serious) problem, where a concurrent __sigqueue_alloc and swich_user could cause sigqueue_alloc to do signal pending reference counting for a _different_ user than the one it then actually ended up using. No oops, but we'd end up with the wrong signal accounting. Another case of Oleg's eagle-eyes picking up the problem. This is trivially fixed by just making sure we load whichever "user" structure we decide to use (it doesn't matter _which_ one we pick, we just need to pick one) just once. Acked-by: Oleg Nesterov Cc: Andrew Morton Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/signal.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 7ed8d5304bec..df18c167a2a7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -267,18 +267,25 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, int override_rlimit) { struct sigqueue *q = NULL; + struct user_struct *user; - atomic_inc(&t->user->sigpending); + /* + * In order to avoid problems with "switch_user()", we want to make + * sure that the compiler doesn't re-load "t->user" + */ + user = t->user; + barrier(); + atomic_inc(&user->sigpending); if (override_rlimit || - atomic_read(&t->user->sigpending) <= + atomic_read(&user->sigpending) <= t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) q = kmem_cache_alloc(sigqueue_cachep, flags); if (unlikely(q == NULL)) { - atomic_dec(&t->user->sigpending); + atomic_dec(&user->sigpending); } else { INIT_LIST_HEAD(&q->list); q->flags = 0; - q->user = get_uid(t->user); + q->user = get_uid(user); } return(q); } -- cgit v1.2.3 From 4b96b1a10cb00c867103b21f0f2a6c91b705db11 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Sun, 5 Nov 2006 23:52:04 -0800 Subject: [PATCH] Fix the spurious unlock_cpu_hotplug false warnings Cpu-hotplug locking has a minor race case caused because of setting the variable "recursive" to NULL *after* releasing the cpu_bitmask_lock in the function unlock_cpu_hotplug,instead of doing so before releasing the cpu_bitmask_lock. This was the cause of most of the recent false spurious lock_cpu_unlock warnings. This should fix the problem reported by Martin Lorenz reported in http://lkml.org/lkml/2006/10/29/127. Thanks to Srinivasa DS for pointing it out. Signed-off-by: Gautham R Shenoy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 663c920b2234..272254f20d97 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -58,8 +58,8 @@ void unlock_cpu_hotplug(void) recursive_depth--; return; } - mutex_unlock(&cpu_bitmask_lock); recursive = NULL; + mutex_unlock(&cpu_bitmask_lock); } EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); -- cgit v1.2.3 From 64efade11cddc4237c1b95ea4ca18af122a7e19e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 5 Nov 2006 23:52:10 -0800 Subject: [PATCH] lockdep: fix delayacct locking bug Make the delayacct lock irqsave; this avoids the possible deadlock where an interrupt is taken while holding the delayacct lock which needs to take the delayacct lock. Signed-off-by: Peter Zijlstra Acked-by: Oleg Nesterov Cc: Balbir Singh Cc: Shailabh Nagar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/delayacct.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 36752f124c6a..66a0ea48751d 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -66,6 +66,7 @@ static void delayacct_end(struct timespec *start, struct timespec *end, { struct timespec ts; s64 ns; + unsigned long flags; do_posix_clock_monotonic_gettime(end); ts = timespec_sub(*end, *start); @@ -73,10 +74,10 @@ static void delayacct_end(struct timespec *start, struct timespec *end, if (ns < 0) return; - spin_lock(¤t->delays->lock); + spin_lock_irqsave(¤t->delays->lock, flags); *total += ns; (*count)++; - spin_unlock(¤t->delays->lock); + spin_unlock_irqrestore(¤t->delays->lock, flags); } void __delayacct_blkio_start(void) @@ -104,6 +105,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) s64 tmp; struct timespec ts; unsigned long t1,t2,t3; + unsigned long flags; /* Though tsk->delays accessed later, early exit avoids * unnecessary returning of other data @@ -136,14 +138,14 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ - spin_lock(&tsk->delays->lock); + spin_lock_irqsave(&tsk->delays->lock, flags); tmp = d->blkio_delay_total + tsk->delays->blkio_delay; d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; tmp = d->swapin_delay_total + tsk->delays->swapin_delay; d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; - spin_unlock(&tsk->delays->lock); + spin_unlock_irqrestore(&tsk->delays->lock, flags); done: return 0; @@ -152,11 +154,12 @@ done: __u64 __delayacct_blkio_ticks(struct task_struct *tsk) { __u64 ret; + unsigned long flags; - spin_lock(&tsk->delays->lock); + spin_lock_irqsave(&tsk->delays->lock, flags); ret = nsec_to_clock_t(tsk->delays->blkio_delay + tsk->delays->swapin_delay); - spin_unlock(&tsk->delays->lock); + spin_unlock_irqrestore(&tsk->delays->lock, flags); return ret; } -- cgit v1.2.3 From 0e009be8a0c2309f3696df70f72ef0075aa34c9c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 5 Nov 2006 23:52:11 -0800 Subject: [PATCH] Improve the removed sysctl warnings Don't warn about libpthread's access to kernel.version. When it receives -ENOSYS it will read /proc/sys/kernel/version. If anything else shows up print the sysctl number string. Signed-off-by: Eric W. Biederman Cc: Cal Peake Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8bff2c18fb5a..0c8e805bbd6f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2680,13 +2680,33 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, asmlinkage long sys_sysctl(struct __sysctl_args __user *args) { static int msg_count; + struct __sysctl_args tmp; + int name[CTL_MAXNAME]; + int i; + + /* Read in the sysctl name for better debug message logging */ + if (copy_from_user(&tmp, args, sizeof(tmp))) + return -EFAULT; + if (tmp.nlen <= 0 || tmp.nlen >= CTL_MAXNAME) + return -ENOTDIR; + for (i = 0; i < tmp.nlen; i++) + if (get_user(name[i], tmp.name + i)) + return -EFAULT; + + /* Ignore accesses to kernel.version */ + if ((tmp.nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION)) + goto out; if (msg_count < 5) { msg_count++; printk(KERN_INFO "warning: process `%s' used the removed sysctl " - "system call\n", current->comm); + "system call with ", current->comm); + for (i = 0; i < tmp.nlen; i++) + printk("%d.", name[i]); + printk("\n"); } +out: return -ENOSYS; } -- cgit v1.2.3 From d99f160ac53e51090f015a8f0617cea25f81a191 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 5 Nov 2006 23:52:12 -0800 Subject: [PATCH] sysctl: allow a zero ctl_name in the middle of a sysctl table Since it is becoming clear that there are just enough users of the binary sysctl interface that completely removing the binary interface from the kernel will not be an option for foreseeable future, we need to find a way to address the sysctl maintenance issues. The basic problem is that sysctl requires one central authority to allocate sysctl numbers, or else conflicts and ABI breakage occur. The proc interface to sysctl does not have that problem, as names are not densely allocated. By not terminating a sysctl table until I have neither a ctl_name nor a procname, it becomes simple to add sysctl entries that don't show up in the binary sysctl interface. Which allows people to avoid allocating a binary sysctl value when not needed. I have audited the kernel code and in my reading I have not found a single sysctl table that wasn't terminated by a completely zero filled entry. So this change in behavior should not affect anything. I think this mechanism eases the pain enough that combined with a little disciple we can solve the reoccurring sysctl ABI breakage. Signed-off-by: Eric W. Biederman Acked-by: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0c8e805bbd6f..09e569f4792b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1315,7 +1315,9 @@ repeat: return -ENOTDIR; if (get_user(n, name)) return -EFAULT; - for ( ; table->ctl_name; table++) { + for ( ; table->ctl_name || table->procname; table++) { + if (!table->ctl_name) + continue; if (n == table->ctl_name || table->ctl_name == CTL_ANY) { int error; if (table->child) { @@ -1532,7 +1534,7 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, int len; mode_t mode; - for (; table->ctl_name; table++) { + for (; table->ctl_name || table->procname; table++) { /* Can't do anything without a proc name. */ if (!table->procname) continue; @@ -1579,7 +1581,7 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root) { struct proc_dir_entry *de; - for (; table->ctl_name; table++) { + for (; table->ctl_name || table->procname; table++) { if (!(de = table->de)) continue; if (de->mode & S_IFDIR) { -- cgit v1.2.3 From 0130b0b32ee53dc7add773fcea984f6a26ef1da3 Mon Sep 17 00:00:00 2001 From: Sharyathi Nagesh Date: Fri, 10 Nov 2006 12:27:54 -0800 Subject: [PATCH] fix Data Acess error in dup_fd On running the Stress Test on machine for more than 72 hours following error message was observed. 0:mon> e cpu 0x0: Vector: 300 (Data Access) at [c00000007ce2f7f0] pc: c000000000060d90: .dup_fd+0x240/0x39c lr: c000000000060d6c: .dup_fd+0x21c/0x39c sp: c00000007ce2fa70 msr: 800000000000b032 dar: ffffffff00000028 dsisr: 40000000 current = 0xc000000074950980 paca = 0xc000000000454500 pid = 27330, comm = bash 0:mon> t [c00000007ce2fa70] c000000000060d28 .dup_fd+0x1d8/0x39c (unreliable) [c00000007ce2fb30] c000000000060f48 .copy_files+0x5c/0x88 [c00000007ce2fbd0] c000000000061f5c .copy_process+0x574/0x1520 [c00000007ce2fcd0] c000000000062f88 .do_fork+0x80/0x1c4 [c00000007ce2fdc0] c000000000011790 .sys_clone+0x5c/0x74 [c00000007ce2fe30] c000000000008950 .ppc_clone+0x8/0xc The problem is because of race window. When if(expand) block is executed in dup_fd unlocking of oldf->file_lock give a window for fdtable in oldf to be modified. So actual open_files in oldf may not match with open_files variable. Cc: Vadim Lobanov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 3da978eec791..4b4eab2a3161 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -687,6 +687,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) * the latest pointer. */ spin_lock(&oldf->file_lock); + open_files = count_open_files(old_fdt); old_fdt = files_fdtable(oldf); } -- cgit v1.2.3 From f72fa707604c015a6625e80f269506032d5430dc Mon Sep 17 00:00:00 2001 From: Pavel Emelianov Date: Fri, 10 Nov 2006 12:27:56 -0800 Subject: [PATCH] Fix misrouted interrupts deadlocks While testing kernel on machine with "irqpoll" option I've caught such a lockup: __do_IRQ() spin_lock(&desc->lock); desc->chip->ack(); /* IRQ is ACKed */ note_interrupt() misrouted_irq() handle_IRQ_event() if (...) local_irq_enable_in_hardirq(); /* interrupts are enabled from now */ ... __do_IRQ() /* same IRQ we've started from */ spin_lock(&desc->lock); /* LOCKUP */ Looking at misrouted_irq() code I've found that a potential deadlock like this can also take place: 1CPU: __do_IRQ() spin_lock(&desc->lock); /* irq = A */ misrouted_irq() for (i = 1; i < NR_IRQS; i++) { spin_lock(&desc->lock); /* irq = B */ if (desc->status & IRQ_INPROGRESS) { 2CPU: __do_IRQ() spin_lock(&desc->lock); /* irq = B */ misrouted_irq() for (i = 1; i < NR_IRQS; i++) { spin_lock(&desc->lock); /* irq = A */ if (desc->status & IRQ_INPROGRESS) { As the second lock on both CPUs is taken before checking that this irq is being handled in another processor this may cause a deadlock. This issue is only theoretical. I propose the attached patch to fix booth problems: when trying to handle misrouted IRQ active desc->lock may be unlocked. Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/spurious.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 543ea2e5ad93..9c7e2e4c1fe7 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -147,7 +147,11 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, if (unlikely(irqfixup)) { /* Don't punish working computers */ if ((irqfixup == 2 && irq == 0) || action_ret == IRQ_NONE) { - int ok = misrouted_irq(irq); + int ok; + + spin_unlock(&desc->lock); + ok = misrouted_irq(irq); + spin_lock(&desc->lock); if (action_ret == IRQ_NONE) desc->irqs_unhandled -= ok; } -- cgit v1.2.3 From 8b126b77536186eef69d408eb7959ce7f558f251 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 14 Nov 2006 02:03:23 -0800 Subject: [PATCH] setup_irq(): better mismatch debugging When we get a mismatch between handlers on the same IRQ, all we get is "IRQ handler type mismatch for IRQ n". Let's print the name of the presently-registered handler with which we got the mismatch. Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 6879202afe9a..b385878c6e80 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -216,6 +216,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) { struct irq_desc *desc = irq_desc + irq; struct irqaction *old, **p; + const char *old_name = NULL; unsigned long flags; int shared = 0; @@ -255,8 +256,10 @@ int setup_irq(unsigned int irq, struct irqaction *new) * set the trigger type must match. */ if (!((old->flags & new->flags) & IRQF_SHARED) || - ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) + ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) { + old_name = old->name; goto mismatch; + } #if defined(CONFIG_IRQ_PER_CPU) /* All handlers must agree on per-cpuness */ @@ -322,11 +325,13 @@ int setup_irq(unsigned int irq, struct irqaction *new) return 0; mismatch: - spin_unlock_irqrestore(&desc->lock, flags); if (!(new->flags & IRQF_PROBE_SHARED)) { printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); + if (old_name) + printk(KERN_ERR "current handler: %s\n", old_name); dump_stack(); } + spin_unlock_irqrestore(&desc->lock, flags); return -EBUSY; } -- cgit v1.2.3 From 9a3a04ac386f44175b6a4142eaeab3d4170a57f3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 14 Nov 2006 15:20:51 -0800 Subject: Revert "[PATCH] fix Data Acess error in dup_fd" This reverts commit 0130b0b32ee53dc7add773fcea984f6a26ef1da3. Sergey Vlasov points out (and Vadim Lobanov concurs) that the bug it was supposed to fix must be some unrelated memory corruption, and the "fix" actually causes more problems: "However, the new code does not look safe in all cases. If some other task has opened more files while dup_fd() released oldf->file_lock, the new code will update open_files to the new larger value. But newf was allocated with the old smaller value of open_files, therefore subsequent accesses to newf may try to write into unallocated memory." so revert it. Cc: Sharyathi Nagesh Cc: Sergey Vlasov Cc: Vadim Lobanov Cc: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 4b4eab2a3161..3da978eec791 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -687,7 +687,6 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) * the latest pointer. */ spin_lock(&oldf->file_lock); - open_files = count_open_files(old_fdt); old_fdt = files_fdtable(oldf); } -- cgit v1.2.3 From b86432b42eba5671969a9e6483ee219674b7ee25 Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Thu, 16 Nov 2006 01:19:10 -0800 Subject: [PATCH] some irq_chip variables point to NULL I got an oops when booting 2.6.19-rc5-mm1 on my ia64 machine. Below is the log. Oops 11012296146944 [1] Modules linked in: binfmt_misc dm_mirror dm_multipath dm_mod thermal processor f an container button sg eepro100 e100 mii Pid: 0, CPU 0, comm: swapper psr : 0000121008022038 ifs : 800000000000040b ip : [] Not tainted ip is at __do_IRQ+0x371/0x3e0 unat: 0000000000000000 pfs : 000000000000040b rsc : 0000000000000003 rnat: 656960155aa56aa5 bsps: a00000010058b890 pr : 656960155aa55a65 ldrs: 0000000000000000 ccv : 0000000000000000 fpsr: 0009804c0270033f csd : 0000000000000000 ssd : 0000000000000000 b0 : a0000001000e1390 b6 : a0000001005beac0 b7 : e00000007f01aa00 f6 : 000000000000000000000 f7 : 0ffe69090000000000000 f8 : 1000a9090000000000000 f9 : 0ffff8000000000000000 f10 : 1000a908ffffff6f70000 f11 : 1003e0000000000000909 r1 : a000000100fbbff0 r2 : 0000000000010002 r3 : 0000000000010001 r8 : fffffffffffbffff r9 : a000000100bd8060 r10 : a000000100dd83b8 r11 : fffffffffffeffff r12 : a000000100bcbbb0 r13 : a000000100bc4000 r14 : 0000000000010000 r15 : 0000000000010000 r16 : a000000100c01aa8 r17 : a000000100d2c350 r18 : 0000000000000000 r19 : a000000100d2c300 r20 : a000000100c01a88 r21 : 0000000080010100 r22 : a000000100c01ac0 r23 : a0000001000108e0 r24 : e000000477980004 r25 : 0000000000000000 r26 : 0000000000000000 r27 : e00000000913400c r28 : e0000004799ee51c r29 : e0000004778b87f0 r30 : a000000100d2c300 r31 : a00000010005c7e0 Call Trace: [] show_stack+0x40/0xa0 sp=a000000100bcb760 bsp=a000000100bc4f40 [] show_regs+0x840/0x880 sp=a000000100bcb930 bsp=a000000100bc4ee8 [] die+0x250/0x320 sp=a000000100bcb930 bsp=a000000100bc4ea0 [] ia64_do_page_fault+0x8d0/0xa20 sp=a000000100bcb950 bsp=a000000100bc4e50 [] ia64_leave_kernel+0x0/0x290 sp=a000000100bcb9e0 bsp=a000000100bc4e50 [] __do_IRQ+0x370/0x3e0 sp=a000000100bcbbb0 bsp=a000000100bc4df0 [] ia64_handle_irq+0x170/0x220 sp=a000000100bcbbb0 bsp=a000000100bc4dc0 [] ia64_leave_kernel+0x0/0x290 sp=a000000100bcbbb0 bsp=a000000100bc4dc0 [] ia64_pal_call_static+0x90/0xc0 sp=a000000100bcbd80 bsp=a000000100bc4d78 [] default_idle+0x90/0x160 sp=a000000100bcbd80 bsp=a000000100bc4d58 [] cpu_idle+0x1f0/0x440 sp=a000000100bcbe20 bsp=a000000100bc4d18 [] rest_init+0xc0/0xe0 sp=a000000100bcbe20 bsp=a000000100bc4d00 [] start_kernel+0x6a0/0x6c0 sp=a000000100bcbe20 bsp=a000000100bc4ca0 [] __end_ivt_text+0x6d0/0x6f0 sp=a000000100bcbe30 bsp=a000000100bc4c00 <0>Kernel panic - not syncing: Aiee, killing interrupt handler! The root cause is that some irq_chip variables, especially ia64_msi_chip, initiate their memeber end to point to NULL. __do_IRQ doesn't check if irq_chip->end is null and just calls it after processing the interrupt. As irq_chip->end is called at many places, so I fix it by reinitiating irq_chip->end to dummy_irq_chip.end, e.g., a noop function. Signed-off-by: Zhang Yanmin Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "Luck, Tony" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/chip.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2d0dc3efe813..ebfd24a41858 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -233,6 +233,8 @@ void irq_chip_set_defaults(struct irq_chip *chip) chip->shutdown = chip->disable; if (!chip->name) chip->name = chip->typename; + if (!chip->end) + chip->end = dummy_irq_chip.end; } static inline void mask_ack_irq(struct irq_desc *desc, int irq) -- cgit v1.2.3 From 1ff5683043196b9ad628a5de6bf8eeca52ee8bfd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 17 Nov 2006 19:57:22 +0100 Subject: [PATCH] lockdep: fix static keys in module-allocated percpu areas lockdep got confused by certain locks in modules: INFO: trying to register non-static key. the code is fine but needs lockdep annotation. turning off the locking correctness validator. Call Trace: [] dump_trace+0xaa/0x3f2 [] show_trace+0x3a/0x60 [] dump_stack+0x15/0x17 [] __lock_acquire+0x724/0x9bb [] lock_acquire+0x4d/0x67 [] rt_spin_lock+0x3d/0x41 [] :ip_conntrack:__ip_ct_refresh_acct+0x131/0x174 [] :ip_conntrack:udp_packet+0xbf/0xcf [] :ip_conntrack:ip_conntrack_in+0x394/0x4a7 [] nf_iterate+0x41/0x7f [] nf_hook_slow+0x64/0xd5 [] ip_rcv+0x24e/0x506 [...] Steven Rostedt found the bug: static_obj() check did not take PERCPU_ENOUGH_ROOM into account, so in-module DEFINE_PER_CPU-area locks were triggering this message. Signed-off-by: Ingo Molnar Signed-off-by: Steven Rostedt Signed-off-by: Linus Torvalds --- kernel/lockdep.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index b739be2a6dc9..c9fefdb1a7db 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -1081,7 +1081,8 @@ static int static_obj(void *obj) */ for_each_possible_cpu(i) { start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); - end = (unsigned long) &__per_cpu_end + per_cpu_offset(i); + end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM + + per_cpu_offset(i); if ((addr >= start) && (addr < end)) return 1; -- cgit v1.2.3 From 52bad64d95bd89e08c49ec5a071fa6dcbe5a1a9c Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 22 Nov 2006 14:54:01 +0000 Subject: WorkStruct: Separate delayable and non-delayable events. Separate delayable work items from non-delayable work items be splitting them into a separate structure (delayed_work), which incorporates a work_struct and the timer_list removed from work_struct. The work_struct struct is huge, and this limits it's usefulness. On a 64-bit architecture it's nearly 100 bytes in size. This reduces that by half for the non-delayable type of event. Signed-Off-By: David Howells --- kernel/workqueue.c | 51 ++++++++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 17c2f03d2c27..44fc54b7decf 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -122,29 +122,33 @@ EXPORT_SYMBOL_GPL(queue_work); static void delayed_work_timer_fn(unsigned long __data) { - struct work_struct *work = (struct work_struct *)__data; - struct workqueue_struct *wq = work->wq_data; + struct delayed_work *dwork = (struct delayed_work *)__data; + struct workqueue_struct *wq = dwork->work.wq_data; int cpu = smp_processor_id(); if (unlikely(is_single_threaded(wq))) cpu = singlethread_cpu; - __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); + __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), &dwork->work); } /** * queue_delayed_work - queue work on a workqueue after delay * @wq: workqueue to use - * @work: work to queue + * @work: delayable work to queue * @delay: number of jiffies to wait before queueing * * Returns 0 if @work was already on a queue, non-zero otherwise. */ int fastcall queue_delayed_work(struct workqueue_struct *wq, - struct work_struct *work, unsigned long delay) + struct delayed_work *dwork, unsigned long delay) { int ret = 0; - struct timer_list *timer = &work->timer; + struct timer_list *timer = &dwork->timer; + struct work_struct *work = &dwork->work; + + if (delay == 0) + return queue_work(wq, work); if (!test_and_set_bit(0, &work->pending)) { BUG_ON(timer_pending(timer)); @@ -153,7 +157,7 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq, /* This stores wq for the moment, for the timer_fn */ work->wq_data = wq; timer->expires = jiffies + delay; - timer->data = (unsigned long)work; + timer->data = (unsigned long)dwork; timer->function = delayed_work_timer_fn; add_timer(timer); ret = 1; @@ -172,10 +176,11 @@ EXPORT_SYMBOL_GPL(queue_delayed_work); * Returns 0 if @work was already on a queue, non-zero otherwise. */ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, - struct work_struct *work, unsigned long delay) + struct delayed_work *dwork, unsigned long delay) { int ret = 0; - struct timer_list *timer = &work->timer; + struct timer_list *timer = &dwork->timer; + struct work_struct *work = &dwork->work; if (!test_and_set_bit(0, &work->pending)) { BUG_ON(timer_pending(timer)); @@ -184,7 +189,7 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, /* This stores wq for the moment, for the timer_fn */ work->wq_data = wq; timer->expires = jiffies + delay; - timer->data = (unsigned long)work; + timer->data = (unsigned long)dwork; timer->function = delayed_work_timer_fn; add_timer_on(timer, cpu); ret = 1; @@ -468,31 +473,31 @@ EXPORT_SYMBOL(schedule_work); /** * schedule_delayed_work - put work task in global workqueue after delay - * @work: job to be done - * @delay: number of jiffies to wait + * @dwork: job to be done + * @delay: number of jiffies to wait or 0 for immediate execution * * After waiting for a given time this puts a job in the kernel-global * workqueue. */ -int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay) +int fastcall schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) { - return queue_delayed_work(keventd_wq, work, delay); + return queue_delayed_work(keventd_wq, dwork, delay); } EXPORT_SYMBOL(schedule_delayed_work); /** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay * @cpu: cpu to use - * @work: job to be done + * @dwork: job to be done * @delay: number of jiffies to wait * * After waiting for a given time this puts a job in the kernel-global * workqueue on the specified CPU. */ int schedule_delayed_work_on(int cpu, - struct work_struct *work, unsigned long delay) + struct delayed_work *dwork, unsigned long delay) { - return queue_delayed_work_on(cpu, keventd_wq, work, delay); + return queue_delayed_work_on(cpu, keventd_wq, dwork, delay); } EXPORT_SYMBOL(schedule_delayed_work_on); @@ -539,12 +544,12 @@ EXPORT_SYMBOL(flush_scheduled_work); * cancel_rearming_delayed_workqueue - reliably kill off a delayed * work whose handler rearms the delayed work. * @wq: the controlling workqueue structure - * @work: the delayed work struct + * @dwork: the delayed work struct */ void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq, - struct work_struct *work) + struct delayed_work *dwork) { - while (!cancel_delayed_work(work)) + while (!cancel_delayed_work(dwork)) flush_workqueue(wq); } EXPORT_SYMBOL(cancel_rearming_delayed_workqueue); @@ -552,11 +557,11 @@ EXPORT_SYMBOL(cancel_rearming_delayed_workqueue); /** * cancel_rearming_delayed_work - reliably kill off a delayed keventd * work whose handler rearms the delayed work. - * @work: the delayed work struct + * @dwork: the delayed work struct */ -void cancel_rearming_delayed_work(struct work_struct *work) +void cancel_rearming_delayed_work(struct delayed_work *dwork) { - cancel_rearming_delayed_workqueue(keventd_wq, work); + cancel_rearming_delayed_workqueue(keventd_wq, dwork); } EXPORT_SYMBOL(cancel_rearming_delayed_work); -- cgit v1.2.3 From 6bb49e5965c1fc399b4d3cd2b5cf2da535b330c0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 22 Nov 2006 14:54:45 +0000 Subject: WorkStruct: Typedef the work function prototype Define a type for the work function prototype. It's not only kept in the work_struct struct, it's also passed as an argument to several functions. This makes it easier to change it. Signed-Off-By: David Howells --- kernel/workqueue.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 44fc54b7decf..1e9d61ecf762 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -217,7 +217,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) while (!list_empty(&cwq->worklist)) { struct work_struct *work = list_entry(cwq->worklist.next, struct work_struct, entry); - void (*f) (void *) = work->func; + work_func_t f = work->func; void *data = work->data; list_del_init(cwq->worklist.next); @@ -513,7 +513,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on); * * schedule_on_each_cpu() is very slow. */ -int schedule_on_each_cpu(void (*func)(void *info), void *info) +int schedule_on_each_cpu(work_func_t func, void *info) { int cpu; struct work_struct *works; @@ -578,7 +578,7 @@ EXPORT_SYMBOL(cancel_rearming_delayed_work); * Returns: 0 - function was executed * 1 - function was scheduled for execution */ -int execute_in_process_context(void (*fn)(void *data), void *data, +int execute_in_process_context(work_func_t fn, void *data, struct execute_work *ew) { if (!in_interrupt()) { -- cgit v1.2.3 From 365970a1ea76d81cb1ad2f652acb605f06dae256 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 22 Nov 2006 14:54:49 +0000 Subject: WorkStruct: Merge the pending bit into the wq_data pointer Reclaim a word from the size of the work_struct by folding the pending bit and the wq_data pointer together. This shouldn't cause misalignment problems as all pointers should be at least 4-byte aligned. Signed-Off-By: David Howells --- kernel/workqueue.c | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1e9d61ecf762..967479756511 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -80,6 +80,29 @@ static inline int is_single_threaded(struct workqueue_struct *wq) return list_empty(&wq->list); } +static inline void set_wq_data(struct work_struct *work, void *wq) +{ + unsigned long new, old, res; + + /* assume the pending flag is already set and that the task has already + * been queued on this workqueue */ + new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING); + res = work->management; + if (res != new) { + do { + old = res; + new = (unsigned long) wq; + new |= (old & WORK_STRUCT_FLAG_MASK); + res = cmpxchg(&work->management, old, new); + } while (res != old); + } +} + +static inline void *get_wq_data(struct work_struct *work) +{ + return (void *) (work->management & WORK_STRUCT_WQ_DATA_MASK); +} + /* Preempt must be disabled. */ static void __queue_work(struct cpu_workqueue_struct *cwq, struct work_struct *work) @@ -87,7 +110,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, unsigned long flags; spin_lock_irqsave(&cwq->lock, flags); - work->wq_data = cwq; + set_wq_data(work, cwq); list_add_tail(&work->entry, &cwq->worklist); cwq->insert_sequence++; wake_up(&cwq->more_work); @@ -108,7 +131,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) { int ret = 0, cpu = get_cpu(); - if (!test_and_set_bit(0, &work->pending)) { + if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) { if (unlikely(is_single_threaded(wq))) cpu = singlethread_cpu; BUG_ON(!list_empty(&work->entry)); @@ -123,7 +146,7 @@ EXPORT_SYMBOL_GPL(queue_work); static void delayed_work_timer_fn(unsigned long __data) { struct delayed_work *dwork = (struct delayed_work *)__data; - struct workqueue_struct *wq = dwork->work.wq_data; + struct workqueue_struct *wq = get_wq_data(&dwork->work); int cpu = smp_processor_id(); if (unlikely(is_single_threaded(wq))) @@ -150,12 +173,12 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq, if (delay == 0) return queue_work(wq, work); - if (!test_and_set_bit(0, &work->pending)) { + if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) { BUG_ON(timer_pending(timer)); BUG_ON(!list_empty(&work->entry)); /* This stores wq for the moment, for the timer_fn */ - work->wq_data = wq; + set_wq_data(work, wq); timer->expires = jiffies + delay; timer->data = (unsigned long)dwork; timer->function = delayed_work_timer_fn; @@ -182,12 +205,12 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; - if (!test_and_set_bit(0, &work->pending)) { + if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) { BUG_ON(timer_pending(timer)); BUG_ON(!list_empty(&work->entry)); /* This stores wq for the moment, for the timer_fn */ - work->wq_data = wq; + set_wq_data(work, wq); timer->expires = jiffies + delay; timer->data = (unsigned long)dwork; timer->function = delayed_work_timer_fn; @@ -223,8 +246,8 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) list_del_init(cwq->worklist.next); spin_unlock_irqrestore(&cwq->lock, flags); - BUG_ON(work->wq_data != cwq); - clear_bit(0, &work->pending); + BUG_ON(get_wq_data(work) != cwq); + clear_bit(WORK_STRUCT_PENDING, &work->management); f(data); spin_lock_irqsave(&cwq->lock, flags); -- cgit v1.2.3 From 65f27f38446e1976cc98fd3004b110fedcddd189 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 22 Nov 2006 14:55:48 +0000 Subject: WorkStruct: Pass the work_struct pointer instead of context data Pass the work_struct pointer to the work function rather than context data. The work function can use container_of() to work out the data. For the cases where the container of the work_struct may go away the moment the pending bit is cleared, it is made possible to defer the release of the structure by deferring the clearing of the pending bit. To make this work, an extra flag is introduced into the management side of the work_struct. This governs auto-release of the structure upon execution. Ordinarily, the work queue executor would release the work_struct for further scheduling or deallocation by clearing the pending bit prior to jumping to the work function. This means that, unless the driver makes some guarantee itself that the work_struct won't go away, the work function may not access anything else in the work_struct or its container lest they be deallocated.. This is a problem if the auxiliary data is taken away (as done by the last patch). However, if the pending bit is *not* cleared before jumping to the work function, then the work function *may* access the work_struct and its container with no problems. But then the work function must itself release the work_struct by calling work_release(). In most cases, automatic release is fine, so this is the default. Special initiators exist for the non-auto-release case (ending in _NAR). Signed-Off-By: David Howells --- kernel/kmod.c | 16 ++++++++++------ kernel/kthread.c | 13 ++++++++----- kernel/power/poweroff.c | 4 ++-- kernel/sys.c | 4 ++-- kernel/workqueue.c | 19 ++++++++----------- 5 files changed, 30 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index bb4e29d924e4..7dc7a9dad6ac 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -114,6 +114,7 @@ EXPORT_SYMBOL(request_module); #endif /* CONFIG_KMOD */ struct subprocess_info { + struct work_struct work; struct completion *complete; char *path; char **argv; @@ -221,9 +222,10 @@ static int wait_for_helper(void *data) } /* This is run by khelper thread */ -static void __call_usermodehelper(void *data) +static void __call_usermodehelper(struct work_struct *work) { - struct subprocess_info *sub_info = data; + struct subprocess_info *sub_info = + container_of(work, struct subprocess_info, work); pid_t pid; int wait = sub_info->wait; @@ -264,6 +266,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, { DECLARE_COMPLETION_ONSTACK(done); struct subprocess_info sub_info = { + .work = __WORK_INITIALIZER(sub_info.work, + __call_usermodehelper), .complete = &done, .path = path, .argv = argv, @@ -272,7 +276,6 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, .wait = wait, .retval = 0, }; - DECLARE_WORK(work, __call_usermodehelper, &sub_info); if (!khelper_wq) return -EBUSY; @@ -280,7 +283,7 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, if (path[0] == '\0') return 0; - queue_work(khelper_wq, &work); + queue_work(khelper_wq, &sub_info.work); wait_for_completion(&done); return sub_info.retval; } @@ -291,6 +294,8 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp, { DECLARE_COMPLETION(done); struct subprocess_info sub_info = { + .work = __WORK_INITIALIZER(sub_info.work, + __call_usermodehelper), .complete = &done, .path = path, .argv = argv, @@ -298,7 +303,6 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp, .retval = 0, }; struct file *f; - DECLARE_WORK(work, __call_usermodehelper, &sub_info); if (!khelper_wq) return -EBUSY; @@ -318,7 +322,7 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp, } sub_info.stdin = f; - queue_work(khelper_wq, &work); + queue_work(khelper_wq, &sub_info.work); wait_for_completion(&done); return sub_info.retval; } diff --git a/kernel/kthread.c b/kernel/kthread.c index 4f9c60ef95e8..1db8c72d0d38 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -31,6 +31,8 @@ struct kthread_create_info /* Result passed back to kthread_create() from keventd. */ struct task_struct *result; struct completion done; + + struct work_struct work; }; struct kthread_stop_info @@ -111,9 +113,10 @@ static int kthread(void *_create) } /* We are keventd: create a thread. */ -static void keventd_create_kthread(void *_create) +static void keventd_create_kthread(struct work_struct *work) { - struct kthread_create_info *create = _create; + struct kthread_create_info *create = + container_of(work, struct kthread_create_info, work); int pid; /* We want our own signal handler (we take no signals by default). */ @@ -154,20 +157,20 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), ...) { struct kthread_create_info create; - DECLARE_WORK(work, keventd_create_kthread, &create); create.threadfn = threadfn; create.data = data; init_completion(&create.started); init_completion(&create.done); + INIT_WORK(&create.work, keventd_create_kthread); /* * The workqueue needs to start up first: */ if (!helper_wq) - work.func(work.data); + create.work.func(&create.work); else { - queue_work(helper_wq, &work); + queue_work(helper_wq, &create.work); wait_for_completion(&create.done); } if (!IS_ERR(create.result)) { diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index f1f900ac3164..678ec736076b 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -16,12 +16,12 @@ * callback we use. */ -static void do_poweroff(void *dummy) +static void do_poweroff(struct work_struct *dummy) { kernel_power_off(); } -static DECLARE_WORK(poweroff_work, do_poweroff, NULL); +static DECLARE_WORK(poweroff_work, do_poweroff); static void handle_poweroff(int key, struct tty_struct *tty) { diff --git a/kernel/sys.c b/kernel/sys.c index 98489d82801b..c87b461de38d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -880,7 +880,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user return 0; } -static void deferred_cad(void *dummy) +static void deferred_cad(struct work_struct *dummy) { kernel_restart(NULL); } @@ -892,7 +892,7 @@ static void deferred_cad(void *dummy) */ void ctrl_alt_del(void) { - static DECLARE_WORK(cad_work, deferred_cad, NULL); + static DECLARE_WORK(cad_work, deferred_cad); if (C_A_D) schedule_work(&cad_work); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 967479756511..8d1e7cb8a51a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -241,14 +241,14 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) struct work_struct *work = list_entry(cwq->worklist.next, struct work_struct, entry); work_func_t f = work->func; - void *data = work->data; list_del_init(cwq->worklist.next); spin_unlock_irqrestore(&cwq->lock, flags); BUG_ON(get_wq_data(work) != cwq); - clear_bit(WORK_STRUCT_PENDING, &work->management); - f(data); + if (!test_bit(WORK_STRUCT_NOAUTOREL, &work->management)) + work_release(work); + f(work); spin_lock_irqsave(&cwq->lock, flags); cwq->remove_sequence++; @@ -527,7 +527,6 @@ EXPORT_SYMBOL(schedule_delayed_work_on); /** * schedule_on_each_cpu - call a function on each online CPU from keventd * @func: the function to call - * @info: a pointer to pass to func() * * Returns zero on success. * Returns -ve errno on failure. @@ -536,7 +535,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on); * * schedule_on_each_cpu() is very slow. */ -int schedule_on_each_cpu(work_func_t func, void *info) +int schedule_on_each_cpu(work_func_t func) { int cpu; struct work_struct *works; @@ -547,7 +546,7 @@ int schedule_on_each_cpu(work_func_t func, void *info) mutex_lock(&workqueue_mutex); for_each_online_cpu(cpu) { - INIT_WORK(per_cpu_ptr(works, cpu), func, info); + INIT_WORK(per_cpu_ptr(works, cpu), func); __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), per_cpu_ptr(works, cpu)); } @@ -591,7 +590,6 @@ EXPORT_SYMBOL(cancel_rearming_delayed_work); /** * execute_in_process_context - reliably execute the routine with user context * @fn: the function to execute - * @data: data to pass to the function * @ew: guaranteed storage for the execute work structure (must * be available when the work executes) * @@ -601,15 +599,14 @@ EXPORT_SYMBOL(cancel_rearming_delayed_work); * Returns: 0 - function was executed * 1 - function was scheduled for execution */ -int execute_in_process_context(work_func_t fn, void *data, - struct execute_work *ew) +int execute_in_process_context(work_func_t fn, struct execute_work *ew) { if (!in_interrupt()) { - fn(data); + fn(&ew->work); return 0; } - INIT_WORK(&ew->work, fn, data); + INIT_WORK(&ew->work, fn); schedule_work(&ew->work); return 1; -- cgit v1.2.3 From c4028958b6ecad064b1a6303a6a5906d4fe48d73 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 22 Nov 2006 14:57:56 +0000 Subject: WorkStruct: make allyesconfig Fix up for make allyesconfig. Signed-Off-By: David Howells --- kernel/relay.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index f04bbdb56ac2..2b92e8ece85b 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -308,9 +308,10 @@ static struct rchan_callbacks default_channel_callbacks = { * reason waking is deferred is that calling directly from write * causes problems if you're writing from say the scheduler. */ -static void wakeup_readers(void *private) +static void wakeup_readers(struct work_struct *work) { - struct rchan_buf *buf = private; + struct rchan_buf *buf = + container_of(work, struct rchan_buf, wake_readers.work); wake_up_interruptible(&buf->read_wait); } @@ -328,7 +329,7 @@ static inline void __relay_reset(struct rchan_buf *buf, unsigned int init) if (init) { init_waitqueue_head(&buf->read_wait); kref_init(&buf->kref); - INIT_WORK(&buf->wake_readers, NULL, NULL); + INIT_DELAYED_WORK(&buf->wake_readers, NULL); } else { cancel_delayed_work(&buf->wake_readers); flush_scheduled_work(); @@ -549,7 +550,8 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) buf->padding[old_subbuf]; smp_mb(); if (waitqueue_active(&buf->read_wait)) { - PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf); + PREPARE_DELAYED_WORK(&buf->wake_readers, + wakeup_readers); schedule_delayed_work(&buf->wake_readers, 1); } } -- cgit v1.2.3 From b42172fc7b569a0ef2b0fa38d71382969074c0e2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 22 Nov 2006 09:32:06 -0800 Subject: Don't call "note_interrupt()" with irq descriptor lock held This reverts commit f72fa707604c015a6625e80f269506032d5430dc, and solves the problem that it tried to fix by simply making "__do_IRQ()" call the note_interrupt() function without the lock held, the way everybody else does. It should be noted that all interrupt handling code must never allow the descriptor actors to be entered "recursively" (that's why we do all the magic IRQ_PENDING stuff in the first place), so there actually is exclusion at that much higher level, even in the absense of locking. Acked-by: Vivek Goyal Acked-by:Pavel Emelianov Cc: Andrew Morton Cc: Ingo Molnar Cc: Adrian Bunk Signed-off-by: Linus Torvalds --- kernel/irq/handle.c | 4 ++-- kernel/irq/spurious.c | 6 +----- 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 42aa6f1a3f0f..a681912bc89a 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -231,10 +231,10 @@ fastcall unsigned int __do_IRQ(unsigned int irq) spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); - - spin_lock(&desc->lock); if (!noirqdebug) note_interrupt(irq, desc, action_ret); + + spin_lock(&desc->lock); if (likely(!(desc->status & IRQ_PENDING))) break; desc->status &= ~IRQ_PENDING; diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 9c7e2e4c1fe7..543ea2e5ad93 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -147,11 +147,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, if (unlikely(irqfixup)) { /* Don't punish working computers */ if ((irqfixup == 2 && irq == 0) || action_ret == IRQ_NONE) { - int ok; - - spin_unlock(&desc->lock); - ok = misrouted_irq(irq); - spin_lock(&desc->lock); + int ok = misrouted_irq(irq); if (action_ret == IRQ_NONE) desc->irqs_unhandled -= ok; } -- cgit v1.2.3 From 753ca4f312a4b26940e4731b4fa5dbbbbcc77e97 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sat, 25 Nov 2006 11:09:34 -0800 Subject: [PATCH] fix copy_process() error check The return value of copy_process() should be checked by IS_ERR(). Signed-off-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 3da978eec791..8cdd3e72ba55 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1315,9 +1315,8 @@ struct task_struct * __devinit fork_idle(int cpu) struct pt_regs regs; task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL, 0); - if (!task) - return ERR_PTR(-ENOMEM); - init_idle(task, cpu); + if (!IS_ERR(task)) + init_idle(task, cpu); return task; } -- cgit v1.2.3 From cfd3ef2346f924d6c0e82236c20fdb3a8840136a Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 25 Nov 2006 11:09:37 -0800 Subject: [PATCH] lockdep: spin_lock_irqsave_nested() Introduce spin_lock_irqsave_nested(); implementation from: http://lkml.org/lkml/2006/6/1/122 Patch from: http://lkml.org/lkml/2006/9/13/258 [akpm@osdl.org: two compile fixes] Signed-off-by: Arjan van de Ven Signed-off-by: Jiri Kosina Signed-off-by: Peter Zijlstra Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/spinlock.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 476c3741511b..2c6c2bf85514 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -293,6 +293,27 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) } EXPORT_SYMBOL(_spin_lock_nested); +unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) +{ + unsigned long flags; + + local_irq_save(flags); + preempt_disable(); + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + /* + * On lockdep we dont want the hand-coded irq-enable of + * _raw_spin_lock_flags() code, because lockdep assumes + * that interrupts are not re-enabled during lock-acquire: + */ +#ifdef CONFIG_PROVE_SPIN_LOCKING + _raw_spin_lock(lock); +#else + _raw_spin_lock_flags(lock, &flags); +#endif + return flags; +} + +EXPORT_SYMBOL(_spin_lock_irqsave_nested); #endif -- cgit v1.2.3 From ff0a538d8b08700df2b46f9aafc9fb2765071f0a Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 28 Nov 2006 20:12:59 +0100 Subject: [PATCH] x86-64: work around gcc4 issue with -Os in Dwarf2 stack unwind This fixes a problem with gcc4 mis-compiling the stack unwind code under -Os, which resulted in 'stuck' messages whenever an assembly routine was encountered. (The second hunk is trivial cleanup.) Signed-off-by: Jan Beulich --- kernel/unwind.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/unwind.c b/kernel/unwind.c index f7e50d16dbf6..ed0a21d4a902 100644 --- a/kernel/unwind.c +++ b/kernel/unwind.c @@ -938,8 +938,11 @@ int unwind(struct unwind_frame_info *frame) else { retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end); /* skip augmentation */ - if (((const char *)(cie + 2))[1] == 'z') - ptr += get_uleb128(&ptr, end); + if (((const char *)(cie + 2))[1] == 'z') { + uleb128_t augSize = get_uleb128(&ptr, end); + + ptr += augSize; + } if (ptr > end || retAddrReg >= ARRAY_SIZE(reg_info) || REG_INVALID(retAddrReg) @@ -963,9 +966,7 @@ int unwind(struct unwind_frame_info *frame) if (cie == NULL || fde == NULL) { #ifdef CONFIG_FRAME_POINTER unsigned long top, bottom; -#endif -#ifdef CONFIG_FRAME_POINTER top = STACK_TOP(frame->task); bottom = STACK_BOTTOM(frame->task); # if FRAME_RETADDR_OFFSET < 0 -- cgit v1.2.3 From 3cce4856ff3dfa663b1a168dab48120d70820da6 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 28 Nov 2006 12:29:43 -0800 Subject: [PATCH] fix create_write_pipe() error check The return value of create_write_pipe()/create_read_pipe() should be checked by IS_ERR(). Signed-off-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index bb4e29d924e4..2b76dee28496 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -307,14 +307,14 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp, return 0; f = create_write_pipe(); - if (!f) - return -ENOMEM; + if (IS_ERR(f)) + return PTR_ERR(f); *filp = f; f = create_read_pipe(f); - if (!f) { + if (IS_ERR(f)) { free_write_pipe(*filp); - return -ENOMEM; + return PTR_ERR(f); } sub_info.stdin = f; -- cgit v1.2.3 From e17e0f51aeea4e59c7e450a1c0f26605b91c1260 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 24 Nov 2006 12:15:25 +0100 Subject: Driver core: show drivers in /sys/module/ Show the drivers, which belong to the module: $ ls -l /sys/module/usbcore/drivers/ hub -> ../../../bus/usb/drivers/hub usb -> ../../../bus/usb/drivers/usb usbfs -> ../../../bus/usb/drivers/usbfs Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/module.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index f0166563c602..45e01cb60101 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1086,22 +1086,35 @@ static int mod_sysfs_setup(struct module *mod, goto out; kobj_set_kset_s(&mod->mkobj, module_subsys); mod->mkobj.mod = mod; - err = kobject_register(&mod->mkobj.kobj); + + /* delay uevent until full sysfs population */ + kobject_init(&mod->mkobj.kobj); + err = kobject_add(&mod->mkobj.kobj); if (err) goto out; + mod->drivers_dir = kobject_add_dir(&mod->mkobj.kobj, "drivers"); + if (!mod->drivers_dir) + goto out_unreg; + err = module_param_sysfs_setup(mod, kparam, num_params); if (err) - goto out_unreg; + goto out_unreg_drivers; err = module_add_modinfo_attrs(mod); if (err) - goto out_unreg; + goto out_unreg_param; + kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); return 0; +out_unreg_drivers: + kobject_unregister(mod->drivers_dir); +out_unreg_param: + module_param_sysfs_remove(mod); out_unreg: - kobject_unregister(&mod->mkobj.kobj); + kobject_del(&mod->mkobj.kobj); + kobject_put(&mod->mkobj.kobj); out: return err; } @@ -1110,6 +1123,7 @@ static void mod_kobject_remove(struct module *mod) { module_remove_modinfo_attrs(mod); module_param_sysfs_remove(mod); + kobject_unregister(mod->drivers_dir); kobject_unregister(&mod->mkobj.kobj); } @@ -2275,11 +2289,14 @@ void print_modules(void) void module_add_driver(struct module *mod, struct device_driver *drv) { + int no_warn; + if (!mod || !drv) return; - /* Don't check return code; this call is idempotent */ - sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module"); + /* Don't check return codes; these calls are idempotent */ + no_warn = sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module"); + no_warn = sysfs_create_link(mod->drivers_dir, &drv->kobj, drv->name); } EXPORT_SYMBOL(module_add_driver); @@ -2288,6 +2305,8 @@ void module_remove_driver(struct device_driver *drv) if (!drv) return; sysfs_remove_link(&drv->kobj, "module"); + if (drv->owner && drv->owner->drivers_dir) + sysfs_remove_link(drv->owner->drivers_dir, drv->name); } EXPORT_SYMBOL(module_remove_driver); -- cgit v1.2.3 From 339bf98ffc6a8d8eb16fc532ac57ffbced2f8a68 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Fri, 10 Nov 2006 14:10:15 -0800 Subject: [NETLINK]: Do precise netlink message allocations where possible Account for the netlink message header size directly in nlmsg_new() instead of relying on the caller calculate it correctly. Replaces error handling of message construction functions when constructing notifications with bug traps since a failure implies a bug in calculating the size of the skb. Signed-off-by: Thomas Graf Acked-by: Paul Moore Signed-off-by: David S. Miller --- kernel/taskstats.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index f45c5e70773c..4f3f0e48c845 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -77,8 +77,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, /* * If new attributes are added, please revisit this allocation */ - size = nlmsg_total_size(genlmsg_total_size(size)); - skb = nlmsg_new(size, GFP_KERNEL); + skb = nlmsg_new(genlmsg_total_size(size), GFP_KERNEL); if (!skb) return -ENOMEM; -- cgit v1.2.3 From 3dabc7157859e706770c825aa229f8943db4e0e1 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 14 Nov 2006 19:44:52 -0800 Subject: [GENL]: Add genlmsg_new() to allocate generic netlink messages Signed-off-by: Thomas Graf Acked-by: Paul Moore Signed-off-by: David S. Miller --- kernel/taskstats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 4f3f0e48c845..faa5239813ce 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -77,7 +77,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, /* * If new attributes are added, please revisit this allocation */ - skb = nlmsg_new(genlmsg_total_size(size), GFP_KERNEL); + skb = genlmsg_new(size, GFP_KERNEL); if (!skb) return -ENOMEM; -- cgit v1.2.3 From 17c157c889f4b07258af6bfec9e4e9dcf3c00178 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 14 Nov 2006 19:46:02 -0800 Subject: [GENL]: Add genlmsg_put_reply() to simplify building reply headers By modyfing genlmsg_put() to take a genl_family and by adding genlmsg_put_reply() the process of constructing the netlink and generic netlink headers is simplified. Signed-off-by: Thomas Graf Acked-by: Paul Moore Signed-off-by: David S. Miller --- kernel/taskstats.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index faa5239813ce..d3d28919d4b4 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -85,13 +85,9 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, int seq = get_cpu_var(taskstats_seqnum)++; put_cpu_var(taskstats_seqnum); - reply = genlmsg_put(skb, 0, seq, - family.id, 0, 0, - cmd, family.version); + reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); } else - reply = genlmsg_put(skb, info->snd_pid, info->snd_seq, - family.id, 0, 0, - cmd, family.version); + reply = genlmsg_put_reply(skb, info, &family, 0, cmd); if (reply == NULL) { nlmsg_free(skb); return -EINVAL; -- cgit v1.2.3 From f6a570333e554b48ad589e7137c77c57809eee81 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 18 Oct 2006 01:47:25 -0400 Subject: [PATCH] severing module.h->sched.h Signed-off-by: Al Viro --- kernel/latency.c | 1 + kernel/module.c | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/latency.c b/kernel/latency.c index 258f2555abbc..e63fcacb61a7 100644 --- a/kernel/latency.c +++ b/kernel/latency.c @@ -36,6 +36,7 @@ #include #include #include +#include #include struct latency_info { diff --git a/kernel/module.c b/kernel/module.c index 45e01cb60101..e2d09d604ca0 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -34,10 +34,10 @@ #include #include #include +#include #include #include #include -#include #include #include #include @@ -790,6 +790,19 @@ static struct module_attribute refcnt = { .show = show_refcnt, }; +void module_put(struct module *module) +{ + if (module) { + unsigned int cpu = get_cpu(); + local_dec(&module->ref[cpu].count); + /* Maybe they're waiting for us to drop reference? */ + if (unlikely(!module_is_live(module))) + wake_up_process(module->waiter); + put_cpu(); + } +} +EXPORT_SYMBOL(module_put); + #else /* !CONFIG_MODULE_UNLOAD */ static void print_unload_info(struct seq_file *m, struct module *mod) { -- cgit v1.2.3 From a1f8e7f7fb9d7e2cbcb53170edca7c0ac4680697 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 19 Oct 2006 16:08:53 -0400 Subject: [PATCH] severing skbuff.h -> highmem.h Signed-off-by: Al Viro --- kernel/auditsc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 42f2f1179711..ab97e5101232 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -64,6 +64,7 @@ #include #include #include +#include #include #include "audit.h" -- cgit v1.2.3 From f95d47caae5302a63d92be9a0292abc90e2a14e1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:02 +0100 Subject: [PATCH] i386: Use %gs as the PDA base-segment in the kernel This patch is the meat of the PDA change. This patch makes several related changes: 1: Most significantly, %gs is now used in the kernel. This means that on entry, the old value of %gs is saved away, and it is reloaded with __KERNEL_PDA. 2: entry.S constructs the stack in the shape of struct pt_regs, and this is passed around the kernel so that the process's saved register state can be accessed. Unfortunately struct pt_regs doesn't currently have space for %gs (or %fs). This patch extends pt_regs to add space for gs (no space is allocated for %fs, since it won't be used, and it would just complicate the code in entry.S to work around the space). 3: Because %gs is now saved on the stack like %ds, %es and the integer registers, there are a number of places where it no longer needs to be handled specially; namely context switch, and saving/restoring the register state in a signal context. 4: And since kernel threads run in kernel space and call normal kernel code, they need to be created with their %gs == __KERNEL_PDA. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Chuck Ebbert <76306.1226@compuserve.com> Cc: Zachary Amsden Cc: Jan Beulich Cc: Andi Kleen Signed-off-by: Andrew Morton --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8cdd3e72ba55..fd22245e3881 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1303,7 +1303,7 @@ fork_out: return ERR_PTR(retval); } -struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) +noinline struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) { memset(regs, 0, sizeof(struct pt_regs)); return regs; -- cgit v1.2.3 From 0741f4d207a644482d7a040f05cd264c98cf7ee8 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Thu, 7 Dec 2006 02:14:11 +0100 Subject: [PATCH] x86: add sysctl for kstack_depth_to_print Add sysctl for kstack_depth_to_print. This lets users change the amount of raw stack data printed in dump_stack() without having to reboot. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen --- kernel/sysctl.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 09e569f4792b..6fc5e17086f4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -54,6 +54,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp, #ifdef CONFIG_X86 #include +#include #endif #if defined(CONFIG_SYSCTL) @@ -707,6 +708,14 @@ static ctl_table kern_table[] = { .mode = 0444, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "kstack_depth_to_print", + .data = &kstack_depth_to_print, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #endif #if defined(CONFIG_MMU) { -- cgit v1.2.3 From e2124bb8d369a4bc1afde1959040e33d71c41d5e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 7 Dec 2006 02:14:12 +0100 Subject: [PATCH] unwinder: Use probe_kernel_address instead of __get_user in kernel/unwind.c This avoids trouble with the page fault handler if the fault happens inside an interrupt context. Suggested by Linus Cc: jbeulich@novell.com Signed-off-by: Andi Kleen --- kernel/unwind.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/unwind.c b/kernel/unwind.c index ed0a21d4a902..af48168a3afb 100644 --- a/kernel/unwind.c +++ b/kernel/unwind.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -550,7 +551,7 @@ static unsigned long read_pointer(const u8 **pLoc, return 0; } if ((ptrType & DW_EH_PE_indirect) - && __get_user(value, (unsigned long *)value)) + && probe_kernel_address((unsigned long *)value, value)) return 0; *pLoc = ptr.p8; @@ -982,18 +983,19 @@ int unwind(struct unwind_frame_info *frame) & (sizeof(unsigned long) - 1))) { unsigned long link; - if (!__get_user(link, + if (!probe_kernel_address( (unsigned long *)(UNW_FP(frame) - + FRAME_LINK_OFFSET)) + + FRAME_LINK_OFFSET), + link) # if FRAME_RETADDR_OFFSET < 0 && link > bottom && link < UNW_FP(frame) # else && link > UNW_FP(frame) && link < bottom # endif && !(link & (sizeof(link) - 1)) - && !__get_user(UNW_PC(frame), + && !probe_kernel_address( (unsigned long *)(UNW_FP(frame) - + FRAME_RETADDR_OFFSET))) { + + FRAME_RETADDR_OFFSET), UNW_PC(frame))) { UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET # if FRAME_RETADDR_OFFSET < 0 - @@ -1104,7 +1106,7 @@ int unwind(struct unwind_frame_info *frame) return -EIO; switch(reg_info[i].width) { #define CASE(n) case sizeof(u##n): \ - __get_user(FRAME_REG(i, u##n), (u##n *)addr); \ + probe_kernel_address((u##n *)addr, FRAME_REG(i, u##n)); \ break CASES; #undef CASE -- cgit v1.2.3 From eef5e0d185fc049bda11fa14ba286fbd357da896 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 7 Dec 2006 02:14:12 +0100 Subject: [PATCH] unwinder: Remove lockdep disabling of nested locks for unwinder Shouldn't be needed anymore since __kernel_text_address is used unconditionally on x86-64 Signed-off-by: Andi Kleen --- kernel/lockdep.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index c9fefdb1a7db..9bb8d784eb02 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -228,11 +228,7 @@ static int save_trace(struct stack_trace *trace) trace->skip = 3; trace->all_contexts = 0; - /* Make sure to not recurse in case the the unwinder needs to tak -e locks. */ - lockdep_off(); save_stack_trace(trace, NULL); - lockdep_on(); trace->max_entries = trace->nr_entries; -- cgit v1.2.3 From 359ad0d4015a9ab39243f2ebc4eb07915bd618b2 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 7 Dec 2006 02:14:13 +0100 Subject: [PATCH] unwinder: more sanity checks in Dwarf2 unwinder Tighten the requirements on both input to and output from the Dwarf2 unwinder. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- kernel/unwind.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/unwind.c b/kernel/unwind.c index af48168a3afb..7e721f104105 100644 --- a/kernel/unwind.c +++ b/kernel/unwind.c @@ -95,6 +95,7 @@ static const struct { typedef unsigned long uleb128_t; typedef signed long sleb128_t; +#define sleb128abs __builtin_labs static struct unwind_table { struct { @@ -787,7 +788,7 @@ int unwind(struct unwind_frame_info *frame) #define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs]) const u32 *fde = NULL, *cie = NULL; const u8 *ptr = NULL, *end = NULL; - unsigned long pc = UNW_PC(frame) - frame->call_frame; + unsigned long pc = UNW_PC(frame) - frame->call_frame, sp; unsigned long startLoc = 0, endLoc = 0, cfa; unsigned i; signed ptrType = -1; @@ -936,6 +937,9 @@ int unwind(struct unwind_frame_info *frame) state.dataAlign = get_sleb128(&ptr, end); if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end) cie = NULL; + else if (UNW_PC(frame) % state.codeAlign + || UNW_SP(frame) % sleb128abs(state.dataAlign)) + return -EPERM; else { retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end); /* skip augmentation */ @@ -968,6 +972,8 @@ int unwind(struct unwind_frame_info *frame) #ifdef CONFIG_FRAME_POINTER unsigned long top, bottom; + if ((UNW_SP(frame) | UNW_FP(frame)) % sizeof(unsigned long)) + return -EPERM; top = STACK_TOP(frame->task); bottom = STACK_BOTTOM(frame->task); # if FRAME_RETADDR_OFFSET < 0 @@ -1018,6 +1024,7 @@ int unwind(struct unwind_frame_info *frame) || state.regs[retAddrReg].where == Nowhere || state.cfa.reg >= ARRAY_SIZE(reg_info) || reg_info[state.cfa.reg].width != sizeof(unsigned long) + || FRAME_REG(state.cfa.reg, unsigned long) % sizeof(unsigned long) || state.cfa.offs % sizeof(unsigned long)) return -EIO; /* update frame */ @@ -1038,6 +1045,8 @@ int unwind(struct unwind_frame_info *frame) #else # define CASES CASE(8); CASE(16); CASE(32); CASE(64) #endif + pc = UNW_PC(frame); + sp = UNW_SP(frame); for (i = 0; i < ARRAY_SIZE(state.regs); ++i) { if (REG_INVALID(i)) { if (state.regs[i].where == Nowhere) @@ -1118,6 +1127,11 @@ int unwind(struct unwind_frame_info *frame) } } + if (UNW_PC(frame) % state.codeAlign + || UNW_SP(frame) % sleb128abs(state.dataAlign) + || (pc == UNW_PC(frame) && sp == UNW_SP(frame))) + return -EIO; + return 0; #undef CASES #undef FRAME_REG -- cgit v1.2.3 From 6d0185ea611276fdf81991d7774d396bdc1ae392 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 7 Dec 2006 02:14:13 +0100 Subject: [PATCH] unwinder: Add debugging output to the Dwarf2 unwinder Add debugging printks to the unwinder to allow easier debugging when something goes wrong with it. This can be controlled with the new unwinder_debug=N option Most output is given by N=1 AK: Added documentation of unwinder_debug= Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- kernel/unwind.c | 113 +++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 96 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/unwind.c b/kernel/unwind.c index 7e721f104105..209e248517db 100644 --- a/kernel/unwind.c +++ b/kernel/unwind.c @@ -137,6 +137,17 @@ struct unwind_state { static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 }; +static unsigned unwind_debug; +static int __init unwind_debug_setup(char *s) +{ + unwind_debug = simple_strtoul(s, NULL, 0); + return 1; +} +__setup("unwind_debug=", unwind_debug_setup); +#define dprintk(lvl, fmt, args...) \ + ((void)(lvl > unwind_debug \ + || printk(KERN_DEBUG "unwind: " fmt "\n", ##args))) + static struct unwind_table *find_table(unsigned long pc) { struct unwind_table *table; @@ -281,6 +292,7 @@ static void __init setup_unwind_table(struct unwind_table *table, hdrSize = 4 + sizeof(unsigned long) + sizeof(unsigned int) + 2 * n * sizeof(unsigned long); + dprintk(2, "Binary lookup table size for %s: %lu bytes", table->name, hdrSize); header = alloc(hdrSize); if (!header) return; @@ -500,13 +512,17 @@ static unsigned long read_pointer(const u8 **pLoc, const unsigned long *pul; } ptr; - if (ptrType < 0 || ptrType == DW_EH_PE_omit) + if (ptrType < 0 || ptrType == DW_EH_PE_omit) { + dprintk(1, "Invalid pointer encoding %02X (%p,%p).", ptrType, *pLoc, end); return 0; + } ptr.p8 = *pLoc; switch(ptrType & DW_EH_PE_FORM) { case DW_EH_PE_data2: - if (end < (const void *)(ptr.p16u + 1)) + if (end < (const void *)(ptr.p16u + 1)) { + dprintk(1, "Data16 overrun (%p,%p).", ptr.p8, end); return 0; + } if(ptrType & DW_EH_PE_signed) value = get_unaligned(ptr.p16s++); else @@ -514,8 +530,10 @@ static unsigned long read_pointer(const u8 **pLoc, break; case DW_EH_PE_data4: #ifdef CONFIG_64BIT - if (end < (const void *)(ptr.p32u + 1)) + if (end < (const void *)(ptr.p32u + 1)) { + dprintk(1, "Data32 overrun (%p,%p).", ptr.p8, end); return 0; + } if(ptrType & DW_EH_PE_signed) value = get_unaligned(ptr.p32s++); else @@ -527,8 +545,10 @@ static unsigned long read_pointer(const u8 **pLoc, BUILD_BUG_ON(sizeof(u32) != sizeof(value)); #endif case DW_EH_PE_native: - if (end < (const void *)(ptr.pul + 1)) + if (end < (const void *)(ptr.pul + 1)) { + dprintk(1, "DataUL overrun (%p,%p).", ptr.p8, end); return 0; + } value = get_unaligned(ptr.pul++); break; case DW_EH_PE_leb128: @@ -536,10 +556,14 @@ static unsigned long read_pointer(const u8 **pLoc, value = ptrType & DW_EH_PE_signed ? get_sleb128(&ptr.p8, end) : get_uleb128(&ptr.p8, end); - if ((const void *)ptr.p8 > end) + if ((const void *)ptr.p8 > end) { + dprintk(1, "DataLEB overrun (%p,%p).", ptr.p8, end); return 0; + } break; default: + dprintk(2, "Cannot decode pointer type %02X (%p,%p).", + ptrType, ptr.p8, end); return 0; } switch(ptrType & DW_EH_PE_ADJUST) { @@ -549,11 +573,16 @@ static unsigned long read_pointer(const u8 **pLoc, value += (unsigned long)*pLoc; break; default: + dprintk(2, "Cannot adjust pointer type %02X (%p,%p).", + ptrType, *pLoc, end); return 0; } if ((ptrType & DW_EH_PE_indirect) - && probe_kernel_address((unsigned long *)value, value)) + && probe_kernel_address((unsigned long *)value, value)) { + dprintk(1, "Cannot read indirect value %lx (%p,%p).", + value, *pLoc, end); return 0; + } *pLoc = ptr.p8; return value; @@ -702,8 +731,10 @@ static int processCFI(const u8 *start, state->label = NULL; return 1; } - if (state->stackDepth >= MAX_STACK_DEPTH) + if (state->stackDepth >= MAX_STACK_DEPTH) { + dprintk(1, "State stack overflow (%p,%p).", ptr.p8, end); return 0; + } state->stack[state->stackDepth++] = ptr.p8; break; case DW_CFA_restore_state: @@ -718,8 +749,10 @@ static int processCFI(const u8 *start, result = processCFI(start, end, 0, ptrType, state); state->loc = loc; state->label = label; - } else + } else { + dprintk(1, "State stack underflow (%p,%p).", ptr.p8, end); return 0; + } break; case DW_CFA_def_cfa: state->cfa.reg = get_uleb128(&ptr.p8, end); @@ -751,6 +784,7 @@ static int processCFI(const u8 *start, break; case DW_CFA_GNU_window_save: default: + dprintk(1, "Unrecognized CFI op %02X (%p,%p).", ptr.p8[-1], ptr.p8 - 1, end); result = 0; break; } @@ -766,12 +800,17 @@ static int processCFI(const u8 *start, set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state); break; } - if (ptr.p8 > end) + if (ptr.p8 > end) { + dprintk(1, "Data overrun (%p,%p).", ptr.p8, end); result = 0; + } if (result && targetLoc != 0 && targetLoc < state->loc) return 1; } + if (result && ptr.p8 < end) + dprintk(1, "Data underrun (%p,%p).", ptr.p8, end); + return result && ptr.p8 == end && (targetLoc == 0 @@ -843,6 +882,8 @@ int unwind(struct unwind_frame_info *frame) hdr[3]); } } + if(hdr && !fde) + dprintk(3, "Binary lookup for %lx failed.", pc); if (fde != NULL) { cie = cie_for_fde(fde, table); @@ -864,6 +905,8 @@ int unwind(struct unwind_frame_info *frame) fde = NULL; } else fde = NULL; + if(!fde) + dprintk(1, "Binary lookup result for %lx discarded.", pc); } if (fde == NULL) { for (fde = table->address, tableSize = table->size; @@ -895,6 +938,8 @@ int unwind(struct unwind_frame_info *frame) if (pc >= startLoc && pc < endLoc) break; } + if(!fde) + dprintk(3, "Linear lookup for %lx failed.", pc); } } if (cie != NULL) { @@ -928,6 +973,8 @@ int unwind(struct unwind_frame_info *frame) if (ptr >= end || *ptr) cie = NULL; } + if(!cie) + dprintk(1, "CIE unusable (%p,%p).", ptr, end); ++ptr; } if (cie != NULL) { @@ -938,9 +985,11 @@ int unwind(struct unwind_frame_info *frame) if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end) cie = NULL; else if (UNW_PC(frame) % state.codeAlign - || UNW_SP(frame) % sleb128abs(state.dataAlign)) + || UNW_SP(frame) % sleb128abs(state.dataAlign)) { + dprintk(1, "Input pointer(s) misaligned (%lx,%lx).", + UNW_PC(frame), UNW_SP(frame)); return -EPERM; - else { + } else { retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end); /* skip augmentation */ if (((const char *)(cie + 2))[1] == 'z') { @@ -954,6 +1003,8 @@ int unwind(struct unwind_frame_info *frame) || reg_info[retAddrReg].width != sizeof(unsigned long)) cie = NULL; } + if(!cie) + dprintk(1, "CIE validation failed (%p,%p).", ptr, end); } if (cie != NULL) { state.cieStart = ptr; @@ -967,6 +1018,8 @@ int unwind(struct unwind_frame_info *frame) if ((ptr += augSize) > end) fde = NULL; } + if(!fde) + dprintk(1, "FDE validation failed (%p,%p).", ptr, end); } if (cie == NULL || fde == NULL) { #ifdef CONFIG_FRAME_POINTER @@ -1025,8 +1078,10 @@ int unwind(struct unwind_frame_info *frame) || state.cfa.reg >= ARRAY_SIZE(reg_info) || reg_info[state.cfa.reg].width != sizeof(unsigned long) || FRAME_REG(state.cfa.reg, unsigned long) % sizeof(unsigned long) - || state.cfa.offs % sizeof(unsigned long)) + || state.cfa.offs % sizeof(unsigned long)) { + dprintk(1, "Unusable unwind info (%p,%p).", ptr, end); return -EIO; + } /* update frame */ #ifndef CONFIG_AS_CFI_SIGNAL_FRAME if(frame->call_frame @@ -1051,6 +1106,8 @@ int unwind(struct unwind_frame_info *frame) if (REG_INVALID(i)) { if (state.regs[i].where == Nowhere) continue; + dprintk(1, "Cannot restore register %u (%d).", + i, state.regs[i].where); return -EIO; } switch(state.regs[i].where) { @@ -1059,8 +1116,11 @@ int unwind(struct unwind_frame_info *frame) case Register: if (state.regs[i].value >= ARRAY_SIZE(reg_info) || REG_INVALID(state.regs[i].value) - || reg_info[i].width > reg_info[state.regs[i].value].width) + || reg_info[i].width > reg_info[state.regs[i].value].width) { + dprintk(1, "Cannot restore register %u from register %lu.", + i, state.regs[i].value); return -EIO; + } switch(reg_info[state.regs[i].value].width) { #define CASE(n) \ case sizeof(u##n): \ @@ -1070,6 +1130,9 @@ int unwind(struct unwind_frame_info *frame) CASES; #undef CASE default: + dprintk(1, "Unsupported register size %u (%lu).", + reg_info[state.regs[i].value].width, + state.regs[i].value); return -EIO; } break; @@ -1094,12 +1157,17 @@ int unwind(struct unwind_frame_info *frame) CASES; #undef CASE default: + dprintk(1, "Unsupported register size %u (%u).", + reg_info[i].width, i); return -EIO; } break; case Value: - if (reg_info[i].width != sizeof(unsigned long)) + if (reg_info[i].width != sizeof(unsigned long)) { + dprintk(1, "Unsupported value size %u (%u).", + reg_info[i].width, i); return -EIO; + } FRAME_REG(i, unsigned long) = cfa + state.regs[i].value * state.dataAlign; break; @@ -1111,8 +1179,11 @@ int unwind(struct unwind_frame_info *frame) % sizeof(unsigned long) || addr < startLoc || addr + sizeof(unsigned long) < addr - || addr + sizeof(unsigned long) > endLoc) + || addr + sizeof(unsigned long) > endLoc) { + dprintk(1, "Bad memory location %lx (%lx).", + addr, state.regs[i].value); return -EIO; + } switch(reg_info[i].width) { #define CASE(n) case sizeof(u##n): \ probe_kernel_address((u##n *)addr, FRAME_REG(i, u##n)); \ @@ -1120,6 +1191,8 @@ int unwind(struct unwind_frame_info *frame) CASES; #undef CASE default: + dprintk(1, "Unsupported memory size %u (%u).", + reg_info[i].width, i); return -EIO; } } @@ -1128,9 +1201,15 @@ int unwind(struct unwind_frame_info *frame) } if (UNW_PC(frame) % state.codeAlign - || UNW_SP(frame) % sleb128abs(state.dataAlign) - || (pc == UNW_PC(frame) && sp == UNW_SP(frame))) + || UNW_SP(frame) % sleb128abs(state.dataAlign)) { + dprintk(1, "Output pointer(s) misaligned (%lx,%lx).", + UNW_PC(frame), UNW_SP(frame)); return -EIO; + } + if (pc == UNW_PC(frame) && sp == UNW_SP(frame)) { + dprintk(1, "No progress (%lx,%lx).", pc, sp); + return -EIO; + } return 0; #undef CASES -- cgit v1.2.3 From c65f38d911aa301cea109d38d40925750dd6c2da Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 7 Dec 2006 02:14:19 +0100 Subject: [PATCH] unwinder: fully support linker generated .eh_frame_hdr section Now that binutils' ld is able to properly populate .eh_frame_hdr in the Linux kernel case, here's a patch to add some functionality to the Dwarf2 unwinder to actually be able to make use of this (applies on firstfloor tree with the previously sent patch to add debug output, but not on plain 2.6.19). Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- kernel/unwind.c | 66 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/unwind.c b/kernel/unwind.c index 209e248517db..08645aa7c2d6 100644 --- a/kernel/unwind.c +++ b/kernel/unwind.c @@ -164,7 +164,9 @@ static struct unwind_table *find_table(unsigned long pc) static unsigned long read_pointer(const u8 **pLoc, const void *end, - signed ptrType); + signed ptrType, + unsigned long text_base, + unsigned long data_base); static void init_unwind_table(struct unwind_table *table, const char *name, @@ -189,10 +191,13 @@ static void init_unwind_table(struct unwind_table *table, /* See if the linker provided table looks valid. */ if (header_size <= 4 || header_start[0] != 1 - || (void *)read_pointer(&ptr, end, header_start[1]) != table_start - || header_start[2] == DW_EH_PE_omit - || read_pointer(&ptr, end, header_start[2]) <= 0 - || header_start[3] == DW_EH_PE_omit) + || (void *)read_pointer(&ptr, end, header_start[1], 0, 0) + != table_start + || !read_pointer(&ptr, end, header_start[2], 0, 0) + || !read_pointer(&ptr, end, header_start[3], 0, + (unsigned long)header_start) + || !read_pointer(&ptr, end, header_start[3], 0, + (unsigned long)header_start)) header_start = NULL; table->hdrsz = header_size; smp_wmb(); @@ -282,7 +287,7 @@ static void __init setup_unwind_table(struct unwind_table *table, ptr = (const u8 *)(fde + 2); if (!read_pointer(&ptr, (const u8 *)(fde + 1) + *fde, - ptrType)) + ptrType, 0, 0)) return; ++n; } @@ -317,7 +322,7 @@ static void __init setup_unwind_table(struct unwind_table *table, ptr = (const u8 *)(fde + 2); header->table[n].start = read_pointer(&ptr, (const u8 *)(fde + 1) + *fde, - fde_pointer_type(cie)); + fde_pointer_type(cie), 0, 0); header->table[n].fde = (unsigned long)fde; ++n; } @@ -500,7 +505,9 @@ static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *table) static unsigned long read_pointer(const u8 **pLoc, const void *end, - signed ptrType) + signed ptrType, + unsigned long text_base, + unsigned long data_base) { unsigned long value = 0; union { @@ -572,6 +579,22 @@ static unsigned long read_pointer(const u8 **pLoc, case DW_EH_PE_pcrel: value += (unsigned long)*pLoc; break; + case DW_EH_PE_textrel: + if (likely(text_base)) { + value += text_base; + break; + } + dprintk(2, "Text-relative encoding %02X (%p,%p), but zero text base.", + ptrType, *pLoc, end); + return 0; + case DW_EH_PE_datarel: + if (likely(data_base)) { + value += data_base; + break; + } + dprintk(2, "Data-relative encoding %02X (%p,%p), but zero data base.", + ptrType, *pLoc, end); + return 0; default: dprintk(2, "Cannot adjust pointer type %02X (%p,%p).", ptrType, *pLoc, end); @@ -625,7 +648,8 @@ static signed fde_pointer_type(const u32 *cie) case 'P': { signed ptrType = *ptr++; - if (!read_pointer(&ptr, end, ptrType) || ptr > end) + if (!read_pointer(&ptr, end, ptrType, 0, 0) + || ptr > end) return -1; } break; @@ -685,7 +709,8 @@ static int processCFI(const u8 *start, case DW_CFA_nop: break; case DW_CFA_set_loc: - if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0) + state->loc = read_pointer(&ptr.p8, end, ptrType, 0, 0); + if (state->loc == 0) result = 0; break; case DW_CFA_advance_loc1: @@ -854,9 +879,9 @@ int unwind(struct unwind_frame_info *frame) ptr = hdr + 4; end = hdr + table->hdrsz; if (tableSize - && read_pointer(&ptr, end, hdr[1]) + && read_pointer(&ptr, end, hdr[1], 0, 0) == (unsigned long)table->address - && (i = read_pointer(&ptr, end, hdr[2])) > 0 + && (i = read_pointer(&ptr, end, hdr[2], 0, 0)) > 0 && i == (end - ptr) / (2 * tableSize) && !((end - ptr) % (2 * tableSize))) { do { @@ -864,7 +889,8 @@ int unwind(struct unwind_frame_info *frame) startLoc = read_pointer(&cur, cur + tableSize, - hdr[3]); + hdr[3], 0, + (unsigned long)hdr); if (pc < startLoc) i /= 2; else { @@ -875,11 +901,13 @@ int unwind(struct unwind_frame_info *frame) if (i == 1 && (startLoc = read_pointer(&ptr, ptr + tableSize, - hdr[3])) != 0 + hdr[3], 0, + (unsigned long)hdr)) != 0 && pc >= startLoc) fde = (void *)read_pointer(&ptr, ptr + tableSize, - hdr[3]); + hdr[3], 0, + (unsigned long)hdr); } } if(hdr && !fde) @@ -894,13 +922,13 @@ int unwind(struct unwind_frame_info *frame) && (ptrType = fde_pointer_type(cie)) >= 0 && read_pointer(&ptr, (const u8 *)(fde + 1) + *fde, - ptrType) == startLoc) { + ptrType, 0, 0) == startLoc) { if (!(ptrType & DW_EH_PE_indirect)) ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed; endLoc = startLoc + read_pointer(&ptr, (const u8 *)(fde + 1) + *fde, - ptrType); + ptrType, 0, 0); if(pc >= endLoc) fde = NULL; } else @@ -926,7 +954,7 @@ int unwind(struct unwind_frame_info *frame) ptr = (const u8 *)(fde + 2); startLoc = read_pointer(&ptr, (const u8 *)(fde + 1) + *fde, - ptrType); + ptrType, 0, 0); if (!startLoc) continue; if (!(ptrType & DW_EH_PE_indirect)) @@ -934,7 +962,7 @@ int unwind(struct unwind_frame_info *frame) endLoc = startLoc + read_pointer(&ptr, (const u8 *)(fde + 1) + *fde, - ptrType); + ptrType, 0, 0); if (pc >= startLoc && pc < endLoc) break; } -- cgit v1.2.3 From b65780e123ba9b762276482bbfb52836e4d41fd9 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 7 Dec 2006 02:14:19 +0100 Subject: [PATCH] unwinder: move .eh_frame to RODATA The .eh_frame section contents is never written to, so it can as well benefit from CONFIG_DEBUG_RODATA. Diff-ed against firstfloor tree. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- kernel/unwind.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/unwind.c b/kernel/unwind.c index 08645aa7c2d6..09c261329249 100644 --- a/kernel/unwind.c +++ b/kernel/unwind.c @@ -19,7 +19,7 @@ #include #include -extern char __start_unwind[], __end_unwind[]; +extern const char __start_unwind[], __end_unwind[]; extern const u8 __start_unwind_hdr[], __end_unwind_hdr[]; #define MAX_STACK_DEPTH 8 -- cgit v1.2.3 From 161a09e737f0761ca064ee6a907313402f7a54b6 Mon Sep 17 00:00:00 2001 From: Joy Latten Date: Mon, 27 Nov 2006 13:11:54 -0600 Subject: audit: Add auditing to ipsec An audit message occurs when an ipsec SA or ipsec policy is created/deleted. Signed-off-by: Joy Latten Signed-off-by: James Morris Signed-off-by: David S. Miller --- kernel/auditsc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ab97e5101232..40722e26de98 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -731,7 +731,7 @@ static inline void audit_free_context(struct audit_context *context) printk(KERN_ERR "audit: freed %d contexts\n", count); } -static void audit_log_task_context(struct audit_buffer *ab) +void audit_log_task_context(struct audit_buffer *ab) { char *ctx = NULL; ssize_t len = 0; @@ -760,6 +760,8 @@ error_path: return; } +EXPORT_SYMBOL(audit_log_task_context); + static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) { char name[sizeof(tsk->comm)]; @@ -1488,6 +1490,8 @@ uid_t audit_get_loginuid(struct audit_context *ctx) return ctx ? ctx->loginuid : -1; } +EXPORT_SYMBOL(audit_get_loginuid); + /** * __audit_mq_open - record audit data for a POSIX MQ open * @oflag: open flag -- cgit v1.2.3 From 7602bdf2fd14a40dd9b104e516fdc05e1bd17952 Mon Sep 17 00:00:00 2001 From: Ashwin Chaugule Date: Wed, 6 Dec 2006 20:31:57 -0800 Subject: [PATCH] new scheme to preempt swap token The new swap token patches replace the current token traversal algo. The old algo had a crude timeout parameter that was used to handover the token from one task to another. This algo, transfers the token to the tasks that are in need of the token. The urgency for the token is based on the number of times a task is required to swap-in pages. Accordingly, the priority of a task is incremented if it has been badly affected due to swap-outs. To ensure that the token doesnt bounce around rapidly, the token holders are given a priority boost. The priority of tasks is also decremented, if their rate of swap-in's keeps reducing. This way, the condition to check whether to pre-empt the swap token, is a matter of comparing two task's priority fields. [akpm@osdl.org: cleanups] Signed-off-by: Ashwin Chaugule Cc: Rik van Riel Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 8 ++++++++ kernel/sysctl.c | 11 ----------- 2 files changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8cdd3e72ba55..5678e6c61ef2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -479,6 +479,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) memcpy(mm, oldmm, sizeof(*mm)); + /* Initializing for Swap token stuff */ + mm->token_priority = 0; + mm->last_interval = 0; + if (!mm_init(mm)) goto fail_nomem; @@ -542,6 +546,10 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) goto fail_nomem; good_mm: + /* Initializing for Swap token stuff */ + mm->token_priority = 0; + mm->last_interval = 0; + tsk->mm = mm; tsk->active_mm = mm; return 0; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 09e569f4792b..7abe9704e75a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -977,17 +977,6 @@ static ctl_table vm_table[] = { .extra1 = &zero, }, #endif -#ifdef CONFIG_SWAP - { - .ctl_name = VM_SWAP_TOKEN_TIMEOUT, - .procname = "swap_token_timeout", - .data = &swap_token_default_timeout, - .maxlen = sizeof(swap_token_default_timeout), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, - }, -#endif #ifdef CONFIG_NUMA { .ctl_name = VM_ZONE_RECLAIM_MODE, -- cgit v1.2.3 From a866374aecc90c7d90619727ccd851ac096b2fc7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Dec 2006 20:32:20 -0800 Subject: [PATCH] mm: pagefault_{disable,enable}() Introduce pagefault_{disable,enable}() and use these where previously we did manual preempt increments/decrements to make the pagefault handler do the atomic thing. Currently they still rely on the increased preempt count, but do not rely on the disabled preemption, this might go away in the future. (NOTE: the extra barrier() in pagefault_disable might fix some holes on machines which have too many registers for their own good) [heiko.carstens@de.ibm.com: s390 fix] Signed-off-by: Peter Zijlstra Acked-by: Nick Piggin Cc: Martin Schwidefsky Signed-off-by: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 93ef30ba209f..af7b81cbde30 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -282,9 +282,9 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from) { int ret; - inc_preempt_count(); + pagefault_disable(); ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); - dec_preempt_count(); + pagefault_enable(); return ret ? -EFAULT : 0; } @@ -585,9 +585,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) if (!(uval & FUTEX_OWNER_DIED)) { newval = FUTEX_WAITERS | new_owner->pid; - inc_preempt_count(); + pagefault_disable(); curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); - dec_preempt_count(); + pagefault_enable(); if (curval == -EFAULT) return -EFAULT; if (curval != uval) @@ -618,9 +618,9 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval) * There is no waiter, so we unlock the futex. The owner died * bit has not to be preserved here. We are the owner: */ - inc_preempt_count(); + pagefault_disable(); oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); - dec_preempt_count(); + pagefault_enable(); if (oldval == -EFAULT) return oldval; @@ -1158,9 +1158,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, */ newval = current->pid; - inc_preempt_count(); + pagefault_disable(); curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); - dec_preempt_count(); + pagefault_enable(); if (unlikely(curval == -EFAULT)) goto uaddr_faulted; @@ -1183,9 +1183,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, uval = curval; newval = uval | FUTEX_WAITERS; - inc_preempt_count(); + pagefault_disable(); curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); - dec_preempt_count(); + pagefault_enable(); if (unlikely(curval == -EFAULT)) goto uaddr_faulted; @@ -1215,10 +1215,10 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, newval = current->pid | FUTEX_OWNER_DIED | FUTEX_WAITERS; - inc_preempt_count(); + pagefault_disable(); curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); - dec_preempt_count(); + pagefault_enable(); if (unlikely(curval == -EFAULT)) goto uaddr_faulted; @@ -1390,9 +1390,9 @@ retry_locked: * anyone else up: */ if (!(uval & FUTEX_OWNER_DIED)) { - inc_preempt_count(); + pagefault_disable(); uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); - dec_preempt_count(); + pagefault_enable(); } if (unlikely(uval == -EFAULT)) -- cgit v1.2.3 From e94b1766097d53e6f3ccfb36c8baa562ffeda3fc Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 6 Dec 2006 20:33:17 -0800 Subject: [PATCH] slab: remove SLAB_KERNEL SLAB_KERNEL is an alias of GFP_KERNEL. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/delayacct.c | 2 +- kernel/fork.c | 6 +++--- kernel/taskstats.c | 2 +- kernel/user.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 66a0ea48751d..70e9ec603082 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -41,7 +41,7 @@ void delayacct_init(void) void __delayacct_tsk_init(struct task_struct *tsk) { - tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL); + tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL); if (tsk->delays) spin_lock_init(&tsk->delays->lock); } diff --git a/kernel/fork.c b/kernel/fork.c index 5678e6c61ef2..711aa5f10da7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -237,7 +237,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) goto fail_nomem; charge = len; } - tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!tmp) goto fail_nomem; *tmp = *mpnt; @@ -319,7 +319,7 @@ static inline void mm_free_pgd(struct mm_struct * mm) __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); -#define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) +#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) #include @@ -621,7 +621,7 @@ static struct files_struct *alloc_files(void) struct files_struct *newf; struct fdtable *fdt; - newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); + newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); if (!newf) goto out; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index d3d28919d4b4..1b2b326cf703 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -425,7 +425,7 @@ void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) *mycpu = raw_smp_processor_id(); *ptidstats = NULL; - tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); + tmp = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); if (!tmp) return; diff --git a/kernel/user.c b/kernel/user.c index 220e586127a0..c1f93c164c93 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -132,7 +132,7 @@ struct user_struct * alloc_uid(uid_t uid) if (!up) { struct user_struct *new; - new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL); + new = kmem_cache_alloc(uid_cachep, GFP_KERNEL); if (!new) return NULL; new->uid = uid; -- cgit v1.2.3 From e18b890bb0881bbab6f4f1a6cd20d9c60d66b003 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 6 Dec 2006 20:33:20 -0800 Subject: [PATCH] slab: remove kmem_cache_t Replace all uses of kmem_cache_t with struct kmem_cache. The patch was generated using the following script: #!/bin/sh # # Replace one string by another in all the kernel sources. # set -e for file in `find * -name "*.c" -o -name "*.h"|xargs grep -l $1`; do quilt add $file sed -e "1,\$s/$1/$2/g" $file >/tmp/$$ mv /tmp/$$ $file quilt refresh done The script was run like this sh replace kmem_cache_t "struct kmem_cache" Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/delayacct.c | 2 +- kernel/fork.c | 16 ++++++++-------- kernel/pid.c | 2 +- kernel/posix-timers.c | 2 +- kernel/signal.c | 2 +- kernel/taskstats.c | 2 +- kernel/user.c | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 70e9ec603082..766d5912b26a 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -20,7 +20,7 @@ #include int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ -kmem_cache_t *delayacct_cache; +struct kmem_cache *delayacct_cache; static int __init delayacct_setup_disable(char *str) { diff --git a/kernel/fork.c b/kernel/fork.c index 711aa5f10da7..2cf74edd3295 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -82,26 +82,26 @@ int nr_processes(void) #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR # define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) # define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) -static kmem_cache_t *task_struct_cachep; +static struct kmem_cache *task_struct_cachep; #endif /* SLAB cache for signal_struct structures (tsk->signal) */ -static kmem_cache_t *signal_cachep; +static struct kmem_cache *signal_cachep; /* SLAB cache for sighand_struct structures (tsk->sighand) */ -kmem_cache_t *sighand_cachep; +struct kmem_cache *sighand_cachep; /* SLAB cache for files_struct structures (tsk->files) */ -kmem_cache_t *files_cachep; +struct kmem_cache *files_cachep; /* SLAB cache for fs_struct structures (tsk->fs) */ -kmem_cache_t *fs_cachep; +struct kmem_cache *fs_cachep; /* SLAB cache for vm_area_struct structures */ -kmem_cache_t *vm_area_cachep; +struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ -static kmem_cache_t *mm_cachep; +static struct kmem_cache *mm_cachep; void free_task(struct task_struct *tsk) { @@ -1421,7 +1421,7 @@ long do_fork(unsigned long clone_flags, #define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif -static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) +static void sighand_ctor(void *data, struct kmem_cache *cachep, unsigned long flags) { struct sighand_struct *sighand = data; diff --git a/kernel/pid.c b/kernel/pid.c index b914392085f9..a48879b0b921 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -31,7 +31,7 @@ #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) static struct hlist_head *pid_hash; static int pidhash_shift; -static kmem_cache_t *pid_cachep; +static struct kmem_cache *pid_cachep; int pid_max = PID_MAX_DEFAULT; diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 9cbb5d1be06f..5fe87de10ff0 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -70,7 +70,7 @@ /* * Lets keep our timers in a slab cache :-) */ -static kmem_cache_t *posix_timers_cache; +static struct kmem_cache *posix_timers_cache; static struct idr posix_timers_id; static DEFINE_SPINLOCK(idr_lock); diff --git a/kernel/signal.c b/kernel/signal.c index df18c167a2a7..8e19d2785486 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -33,7 +33,7 @@ * SLAB caches for signal bits. */ -static kmem_cache_t *sigqueue_cachep; +static struct kmem_cache *sigqueue_cachep; /* * In POSIX a signal is sent either to a specific thread (Linux task) diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 1b2b326cf703..f5f92014ae98 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -34,7 +34,7 @@ static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; static int family_registered; -kmem_cache_t *taskstats_cache; +struct kmem_cache *taskstats_cache; static struct genl_family family = { .id = GENL_ID_GENERATE, diff --git a/kernel/user.c b/kernel/user.c index c1f93c164c93..4869563080e9 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -26,7 +26,7 @@ #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) #define uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) -static kmem_cache_t *uid_cachep; +static struct kmem_cache *uid_cachep; static struct list_head uidhash_table[UIDHASH_SZ]; /* -- cgit v1.2.3 From 3592695c363c3f3119621bdcf5ed852d6b9d1a5c Mon Sep 17 00:00:00 2001 From: Stefan Seyfried Date: Wed, 6 Dec 2006 20:34:06 -0800 Subject: [PATCH] uswsusp: add pmops->{prepare,enter,finish} support (aka "platform mode") Add an ioctl to the userspace swsusp code that enables the usage of the pmops->prepare, pmops->enter and pmops->finish methods (the in-kernel suspend knows these as "platform method"). These are needed on many machines to (among others) speed up resuming by letting the BIOS skip some steps or let my hp nx5000 recognise the correct ac_adapter state after resume again. It also ensures on many machines, that changed hardware (unplugged AC adapters) gets correctly detected and that kacpid does not run wild after resume. Signed-off-by: Stefan Seyfried Cc: "Rafael J. Wysocki" Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/power.h | 7 ++++++- kernel/power/user.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index bfe999f7b272..87ecb1856ee8 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -117,7 +117,12 @@ extern void snapshot_free_unused_memory(struct snapshot_handle *handle); #define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9) #define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) #define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11) -#define SNAPSHOT_IOC_MAXNR 11 +#define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int) +#define SNAPSHOT_IOC_MAXNR 12 + +#define PMOPS_PREPARE 1 +#define PMOPS_ENTER 2 +#define PMOPS_FINISH 3 /** * The bitmap is used for tracing allocated swap pages diff --git a/kernel/power/user.c b/kernel/power/user.c index d991d3b0e5a4..4c24ca5d62e3 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -313,6 +314,33 @@ OutS3: up(&pm_sem); break; + case SNAPSHOT_PMOPS: + switch (arg) { + + case PMOPS_PREPARE: + if (pm_ops->prepare) { + error = pm_ops->prepare(PM_SUSPEND_DISK); + } + break; + + case PMOPS_ENTER: + kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); + error = pm_ops->enter(PM_SUSPEND_DISK); + break; + + case PMOPS_FINISH: + if (pm_ops && pm_ops->finish) { + pm_ops->finish(PM_SUSPEND_DISK); + } + break; + + default: + printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg); + error = -EINVAL; + + } + break; + default: error = -ENOTTY; -- cgit v1.2.3 From 915bae9ebe41e52d71ad8b06d50e4ab26189f964 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 6 Dec 2006 20:34:07 -0800 Subject: [PATCH] swsusp: use partition device and offset to identify swap areas The Linux kernel handles swap files almost in the same way as it handles swap partitions and there are only two differences between these two types of swap areas: (1) swap files need not be contiguous, (2) the header of a swap file is not in the first block of the partition that holds it. From the swsusp's point of view (1) is not a problem, because it is already taken care of by the swap-handling code, but (2) has to be taken into consideration. In principle the location of a swap file's header may be determined with the help of appropriate filesystem driver. Unfortunately, however, it requires the filesystem holding the swap file to be mounted, and if this filesystem is journaled, it cannot be mounted during a resume from disk. For this reason we need some other means by which swap areas can be identified. For example, to identify a swap area we can use the partition that holds the area and the offset from the beginning of this partition at which the swap header is located. The following patch allows swsusp to identify swap areas this way. It changes swap_type_of() so that it takes an additional argument representing an offset of the swap header within the partition represented by its first argument. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/swap.c | 2 +- kernel/power/user.c | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 1a3b0dd2c3fc..7b10da16389e 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -74,7 +74,7 @@ static int mark_swapfiles(swp_entry_t start) static int swsusp_swap_check(void) /* This is called before saving image */ { - int res = swap_type_of(swsusp_resume_device); + int res = swap_type_of(swsusp_resume_device, 0); if (res >= 0) { root_swap = res; diff --git a/kernel/power/user.c b/kernel/power/user.c index 4c24ca5d62e3..a327b18a5ffd 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -55,7 +55,8 @@ static int snapshot_open(struct inode *inode, struct file *filp) filp->private_data = data; memset(&data->handle, 0, sizeof(struct snapshot_handle)); if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { - data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1; + data->swap = swsusp_resume_device ? + swap_type_of(swsusp_resume_device, 0) : -1; data->mode = O_RDONLY; } else { data->swap = -1; @@ -265,7 +266,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, * so we need to recode them */ if (old_decode_dev(arg)) { - data->swap = swap_type_of(old_decode_dev(arg)); + data->swap = swap_type_of(old_decode_dev(arg), 0); if (data->swap < 0) error = -ENODEV; } else { -- cgit v1.2.3 From 3fc6b34f4803b959c1e30c15247e2180cd529115 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 6 Dec 2006 20:34:09 -0800 Subject: [PATCH] swsusp: rearrange swap-handling code Rearrange the code in kernel/power/swap.c so that the next patch is more readable. [This patch only moves the existing code.] Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/swap.c | 221 ++++++++++++++++++++++++++-------------------------- 1 file changed, 112 insertions(+), 109 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 7b10da16389e..7768c2ba7550 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -41,10 +41,121 @@ static struct swsusp_header { } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; /* - * Saving part... + * General things */ static unsigned short root_swap = 0xffff; +static struct block_device *resume_bdev; + +/** + * submit - submit BIO request. + * @rw: READ or WRITE. + * @off physical offset of page. + * @page: page we're reading or writing. + * @bio_chain: list of pending biod (for async reading) + * + * Straight from the textbook - allocate and initialize the bio. + * If we're reading, make sure the page is marked as dirty. + * Then submit it and, if @bio_chain == NULL, wait. + */ +static int submit(int rw, pgoff_t page_off, struct page *page, + struct bio **bio_chain) +{ + struct bio *bio; + + bio = bio_alloc(GFP_ATOMIC, 1); + if (!bio) + return -ENOMEM; + bio->bi_sector = page_off * (PAGE_SIZE >> 9); + bio->bi_bdev = resume_bdev; + bio->bi_end_io = end_swap_bio_read; + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + printk("swsusp: ERROR: adding page to bio at %ld\n", page_off); + bio_put(bio); + return -EFAULT; + } + + lock_page(page); + bio_get(bio); + + if (bio_chain == NULL) { + submit_bio(rw | (1 << BIO_RW_SYNC), bio); + wait_on_page_locked(page); + if (rw == READ) + bio_set_pages_dirty(bio); + bio_put(bio); + } else { + if (rw == READ) + get_page(page); /* These pages are freed later */ + bio->bi_private = *bio_chain; + *bio_chain = bio; + submit_bio(rw | (1 << BIO_RW_SYNC), bio); + } + return 0; +} + +static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain) +{ + return submit(READ, page_off, virt_to_page(addr), bio_chain); +} + +static int bio_write_page(pgoff_t page_off, void *addr) +{ + return submit(WRITE, page_off, virt_to_page(addr), NULL); +} + +static int wait_on_bio_chain(struct bio **bio_chain) +{ + struct bio *bio; + struct bio *next_bio; + int ret = 0; + + if (bio_chain == NULL) + return 0; + + bio = *bio_chain; + if (bio == NULL) + return 0; + while (bio) { + struct page *page; + + next_bio = bio->bi_private; + page = bio->bi_io_vec[0].bv_page; + wait_on_page_locked(page); + if (!PageUptodate(page) || PageError(page)) + ret = -EIO; + put_page(page); + bio_put(bio); + bio = next_bio; + } + *bio_chain = NULL; + return ret; +} + +static void show_speed(struct timeval *start, struct timeval *stop, + unsigned nr_pages, char *msg) +{ + s64 elapsed_centisecs64; + int centisecs; + int k; + int kps; + + elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); + do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); + centisecs = elapsed_centisecs64; + if (centisecs == 0) + centisecs = 1; /* avoid div-by-zero */ + k = nr_pages * (PAGE_SIZE / 1024); + kps = (k * 100) / centisecs; + printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k, + centisecs / 100, centisecs % 100, + kps / 1000, (kps % 1000) / 10); +} + +/* + * Saving part + */ static int mark_swapfiles(swp_entry_t start) { @@ -166,26 +277,6 @@ static void release_swap_writer(struct swap_map_handle *handle) handle->bitmap = NULL; } -static void show_speed(struct timeval *start, struct timeval *stop, - unsigned nr_pages, char *msg) -{ - s64 elapsed_centisecs64; - int centisecs; - int k; - int kps; - - elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); - do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); - centisecs = elapsed_centisecs64; - if (centisecs == 0) - centisecs = 1; /* avoid div-by-zero */ - k = nr_pages * (PAGE_SIZE / 1024); - kps = (k * 100) / centisecs; - printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k, - centisecs / 100, centisecs % 100, - kps / 1000, (kps % 1000) / 10); -} - static int get_swap_writer(struct swap_map_handle *handle) { handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); @@ -205,34 +296,6 @@ static int get_swap_writer(struct swap_map_handle *handle) return 0; } -static int wait_on_bio_chain(struct bio **bio_chain) -{ - struct bio *bio; - struct bio *next_bio; - int ret = 0; - - if (bio_chain == NULL) - return 0; - - bio = *bio_chain; - if (bio == NULL) - return 0; - while (bio) { - struct page *page; - - next_bio = bio->bi_private; - page = bio->bi_io_vec[0].bv_page; - wait_on_page_locked(page); - if (!PageUptodate(page) || PageError(page)) - ret = -EIO; - put_page(page); - bio_put(bio); - bio = next_bio; - } - *bio_chain = NULL; - return ret; -} - static int swap_write_page(struct swap_map_handle *handle, void *buf, struct bio **bio_chain) { @@ -384,66 +447,6 @@ int swsusp_write(void) return error; } -static struct block_device *resume_bdev; - -/** - * submit - submit BIO request. - * @rw: READ or WRITE. - * @off physical offset of page. - * @page: page we're reading or writing. - * @bio_chain: list of pending biod (for async reading) - * - * Straight from the textbook - allocate and initialize the bio. - * If we're reading, make sure the page is marked as dirty. - * Then submit it and, if @bio_chain == NULL, wait. - */ -static int submit(int rw, pgoff_t page_off, struct page *page, - struct bio **bio_chain) -{ - struct bio *bio; - - bio = bio_alloc(GFP_ATOMIC, 1); - if (!bio) - return -ENOMEM; - bio->bi_sector = page_off * (PAGE_SIZE >> 9); - bio->bi_bdev = resume_bdev; - bio->bi_end_io = end_swap_bio_read; - - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - printk("swsusp: ERROR: adding page to bio at %ld\n", page_off); - bio_put(bio); - return -EFAULT; - } - - lock_page(page); - bio_get(bio); - - if (bio_chain == NULL) { - submit_bio(rw | (1 << BIO_RW_SYNC), bio); - wait_on_page_locked(page); - if (rw == READ) - bio_set_pages_dirty(bio); - bio_put(bio); - } else { - if (rw == READ) - get_page(page); /* These pages are freed later */ - bio->bi_private = *bio_chain; - *bio_chain = bio; - submit_bio(rw | (1 << BIO_RW_SYNC), bio); - } - return 0; -} - -static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain) -{ - return submit(READ, page_off, virt_to_page(addr), bio_chain); -} - -static int bio_write_page(pgoff_t page_off, void *addr) -{ - return submit(WRITE, page_off, virt_to_page(addr), NULL); -} - /** * The following functions allow us to read data using a swap map * in a file-alike way -- cgit v1.2.3 From 3aef83e0ef1ffb8ea3bea97be46821a45c952173 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 6 Dec 2006 20:34:10 -0800 Subject: [PATCH] swsusp: use block device offsets to identify swap locations Make swsusp use block device offsets instead of swap offsets to identify swap locations and make it use the same code paths for writing as well as for reading data. This allows us to use the same code for handling swap files and swap partitions and to simplify the code, eg. by dropping rw_swap_page_sync(). Signed-off-by: Rafael J. Wysocki Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/power.h | 2 +- kernel/power/swap.c | 133 ++++++++++++++++++++++++++------------------------ kernel/power/swsusp.c | 10 ++-- kernel/power/user.c | 7 +-- 4 files changed, 80 insertions(+), 72 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index 87ecb1856ee8..210ebba26020 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -146,7 +146,7 @@ struct bitmap_page { extern void free_bitmap(struct bitmap_page *bitmap); extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits); -extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap); +extern sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap); extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap); extern int swsusp_check(void); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 7768c2ba7550..1b08f46bbb7e 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -34,8 +34,8 @@ extern char resume_file[]; #define SWSUSP_SIG "S1SUSPEND" static struct swsusp_header { - char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; - swp_entry_t image; + char reserved[PAGE_SIZE - 20 - sizeof(sector_t)]; + sector_t image; char orig_sig[10]; char sig[10]; } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; @@ -100,9 +100,9 @@ static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain) return submit(READ, page_off, virt_to_page(addr), bio_chain); } -static int bio_write_page(pgoff_t page_off, void *addr) +static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain) { - return submit(WRITE, page_off, virt_to_page(addr), NULL); + return submit(WRITE, page_off, virt_to_page(addr), bio_chain); } static int wait_on_bio_chain(struct bio **bio_chain) @@ -157,22 +157,19 @@ static void show_speed(struct timeval *start, struct timeval *stop, * Saving part */ -static int mark_swapfiles(swp_entry_t start) +static int mark_swapfiles(sector_t start) { int error; - rw_swap_page_sync(READ, swp_entry(root_swap, 0), - virt_to_page((unsigned long)&swsusp_header), NULL); + bio_read_page(0, &swsusp_header, NULL); if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); memcpy(swsusp_header.sig,SWSUSP_SIG, 10); swsusp_header.image = start; - error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0), - virt_to_page((unsigned long)&swsusp_header), - NULL); + error = bio_write_page(0, &swsusp_header, NULL); } else { - pr_debug("swsusp: Partition is not swap space.\n"); + printk(KERN_ERR "swsusp: Swap header not found!\n"); error = -ENODEV; } return error; @@ -185,12 +182,21 @@ static int mark_swapfiles(swp_entry_t start) static int swsusp_swap_check(void) /* This is called before saving image */ { - int res = swap_type_of(swsusp_resume_device, 0); + int res; + + res = swap_type_of(swsusp_resume_device, 0); + if (res < 0) + return res; + + root_swap = res; + resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_WRITE); + if (IS_ERR(resume_bdev)) + return PTR_ERR(resume_bdev); + + res = set_blocksize(resume_bdev, PAGE_SIZE); + if (res < 0) + blkdev_put(resume_bdev); - if (res >= 0) { - root_swap = res; - return 0; - } return res; } @@ -201,36 +207,26 @@ static int swsusp_swap_check(void) /* This is called before saving image */ * @bio_chain: Link the next write BIO here */ -static int write_page(void *buf, unsigned long offset, struct bio **bio_chain) +static int write_page(void *buf, sector_t offset, struct bio **bio_chain) { - swp_entry_t entry; - int error = -ENOSPC; - - if (offset) { - struct page *page = virt_to_page(buf); - - if (bio_chain) { - /* - * Whether or not we successfully allocated a copy page, - * we take a ref on the page here. It gets undone in - * wait_on_bio_chain(). - */ - struct page *page_copy; - page_copy = alloc_page(GFP_ATOMIC); - if (page_copy == NULL) { - WARN_ON_ONCE(1); - bio_chain = NULL; /* Go synchronous */ - get_page(page); - } else { - memcpy(page_address(page_copy), - page_address(page), PAGE_SIZE); - page = page_copy; - } + void *src; + + if (!offset) + return -ENOSPC; + + if (bio_chain) { + src = (void *)__get_free_page(GFP_ATOMIC); + if (src) { + memcpy(src, buf, PAGE_SIZE); + } else { + WARN_ON_ONCE(1); + bio_chain = NULL; /* Go synchronous */ + src = buf; } - entry = swp_entry(root_swap, offset); - error = rw_swap_page_sync(WRITE, entry, page, bio_chain); + } else { + src = buf; } - return error; + return bio_write_page(offset, src, bio_chain); } /* @@ -248,11 +244,11 @@ static int write_page(void *buf, unsigned long offset, struct bio **bio_chain) * at a time. */ -#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(long) - 1) +#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) struct swap_map_page { - unsigned long entries[MAP_PAGE_ENTRIES]; - unsigned long next_swap; + sector_t entries[MAP_PAGE_ENTRIES]; + sector_t next_swap; }; /** @@ -262,7 +258,7 @@ struct swap_map_page { struct swap_map_handle { struct swap_map_page *cur; - unsigned long cur_swap; + sector_t cur_swap; struct bitmap_page *bitmap; unsigned int k; }; @@ -287,7 +283,7 @@ static int get_swap_writer(struct swap_map_handle *handle) release_swap_writer(handle); return -ENOMEM; } - handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap); + handle->cur_swap = alloc_swapdev_block(root_swap, handle->bitmap); if (!handle->cur_swap) { release_swap_writer(handle); return -ENOSPC; @@ -300,11 +296,11 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, struct bio **bio_chain) { int error = 0; - unsigned long offset; + sector_t offset; if (!handle->cur) return -EINVAL; - offset = alloc_swap_page(root_swap, handle->bitmap); + offset = alloc_swapdev_block(root_swap, handle->bitmap); error = write_page(buf, offset, bio_chain); if (error) return error; @@ -313,7 +309,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, error = wait_on_bio_chain(bio_chain); if (error) goto out; - offset = alloc_swap_page(root_swap, handle->bitmap); + offset = alloc_swapdev_block(root_swap, handle->bitmap); if (!offset) return -ENOSPC; handle->cur->next_swap = offset; @@ -413,37 +409,47 @@ int swsusp_write(void) struct swsusp_info *header; int error; - if ((error = swsusp_swap_check())) { + error = swsusp_swap_check(); + if (error) { printk(KERN_ERR "swsusp: Cannot find swap device, try " "swapon -a.\n"); return error; } memset(&snapshot, 0, sizeof(struct snapshot_handle)); error = snapshot_read_next(&snapshot, PAGE_SIZE); - if (error < PAGE_SIZE) - return error < 0 ? error : -EFAULT; + if (error < PAGE_SIZE) { + if (error >= 0) + error = -EFAULT; + + goto out; + } header = (struct swsusp_info *)data_of(snapshot); if (!enough_swap(header->pages)) { printk(KERN_ERR "swsusp: Not enough free swap\n"); - return -ENOSPC; + error = -ENOSPC; + goto out; } error = get_swap_writer(&handle); if (!error) { - unsigned long start = handle.cur_swap; + sector_t start = handle.cur_swap; + error = swap_write_page(&handle, header, NULL); if (!error) error = save_image(&handle, &snapshot, header->pages - 1); + if (!error) { flush_swap_writer(&handle); printk("S"); - error = mark_swapfiles(swp_entry(root_swap, start)); + error = mark_swapfiles(start); printk("|\n"); } } if (error) free_all_swap_pages(root_swap, handle.bitmap); release_swap_writer(&handle); +out: + swsusp_close(); return error; } @@ -459,17 +465,18 @@ static void release_swap_reader(struct swap_map_handle *handle) handle->cur = NULL; } -static int get_swap_reader(struct swap_map_handle *handle, - swp_entry_t start) +static int get_swap_reader(struct swap_map_handle *handle, sector_t start) { int error; - if (!swp_offset(start)) + if (!start) return -EINVAL; + handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); if (!handle->cur) return -ENOMEM; - error = bio_read_page(swp_offset(start), handle->cur, NULL); + + error = bio_read_page(start, handle->cur, NULL); if (error) { release_swap_reader(handle); return error; @@ -481,7 +488,7 @@ static int get_swap_reader(struct swap_map_handle *handle, static int swap_read_page(struct swap_map_handle *handle, void *buf, struct bio **bio_chain) { - unsigned long offset; + sector_t offset; int error; if (!handle->cur) @@ -608,7 +615,7 @@ int swsusp_check(void) if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); /* Reset swap signature now */ - error = bio_write_page(0, &swsusp_header); + error = bio_write_page(0, &swsusp_header, NULL); } else { return -EINVAL; } diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 0b66659dc516..4147a756a8c7 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -134,18 +134,18 @@ static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit) return 0; } -unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap) +sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap) { unsigned long offset; offset = swp_offset(get_swap_page_of_type(swap)); if (offset) { - if (bitmap_set(bitmap, offset)) { + if (bitmap_set(bitmap, offset)) swap_free(swp_entry(swap, offset)); - offset = 0; - } + else + return swapdev_block(swap, offset); } - return offset; + return 0; } void free_all_swap_pages(int swap, struct bitmap_page *bitmap) diff --git a/kernel/power/user.c b/kernel/power/user.c index a327b18a5ffd..f0b7ef8bdd74 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -126,7 +126,8 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, { int error = 0; struct snapshot_data *data; - loff_t offset, avail; + loff_t avail; + sector_t offset; if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) return -ENOTTY; @@ -240,10 +241,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, break; } } - offset = alloc_swap_page(data->swap, data->bitmap); + offset = alloc_swapdev_block(data->swap, data->bitmap); if (offset) { offset <<= PAGE_SHIFT; - error = put_user(offset, (loff_t __user *)arg); + error = put_user(offset, (sector_t __user *)arg); } else { error = -ENOSPC; } -- cgit v1.2.3 From 9a154d9d95b7b9845938242f5c62505b3cab5bcd Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 6 Dec 2006 20:34:12 -0800 Subject: [PATCH] swsusp: add resume_offset command line parameter Add the kernel command line parameter "resume_offset=" allowing us to specify the offset, in units, from the beginning of the partition pointed to by the "resume=" parameter at which the swap header is located. This offset can be determined, for example, by an application using the FIBMAP ioctl to obtain the swap header's block number for given file. [akpm@osdl.org: we don't know what type sector_t is] Signed-off-by: Rafael J. Wysocki Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 15 +++++++++++++++ kernel/power/power.h | 1 + kernel/power/swap.c | 15 ++++++++++----- 3 files changed, 26 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index b1fb7866b0b3..d79feeb45459 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -27,6 +27,7 @@ static int noresume = 0; char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; +sector_t swsusp_resume_block; /** * power_down - Shut machine down for hibernate. @@ -423,6 +424,19 @@ static int __init resume_setup(char *str) return 1; } +static int __init resume_offset_setup(char *str) +{ + unsigned long long offset; + + if (noresume) + return 1; + + if (sscanf(str, "%llu", &offset) == 1) + swsusp_resume_block = offset; + + return 1; +} + static int __init noresume_setup(char *str) { noresume = 1; @@ -430,4 +444,5 @@ static int __init noresume_setup(char *str) } __setup("noresume", noresume_setup); +__setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); diff --git a/kernel/power/power.h b/kernel/power/power.h index 210ebba26020..adaf7d4fbdaf 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -42,6 +42,7 @@ extern const void __nosave_begin, __nosave_end; extern unsigned long image_size; extern int in_suspend; extern dev_t swsusp_resume_device; +extern sector_t swsusp_resume_block; extern asmlinkage int swsusp_arch_suspend(void); extern asmlinkage int swsusp_arch_resume(void); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 1b08f46bbb7e..aa5a9bff01f1 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -161,13 +161,14 @@ static int mark_swapfiles(sector_t start) { int error; - bio_read_page(0, &swsusp_header, NULL); + bio_read_page(swsusp_resume_block, &swsusp_header, NULL); if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); memcpy(swsusp_header.sig,SWSUSP_SIG, 10); swsusp_header.image = start; - error = bio_write_page(0, &swsusp_header, NULL); + error = bio_write_page(swsusp_resume_block, + &swsusp_header, NULL); } else { printk(KERN_ERR "swsusp: Swap header not found!\n"); error = -ENODEV; @@ -184,7 +185,7 @@ static int swsusp_swap_check(void) /* This is called before saving image */ { int res; - res = swap_type_of(swsusp_resume_device, 0); + res = swap_type_of(swsusp_resume_device, swsusp_resume_block); if (res < 0) return res; @@ -610,12 +611,16 @@ int swsusp_check(void) if (!IS_ERR(resume_bdev)) { set_blocksize(resume_bdev, PAGE_SIZE); memset(&swsusp_header, 0, sizeof(swsusp_header)); - if ((error = bio_read_page(0, &swsusp_header, NULL))) + error = bio_read_page(swsusp_resume_block, + &swsusp_header, NULL); + if (error) return error; + if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); /* Reset swap signature now */ - error = bio_write_page(0, &swsusp_header, NULL); + error = bio_write_page(swsusp_resume_block, + &swsusp_header, NULL); } else { return -EINVAL; } -- cgit v1.2.3 From 37b2ba12df88f0e29f2d52aaf1ab22789377d5b5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 6 Dec 2006 20:34:15 -0800 Subject: [PATCH] swsusp: add ioctl for swap files support To be able to use swap files as suspend storage from the userland suspend tools we need an additional ioctl() that will allow us to provide the kernel with both the swap header's offset and the identification of the resume partition. The new ioctl() should be regarded as a replacement for the SNAPSHOT_SET_SWAP_FILE ioctl() that from now on will be considered as obsolete, but has to stay for backwards compatibility of the interface. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/power.h | 14 +++++++++++++- kernel/power/user.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index adaf7d4fbdaf..7dbfd9f67e1c 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -106,6 +106,16 @@ extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); extern int snapshot_image_loaded(struct snapshot_handle *handle); extern void snapshot_free_unused_memory(struct snapshot_handle *handle); +/* + * This structure is used to pass the values needed for the identification + * of the resume swap area from a user space to the kernel via the + * SNAPSHOT_SET_SWAP_AREA ioctl + */ +struct resume_swap_area { + loff_t offset; + u_int32_t dev; +} __attribute__((packed)); + #define SNAPSHOT_IOC_MAGIC '3' #define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1) #define SNAPSHOT_UNFREEZE _IO(SNAPSHOT_IOC_MAGIC, 2) @@ -119,7 +129,9 @@ extern void snapshot_free_unused_memory(struct snapshot_handle *handle); #define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) #define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11) #define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int) -#define SNAPSHOT_IOC_MAXNR 12 +#define SNAPSHOT_SET_SWAP_AREA _IOW(SNAPSHOT_IOC_MAGIC, 13, \ + struct resume_swap_area) +#define SNAPSHOT_IOC_MAXNR 13 #define PMOPS_PREPARE 1 #define PMOPS_ENTER 2 diff --git a/kernel/power/user.c b/kernel/power/user.c index f0b7ef8bdd74..05c58a2c0dd4 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -343,6 +343,37 @@ OutS3: } break; + case SNAPSHOT_SET_SWAP_AREA: + if (data->bitmap) { + error = -EPERM; + } else { + struct resume_swap_area swap_area; + dev_t swdev; + + error = copy_from_user(&swap_area, (void __user *)arg, + sizeof(struct resume_swap_area)); + if (error) { + error = -EFAULT; + break; + } + + /* + * User space encodes device types as two-byte values, + * so we need to recode them + */ + swdev = old_decode_dev(swap_area.dev); + if (swdev) { + offset = swap_area.offset; + data->swap = swap_type_of(swdev, offset); + if (data->swap < 0) + error = -ENODEV; + } else { + data->swap = -1; + error = -EINVAL; + } + } + break; + default: error = -ENOTTY; -- cgit v1.2.3 From 8357376d3df21b7d6f857931a57ac50da9c66e26 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 6 Dec 2006 20:34:18 -0800 Subject: [PATCH] swsusp: Improve handling of highmem Currently swsusp saves the contents of highmem pages by copying them to the normal zone which is quite inefficient (eg. it requires two normal pages to be used for saving one highmem page). This may be improved by using highmem for saving the contents of saveable highmem pages. Namely, during the suspend phase of the suspend-resume cycle we try to allocate as many free highmem pages as there are saveable highmem pages. If there are not enough highmem image pages to store the contents of all of the saveable highmem pages, some of them will be stored in the "normal" memory. Next, we allocate as many free "normal" pages as needed to store the (remaining) image data. We use a memory bitmap to mark the allocated free pages (ie. highmem as well as "normal" image pages). Now, we use another memory bitmap to mark all of the saveable pages (highmem as well as "normal") and the contents of the saveable pages are copied into the image pages. Then, the second bitmap is used to save the pfns corresponding to the saveable pages and the first one is used to save their data. During the resume phase the pfns of the pages that were saveable during the suspend are loaded from the image and used to mark the "unsafe" page frames. Next, we try to allocate as many free highmem page frames as to load all of the image data that had been in the highmem before the suspend and we allocate so many free "normal" page frames that the total number of allocated free pages (highmem and "normal") is equal to the size of the image. While doing this we have to make sure that there will be some extra free "normal" and "safe" page frames for two lists of PBEs constructed later. Now, the image data are loaded, if possible, into their "original" page frames. The image data that cannot be written into their "original" page frames are loaded into "safe" page frames and their "original" kernel virtual addresses, as well as the addresses of the "safe" pages containing their copies, are stored in one of two lists of PBEs. One list of PBEs is for the copies of "normal" suspend pages (ie. "normal" pages that were saveable during the suspend) and it is used in the same way as previously (ie. by the architecture-dependent parts of swsusp). The other list of PBEs is for the copies of highmem suspend pages. The pages in this list are restored (in a reversible way) right before the arch-dependent code is called. Signed-off-by: Rafael J. Wysocki Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/power.h | 2 +- kernel/power/snapshot.c | 851 ++++++++++++++++++++++++++++++++++++------------ kernel/power/swap.c | 2 +- kernel/power/swsusp.c | 53 +-- kernel/power/user.c | 2 +- 5 files changed, 671 insertions(+), 239 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index 7dbfd9f67e1c..3763343bde2f 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -103,8 +103,8 @@ struct snapshot_handle { extern unsigned int snapshot_additional_pages(struct zone *zone); extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); +extern void snapshot_write_finalize(struct snapshot_handle *handle); extern int snapshot_image_loaded(struct snapshot_handle *handle); -extern void snapshot_free_unused_memory(struct snapshot_handle *handle); /* * This structure is used to pass the values needed for the identification diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 99f9b7d177d6..fd8251d40eb8 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1,15 +1,15 @@ /* * linux/kernel/power/snapshot.c * - * This file provide system snapshot/restore functionality. + * This file provides system snapshot/restore functionality for swsusp. * * Copyright (C) 1998-2005 Pavel Machek + * Copyright (C) 2006 Rafael J. Wysocki * - * This file is released under the GPLv2, and is based on swsusp.c. + * This file is released under the GPLv2. * */ - #include #include #include @@ -34,137 +34,24 @@ #include "power.h" -/* List of PBEs used for creating and restoring the suspend image */ +/* List of PBEs needed for restoring the pages that were allocated before + * the suspend and included in the suspend image, but have also been + * allocated by the "resume" kernel, so their contents cannot be written + * directly to their "original" page frames. + */ struct pbe *restore_pblist; -static unsigned int nr_copy_pages; -static unsigned int nr_meta_pages; +/* Pointer to an auxiliary buffer (1 page) */ static void *buffer; -#ifdef CONFIG_HIGHMEM -unsigned int count_highmem_pages(void) -{ - struct zone *zone; - unsigned long zone_pfn; - unsigned int n = 0; - - for_each_zone (zone) - if (is_highmem(zone)) { - mark_free_pages(zone); - for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) { - struct page *page; - unsigned long pfn = zone_pfn + zone->zone_start_pfn; - if (!pfn_valid(pfn)) - continue; - page = pfn_to_page(pfn); - if (PageReserved(page)) - continue; - if (PageNosaveFree(page)) - continue; - n++; - } - } - return n; -} - -struct highmem_page { - char *data; - struct page *page; - struct highmem_page *next; -}; - -static struct highmem_page *highmem_copy; - -static int save_highmem_zone(struct zone *zone) -{ - unsigned long zone_pfn; - mark_free_pages(zone); - for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { - struct page *page; - struct highmem_page *save; - void *kaddr; - unsigned long pfn = zone_pfn + zone->zone_start_pfn; - - if (!(pfn%10000)) - printk("."); - if (!pfn_valid(pfn)) - continue; - page = pfn_to_page(pfn); - /* - * This condition results from rvmalloc() sans vmalloc_32() - * and architectural memory reservations. This should be - * corrected eventually when the cases giving rise to this - * are better understood. - */ - if (PageReserved(page)) - continue; - BUG_ON(PageNosave(page)); - if (PageNosaveFree(page)) - continue; - save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC); - if (!save) - return -ENOMEM; - save->next = highmem_copy; - save->page = page; - save->data = (void *) get_zeroed_page(GFP_ATOMIC); - if (!save->data) { - kfree(save); - return -ENOMEM; - } - kaddr = kmap_atomic(page, KM_USER0); - memcpy(save->data, kaddr, PAGE_SIZE); - kunmap_atomic(kaddr, KM_USER0); - highmem_copy = save; - } - return 0; -} - -int save_highmem(void) -{ - struct zone *zone; - int res = 0; - - pr_debug("swsusp: Saving Highmem"); - drain_local_pages(); - for_each_zone (zone) { - if (is_highmem(zone)) - res = save_highmem_zone(zone); - if (res) - return res; - } - printk("\n"); - return 0; -} - -int restore_highmem(void) -{ - printk("swsusp: Restoring Highmem\n"); - while (highmem_copy) { - struct highmem_page *save = highmem_copy; - void *kaddr; - highmem_copy = save->next; - - kaddr = kmap_atomic(save->page, KM_USER0); - memcpy(kaddr, save->data, PAGE_SIZE); - kunmap_atomic(kaddr, KM_USER0); - free_page((long) save->data); - kfree(save); - } - return 0; -} -#else -static inline unsigned int count_highmem_pages(void) {return 0;} -static inline int save_highmem(void) {return 0;} -static inline int restore_highmem(void) {return 0;} -#endif - /** * @safe_needed - on resume, for storing the PBE list and the image, * we can only use memory pages that do not conflict with the pages - * used before suspend. + * used before suspend. The unsafe pages have PageNosaveFree set + * and we count them using unsafe_pages. * - * The unsafe pages are marked with the PG_nosave_free flag - * and we count them using unsafe_pages + * Each allocated image page is marked as PageNosave and PageNosaveFree + * so that swsusp_free() can release it. */ #define PG_ANY 0 @@ -174,7 +61,7 @@ static inline int restore_highmem(void) {return 0;} static unsigned int allocated_unsafe_pages; -static void *alloc_image_page(gfp_t gfp_mask, int safe_needed) +static void *get_image_page(gfp_t gfp_mask, int safe_needed) { void *res; @@ -195,20 +82,38 @@ static void *alloc_image_page(gfp_t gfp_mask, int safe_needed) unsigned long get_safe_page(gfp_t gfp_mask) { - return (unsigned long)alloc_image_page(gfp_mask, PG_SAFE); + return (unsigned long)get_image_page(gfp_mask, PG_SAFE); +} + +static struct page *alloc_image_page(gfp_t gfp_mask) { + struct page *page; + + page = alloc_page(gfp_mask); + if (page) { + SetPageNosave(page); + SetPageNosaveFree(page); + } + return page; } /** * free_image_page - free page represented by @addr, allocated with - * alloc_image_page (page flags set by it must be cleared) + * get_image_page (page flags set by it must be cleared) */ static inline void free_image_page(void *addr, int clear_nosave_free) { - ClearPageNosave(virt_to_page(addr)); + struct page *page; + + BUG_ON(!virt_addr_valid(addr)); + + page = virt_to_page(addr); + + ClearPageNosave(page); if (clear_nosave_free) - ClearPageNosaveFree(virt_to_page(addr)); - free_page((unsigned long)addr); + ClearPageNosaveFree(page); + + __free_page(page); } /* struct linked_page is used to build chains of pages */ @@ -269,7 +174,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) { struct linked_page *lp; - lp = alloc_image_page(ca->gfp_mask, ca->safe_needed); + lp = get_image_page(ca->gfp_mask, ca->safe_needed); if (!lp) return NULL; @@ -446,8 +351,8 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) /* Compute the number of zones */ nr = 0; - for_each_zone (zone) - if (populated_zone(zone) && !is_highmem(zone)) + for_each_zone(zone) + if (populated_zone(zone)) nr++; /* Allocate the list of zones bitmap objects */ @@ -459,10 +364,10 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) } /* Initialize the zone bitmap objects */ - for_each_zone (zone) { + for_each_zone(zone) { unsigned long pfn; - if (!populated_zone(zone) || is_highmem(zone)) + if (!populated_zone(zone)) continue; zone_bm->start_pfn = zone->zone_start_pfn; @@ -481,7 +386,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) while (bb) { unsigned long *ptr; - ptr = alloc_image_page(gfp_mask, safe_needed); + ptr = get_image_page(gfp_mask, safe_needed); bb->data = ptr; if (!ptr) goto Free; @@ -669,9 +574,81 @@ unsigned int snapshot_additional_pages(struct zone *zone) res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE); - return res; + return 2 * res; } +#ifdef CONFIG_HIGHMEM +/** + * count_free_highmem_pages - compute the total number of free highmem + * pages, system-wide. + */ + +static unsigned int count_free_highmem_pages(void) +{ + struct zone *zone; + unsigned int cnt = 0; + + for_each_zone(zone) + if (populated_zone(zone) && is_highmem(zone)) + cnt += zone->free_pages; + + return cnt; +} + +/** + * saveable_highmem_page - Determine whether a highmem page should be + * included in the suspend image. + * + * We should save the page if it isn't Nosave or NosaveFree, or Reserved, + * and it isn't a part of a free chunk of pages. + */ + +static struct page *saveable_highmem_page(unsigned long pfn) +{ + struct page *page; + + if (!pfn_valid(pfn)) + return NULL; + + page = pfn_to_page(pfn); + + BUG_ON(!PageHighMem(page)); + + if (PageNosave(page) || PageReserved(page) || PageNosaveFree(page)) + return NULL; + + return page; +} + +/** + * count_highmem_pages - compute the total number of saveable highmem + * pages. + */ + +unsigned int count_highmem_pages(void) +{ + struct zone *zone; + unsigned int n = 0; + + for_each_zone(zone) { + unsigned long pfn, max_zone_pfn; + + if (!is_highmem(zone)) + continue; + + mark_free_pages(zone); + max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) + if (saveable_highmem_page(pfn)) + n++; + } + return n; +} +#else +static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } +static inline unsigned int count_highmem_pages(void) { return 0; } +#endif /* CONFIG_HIGHMEM */ + /** * pfn_is_nosave - check if given pfn is in the 'nosave' section */ @@ -684,12 +661,12 @@ static inline int pfn_is_nosave(unsigned long pfn) } /** - * saveable - Determine whether a page should be cloned or not. - * @pfn: The page + * saveable - Determine whether a non-highmem page should be included in + * the suspend image. * - * We save a page if it isn't Nosave, and is not in the range of pages - * statically defined as 'unsaveable', and it - * isn't a part of a free chunk of pages. + * We should save the page if it isn't Nosave, and is not in the range + * of pages statically defined as 'unsaveable', and it isn't a part of + * a free chunk of pages. */ static struct page *saveable_page(unsigned long pfn) @@ -701,76 +678,130 @@ static struct page *saveable_page(unsigned long pfn) page = pfn_to_page(pfn); - if (PageNosave(page)) + BUG_ON(PageHighMem(page)); + + if (PageNosave(page) || PageNosaveFree(page)) return NULL; + if (PageReserved(page) && pfn_is_nosave(pfn)) return NULL; - if (PageNosaveFree(page)) - return NULL; return page; } +/** + * count_data_pages - compute the total number of saveable non-highmem + * pages. + */ + unsigned int count_data_pages(void) { struct zone *zone; unsigned long pfn, max_zone_pfn; unsigned int n = 0; - for_each_zone (zone) { + for_each_zone(zone) { if (is_highmem(zone)) continue; + mark_free_pages(zone); max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - n += !!saveable_page(pfn); + if(saveable_page(pfn)) + n++; } return n; } -static inline void copy_data_page(long *dst, long *src) +/* This is needed, because copy_page and memcpy are not usable for copying + * task structs. + */ +static inline void do_copy_page(long *dst, long *src) { int n; - /* copy_page and memcpy are not usable for copying task structs. */ for (n = PAGE_SIZE / sizeof(long); n; n--) *dst++ = *src++; } +#ifdef CONFIG_HIGHMEM +static inline struct page * +page_is_saveable(struct zone *zone, unsigned long pfn) +{ + return is_highmem(zone) ? + saveable_highmem_page(pfn) : saveable_page(pfn); +} + +static inline void +copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) +{ + struct page *s_page, *d_page; + void *src, *dst; + + s_page = pfn_to_page(src_pfn); + d_page = pfn_to_page(dst_pfn); + if (PageHighMem(s_page)) { + src = kmap_atomic(s_page, KM_USER0); + dst = kmap_atomic(d_page, KM_USER1); + do_copy_page(dst, src); + kunmap_atomic(src, KM_USER0); + kunmap_atomic(dst, KM_USER1); + } else { + src = page_address(s_page); + if (PageHighMem(d_page)) { + /* Page pointed to by src may contain some kernel + * data modified by kmap_atomic() + */ + do_copy_page(buffer, src); + dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0); + memcpy(dst, buffer, PAGE_SIZE); + kunmap_atomic(dst, KM_USER0); + } else { + dst = page_address(d_page); + do_copy_page(dst, src); + } + } +} +#else +#define page_is_saveable(zone, pfn) saveable_page(pfn) + +static inline void +copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) +{ + do_copy_page(page_address(pfn_to_page(dst_pfn)), + page_address(pfn_to_page(src_pfn))); +} +#endif /* CONFIG_HIGHMEM */ + static void copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) { struct zone *zone; unsigned long pfn; - for_each_zone (zone) { + for_each_zone(zone) { unsigned long max_zone_pfn; - if (is_highmem(zone)) - continue; - mark_free_pages(zone); max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (saveable_page(pfn)) + if (page_is_saveable(zone, pfn)) memory_bm_set_bit(orig_bm, pfn); } memory_bm_position_reset(orig_bm); memory_bm_position_reset(copy_bm); do { pfn = memory_bm_next_pfn(orig_bm); - if (likely(pfn != BM_END_OF_MAP)) { - struct page *page; - void *src; - - page = pfn_to_page(pfn); - src = page_address(page); - page = pfn_to_page(memory_bm_next_pfn(copy_bm)); - copy_data_page(page_address(page), src); - } + if (likely(pfn != BM_END_OF_MAP)) + copy_data_page(memory_bm_next_pfn(copy_bm), pfn); } while (pfn != BM_END_OF_MAP); } +/* Total number of image pages */ +static unsigned int nr_copy_pages; +/* Number of pages needed for saving the original pfns of the image pages */ +static unsigned int nr_meta_pages; + /** * swsusp_free - free pages allocated for the suspend. * @@ -792,7 +823,7 @@ void swsusp_free(void) if (PageNosave(page) && PageNosaveFree(page)) { ClearPageNosave(page); ClearPageNosaveFree(page); - free_page((long) page_address(page)); + __free_page(page); } } } @@ -802,34 +833,108 @@ void swsusp_free(void) buffer = NULL; } +#ifdef CONFIG_HIGHMEM +/** + * count_pages_for_highmem - compute the number of non-highmem pages + * that will be necessary for creating copies of highmem pages. + */ + +static unsigned int count_pages_for_highmem(unsigned int nr_highmem) +{ + unsigned int free_highmem = count_free_highmem_pages(); + + if (free_highmem >= nr_highmem) + nr_highmem = 0; + else + nr_highmem -= free_highmem; + + return nr_highmem; +} +#else +static unsigned int +count_pages_for_highmem(unsigned int nr_highmem) { return 0; } +#endif /* CONFIG_HIGHMEM */ /** - * enough_free_mem - Make sure we enough free memory to snapshot. - * - * Returns TRUE or FALSE after checking the number of available - * free pages. + * enough_free_mem - Make sure we have enough free memory for the + * snapshot image. */ -static int enough_free_mem(unsigned int nr_pages) +static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) { struct zone *zone; unsigned int free = 0, meta = 0; - for_each_zone (zone) - if (!is_highmem(zone)) { + for_each_zone(zone) { + meta += snapshot_additional_pages(zone); + if (!is_highmem(zone)) free += zone->free_pages; - meta += snapshot_additional_pages(zone); - } + } - pr_debug("swsusp: pages needed: %u + %u + %u, available pages: %u\n", + nr_pages += count_pages_for_highmem(nr_highmem); + pr_debug("swsusp: Normal pages needed: %u + %u + %u, available pages: %u\n", nr_pages, PAGES_FOR_IO, meta, free); return free > nr_pages + PAGES_FOR_IO + meta; } +#ifdef CONFIG_HIGHMEM +/** + * get_highmem_buffer - if there are some highmem pages in the suspend + * image, we may need the buffer to copy them and/or load their data. + */ + +static inline int get_highmem_buffer(int safe_needed) +{ + buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed); + return buffer ? 0 : -ENOMEM; +} + +/** + * alloc_highmem_image_pages - allocate some highmem pages for the image. + * Try to allocate as many pages as needed, but if the number of free + * highmem pages is lesser than that, allocate them all. + */ + +static inline unsigned int +alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem) +{ + unsigned int to_alloc = count_free_highmem_pages(); + + if (to_alloc > nr_highmem) + to_alloc = nr_highmem; + + nr_highmem -= to_alloc; + while (to_alloc-- > 0) { + struct page *page; + + page = alloc_image_page(__GFP_HIGHMEM); + memory_bm_set_bit(bm, page_to_pfn(page)); + } + return nr_highmem; +} +#else +static inline int get_highmem_buffer(int safe_needed) { return 0; } + +static inline unsigned int +alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } +#endif /* CONFIG_HIGHMEM */ + +/** + * swsusp_alloc - allocate memory for the suspend image + * + * We first try to allocate as many highmem pages as there are + * saveable highmem pages in the system. If that fails, we allocate + * non-highmem pages for the copies of the remaining highmem ones. + * + * In this approach it is likely that the copies of highmem pages will + * also be located in the high memory, because of the way in which + * copy_data_pages() works. + */ + static int swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, - unsigned int nr_pages) + unsigned int nr_pages, unsigned int nr_highmem) { int error; @@ -841,13 +946,19 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, if (error) goto Free; + if (nr_highmem > 0) { + error = get_highmem_buffer(PG_ANY); + if (error) + goto Free; + + nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem); + } while (nr_pages-- > 0) { - struct page *page = alloc_page(GFP_ATOMIC | __GFP_COLD); + struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); + if (!page) goto Free; - SetPageNosave(page); - SetPageNosaveFree(page); memory_bm_set_bit(copy_bm, page_to_pfn(page)); } return 0; @@ -857,30 +968,39 @@ Free: return -ENOMEM; } -/* Memory bitmap used for marking saveable pages */ +/* Memory bitmap used for marking saveable pages (during suspend) or the + * suspend image pages (during resume) + */ static struct memory_bitmap orig_bm; -/* Memory bitmap used for marking allocated pages that will contain the copies - * of saveable pages +/* Memory bitmap used on suspend for marking allocated pages that will contain + * the copies of saveable pages. During resume it is initially used for + * marking the suspend image pages, but then its set bits are duplicated in + * @orig_bm and it is released. Next, on systems with high memory, it may be + * used for marking "safe" highmem pages, but it has to be reinitialized for + * this purpose. */ static struct memory_bitmap copy_bm; asmlinkage int swsusp_save(void) { - unsigned int nr_pages; + unsigned int nr_pages, nr_highmem; - pr_debug("swsusp: critical section: \n"); + printk("swsusp: critical section: \n"); drain_local_pages(); nr_pages = count_data_pages(); - printk("swsusp: Need to copy %u pages\n", nr_pages); + nr_highmem = count_highmem_pages(); + printk("swsusp: Need to copy %u pages\n", nr_pages + nr_highmem); - if (!enough_free_mem(nr_pages)) { + if (!enough_free_mem(nr_pages, nr_highmem)) { printk(KERN_ERR "swsusp: Not enough free memory\n"); return -ENOMEM; } - if (swsusp_alloc(&orig_bm, ©_bm, nr_pages)) + if (swsusp_alloc(&orig_bm, ©_bm, nr_pages, nr_highmem)) { + printk(KERN_ERR "swsusp: Memory allocation failed\n"); return -ENOMEM; + } /* During allocating of suspend pagedir, new cold pages may appear. * Kill them. @@ -894,10 +1014,12 @@ asmlinkage int swsusp_save(void) * touch swap space! Except we must write out our image of course. */ + nr_pages += nr_highmem; nr_copy_pages = nr_pages; - nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT; + nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); + return 0; } @@ -960,7 +1082,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count) if (!buffer) { /* This makes the buffer be freed by swsusp_free() */ - buffer = alloc_image_page(GFP_ATOMIC, PG_ANY); + buffer = get_image_page(GFP_ATOMIC, PG_ANY); if (!buffer) return -ENOMEM; } @@ -975,9 +1097,23 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count) memset(buffer, 0, PAGE_SIZE); pack_pfns(buffer, &orig_bm); } else { - unsigned long pfn = memory_bm_next_pfn(©_bm); + struct page *page; - handle->buffer = page_address(pfn_to_page(pfn)); + page = pfn_to_page(memory_bm_next_pfn(©_bm)); + if (PageHighMem(page)) { + /* Highmem pages are copied to the buffer, + * because we can't return with a kmapped + * highmem page (we may not be called again). + */ + void *kaddr; + + kaddr = kmap_atomic(page, KM_USER0); + memcpy(buffer, kaddr, PAGE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + handle->buffer = buffer; + } else { + handle->buffer = page_address(page); + } } handle->prev = handle->cur; } @@ -1005,7 +1141,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) unsigned long pfn, max_zone_pfn; /* Clear page flags */ - for_each_zone (zone) { + for_each_zone(zone) { max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) @@ -1101,6 +1237,218 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) } } +/* List of "safe" pages that may be used to store data loaded from the suspend + * image + */ +static struct linked_page *safe_pages_list; + +#ifdef CONFIG_HIGHMEM +/* struct highmem_pbe is used for creating the list of highmem pages that + * should be restored atomically during the resume from disk, because the page + * frames they have occupied before the suspend are in use. + */ +struct highmem_pbe { + struct page *copy_page; /* data is here now */ + struct page *orig_page; /* data was here before the suspend */ + struct highmem_pbe *next; +}; + +/* List of highmem PBEs needed for restoring the highmem pages that were + * allocated before the suspend and included in the suspend image, but have + * also been allocated by the "resume" kernel, so their contents cannot be + * written directly to their "original" page frames. + */ +static struct highmem_pbe *highmem_pblist; + +/** + * count_highmem_image_pages - compute the number of highmem pages in the + * suspend image. The bits in the memory bitmap @bm that correspond to the + * image pages are assumed to be set. + */ + +static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) +{ + unsigned long pfn; + unsigned int cnt = 0; + + memory_bm_position_reset(bm); + pfn = memory_bm_next_pfn(bm); + while (pfn != BM_END_OF_MAP) { + if (PageHighMem(pfn_to_page(pfn))) + cnt++; + + pfn = memory_bm_next_pfn(bm); + } + return cnt; +} + +/** + * prepare_highmem_image - try to allocate as many highmem pages as + * there are highmem image pages (@nr_highmem_p points to the variable + * containing the number of highmem image pages). The pages that are + * "safe" (ie. will not be overwritten when the suspend image is + * restored) have the corresponding bits set in @bm (it must be + * unitialized). + * + * NOTE: This function should not be called if there are no highmem + * image pages. + */ + +static unsigned int safe_highmem_pages; + +static struct memory_bitmap *safe_highmem_bm; + +static int +prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) +{ + unsigned int to_alloc; + + if (memory_bm_create(bm, GFP_ATOMIC, PG_SAFE)) + return -ENOMEM; + + if (get_highmem_buffer(PG_SAFE)) + return -ENOMEM; + + to_alloc = count_free_highmem_pages(); + if (to_alloc > *nr_highmem_p) + to_alloc = *nr_highmem_p; + else + *nr_highmem_p = to_alloc; + + safe_highmem_pages = 0; + while (to_alloc-- > 0) { + struct page *page; + + page = alloc_page(__GFP_HIGHMEM); + if (!PageNosaveFree(page)) { + /* The page is "safe", set its bit the bitmap */ + memory_bm_set_bit(bm, page_to_pfn(page)); + safe_highmem_pages++; + } + /* Mark the page as allocated */ + SetPageNosave(page); + SetPageNosaveFree(page); + } + memory_bm_position_reset(bm); + safe_highmem_bm = bm; + return 0; +} + +/** + * get_highmem_page_buffer - for given highmem image page find the buffer + * that suspend_write_next() should set for its caller to write to. + * + * If the page is to be saved to its "original" page frame or a copy of + * the page is to be made in the highmem, @buffer is returned. Otherwise, + * the copy of the page is to be made in normal memory, so the address of + * the copy is returned. + * + * If @buffer is returned, the caller of suspend_write_next() will write + * the page's contents to @buffer, so they will have to be copied to the + * right location on the next call to suspend_write_next() and it is done + * with the help of copy_last_highmem_page(). For this purpose, if + * @buffer is returned, @last_highmem page is set to the page to which + * the data will have to be copied from @buffer. + */ + +static struct page *last_highmem_page; + +static void * +get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) +{ + struct highmem_pbe *pbe; + void *kaddr; + + if (PageNosave(page) && PageNosaveFree(page)) { + /* We have allocated the "original" page frame and we can + * use it directly to store the loaded page. + */ + last_highmem_page = page; + return buffer; + } + /* The "original" page frame has not been allocated and we have to + * use a "safe" page frame to store the loaded page. + */ + pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); + if (!pbe) { + swsusp_free(); + return NULL; + } + pbe->orig_page = page; + if (safe_highmem_pages > 0) { + struct page *tmp; + + /* Copy of the page will be stored in high memory */ + kaddr = buffer; + tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm)); + safe_highmem_pages--; + last_highmem_page = tmp; + pbe->copy_page = tmp; + } else { + /* Copy of the page will be stored in normal memory */ + kaddr = safe_pages_list; + safe_pages_list = safe_pages_list->next; + pbe->copy_page = virt_to_page(kaddr); + } + pbe->next = highmem_pblist; + highmem_pblist = pbe; + return kaddr; +} + +/** + * copy_last_highmem_page - copy the contents of a highmem image from + * @buffer, where the caller of snapshot_write_next() has place them, + * to the right location represented by @last_highmem_page . + */ + +static void copy_last_highmem_page(void) +{ + if (last_highmem_page) { + void *dst; + + dst = kmap_atomic(last_highmem_page, KM_USER0); + memcpy(dst, buffer, PAGE_SIZE); + kunmap_atomic(dst, KM_USER0); + last_highmem_page = NULL; + } +} + +static inline int last_highmem_page_copied(void) +{ + return !last_highmem_page; +} + +static inline void free_highmem_data(void) +{ + if (safe_highmem_bm) + memory_bm_free(safe_highmem_bm, PG_UNSAFE_CLEAR); + + if (buffer) + free_image_page(buffer, PG_UNSAFE_CLEAR); +} +#else +static inline int get_safe_write_buffer(void) { return 0; } + +static unsigned int +count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } + +static inline int +prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) +{ + return 0; +} + +static inline void * +get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) +{ + return NULL; +} + +static inline void copy_last_highmem_page(void) {} +static inline int last_highmem_page_copied(void) { return 1; } +static inline void free_highmem_data(void) {} +#endif /* CONFIG_HIGHMEM */ + /** * prepare_image - use the memory bitmap @bm to mark the pages that will * be overwritten in the process of restoring the system memory state @@ -1110,20 +1458,25 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) * The idea is to allocate a new memory bitmap first and then allocate * as many pages as needed for the image data, but not to assign these * pages to specific tasks initially. Instead, we just mark them as - * allocated and create a list of "safe" pages that will be used later. + * allocated and create a lists of "safe" pages that will be used + * later. On systems with high memory a list of "safe" highmem pages is + * also created. */ #define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) -static struct linked_page *safe_pages_list; - static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) { - unsigned int nr_pages; + unsigned int nr_pages, nr_highmem; struct linked_page *sp_list, *lp; int error; + /* If there is no highmem, the buffer will not be necessary */ + free_image_page(buffer, PG_UNSAFE_CLEAR); + buffer = NULL; + + nr_highmem = count_highmem_image_pages(bm); error = mark_unsafe_pages(bm); if (error) goto Free; @@ -1134,6 +1487,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) duplicate_memory_bitmap(new_bm, bm); memory_bm_free(bm, PG_UNSAFE_KEEP); + if (nr_highmem > 0) { + error = prepare_highmem_image(bm, &nr_highmem); + if (error) + goto Free; + } /* Reserve some safe pages for potential later use. * * NOTE: This way we make sure there will be enough safe pages for the @@ -1142,10 +1500,10 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) */ sp_list = NULL; /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */ - nr_pages = nr_copy_pages - allocated_unsafe_pages; + nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); while (nr_pages > 0) { - lp = alloc_image_page(GFP_ATOMIC, PG_SAFE); + lp = get_image_page(GFP_ATOMIC, PG_SAFE); if (!lp) { error = -ENOMEM; goto Free; @@ -1156,7 +1514,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) } /* Preallocate memory for the image */ safe_pages_list = NULL; - nr_pages = nr_copy_pages - allocated_unsafe_pages; + nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; while (nr_pages > 0) { lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); if (!lp) { @@ -1196,6 +1554,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) struct pbe *pbe; struct page *page = pfn_to_page(memory_bm_next_pfn(bm)); + if (PageHighMem(page)) + return get_highmem_page_buffer(page, ca); + if (PageNosave(page) && PageNosaveFree(page)) /* We have allocated the "original" page frame and we can * use it directly to store the loaded page. @@ -1210,12 +1571,12 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) swsusp_free(); return NULL; } - pbe->orig_address = (unsigned long)page_address(page); - pbe->address = (unsigned long)safe_pages_list; + pbe->orig_address = page_address(page); + pbe->address = safe_pages_list; safe_pages_list = safe_pages_list->next; pbe->next = restore_pblist; restore_pblist = pbe; - return (void *)pbe->address; + return pbe->address; } /** @@ -1249,14 +1610,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) return 0; - if (!buffer) { - /* This makes the buffer be freed by swsusp_free() */ - buffer = alloc_image_page(GFP_ATOMIC, PG_ANY); + if (handle->offset == 0) { + if (!buffer) + /* This makes the buffer be freed by swsusp_free() */ + buffer = get_image_page(GFP_ATOMIC, PG_ANY); + if (!buffer) return -ENOMEM; - } - if (!handle->offset) + handle->buffer = buffer; + } handle->sync_read = 1; if (handle->prev < handle->cur) { if (handle->prev == 0) { @@ -1284,8 +1647,10 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) return -ENOMEM; } } else { + copy_last_highmem_page(); handle->buffer = get_buffer(&orig_bm, &ca); - handle->sync_read = 0; + if (handle->buffer != buffer) + handle->sync_read = 0; } handle->prev = handle->cur; } @@ -1301,15 +1666,73 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) return count; } +/** + * snapshot_write_finalize - must be called after the last call to + * snapshot_write_next() in case the last page in the image happens + * to be a highmem page and its contents should be stored in the + * highmem. Additionally, it releases the memory that will not be + * used any more. + */ + +void snapshot_write_finalize(struct snapshot_handle *handle) +{ + copy_last_highmem_page(); + /* Free only if we have loaded the image entirely */ + if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) { + memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); + free_highmem_data(); + } +} + int snapshot_image_loaded(struct snapshot_handle *handle) { - return !(!nr_copy_pages || + return !(!nr_copy_pages || !last_highmem_page_copied() || handle->cur <= nr_meta_pages + nr_copy_pages); } -void snapshot_free_unused_memory(struct snapshot_handle *handle) +#ifdef CONFIG_HIGHMEM +/* Assumes that @buf is ready and points to a "safe" page */ +static inline void +swap_two_pages_data(struct page *p1, struct page *p2, void *buf) { - /* Free only if we have loaded the image entirely */ - if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) - memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); + void *kaddr1, *kaddr2; + + kaddr1 = kmap_atomic(p1, KM_USER0); + kaddr2 = kmap_atomic(p2, KM_USER1); + memcpy(buf, kaddr1, PAGE_SIZE); + memcpy(kaddr1, kaddr2, PAGE_SIZE); + memcpy(kaddr2, buf, PAGE_SIZE); + kunmap_atomic(kaddr1, KM_USER0); + kunmap_atomic(kaddr2, KM_USER1); +} + +/** + * restore_highmem - for each highmem page that was allocated before + * the suspend and included in the suspend image, and also has been + * allocated by the "resume" kernel swap its current (ie. "before + * resume") contents with the previous (ie. "before suspend") one. + * + * If the resume eventually fails, we can call this function once + * again and restore the "before resume" highmem state. + */ + +int restore_highmem(void) +{ + struct highmem_pbe *pbe = highmem_pblist; + void *buf; + + if (!pbe) + return 0; + + buf = get_image_page(GFP_ATOMIC, PG_SAFE); + if (!buf) + return -ENOMEM; + + while (pbe) { + swap_two_pages_data(pbe->copy_page, pbe->orig_page, buf); + pbe = pbe->next; + } + free_image_page(buf, PG_UNSAFE_CLEAR); + return 0; } +#endif /* CONFIG_HIGHMEM */ diff --git a/kernel/power/swap.c b/kernel/power/swap.c index aa5a9bff01f1..cbd187e90410 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -558,7 +558,7 @@ static int load_image(struct swap_map_handle *handle, error = err2; if (!error) { printk("\b\b\b\bdone\n"); - snapshot_free_unused_memory(snapshot); + snapshot_write_finalize(snapshot); if (!snapshot_image_loaded(snapshot)) error = -ENODATA; } diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 4147a756a8c7..68de5c1dbd78 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -64,10 +64,8 @@ int in_suspend __nosavedata = 0; #ifdef CONFIG_HIGHMEM unsigned int count_highmem_pages(void); -int save_highmem(void); int restore_highmem(void); #else -static inline int save_highmem(void) { return 0; } static inline int restore_highmem(void) { return 0; } static inline unsigned int count_highmem_pages(void) { return 0; } #endif @@ -184,7 +182,7 @@ static inline unsigned long __shrink_memory(long tmp) int swsusp_shrink_memory(void) { - long size, tmp; + long tmp; struct zone *zone; unsigned long pages = 0; unsigned int i = 0; @@ -192,15 +190,27 @@ int swsusp_shrink_memory(void) printk("Shrinking memory... "); do { - size = 2 * count_highmem_pages(); - size += size / 50 + count_data_pages() + PAGES_FOR_IO; + long size, highmem_size; + + highmem_size = count_highmem_pages(); + size = count_data_pages() + PAGES_FOR_IO; tmp = size; + size += highmem_size; for_each_zone (zone) - if (!is_highmem(zone) && populated_zone(zone)) { - tmp -= zone->free_pages; - tmp += zone->lowmem_reserve[ZONE_NORMAL]; - tmp += snapshot_additional_pages(zone); + if (populated_zone(zone)) { + if (is_highmem(zone)) { + highmem_size -= zone->free_pages; + } else { + tmp -= zone->free_pages; + tmp += zone->lowmem_reserve[ZONE_NORMAL]; + tmp += snapshot_additional_pages(zone); + } } + + if (highmem_size < 0) + highmem_size = 0; + + tmp += highmem_size; if (tmp > 0) { tmp = __shrink_memory(tmp); if (!tmp) @@ -223,6 +233,7 @@ int swsusp_suspend(void) if ((error = arch_prepare_suspend())) return error; + local_irq_disable(); /* At this point, device_suspend() has been called, but *not* * device_power_down(). We *must* device_power_down() now. @@ -235,18 +246,11 @@ int swsusp_suspend(void) goto Enable_irqs; } - if ((error = save_highmem())) { - printk(KERN_ERR "swsusp: Not enough free pages for highmem\n"); - goto Restore_highmem; - } - save_processor_state(); if ((error = swsusp_arch_suspend())) printk(KERN_ERR "Error %d suspending\n", error); /* Restore control flow magically appears here */ restore_processor_state(); -Restore_highmem: - restore_highmem(); /* NOTE: device_power_up() is just a resume() for devices * that suspended with irqs off ... no overall powerup. */ @@ -268,18 +272,23 @@ int swsusp_resume(void) printk(KERN_ERR "Some devices failed to power down, very bad\n"); /* We'll ignore saved state, but this gets preempt count (etc) right */ save_processor_state(); - error = swsusp_arch_resume(); - /* Code below is only ever reached in case of failure. Otherwise - * execution continues at place where swsusp_arch_suspend was called - */ - BUG_ON(!error); + error = restore_highmem(); + if (!error) { + error = swsusp_arch_resume(); + /* The code below is only ever reached in case of a failure. + * Otherwise execution continues at place where + * swsusp_arch_suspend() was called + */ + BUG_ON(!error); + /* This call to restore_highmem() undos the previous one */ + restore_highmem(); + } /* The only reason why swsusp_arch_resume() can fail is memory being * very tight, so we have to free it as soon as we can to avoid * subsequent failures */ swsusp_free(); restore_processor_state(); - restore_highmem(); touch_softlockup_watchdog(); device_power_up(); local_irq_enable(); diff --git a/kernel/power/user.c b/kernel/power/user.c index 05c58a2c0dd4..a63b25c63b49 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -194,12 +194,12 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, break; case SNAPSHOT_ATOMIC_RESTORE: + snapshot_write_finalize(&data->handle); if (data->mode != O_WRONLY || !data->frozen || !snapshot_image_loaded(&data->handle)) { error = -EPERM; break; } - snapshot_free_unused_memory(&data->handle); down(&pm_sem); pm_prepare_console(); suspend_console(); -- cgit v1.2.3 From 859491218770315ba95ee3fa09961fc71c506cae Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 6 Dec 2006 20:34:19 -0800 Subject: [PATCH] swsusp: use __GFP_WAIT swsusp uses GFP_ATOMIC, but it can afford to use __GFP_WAIT, which will permit it to reclaim clean pagecache instead of emitting scary page-allocation-failure messages. Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/swap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index cbd187e90410..52e70ca832a8 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -63,7 +63,7 @@ static int submit(int rw, pgoff_t page_off, struct page *page, { struct bio *bio; - bio = bio_alloc(GFP_ATOMIC, 1); + bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); if (!bio) return -ENOMEM; bio->bi_sector = page_off * (PAGE_SIZE >> 9); @@ -216,7 +216,7 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) return -ENOSPC; if (bio_chain) { - src = (void *)__get_free_page(GFP_ATOMIC); + src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); if (src) { memcpy(src, buf, PAGE_SIZE); } else { @@ -473,7 +473,7 @@ static int get_swap_reader(struct swap_map_handle *handle, sector_t start) if (!start) return -EINVAL; - handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); + handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH); if (!handle->cur) return -ENOMEM; -- cgit v1.2.3 From 8a05aac2631aa0e6494d9dc990f8c68ed8b8fde7 Mon Sep 17 00:00:00 2001 From: Stefan Seyfried Date: Wed, 6 Dec 2006 20:34:21 -0800 Subject: [PATCH] swsusp: fix platform mode At some point after 2.6.13, in-kernel software suspend got "incomplete" for the so-called "platform" mode. pm_ops->prepare() is never called. A visible sign of this is the "moon" light on thinkpads not flashing during suspend. Fix by readding the pm_ops->prepare call during suspend. Signed-off-by: Stefan Seyfried Acked-by: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index d79feeb45459..f5079231383e 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -29,6 +29,22 @@ char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; sector_t swsusp_resume_block; +/** + * platform_prepare - prepare the machine for hibernation using the + * platform driver if so configured and return an error code if it fails + */ + +static inline int platform_prepare(void) +{ + int error = 0; + + if (pm_disk_mode == PM_DISK_PLATFORM) { + if (pm_ops && pm_ops->prepare) + error = pm_ops->prepare(PM_SUSPEND_DISK); + } + return error; +} + /** * power_down - Shut machine down for hibernate. * @mode: Suspend-to-disk mode @@ -91,9 +107,15 @@ static int prepare_processes(void) goto thaw; } + error = platform_prepare(); + if (error) + goto thaw; + /* Free memory before shutting down devices. */ if (!(error = swsusp_shrink_memory())) return 0; + + platform_finish(); thaw: thaw_processes(); enable_cpus: -- cgit v1.2.3 From 7dfb71030f7636a0d65200158113c37764552f93 Mon Sep 17 00:00:00 2001 From: Nigel Cunningham Date: Wed, 6 Dec 2006 20:34:23 -0800 Subject: [PATCH] Add include/linux/freezer.h and move definitions from sched.h Move process freezing functions from include/linux/sched.h to freezer.h, so that modifications to the freezer or the kernel configuration don't require recompiling just about everything. [akpm@osdl.org: fix ueagle driver] Signed-off-by: Nigel Cunningham Cc: "Rafael J. Wysocki" Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.c | 1 + kernel/power/disk.c | 1 + kernel/power/main.c | 1 + kernel/power/process.c | 1 + kernel/power/user.c | 1 + kernel/rtmutex-tester.c | 1 + kernel/sched.c | 2 +- kernel/signal.c | 1 + 8 files changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 98106f6078b0..d9b690ac684b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -57,6 +57,7 @@ #include #include #include +#include #include "audit.h" diff --git a/kernel/power/disk.c b/kernel/power/disk.c index f5079231383e..53b3b57c0223 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "power.h" diff --git a/kernel/power/main.c b/kernel/power/main.c index 873228c71dab..6096c71b182b 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "power.h" diff --git a/kernel/power/process.c b/kernel/power/process.c index 72e72d2c61e6..29be608e8349 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -13,6 +13,7 @@ #include #include #include +#include /* * Timeout for stopping processes diff --git a/kernel/power/user.c b/kernel/power/user.c index a63b25c63b49..26c66941c001 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -22,6 +22,7 @@ #include #include #include +#include #include diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 6dcea9dd8c94..015fc633c96c 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "rtmutex.h" diff --git a/kernel/sched.c b/kernel/sched.c index 3399701c680e..12fdbef1d9bf 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/signal.c b/kernel/signal.c index 8e19d2785486..bc972d7e6319 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 32d50f57dab94d8c46566a903bbb633ee72fdcc2 Mon Sep 17 00:00:00 2001 From: Nigel Cunningham Date: Wed, 6 Dec 2006 20:34:25 -0800 Subject: [PATCH] swsusp: quieten Freezer if !CONFIG_PM_DEBUG The freezer currently prints an '=' for every process that is frozen. This is pretty pointless, as the equals sign says nothing about which process is frozen, and makes logs look messier (especially if there were a large number of processes running). All we really need to know is that we started trying to freeze processes and what processes (if any) failed to freeze, or that we succeeded. Signed-off-by: Nigel Cunningham Acked-by: Pavel Machek Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/process.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 29be608e8349..b0edfc6f2798 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -40,7 +40,6 @@ void refrigerator(void) long save; save = current->state; pr_debug("%s entered refrigerator\n", current->comm); - printk("="); frozen_process(current); spin_lock_irq(¤t->sighand->siglock); -- cgit v1.2.3 From 14b5b7cfaa110b1d25b8f80b01a8c97cf2db30bc Mon Sep 17 00:00:00 2001 From: Nigel Cunningham Date: Wed, 6 Dec 2006 20:34:26 -0800 Subject: [PATCH] swsusp: clean up whitespace in freezer output Minor whitespace and formatting modifications for the freezer. Signed-off-by: Nigel Cunningham Acked-by: Pavel Machek Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/process.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index b0edfc6f2798..fedabad5a180 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -86,7 +86,7 @@ int freeze_processes(void) unsigned long start_time; struct task_struct *g, *p; - printk( "Stopping tasks: " ); + printk("Stopping tasks... "); start_time = jiffies; user_frozen = 0; do { @@ -134,21 +134,21 @@ int freeze_processes(void) * but it cleans up leftover PF_FREEZE requests. */ if (todo) { - printk( "\n" ); - printk(KERN_ERR " stopping tasks timed out " + printk("\n"); + printk(KERN_ERR "Stopping tasks timed out " "after %d seconds (%d tasks remaining):\n", TIMEOUT / HZ, todo); read_lock(&tasklist_lock); do_each_thread(g, p) { if (freezeable(p) && !frozen(p)) - printk(KERN_ERR " %s\n", p->comm); + printk(KERN_ERR " %s\n", p->comm); cancel_freezing(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); return todo; } - printk( "|\n" ); + printk("done.\n"); BUG_ON(in_atomic()); return 0; } @@ -157,18 +157,18 @@ void thaw_processes(void) { struct task_struct *g, *p; - printk( "Restarting tasks..." ); + printk("Restarting tasks... "); read_lock(&tasklist_lock); do_each_thread(g, p) { if (!freezeable(p)) continue; if (!thaw_process(p)) - printk(KERN_INFO " Strange, %s not stopped\n", p->comm ); + printk(KERN_INFO "Strange, %s not stopped\n", p->comm); } while_each_thread(g, p); read_unlock(&tasklist_lock); schedule(); - printk( " done\n" ); + printk("done.\n"); } EXPORT_SYMBOL(refrigerator); -- cgit v1.2.3 From ff39593ad0ff7a79a3717edac6634407aa8200c2 Mon Sep 17 00:00:00 2001 From: Nigel Cunningham Date: Wed, 6 Dec 2006 20:34:28 -0800 Subject: [PATCH] swsusp: thaw userspace and kernel space separately Modify process thawing so that we can thaw kernel space without thawing userspace, and thaw kernelspace first. This will be useful in later patches, where I intend to get swsusp thawing kernel threads only before seeking to free memory. Signed-off-by: Nigel Cunningham Cc: Pavel Machek Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/process.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index fedabad5a180..cba8a5890eda 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -153,18 +153,29 @@ int freeze_processes(void) return 0; } -void thaw_processes(void) +void thaw_some_processes(int all) { struct task_struct *g, *p; + int pass = 0; /* Pass 0 = Kernel space, 1 = Userspace */ printk("Restarting tasks... "); read_lock(&tasklist_lock); - do_each_thread(g, p) { - if (!freezeable(p)) - continue; - if (!thaw_process(p)) - printk(KERN_INFO "Strange, %s not stopped\n", p->comm); - } while_each_thread(g, p); + do { + do_each_thread(g, p) { + /* + * is_user = 0 if kernel thread or borrowed mm, + * 1 otherwise. + */ + int is_user = !!(p->mm && !(p->flags & PF_BORROWED_MM)); + if (!freezeable(p) || (is_user != pass)) + continue; + if (!thaw_process(p)) + printk(KERN_INFO + "Strange, %s not stopped\n", p->comm); + } while_each_thread(g, p); + + pass++; + } while (pass < 2 && all); read_unlock(&tasklist_lock); schedule(); -- cgit v1.2.3 From 2d4a34c9365c6e3f94a5b26ce296e1fce9b66c8b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 6 Dec 2006 20:34:29 -0800 Subject: [PATCH] swsusp: Support i386 systems with PAE or without PSE Make swsusp support i386 systems with PAE or without PSE. This is done by creating temporary page tables located in resume-safe page frames before the suspend image is restored in the same way as x86_64 does it. Signed-off-by: Rafael J. Wysocki Cc: Andi Kleen Cc: Dave Jones Cc: Nigel Cunningham Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 825068ca3479..710ed084e7c5 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -78,7 +78,7 @@ config PM_SYSFS_DEPRECATED config SOFTWARE_SUSPEND bool "Software Suspend" - depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP) && !X86_PAE) || ((FRV || PPC32) && !SMP)) + depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)) ---help--- Enable the possibility of suspending the machine. It doesn't need ACPI or APM. -- cgit v1.2.3 From 112cecb2cc0e7341db92281ba04b26c41bb8146d Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Wed, 6 Dec 2006 20:34:31 -0800 Subject: [PATCH] suspend: don't change cpus_allowed for task initiating the suspend Don't modify the cpus_allowed of the task initiating the suspend. _cpu_down() already makes sure that the task doing the suspend doesn't run on dying cpu. Signed-off-by: Suresh Siddha Cc: Venkatesh Pallipadi Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: Nigel Cunningham Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 272254f20d97..9124669f4586 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -270,11 +270,7 @@ int disable_nonboot_cpus(void) goto out; } } - error = set_cpus_allowed(current, cpumask_of_cpu(first_cpu)); - if (error) { - printk(KERN_ERR "Could not run on CPU%d\n", first_cpu); - goto out; - } + /* We take down all of the non-boot CPUs in one shot to avoid races * with the userspace trying to use the CPU hotplug at the same time */ -- cgit v1.2.3 From 0d3a9abe8ae055e1052295698bcd0722c92eff47 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 6 Dec 2006 20:34:32 -0800 Subject: [PATCH] swsusp: Measure memory shrinking time Make swsusp measure and print the time needed to shrink memory during the suspend. Signed-off-by: Rafael J. Wysocki Cc: Pavel Machek Cc: Nigel Cunningham Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/power.h | 4 ++++ kernel/power/swap.c | 24 ++---------------------- kernel/power/swsusp.c | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index 3763343bde2f..3afa5dbe6bc5 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -171,3 +171,7 @@ extern int swsusp_read(void); extern int swsusp_write(void); extern void swsusp_close(void); extern int suspend_enter(suspend_state_t state); + +struct timeval; +extern void swsusp_show_speed(struct timeval *, struct timeval *, + unsigned int, char *); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 52e70ca832a8..dedf8797b723 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -133,26 +133,6 @@ static int wait_on_bio_chain(struct bio **bio_chain) return ret; } -static void show_speed(struct timeval *start, struct timeval *stop, - unsigned nr_pages, char *msg) -{ - s64 elapsed_centisecs64; - int centisecs; - int k; - int kps; - - elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); - do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); - centisecs = elapsed_centisecs64; - if (centisecs == 0) - centisecs = 1; /* avoid div-by-zero */ - k = nr_pages * (PAGE_SIZE / 1024); - kps = (k * 100) / centisecs; - printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k, - centisecs / 100, centisecs % 100, - kps / 1000, (kps % 1000) / 10); -} - /* * Saving part */ @@ -375,7 +355,7 @@ static int save_image(struct swap_map_handle *handle, error = err2; if (!error) printk("\b\b\b\bdone\n"); - show_speed(&start, &stop, nr_to_write, "Wrote"); + swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); return error; } @@ -562,7 +542,7 @@ static int load_image(struct swap_map_handle *handle, if (!snapshot_image_loaded(snapshot)) error = -ENODATA; } - show_speed(&start, &stop, nr_to_read, "Read"); + swsusp_show_speed(&start, &stop, nr_to_read, "Read"); return error; } diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 68de5c1dbd78..aa31432bbd97 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -49,6 +49,7 @@ #include #include #include +#include #include "power.h" @@ -163,6 +164,34 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap) } } +/** + * swsusp_show_speed - print the time elapsed between two events represented by + * @start and @stop + * + * @nr_pages - number of pages processed between @start and @stop + * @msg - introductory message to print + */ + +void swsusp_show_speed(struct timeval *start, struct timeval *stop, + unsigned nr_pages, char *msg) +{ + s64 elapsed_centisecs64; + int centisecs; + int k; + int kps; + + elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); + do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); + centisecs = elapsed_centisecs64; + if (centisecs == 0) + centisecs = 1; /* avoid div-by-zero */ + k = nr_pages * (PAGE_SIZE / 1024); + kps = (k * 100) / centisecs; + printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k, + centisecs / 100, centisecs % 100, + kps / 1000, (kps % 1000) / 10); +} + /** * swsusp_shrink_memory - Try to free as much memory as needed * @@ -187,8 +216,10 @@ int swsusp_shrink_memory(void) unsigned long pages = 0; unsigned int i = 0; char *p = "-\\|/"; + struct timeval start, stop; printk("Shrinking memory... "); + do_gettimeofday(&start); do { long size, highmem_size; @@ -222,7 +253,9 @@ int swsusp_shrink_memory(void) } printk("\b%c", p[i++%4]); } while (tmp > 0); + do_gettimeofday(&stop); printk("\bdone (%lu pages freed)\n", pages); + swsusp_show_speed(&start, &stop, pages, "Freed"); return 0; } -- cgit v1.2.3 From 3eb1b3a40722cbb46631db373af66d13d1e7ac81 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 6 Dec 2006 20:34:34 -0800 Subject: [PATCH] suspend to disk fails if gdb is suspended with a traced child Fix http://bugzilla.kernel.org/show_bug.cgi?id=7534 Fix the freezing of processes so that it won't fail if there is a traced process the parent of which has been stopped. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Cc: maurice barnum Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/process.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index cba8a5890eda..1badb9a89ade 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -97,7 +97,9 @@ int freeze_processes(void) continue; if (frozen(p)) continue; - if (p->state == TASK_TRACED && frozen(p->parent)) { + if (p->state == TASK_TRACED && + (frozen(p->parent) || + p->parent->state == TASK_STOPPED)) { cancel_freezing(p); continue; } -- cgit v1.2.3 From a6d70980602e6f1869ebcdcbfaf55a0a5941583e Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 6 Dec 2006 20:34:35 -0800 Subject: [PATCH] convert pm_sem to a mutex The power management semaphore is only used as mutex, so convert it. [akpm@osdl.org: fix rotten bug] Signed-off-by: Stephen Hemminger Acked-by: Ingo Molnar Acked-by: Pavel Machek Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 16 ++++++++-------- kernel/power/main.c | 10 +++++----- kernel/power/power.h | 4 +++- kernel/power/user.c | 24 ++++++++++++------------ 4 files changed, 28 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 53b3b57c0223..08d9e7aac0f8 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -213,10 +213,10 @@ static int software_resume(void) { int error; - down(&pm_sem); + mutex_lock(&pm_mutex); if (!swsusp_resume_device) { if (!strlen(resume_file)) { - up(&pm_sem); + mutex_unlock(&pm_mutex); return -ENOENT; } swsusp_resume_device = name_to_dev_t(resume_file); @@ -231,7 +231,7 @@ static int software_resume(void) * FIXME: If noresume is specified, we need to find the partition * and reset it back to normal swap space. */ - up(&pm_sem); + mutex_unlock(&pm_mutex); return 0; } @@ -275,7 +275,7 @@ static int software_resume(void) unprepare_processes(); Done: /* For success case, the suspend path will release the lock */ - up(&pm_sem); + mutex_unlock(&pm_mutex); pr_debug("PM: Resume from disk failed.\n"); return 0; } @@ -336,7 +336,7 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n) p = memchr(buf, '\n', n); len = p ? p - buf : n; - down(&pm_sem); + mutex_lock(&pm_mutex); for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) { if (!strncmp(buf, pm_disk_modes[i], len)) { mode = i; @@ -360,7 +360,7 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n) pr_debug("PM: suspend-to-disk mode set to '%s'\n", pm_disk_modes[mode]); - up(&pm_sem); + mutex_unlock(&pm_mutex); return error ? error : n; } @@ -385,9 +385,9 @@ static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n) if (maj != MAJOR(res) || min != MINOR(res)) goto out; - down(&pm_sem); + mutex_lock(&pm_mutex); swsusp_resume_device = res; - up(&pm_sem); + mutex_unlock(&pm_mutex); printk("Attempting manual resume\n"); noresume = 0; software_resume(); diff --git a/kernel/power/main.c b/kernel/power/main.c index 6096c71b182b..751157b7897e 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -25,7 +25,7 @@ /*This is just an arbitrary number */ #define FREE_PAGE_NUMBER (100) -DECLARE_MUTEX(pm_sem); +DEFINE_MUTEX(pm_mutex); struct pm_ops *pm_ops; suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN; @@ -37,9 +37,9 @@ suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN; void pm_set_ops(struct pm_ops * ops) { - down(&pm_sem); + mutex_lock(&pm_mutex); pm_ops = ops; - up(&pm_sem); + mutex_unlock(&pm_mutex); } @@ -183,7 +183,7 @@ static int enter_state(suspend_state_t state) if (!valid_state(state)) return -ENODEV; - if (down_trylock(&pm_sem)) + if (!mutex_trylock(&pm_mutex)) return -EBUSY; if (state == PM_SUSPEND_DISK) { @@ -201,7 +201,7 @@ static int enter_state(suspend_state_t state) pr_debug("PM: Finishing wakeup.\n"); suspend_finish(state); Unlock: - up(&pm_sem); + mutex_unlock(&pm_mutex); return error; } diff --git a/kernel/power/power.h b/kernel/power/power.h index 3afa5dbe6bc5..eb461b816bf4 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -22,7 +22,9 @@ static inline int pm_suspend_disk(void) return -EPERM; } #endif -extern struct semaphore pm_sem; + +extern struct mutex pm_mutex; + #define power_attr(_name) \ static struct subsys_attribute _name##_attr = { \ .attr = { \ diff --git a/kernel/power/user.c b/kernel/power/user.c index 26c66941c001..905dc269c3fc 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -79,10 +79,10 @@ static int snapshot_release(struct inode *inode, struct file *filp) free_all_swap_pages(data->swap, data->bitmap); free_bitmap(data->bitmap); if (data->frozen) { - down(&pm_sem); + mutex_lock(&pm_mutex); thaw_processes(); enable_nonboot_cpus(); - up(&pm_sem); + mutex_unlock(&pm_mutex); } atomic_inc(&device_available); return 0; @@ -144,7 +144,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, case SNAPSHOT_FREEZE: if (data->frozen) break; - down(&pm_sem); + mutex_lock(&pm_mutex); error = disable_nonboot_cpus(); if (!error) { error = freeze_processes(); @@ -154,7 +154,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, error = -EBUSY; } } - up(&pm_sem); + mutex_unlock(&pm_mutex); if (!error) data->frozen = 1; break; @@ -162,10 +162,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, case SNAPSHOT_UNFREEZE: if (!data->frozen) break; - down(&pm_sem); + mutex_lock(&pm_mutex); thaw_processes(); enable_nonboot_cpus(); - up(&pm_sem); + mutex_unlock(&pm_mutex); data->frozen = 0; break; @@ -174,7 +174,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, error = -EPERM; break; } - down(&pm_sem); + mutex_lock(&pm_mutex); /* Free memory before shutting down devices. */ error = swsusp_shrink_memory(); if (!error) { @@ -187,7 +187,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, } resume_console(); } - up(&pm_sem); + mutex_unlock(&pm_mutex); if (!error) error = put_user(in_suspend, (unsigned int __user *)arg); if (!error) @@ -201,7 +201,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, error = -EPERM; break; } - down(&pm_sem); + mutex_lock(&pm_mutex); pm_prepare_console(); suspend_console(); error = device_suspend(PMSG_PRETHAW); @@ -211,7 +211,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, } resume_console(); pm_restore_console(); - up(&pm_sem); + mutex_unlock(&pm_mutex); break; case SNAPSHOT_FREE: @@ -286,7 +286,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, break; } - if (down_trylock(&pm_sem)) { + if (!mutex_trylock(&pm_mutex)) { error = -EBUSY; break; } @@ -314,7 +314,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, pm_ops->finish(PM_SUSPEND_MEM); OutS3: - up(&pm_sem); + mutex_unlock(&pm_mutex); break; case SNAPSHOT_PMOPS: -- cgit v1.2.3 From a9b6f562f14dc28fb4b2415f0f275cede0abe9b5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 6 Dec 2006 20:34:37 -0800 Subject: [PATCH] swsusp: Untangle thaw_processes Move the loop from thaw_processes() to a separate function and call it independently for kernel threads and user space processes so that the order of thawing tasks is clearly visible. Drop thaw_kernel_threads() which is never used. Signed-off-by: Rafael J. Wysocki Cc: Pavel Machek Cc: Nigel Cunningham Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/process.c | 49 +++++++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 1badb9a89ade..fd0ebb942f50 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -20,6 +20,8 @@ */ #define TIMEOUT (20 * HZ) +#define FREEZER_KERNEL_THREADS 0 +#define FREEZER_USER_SPACE 1 static inline int freezeable(struct task_struct * p) { @@ -79,6 +81,11 @@ static void cancel_freezing(struct task_struct *p) } } +static inline int is_user_space(struct task_struct *p) +{ + return p->mm && !(p->flags & PF_BORROWED_MM); +} + /* 0 = success, else # of processes that we failed to stop */ int freeze_processes(void) { @@ -103,10 +110,9 @@ int freeze_processes(void) cancel_freezing(p); continue; } - if (p->mm && !(p->flags & PF_BORROWED_MM)) { - /* The task is a user-space one. - * Freeze it unless there's a vfork completion - * pending + if (is_user_space(p)) { + /* Freeze the task unless there is a vfork + * completion pending */ if (!p->vfork_done) freeze_process(p); @@ -155,31 +161,30 @@ int freeze_processes(void) return 0; } -void thaw_some_processes(int all) +static void thaw_tasks(int thaw_user_space) { struct task_struct *g, *p; - int pass = 0; /* Pass 0 = Kernel space, 1 = Userspace */ - printk("Restarting tasks... "); read_lock(&tasklist_lock); - do { - do_each_thread(g, p) { - /* - * is_user = 0 if kernel thread or borrowed mm, - * 1 otherwise. - */ - int is_user = !!(p->mm && !(p->flags & PF_BORROWED_MM)); - if (!freezeable(p) || (is_user != pass)) - continue; - if (!thaw_process(p)) - printk(KERN_INFO - "Strange, %s not stopped\n", p->comm); - } while_each_thread(g, p); + do_each_thread(g, p) { + if (!freezeable(p)) + continue; - pass++; - } while (pass < 2 && all); + if (is_user_space(p) == !thaw_user_space) + continue; + if (!thaw_process(p)) + printk(KERN_WARNING " Strange, %s not stopped\n", + p->comm ); + } while_each_thread(g, p); read_unlock(&tasklist_lock); +} + +void thaw_processes(void) +{ + printk("Restarting tasks ... "); + thaw_tasks(FREEZER_KERNEL_THREADS); + thaw_tasks(FREEZER_USER_SPACE); schedule(); printk("done.\n"); } -- cgit v1.2.3 From 11b2ce2ba90f801e2a5ebba4e6b7da72d87f2b13 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 6 Dec 2006 20:34:40 -0800 Subject: [PATCH] swsusp: Untangle freeze_processes Move the loop from freeze_processes() to a separate function and call it independently for user space processes and kernel threads so that the order of freezing tasks is clearly visible. Signed-off-by: Rafael J. Wysocki Cc: Pavel Machek Cc: Nigel Cunningham Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/process.c | 84 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index fd0ebb942f50..99eeb119b06d 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -86,24 +86,23 @@ static inline int is_user_space(struct task_struct *p) return p->mm && !(p->flags & PF_BORROWED_MM); } -/* 0 = success, else # of processes that we failed to stop */ -int freeze_processes(void) +static unsigned int try_to_freeze_tasks(int freeze_user_space) { - int todo, nr_user, user_frozen; - unsigned long start_time; struct task_struct *g, *p; + unsigned long end_time; + unsigned int todo; - printk("Stopping tasks... "); - start_time = jiffies; - user_frozen = 0; + end_time = jiffies + TIMEOUT; do { - nr_user = todo = 0; + todo = 0; read_lock(&tasklist_lock); do_each_thread(g, p) { if (!freezeable(p)) continue; + if (frozen(p)) continue; + if (p->state == TASK_TRACED && (frozen(p->parent) || p->parent->state == TASK_STOPPED)) { @@ -111,51 +110,76 @@ int freeze_processes(void) continue; } if (is_user_space(p)) { + if (!freeze_user_space) + continue; + /* Freeze the task unless there is a vfork * completion pending */ if (!p->vfork_done) freeze_process(p); - nr_user++; } else { - /* Freeze only if the user space is frozen */ - if (user_frozen) - freeze_process(p); - todo++; + if (freeze_user_space) + continue; + + freeze_process(p); } + todo++; } while_each_thread(g, p); read_unlock(&tasklist_lock); - todo += nr_user; - if (!user_frozen && !nr_user) { - sys_sync(); - start_time = jiffies; - } - user_frozen = !nr_user; yield(); /* Yield is okay here */ - if (todo && time_after(jiffies, start_time + TIMEOUT)) + if (todo && time_after(jiffies, end_time)) break; - } while(todo); + } while (todo); - /* This does not unfreeze processes that are already frozen - * (we have slightly ugly calling convention in that respect, - * and caller must call thaw_processes() if something fails), - * but it cleans up leftover PF_FREEZE requests. - */ if (todo) { + /* This does not unfreeze processes that are already frozen + * (we have slightly ugly calling convention in that respect, + * and caller must call thaw_processes() if something fails), + * but it cleans up leftover PF_FREEZE requests. + */ printk("\n"); - printk(KERN_ERR "Stopping tasks timed out " - "after %d seconds (%d tasks remaining):\n", - TIMEOUT / HZ, todo); + printk(KERN_ERR "Stopping %s timed out after %d seconds " + "(%d tasks refusing to freeze):\n", + freeze_user_space ? "user space processes" : + "kernel threads", + TIMEOUT / HZ, todo); read_lock(&tasklist_lock); do_each_thread(g, p) { + if (is_user_space(p) == !freeze_user_space) + continue; + if (freezeable(p) && !frozen(p)) printk(KERN_ERR " %s\n", p->comm); + cancel_freezing(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); - return todo; } + return todo; +} + +/** + * freeze_processes - tell processes to enter the refrigerator + * + * Returns 0 on success, or the number of processes that didn't freeze, + * although they were told to. + */ +int freeze_processes(void) +{ + unsigned int nr_unfrozen; + + printk("Stopping tasks ... "); + nr_unfrozen = try_to_freeze_tasks(FREEZER_USER_SPACE); + if (nr_unfrozen) + return nr_unfrozen; + + sys_sync(); + nr_unfrozen = try_to_freeze_tasks(FREEZER_KERNEL_THREADS); + if (nr_unfrozen) + return nr_unfrozen; + printk("done.\n"); BUG_ON(in_atomic()); return 0; -- cgit v1.2.3 From 5b6d15de2d4c8149902a680a6cd1d3b26cd2e828 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 6 Dec 2006 20:34:43 -0800 Subject: [PATCH] swsusp: Fix coding style in suspend.c Fix coding style in suspend.c. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Cc: Nigel Cunningham Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index fd8251d40eb8..712f1524d846 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -85,7 +85,8 @@ unsigned long get_safe_page(gfp_t gfp_mask) return (unsigned long)get_image_page(gfp_mask, PG_SAFE); } -static struct page *alloc_image_page(gfp_t gfp_mask) { +static struct page *alloc_image_page(gfp_t gfp_mask) +{ struct page *page; page = alloc_page(gfp_mask); -- cgit v1.2.3 From 59a493350e7aefff7e262efa39e017517b31b8e8 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 6 Dec 2006 20:34:44 -0800 Subject: [PATCH] swsusp: Fix labels Move all labels in the swsusp code to the second column, so that they won't fool diff -p. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Cc: Nigel Cunningham Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 6 +++--- kernel/power/snapshot.c | 8 ++++---- kernel/power/swap.c | 4 ++-- kernel/power/swsusp.c | 2 +- kernel/power/user.c | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 08d9e7aac0f8..126dda61f45d 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -117,9 +117,9 @@ static int prepare_processes(void) return 0; platform_finish(); -thaw: + thaw: thaw_processes(); -enable_cpus: + enable_cpus: enable_nonboot_cpus(); pm_restore_console(); return error; @@ -392,7 +392,7 @@ static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n) noresume = 0; software_resume(); ret = n; -out: + out: return ret; } diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 712f1524d846..c024606221c4 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -411,7 +411,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) memory_bm_position_reset(bm); return 0; -Free: + Free: bm->p_list = ca.chain; memory_bm_free(bm, PG_UNSAFE_CLEAR); return -ENOMEM; @@ -557,7 +557,7 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) memory_bm_position_reset(bm); return BM_END_OF_MAP; -Return_pfn: + Return_pfn: bm->cur.chunk = chunk; bm->cur.bit = bit; return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit; @@ -964,7 +964,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, } return 0; -Free: + Free: swsusp_free(); return -ENOMEM; } @@ -1540,7 +1540,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) } return 0; -Free: + Free: swsusp_free(); return error; } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index dedf8797b723..f133d4a6d817 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -301,7 +301,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, handle->cur_swap = offset; handle->k = 0; } -out: + out: return error; } @@ -429,7 +429,7 @@ int swsusp_write(void) if (error) free_all_swap_pages(root_swap, handle.bitmap); release_swap_writer(&handle); -out: + out: swsusp_close(); return error; } diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index aa31432bbd97..31aa0390c777 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -288,7 +288,7 @@ int swsusp_suspend(void) * that suspended with irqs off ... no overall powerup. */ device_power_up(); -Enable_irqs: + Enable_irqs: local_irq_enable(); return error; } diff --git a/kernel/power/user.c b/kernel/power/user.c index 905dc269c3fc..069732e5695c 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -313,7 +313,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, if (pm_ops->finish) pm_ops->finish(PM_SUSPEND_MEM); -OutS3: + OutS3: mutex_unlock(&pm_mutex); break; -- cgit v1.2.3 From 2d87595ea628ea58415ba4638c553a8c2fbd90e2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 6 Dec 2006 20:34:47 -0800 Subject: [PATCH] PM: Fix swsusp debug mode testproc The 'testproc' swsusp debug mode thaws tasks twice in a row, which is _very_ confusing. Fix that. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 126dda61f45d..6180379d5b84 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -151,7 +151,7 @@ int pm_suspend_disk(void) return error; if (pm_disk_mode == PM_DISK_TESTPROC) - goto Thaw; + return 0; suspend_console(); error = device_suspend(PMSG_FREEZE); -- cgit v1.2.3 From 5045cfc103566878228ca36d05a0ae0076673e5a Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Wed, 6 Dec 2006 20:34:48 -0800 Subject: [PATCH] swsusp: kill write-only variable Cleanup write-only variable, suggested by D Binderman. Signed-off-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 6180379d5b84..0b00f56c2ad0 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -58,12 +58,10 @@ static inline int platform_prepare(void) static void power_down(suspend_disk_method_t mode) { - int error = 0; - switch(mode) { case PM_DISK_PLATFORM: kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); - error = pm_ops->enter(PM_SUSPEND_DISK); + pm_ops->enter(PM_SUSPEND_DISK); break; case PM_DISK_SHUTDOWN: kernel_power_off(); -- cgit v1.2.3 From 341a595850dac1b0503df34260257d71b4fdf72c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 6 Dec 2006 20:34:49 -0800 Subject: [PATCH] Support for freezeable workqueues Make it possible to create a workqueue the worker thread of which will be frozen during suspend, along with other kernel threads. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Cc: Nigel Cunningham Cc: David Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8d1e7cb8a51a..2945b094d871 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -29,6 +29,7 @@ #include #include #include +#include /* * The per-CPU workqueue (if single thread, we always use the first @@ -55,6 +56,8 @@ struct cpu_workqueue_struct { struct task_struct *thread; int run_depth; /* Detect run_workqueue() recursion depth */ + + int freezeable; /* Freeze the thread during suspend */ } ____cacheline_aligned; /* @@ -265,7 +268,8 @@ static int worker_thread(void *__cwq) struct k_sigaction sa; sigset_t blocked; - current->flags |= PF_NOFREEZE; + if (!cwq->freezeable) + current->flags |= PF_NOFREEZE; set_user_nice(current, -5); @@ -288,6 +292,9 @@ static int worker_thread(void *__cwq) set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { + if (cwq->freezeable) + try_to_freeze(); + add_wait_queue(&cwq->more_work, &wait); if (list_empty(&cwq->worklist)) schedule(); @@ -364,7 +371,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) EXPORT_SYMBOL_GPL(flush_workqueue); static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, - int cpu) + int cpu, int freezeable) { struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); struct task_struct *p; @@ -374,6 +381,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, cwq->thread = NULL; cwq->insert_sequence = 0; cwq->remove_sequence = 0; + cwq->freezeable = freezeable; INIT_LIST_HEAD(&cwq->worklist); init_waitqueue_head(&cwq->more_work); init_waitqueue_head(&cwq->work_done); @@ -389,7 +397,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, } struct workqueue_struct *__create_workqueue(const char *name, - int singlethread) + int singlethread, int freezeable) { int cpu, destroy = 0; struct workqueue_struct *wq; @@ -409,7 +417,7 @@ struct workqueue_struct *__create_workqueue(const char *name, mutex_lock(&workqueue_mutex); if (singlethread) { INIT_LIST_HEAD(&wq->list); - p = create_workqueue_thread(wq, singlethread_cpu); + p = create_workqueue_thread(wq, singlethread_cpu, freezeable); if (!p) destroy = 1; else @@ -417,7 +425,7 @@ struct workqueue_struct *__create_workqueue(const char *name, } else { list_add(&wq->list, &workqueues); for_each_online_cpu(cpu) { - p = create_workqueue_thread(wq, cpu); + p = create_workqueue_thread(wq, cpu, freezeable); if (p) { kthread_bind(p, cpu); wake_up_process(p); @@ -667,7 +675,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, mutex_lock(&workqueue_mutex); /* Create a new workqueue thread for it. */ list_for_each_entry(wq, &workqueues, list) { - if (!create_workqueue_thread(wq, hotcpu)) { + if (!create_workqueue_thread(wq, hotcpu, 0)) { printk("workqueue for %i failed\n", hotcpu); return NOTIFY_BAD; } -- cgit v1.2.3 From ed07536ed6731775219c1df7fa26a7588753e693 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Dec 2006 20:35:24 -0800 Subject: [PATCH] lockdep: annotate nfs/nfsd in-kernel sockets Stick NFS sockets in their own class to avoid some lockdep warnings. NFS sockets are never exposed to user-space, and will hence not trigger certain code paths that would otherwise pose deadlock scenarios. [akpm@osdl.org: cleanups] Signed-off-by: Peter Zijlstra Signed-off-by: Steven Dickson Acked-by: Ingo Molnar Cc: Trond Myklebust Acked-by: Neil Brown Cc: "David S. Miller" Signed-off-by: Andrew Morton [ Fixed patch corruption by quilt, pointed out by Peter Zijlstra ] Signed-off-by: Linus Torvalds --- kernel/lockdep.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index c9fefdb1a7db..e33f6207f5b3 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2645,6 +2645,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) } local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); static void print_held_locks_bug(struct task_struct *curr) { -- cgit v1.2.3 From 07354a00901d103085e4376b7df0aad264c1836a Mon Sep 17 00:00:00 2001 From: "Adam B. Jerome" Date: Wed, 6 Dec 2006 20:35:30 -0800 Subject: [PATCH] /proc/kallsyms reports lower-case types for some non-exported symbols This patch addresses incorrect symbol type information reported through /proc/kallsyms. A lowercase character should designate the symbol as local (or non-exported). An uppercase character should designate the symbol as global (or external). Without this patch, some non-exported symbols are incorrectly assigned an upper-case designation in /proc/kallsyms. This patch corrects this condition by converting non-exported symbols types to lower case when appropriate and eliminates the superfluous upcase_if_global function Signed-off-by: Adam B. Jerome Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index eeac3e313b2b..54befe36ee0b 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -20,6 +20,7 @@ #include #include /* for cond_resched */ #include +#include #include @@ -301,13 +302,6 @@ struct kallsym_iter char name[KSYM_NAME_LEN+1]; }; -/* Only label it "global" if it is exported. */ -static void upcase_if_global(struct kallsym_iter *iter) -{ - if (is_exported(iter->name, iter->owner)) - iter->type += 'A' - 'a'; -} - static int get_ksymbol_mod(struct kallsym_iter *iter) { iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, @@ -316,7 +310,10 @@ static int get_ksymbol_mod(struct kallsym_iter *iter) if (iter->owner == NULL) return 0; - upcase_if_global(iter); + /* Label it "global" if it is exported, "local" if not exported. */ + iter->type = is_exported(iter->name, iter->owner) + ? toupper(iter->type) : tolower(iter->type); + return 1; } -- cgit v1.2.3 From e59e2ae2c29700117a54e85c106017c24837119f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 6 Dec 2006 20:35:59 -0800 Subject: [PATCH] SysRq-X: show blocked tasks Add SysRq-X support: show blocked (TASK_UNINTERRUPTIBLE) tasks only. Useful for debugging IO stalls. Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 12fdbef1d9bf..1848e280504d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4804,7 +4804,7 @@ static void show_task(struct task_struct *p) show_stack(p, NULL); } -void show_state(void) +void show_state_filter(unsigned long state_filter) { struct task_struct *g, *p; @@ -4824,11 +4824,16 @@ void show_state(void) * console might take alot of time: */ touch_nmi_watchdog(); - show_task(p); + if (p->state & state_filter) + show_task(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); - debug_show_all_locks(); + /* + * Only show locks if all tasks are dumped: + */ + if (state_filter == -1) + debug_show_all_locks(); } /** -- cgit v1.2.3 From 4cf303487d5dddaace2daca8437c555f3f0bc1aa Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 6 Dec 2006 20:36:06 -0800 Subject: [PATCH] Export pm_suspend for the shared APM emulation The new shared APM emulation just like its ARM and MIPS predecessors uses pm_suspend() which was only exported on SH. Move export to close to it's definition where it really should be anyway. Signed-off-by: Ralf Baechle Cc: Russell King Cc: Paul Mundt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 751157b7897e..500eb87f643d 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -8,6 +8,7 @@ * */ +#include #include #include #include @@ -230,7 +231,7 @@ int pm_suspend(suspend_state_t state) return -EINVAL; } - +EXPORT_SYMBOL(pm_suspend); decl_subsys(power,NULL,NULL); -- cgit v1.2.3 From 696040670a12f66b17a839011f96d9ca376f688b Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Wed, 6 Dec 2006 20:36:15 -0800 Subject: [PATCH] cpuset: minor code refinements A couple of minor code simplifications to the kernel/cpuset.c code. No functional change. Just a little less code and a little more readable. Signed-off-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6313c38c930e..bd1e89c4c96a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -729,9 +729,11 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) } /* Remaining checks don't apply to root cpuset */ - if ((par = cur->parent) == NULL) + if (cur == &top_cpuset) return 0; + par = cur->parent; + /* We must be a subset of our parent cpuset */ if (!is_cpuset_subset(trial, par)) return -EACCES; @@ -1060,10 +1062,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) cpu_exclusive_changed = (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); mutex_lock(&callback_mutex); - if (turning_on) - set_bit(bit, &cs->flags); - else - clear_bit(bit, &cs->flags); + cs->flags = trialcs.flags; mutex_unlock(&callback_mutex); if (cpu_exclusive_changed) -- cgit v1.2.3 From 910b1b2e6d7d10e1c3bffdd12a90ec82f535f9a5 Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Thu, 7 Dec 2006 10:45:25 +0100 Subject: [PATCH] lockdep: internal locking fixes Here are mainly some lockdep returns with 0 with unlocking fixes. Signed-off-by: Jarek Poplawski Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/lockdep.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index e33f6207f5b3..f76a24f2ac20 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -237,8 +237,10 @@ e locks. */ trace->max_entries = trace->nr_entries; nr_stack_trace_entries += trace->nr_entries; - if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES)) + if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES)) { + __raw_spin_unlock(&hash_lock); return 0; + } if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) { __raw_spin_unlock(&hash_lock); @@ -474,7 +476,8 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, return 0; entry->class = this; - save_trace(&entry->trace); + if (!save_trace(&entry->trace)) + return 0; /* * Since we never remove from the dependency list, the list can @@ -562,8 +565,12 @@ static noinline int print_circular_bug_tail(void) if (debug_locks_silent) return 0; + /* hash_lock unlocked by the header */ + __raw_spin_lock(&hash_lock); this.class = check_source->class; - save_trace(&this.trace); + if (!save_trace(&this.trace)) + return 0; + __raw_spin_unlock(&hash_lock); print_circular_bug_entry(&this, 0); printk("\nother info that might help us debug this:\n\n"); @@ -966,14 +973,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, &prev->class->locks_after, next->acquire_ip); if (!ret) return 0; - /* - * Return value of 2 signals 'dependency already added', - * in that case we dont have to add the backlink either. - */ - if (ret == 2) - return 2; + ret = add_lock_to_list(next->class, prev->class, &next->class->locks_before, next->acquire_ip); + if (!ret) + return 0; /* * Debugging printouts: @@ -1025,7 +1029,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) * added: */ if (hlock->read != 2) { - check_prev_add(curr, hlock, next); + if (!check_prev_add(curr, hlock, next)) + return 0; /* * Stop after the first non-trylock entry, * as non-trylock entries have added their -- cgit v1.2.3 From b23984d0a12a4821b2e9712c71550f321eb88bb5 Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Wed, 6 Dec 2006 20:36:23 -0800 Subject: [PATCH] lockdep: misc fixes in lockdep.c - numeric string size replaced with constant in print_lock_name and print_lockdep_cache, - return on null pointer in print_lock_dependencies, - one more lockdep return with 0 with unlocking fix in mark_lock. Signed-off-by: Jarek Poplawski Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/lockdep.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index f76a24f2ac20..300d61bb314b 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -359,7 +359,7 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4 static void print_lock_name(struct lock_class *class) { - char str[128], c1, c2, c3, c4; + char str[KSYM_NAME_LEN + 1], c1, c2, c3, c4; const char *name; get_usage_chars(class, &c1, &c2, &c3, &c4); @@ -381,7 +381,7 @@ static void print_lock_name(struct lock_class *class) static void print_lockdep_cache(struct lockdep_map *lock) { const char *name; - char str[128]; + char str[KSYM_NAME_LEN + 1]; name = lock->name; if (!name) @@ -451,7 +451,9 @@ static void print_lock_dependencies(struct lock_class *class, int depth) print_lock_class_header(class, depth); list_for_each_entry(entry, &class->locks_after, entry) { - DEBUG_LOCKS_WARN_ON(!entry->class); + if (DEBUG_LOCKS_WARN_ON(!entry->class)) + return; + print_lock_dependencies(entry->class, depth + 1); printk("%*s ... acquired at:\n",depth,""); @@ -1733,6 +1735,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, debug_atomic_dec(&nr_unused_locks); break; default: + __raw_spin_unlock(&hash_lock); debug_locks_off(); WARN_ON(1); return 0; -- cgit v1.2.3 From fec1d0115240593b39898289e6e1413ea6e44a84 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 6 Dec 2006 20:36:34 -0800 Subject: [PATCH] Disable CLONE_CHILD_CLEARTID for abnormal exit The CLONE_CHILD_CLEARTID flag is used by NPTL to have its threads communicate via memory/futex when they exit, so pthread_join can synchronize using a simple futex wait. The word of user memory where NPTL stores a thread's own TID is what it passes; this gets reset to zero at thread exit. It is not desireable to touch this user memory when threads are dying due to a fatal signal. A core dump is more usefully representative of the dying program state if the threads live at the time of the crash have their NPTL data structures unperturbed. The userland expectation of CLONE_CHILD_CLEARTID has only ever been that it works for a thread making an _exit system call. This problem was identified by Ernie Petrides . Signed-off-by: Roland McGrath Cc: Ernie Petrides Cc: Jakub Jelinek Acked-by: Ingo Molnar Cc: Ulrich Drepper Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 2cf74edd3295..f37980df1d58 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -448,7 +448,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) tsk->vfork_done = NULL; complete(vfork_done); } - if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) { + + /* + * If we're exiting normally, clear a user-space tid field if + * requested. We leave this alone when dying by signal, to leave + * the value intact in a core dump, and to save the unnecessary + * trouble otherwise. Userland only wants this done for a sys_exit. + */ + if (tsk->clear_child_tid + && !(tsk->flags & PF_SIGNALED) + && atomic_read(&mm->mm_users) > 1) { u32 __user * tidptr = tsk->clear_child_tid; tsk->clear_child_tid = NULL; -- cgit v1.2.3 From 064b022c7adb2d853378078a9dc141f8288d1c73 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 6 Dec 2006 20:36:37 -0800 Subject: [PATCH] profile: fix uaccess handling Signed-off-by: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/profile.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index f940b462eec9..15b012df4ff1 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -442,7 +442,8 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) read = 0; while (p < sizeof(unsigned int) && count > 0) { - put_user(*((char *)(&sample_step)+p),buf); + if (put_user(*((char *)(&sample_step)+p),buf)) + return -EFAULT; buf++; p++; count--; read++; } pnt = (char *)prof_buffer + p - sizeof(atomic_t); -- cgit v1.2.3 From 128fb95650b3273a8dc9ba5514b6fe7db8ea30bf Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 6 Dec 2006 20:36:50 -0800 Subject: [PATCH] taskstats_exit_alloc: optimize/simplify If there are no listeners, every task does unneeded kmem_cache alloc/free on exit. We don't need listeners->sem for 'if (!list_empty())' check. Yes, we may have a false positive, but this doesn't differ from the case when the listener is unregistered after we drop the semaphore. So we don't need to do allocation beforehand. Signed-off-by: Oleg Nesterov Cc: Balbir Singh Acked-by: Shailabh Nagar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index f5f92014ae98..d9d7c3576238 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -416,7 +416,6 @@ err: void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) { struct listener_list *listeners; - struct taskstats *tmp; /* * This is the cpu on which the task is exiting currently and will * be the one for which the exit event is sent, even if the cpu @@ -424,19 +423,11 @@ void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) */ *mycpu = raw_smp_processor_id(); - *ptidstats = NULL; - tmp = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); - if (!tmp) - return; - listeners = &per_cpu(listener_array, *mycpu); - down_read(&listeners->sem); - if (!list_empty(&listeners->list)) { - *ptidstats = tmp; - tmp = NULL; - } - up_read(&listeners->sem); - kfree(tmp); + + *ptidstats = NULL; + if (!list_empty(&listeners->list)) + *ptidstats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); } /* Send pid data out on exit */ -- cgit v1.2.3 From 115085ea0794c0f339be8f9d25505c7f9861d824 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 6 Dec 2006 20:36:51 -0800 Subject: [PATCH] taskstats: cleanup do_exit() path do_exit: taskstats_exit_alloc() ... taskstats_exit_send() taskstats_exit_free() I think this is not good, let it be a single function exported to the core kernel, taskstats_exit(), which does alloc + send + free itself. Signed-off-by: Oleg Nesterov Cc: Balbir Singh Cc: Shailabh Nagar Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 8 ++------ kernel/taskstats.c | 41 +++++++++++++++-------------------------- 2 files changed, 17 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 06de6c4e8ca3..4e3f919edc48 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -850,9 +850,7 @@ static void exit_notify(struct task_struct *tsk) fastcall NORET_TYPE void do_exit(long code) { struct task_struct *tsk = current; - struct taskstats *tidstats; int group_dead; - unsigned int mycpu; profile_task_exit(tsk); @@ -890,8 +888,6 @@ fastcall NORET_TYPE void do_exit(long code) current->comm, current->pid, preempt_count()); - taskstats_exit_alloc(&tidstats, &mycpu); - acct_update_integrals(tsk); if (tsk->mm) { update_hiwater_rss(tsk->mm); @@ -911,8 +907,8 @@ fastcall NORET_TYPE void do_exit(long code) #endif if (unlikely(tsk->audit_context)) audit_free(tsk); - taskstats_exit_send(tsk, tidstats, group_dead, mycpu); - taskstats_exit_free(tidstats); + + taskstats_exit(tsk, group_dead); exit_mm(tsk); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index d9d7c3576238..2654886fe058 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -119,10 +119,10 @@ static int send_reply(struct sk_buff *skb, pid_t pid) /* * Send taskstats data in @skb to listeners registered for @cpu's exit data */ -static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) +static void send_cpu_listeners(struct sk_buff *skb, + struct listener_list *listeners) { struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); - struct listener_list *listeners; struct listener *s, *tmp; struct sk_buff *skb_next, *skb_cur = skb; void *reply = genlmsg_data(genlhdr); @@ -135,7 +135,6 @@ static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) } rc = 0; - listeners = &per_cpu(listener_array, cpu); down_read(&listeners->sem); list_for_each_entry(s, &listeners->list, list) { skb_next = NULL; @@ -413,28 +412,12 @@ err: return rc; } -void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) -{ - struct listener_list *listeners; - /* - * This is the cpu on which the task is exiting currently and will - * be the one for which the exit event is sent, even if the cpu - * on which this function is running changes later. - */ - *mycpu = raw_smp_processor_id(); - - listeners = &per_cpu(listener_array, *mycpu); - - *ptidstats = NULL; - if (!list_empty(&listeners->list)) - *ptidstats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); -} - /* Send pid data out on exit */ -void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, - int group_dead, unsigned int mycpu) +void taskstats_exit(struct task_struct *tsk, int group_dead) { int rc; + struct listener_list *listeners; + struct taskstats *tidstats; struct sk_buff *rep_skb; void *reply; size_t size; @@ -458,12 +441,17 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, fill_tgid_exit(tsk); } + listeners = &__raw_get_cpu_var(listener_array); + if (list_empty(&listeners->list)) + return; + + tidstats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); if (!tidstats) return; rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); if (rc < 0) - goto ret; + goto free_stats; rc = fill_pid(tsk->pid, tsk, tidstats); if (rc < 0) @@ -492,15 +480,16 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, nla_nest_end(rep_skb, na); send: - send_cpu_listeners(rep_skb, mycpu); + send_cpu_listeners(rep_skb, listeners); +free_stats: + kmem_cache_free(taskstats_cache, tidstats); return; nla_put_failure: genlmsg_cancel(rep_skb, reply); err_skb: nlmsg_free(rep_skb); -ret: - return; + goto free_stats; } static struct genl_ops taskstats_ops = { -- cgit v1.2.3 From 34ec12349c8a9505adc59d72f92b4595bc2483ff Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 6 Dec 2006 20:36:52 -0800 Subject: [PATCH] taskstats: cleanup ->signal->stats allocation Allocate ->signal->stats on demand in taskstats_exit(), this allows us to remove taskstats_tgid_alloc() (the last non-trivial inline) from taskstat's public interface. Signed-off-by: Oleg Nesterov Cc: Balbir Singh Cc: Shailabh Nagar Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 - kernel/taskstats.c | 26 +++++++++++++++++++++++++- 2 files changed, 25 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index f37980df1d58..658838148647 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -847,7 +847,6 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts if (clone_flags & CLONE_THREAD) { atomic_inc(¤t->signal->count); atomic_inc(¤t->signal->live); - taskstats_tgid_alloc(current); return 0; } sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 2654886fe058..7d793d6b1e90 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -412,6 +412,30 @@ err: return rc; } +static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) +{ + struct signal_struct *sig = tsk->signal; + struct taskstats *stats; + + if (sig->stats || thread_group_empty(tsk)) + goto ret; + + /* No problem if kmem_cache_zalloc() fails */ + stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); + + spin_lock_irq(&tsk->sighand->siglock); + if (!sig->stats) { + sig->stats = stats; + stats = NULL; + } + spin_unlock_irq(&tsk->sighand->siglock); + + if (stats) + kmem_cache_free(taskstats_cache, stats); +ret: + return sig->stats; +} + /* Send pid data out on exit */ void taskstats_exit(struct task_struct *tsk, int group_dead) { @@ -433,7 +457,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) size = nla_total_size(sizeof(u32)) + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); - is_thread_group = (tsk->signal->stats != NULL); + is_thread_group = !!taskstats_tgid_alloc(tsk); if (is_thread_group) { /* PID + STATS + TGID + STATS */ size = 2 * size; -- cgit v1.2.3 From 68062b86fc0f480b806d270a8278709a5a41bb67 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 6 Dec 2006 20:36:53 -0800 Subject: [PATCH] taskstats: factor out reply assembling Introduce mk_reply() helper which does all nla_put()s on reply. Saves 453 bytes and a preparation for the next patch. Signed-off-by: Oleg Nesterov Acked-by: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Cc: Thomas Graf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 55 +++++++++++++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 7d793d6b1e90..d2a41339f37b 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -343,6 +343,25 @@ static int parse(struct nlattr *na, cpumask_t *mask) return ret; } +static int mk_reply(struct sk_buff *skb, int type, u32 pid, struct taskstats *stats) +{ + struct nlattr *na; + int aggr; + + aggr = TASKSTATS_TYPE_AGGR_TGID; + if (type == TASKSTATS_TYPE_PID) + aggr = TASKSTATS_TYPE_AGGR_PID; + + na = nla_nest_start(skb, aggr); + NLA_PUT_U32(skb, type, pid); + NLA_PUT_TYPE(skb, struct taskstats, TASKSTATS_TYPE_STATS, *stats); + nla_nest_end(skb, na); + + return 0; +nla_put_failure: + return -1; +} + static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) { int rc = 0; @@ -350,7 +369,6 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) struct taskstats stats; void *reply; size_t size; - struct nlattr *na; cpumask_t mask; rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); @@ -382,27 +400,21 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) if (rc < 0) goto err; - na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); - NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid); - NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, - stats); + if (mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid, &stats)) + goto nla_put_failure; } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); rc = fill_tgid(tgid, NULL, &stats); if (rc < 0) goto err; - na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); - NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid); - NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, - stats); + if (mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid, &stats)) + goto nla_put_failure; } else { rc = -EINVAL; goto err; } - nla_nest_end(rep_skb, na); - return send_reply(rep_skb, info->snd_pid); nla_put_failure: @@ -446,7 +458,6 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) void *reply; size_t size; int is_thread_group; - struct nlattr *na; if (!family_registered) return; @@ -481,27 +492,17 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) if (rc < 0) goto err_skb; - na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); - NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid); - NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, - *tidstats); - nla_nest_end(rep_skb, na); - - if (!is_thread_group) - goto send; + if (mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid, tidstats)) + goto nla_put_failure; /* * Doesn't matter if tsk is the leader or the last group member leaving */ - if (!group_dead) + if (!is_thread_group || !group_dead) goto send; - na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); - NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid); - /* No locking needed for tsk->signal->stats since group is dead */ - NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, - *tsk->signal->stats); - nla_nest_end(rep_skb, na); + if (mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid, tsk->signal->stats)) + goto nla_put_failure; send: send_cpu_listeners(rep_skb, listeners); -- cgit v1.2.3 From 51de4d90852ba4cfa5743594ec4a7f158b52dc43 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 6 Dec 2006 20:36:54 -0800 Subject: [PATCH] taskstats: use nla_reserve() for reply assembling Currently taskstats_user_cmd()/taskstats_exit() do: 1) allocate stats 2) fill stats 3) make a temporary copy on stack (236 bytes) 4) copy that copy to skb 5) free stats With the help of nla_reserve() we can operate on skb->data directly, thus avoiding all these steps except 2). So, before this patch: // copy *stats to skb->data int mk_reply(skb, ..., struct taskstats *stats); fill_pid(stats); mk_reply(skb, ..., stats); After: // return a pointer to skb->data struct taskstats *mk_reply(skb, ...); stat = mk_reply(skb, ...); fill_pid(stats); Shrinks taskatsks.o by 162 bytes. A stupid benchmark (send one million TASKSTATS_CMD_ATTR_PID) shows the real user sys before: 4.02 0.06 3.96 4.02 0.04 3.98 4.02 0.04 3.97 after: 3.86 0.08 3.78 3.88 0.10 3.77 3.89 0.09 3.80 but this looks suspiciously good. Signed-off-by: Oleg Nesterov Acked-by: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Cc: Thomas Graf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 86 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 44 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index d2a41339f37b..b0aad99c4f8d 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -185,6 +185,7 @@ static int fill_pid(pid_t pid, struct task_struct *tsk, } else get_task_struct(tsk); + memset(stats, 0, sizeof(*stats)); /* * Each accounting subsystem adds calls to its functions to * fill in relevant parts of struct taskstsats as follows @@ -227,6 +228,8 @@ static int fill_tgid(pid_t tgid, struct task_struct *first, if (first->signal->stats) memcpy(stats, first->signal->stats, sizeof(*stats)); + else + memset(stats, 0, sizeof(*stats)); tsk = first; do { @@ -343,9 +346,9 @@ static int parse(struct nlattr *na, cpumask_t *mask) return ret; } -static int mk_reply(struct sk_buff *skb, int type, u32 pid, struct taskstats *stats) +static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) { - struct nlattr *na; + struct nlattr *na, *ret; int aggr; aggr = TASKSTATS_TYPE_AGGR_TGID; @@ -353,20 +356,23 @@ static int mk_reply(struct sk_buff *skb, int type, u32 pid, struct taskstats *st aggr = TASKSTATS_TYPE_AGGR_PID; na = nla_nest_start(skb, aggr); - NLA_PUT_U32(skb, type, pid); - NLA_PUT_TYPE(skb, struct taskstats, TASKSTATS_TYPE_STATS, *stats); + if (nla_put(skb, type, sizeof(pid), &pid) < 0) + goto err; + ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); + if (!ret) + goto err; nla_nest_end(skb, na); - return 0; -nla_put_failure: - return -1; + return nla_data(ret); +err: + return NULL; } static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) { int rc = 0; struct sk_buff *rep_skb; - struct taskstats stats; + struct taskstats *stats; void *reply; size_t size; cpumask_t mask; @@ -389,36 +395,36 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) size = nla_total_size(sizeof(u32)) + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); - memset(&stats, 0, sizeof(stats)); rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); if (rc < 0) return rc; + rc = -EINVAL; if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); - rc = fill_pid(pid, NULL, &stats); - if (rc < 0) - goto err; + stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); + if (!stats) + goto nla_err; - if (mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid, &stats)) - goto nla_put_failure; + rc = fill_pid(pid, NULL, stats); + if (rc < 0) + goto nla_err; } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); - rc = fill_tgid(tgid, NULL, &stats); - if (rc < 0) - goto err; + stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); + if (!stats) + goto nla_err; - if (mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid, &stats)) - goto nla_put_failure; - } else { - rc = -EINVAL; + rc = fill_tgid(tgid, NULL, stats); + if (rc < 0) + goto nla_err; + } else goto err; - } return send_reply(rep_skb, info->snd_pid); -nla_put_failure: - rc = genlmsg_cancel(rep_skb, reply); +nla_err: + genlmsg_cancel(rep_skb, reply); err: nlmsg_free(rep_skb); return rc; @@ -453,7 +459,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) { int rc; struct listener_list *listeners; - struct taskstats *tidstats; + struct taskstats *stats; struct sk_buff *rep_skb; void *reply; size_t size; @@ -480,20 +486,17 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) if (list_empty(&listeners->list)) return; - tidstats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); - if (!tidstats) - return; - rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); if (rc < 0) - goto free_stats; + return; - rc = fill_pid(tsk->pid, tsk, tidstats); - if (rc < 0) - goto err_skb; + stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid); + if (!stats) + goto nla_err; - if (mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid, tidstats)) - goto nla_put_failure; + rc = fill_pid(tsk->pid, tsk, stats); + if (rc < 0) + goto nla_err; /* * Doesn't matter if tsk is the leader or the last group member leaving @@ -501,20 +504,19 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) if (!is_thread_group || !group_dead) goto send; - if (mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid, tsk->signal->stats)) - goto nla_put_failure; + stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid); + if (!stats) + goto nla_err; + + memcpy(stats, tsk->signal->stats, sizeof(*stats)); send: send_cpu_listeners(rep_skb, listeners); -free_stats: - kmem_cache_free(taskstats_cache, tidstats); return; -nla_put_failure: +nla_err: genlmsg_cancel(rep_skb, reply); -err_skb: nlmsg_free(rep_skb); - goto free_stats; } static struct genl_ops taskstats_ops = { -- cgit v1.2.3 From 37167485302c8876cb0303af113696e88c2945aa Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 6 Dec 2006 20:36:55 -0800 Subject: [PATCH] taskstats: cleanup reply assembling Thomas Graf wrote: > > nla_nest_start() may return NULL, either rely on prepare_reply() to be > correct and BUG() on failure or do proper error handling for all > functions. nla_put() in taskstat.c can fail only if the 'size' argument of alloc_skb() was not right. This is a kernel bug, we should not hide it. So add 'BUG()' on error path and check for 'na == NULL'. > genlmsg_cancel() is only required in error paths for dumping > procedures. So we can remove 'genlmsg_cancel()' calls and 'void *reply' (saves 227 bytes). Signed-off-by: Oleg Nesterov Cc: Thomas Graf Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index b0aad99c4f8d..4c3476fa058d 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -69,7 +69,7 @@ enum actions { }; static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, - void **replyp, size_t size) + size_t size) { struct sk_buff *skb; void *reply; @@ -94,7 +94,6 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, } *skbp = skb; - *replyp = reply; return 0; } @@ -351,11 +350,13 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) struct nlattr *na, *ret; int aggr; - aggr = TASKSTATS_TYPE_AGGR_TGID; - if (type == TASKSTATS_TYPE_PID) - aggr = TASKSTATS_TYPE_AGGR_PID; + aggr = (type == TASKSTATS_TYPE_PID) + ? TASKSTATS_TYPE_AGGR_PID + : TASKSTATS_TYPE_AGGR_TGID; na = nla_nest_start(skb, aggr); + if (!na) + goto err; if (nla_put(skb, type, sizeof(pid), &pid) < 0) goto err; ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); @@ -373,7 +374,6 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) int rc = 0; struct sk_buff *rep_skb; struct taskstats *stats; - void *reply; size_t size; cpumask_t mask; @@ -395,7 +395,7 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) size = nla_total_size(sizeof(u32)) + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); - rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); + rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); if (rc < 0) return rc; @@ -404,27 +404,24 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); if (!stats) - goto nla_err; + goto err; rc = fill_pid(pid, NULL, stats); if (rc < 0) - goto nla_err; + goto err; } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); if (!stats) - goto nla_err; + goto err; rc = fill_tgid(tgid, NULL, stats); if (rc < 0) - goto nla_err; + goto err; } else goto err; return send_reply(rep_skb, info->snd_pid); - -nla_err: - genlmsg_cancel(rep_skb, reply); err: nlmsg_free(rep_skb); return rc; @@ -461,7 +458,6 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) struct listener_list *listeners; struct taskstats *stats; struct sk_buff *rep_skb; - void *reply; size_t size; int is_thread_group; @@ -486,17 +482,17 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) if (list_empty(&listeners->list)) return; - rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); + rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size); if (rc < 0) return; stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid); if (!stats) - goto nla_err; + goto err; rc = fill_pid(tsk->pid, tsk, stats); if (rc < 0) - goto nla_err; + goto err; /* * Doesn't matter if tsk is the leader or the last group member leaving @@ -506,16 +502,14 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid); if (!stats) - goto nla_err; + goto err; memcpy(stats, tsk->signal->stats, sizeof(*stats)); send: send_cpu_listeners(rep_skb, listeners); return; - -nla_err: - genlmsg_cancel(rep_skb, reply); +err: nlmsg_free(rep_skb); } -- cgit v1.2.3 From a4c410f00f7ca4bd448b0d63f6f882fd244dc991 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Dec 2006 20:37:21 -0800 Subject: [PATCH] lockdep: print current locks on in_atomic warnings Add debug_show_held_locks(current) to __might_sleep() and schedule(); this makes finding the offending lock leak easier. Signed-off-by: Peter Zijlstra Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1848e280504d..343e1794233e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3333,6 +3333,7 @@ asmlinkage void __sched schedule(void) printk(KERN_ERR "BUG: scheduling while atomic: " "%s/0x%08x/%d\n", current->comm, preempt_count(), current->pid); + debug_show_held_locks(current); dump_stack(); } profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -6872,6 +6873,7 @@ void __might_sleep(char *file, int line) " context at %s:%d\n", file, line); printk("in_atomic():%d, irqs_disabled():%d\n", in_atomic(), irqs_disabled()); + debug_show_held_locks(current); dump_stack(); } #endif -- cgit v1.2.3 From 6cfd76a26d9fe2ba54b9d496a48c1d9285e5c5ed Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Dec 2006 20:37:22 -0800 Subject: [PATCH] lockdep: name some old style locks Name some of the remaning 'old_style_spin_init' locks Signed-off-by: Peter Zijlstra Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 3 ++- kernel/irq/handle.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 0aad5ca36a81..dc12db8600e7 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -89,7 +89,8 @@ struct acct_glbs { struct timer_list timer; }; -static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED}; +static struct acct_glbs acct_globals __cacheline_aligned = + {__SPIN_LOCK_UNLOCKED(acct_globals.lock)}; /* * Called whenever the timer says to check the free space. diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a681912bc89a..aff1f0fabb0d 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -54,7 +54,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = { .chip = &no_irq_chip, .handle_irq = handle_bad_irq, .depth = 1, - .lock = SPIN_LOCK_UNLOCKED, + .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), #ifdef CONFIG_SMP .affinity = CPU_MASK_ALL #endif -- cgit v1.2.3 From ece8a684c75df215320b4155944979e3f78c5c93 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 6 Dec 2006 20:37:24 -0800 Subject: [PATCH] sleep profiling Implement prof=sleep profiling. TASK_UNINTERRUPTIBLE sleeps will be taken as a profile hit, and every millisecond spent sleeping causes a profile-hit for the call site that initiated the sleep. Sample readprofile output on i386: 306 ps2_sendbyte 1.3973 432 call_usermodehelper_keys 1.9548 484 ps2_command 0.6453 790 __driver_attach 4.7879 1593 msleep 44.2500 3976 sync_buffer 64.1290 4076 do_lookup 12.4648 8587 sync_page 122.6714 20820 total 0.0067 (NOTE: architectures need to check whether get_wchan() can be called from deep within the wakeup path.) akpm: we need to mark more functions __sched. lock_sock(), msleep(), others.. akpm: the contention in do_lookup() is a surprise. Presumably doing disk reads for directory contents while holding i_mutex. [akpm@osdl.org: various fixes] Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/profile.c | 39 ++++++++++++++++++++++++++++++--------- kernel/sched.c | 11 +++++++++++ 2 files changed, 41 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index 15b012df4ff1..04fd84e8cdbe 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -40,7 +40,7 @@ int (*timer_hook)(struct pt_regs *) __read_mostly; static atomic_t *prof_buffer; static unsigned long prof_len, prof_shift; -static int prof_on __read_mostly; +int prof_on __read_mostly; static cpumask_t prof_cpu_mask = CPU_MASK_ALL; #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); @@ -51,9 +51,19 @@ static DEFINE_MUTEX(profile_flip_mutex); static int __init profile_setup(char * str) { static char __initdata schedstr[] = "schedule"; + static char __initdata sleepstr[] = "sleep"; int par; - if (!strncmp(str, schedstr, strlen(schedstr))) { + if (!strncmp(str, sleepstr, strlen(sleepstr))) { + prof_on = SLEEP_PROFILING; + if (str[strlen(sleepstr)] == ',') + str += strlen(sleepstr) + 1; + if (get_option(&str, &par)) + prof_shift = par; + printk(KERN_INFO + "kernel sleep profiling enabled (shift: %ld)\n", + prof_shift); + } else if (!strncmp(str, sleepstr, strlen(sleepstr))) { prof_on = SCHED_PROFILING; if (str[strlen(schedstr)] == ',') str += strlen(schedstr) + 1; @@ -204,7 +214,8 @@ EXPORT_SYMBOL_GPL(profile_event_unregister); * positions to which hits are accounted during short intervals (e.g. * several seconds) is usually very small. Exclusion from buffer * flipping is provided by interrupt disablement (note that for - * SCHED_PROFILING profile_hit() may be called from process context). + * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from + * process context). * The hash function is meant to be lightweight as opposed to strong, * and was vaguely inspired by ppc64 firmware-supported inverted * pagetable hash functions, but uses a full hashtable full of finite @@ -257,7 +268,7 @@ static void profile_discard_flip_buffers(void) mutex_unlock(&profile_flip_mutex); } -void profile_hit(int type, void *__pc) +void profile_hits(int type, void *__pc, unsigned int nr_hits) { unsigned long primary, secondary, flags, pc = (unsigned long)__pc; int i, j, cpu; @@ -274,21 +285,31 @@ void profile_hit(int type, void *__pc) put_cpu(); return; } + /* + * We buffer the global profiler buffer into a per-CPU + * queue and thus reduce the number of global (and possibly + * NUMA-alien) accesses. The write-queue is self-coalescing: + */ local_irq_save(flags); do { for (j = 0; j < PROFILE_GRPSZ; ++j) { if (hits[i + j].pc == pc) { - hits[i + j].hits++; + hits[i + j].hits += nr_hits; goto out; } else if (!hits[i + j].hits) { hits[i + j].pc = pc; - hits[i + j].hits = 1; + hits[i + j].hits = nr_hits; goto out; } } i = (i + secondary) & (NR_PROFILE_HIT - 1); } while (i != primary); - atomic_inc(&prof_buffer[pc]); + + /* + * Add the current hit(s) and flush the write-queue out + * to the global buffer: + */ + atomic_add(nr_hits, &prof_buffer[pc]); for (i = 0; i < NR_PROFILE_HIT; ++i) { atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); hits[i].pc = hits[i].hits = 0; @@ -356,14 +377,14 @@ static int __devinit profile_cpu_callback(struct notifier_block *info, #define profile_flip_buffers() do { } while (0) #define profile_discard_flip_buffers() do { } while (0) -void profile_hit(int type, void *__pc) +void profile_hits(int type, void *__pc, unsigned int nr_hits) { unsigned long pc; if (prof_on != type || !prof_buffer) return; pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; - atomic_inc(&prof_buffer[min(pc, prof_len - 1)]); + atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); } #endif /* !CONFIG_SMP */ diff --git a/kernel/sched.c b/kernel/sched.c index 343e1794233e..75a005ed4eda 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -948,6 +948,17 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) } #endif + /* + * Sleep time is in units of nanosecs, so shift by 20 to get a + * milliseconds-range estimation of the amount of time that the task + * spent sleeping: + */ + if (unlikely(prof_on == SLEEP_PROFILING)) { + if (p->state == TASK_UNINTERRUPTIBLE) + profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), + (now - p->timestamp) >> 20); + } + if (!rt_task(p)) p->prio = recalc_task_prio(p, now); -- cgit v1.2.3 From d5abe669172f20a4129a711de0f250a4e07db298 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Dec 2006 20:37:26 -0800 Subject: [PATCH] debug: workqueue locking sanity Workqueue functions should not leak locks, assert so, printing the last function ran. Use macros in lockdep.h to avoid include dependency pains. [akpm@osdl.org: build fix] Signed-off-by: Peter Zijlstra Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2945b094d871..5484d6e045c2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -30,6 +30,8 @@ #include #include #include +#include +#include /* * The per-CPU workqueue (if single thread, we always use the first @@ -253,6 +255,17 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) work_release(work); f(work); + if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { + printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), + current->pid); + printk(KERN_ERR " last function: "); + print_symbol("%s\n", (unsigned long)f); + debug_show_held_locks(current); + dump_stack(); + } + spin_lock_irqsave(&cwq->lock, flags); cwq->remove_sequence++; wake_up(&cwq->work_done); -- cgit v1.2.3 From 40fcfc87222e2e8af6379ec366f0cb2a411570cd Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Wed, 6 Dec 2006 20:37:27 -0800 Subject: [PATCH] HZ: 300Hz support Fix two things. Firstly the unit is "Hz" not "HZ". Secondly it is useful to have 300Hz support when doing multimedia work. 250 is fine for us in Europe but the US frame rate is 30fps (29.99 blah for pedants). 300 gives us a tick divisible by both 25 and 30, and for interlace work 50 and 60. It's also giving similar performance to 250Hz. I'd argue we should remove 250 and add 300, but that might be excess disruption for now. Signed-off-by: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Kconfig.hz | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 248e1c396f8b..4af15802ccd4 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -7,7 +7,7 @@ choice default HZ_250 help Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 HZ but 100 HZ may be more + to have the timer interrupt run at 1000 Hz but 100 Hz may be more beneficial for servers and NUMA systems that do not need to have a fast response for user interaction and that may experience bus contention and cacheline bounces as a result of timer interrupts. @@ -19,21 +19,30 @@ choice config HZ_100 bool "100 HZ" help - 100 HZ is a typical choice for servers, SMP and NUMA systems + 100 Hz is a typical choice for servers, SMP and NUMA systems with lots of processors that may show reduced performance if too many timer interrupts are occurring. config HZ_250 bool "250 HZ" help - 250 HZ is a good compromise choice allowing server performance + 250 Hz is a good compromise choice allowing server performance while also showing good interactive responsiveness even - on SMP and NUMA systems. + on SMP and NUMA systems. If you are going to be using NTSC video + or multimedia, selected 300Hz instead. + + config HZ_300 + bool "300 HZ" + help + 300 Hz is a good compromise choice allowing server performance + while also showing good interactive responsiveness even + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. config HZ_1000 bool "1000 HZ" help - 1000 HZ is the preferred choice for desktop systems and other + 1000 Hz is the preferred choice for desktop systems and other systems requiring fast interactive responses to events. endchoice @@ -42,5 +51,6 @@ config HZ int default 100 if HZ_100 default 250 if HZ_250 + default 300 if HZ_300 default 1000 if HZ_1000 -- cgit v1.2.3 From c36264dfb2d6fa6383082de0a1bba8e12b477da1 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 6 Dec 2006 20:37:42 -0800 Subject: [PATCH] remove the syslog interface when printk is disabled Attempts to read() from the non-existent dmesg buffer will return zero and userspace tends to get stuck in a busyloop. So just remove /dev/kmsg altogether if CONFIG_PRINTK=n. Signed-off-by: Mike Galbraith Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 66426552fbfe..ba59c2a30ed0 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -631,12 +631,7 @@ EXPORT_SYMBOL(vprintk); asmlinkage long sys_syslog(int type, char __user *buf, int len) { - return 0; -} - -int do_syslog(int type, char __user *buf, int len) -{ - return 0; + return -ENOSYS; } static void call_console_drivers(unsigned long start, unsigned long end) -- cgit v1.2.3 From b4c6c34a530b4d1c626f4ac0a884e0a9b849378c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 6 Dec 2006 20:38:11 -0800 Subject: [PATCH] kprobes: enable booster on the preemptible kernel When we are unregistering a kprobe-booster, we can't release its instruction buffer immediately on the preemptive kernel, because some processes might be preempted on the buffer. The freeze_processes() and thaw_processes() functions can clean most of processes up from the buffer. There are still some non-frozen threads who have the PF_NOFREEZE flag. If those threads are sleeping (not preempted) at the known place outside the buffer, we can ensure safety of freeing. However, the processing of this check routine takes a long time. So, this patch introduces the garbage collection mechanism of insn_slot. It also introduces the "dirty" flag to free_insn_slot because of efficiency. The "clean" instruction slots (dirty flag is cleared) are released immediately. But the "dirty" slots which are used by boosted kprobes, are marked as garbages. collect_garbage_slots() will be invoked to release "dirty" slots if there are more than INSNS_PER_PAGE garbage slots or if there are no unused slots. Cc: "Keshavamurthy, Anil S" Cc: Ananth N Mavinakayanahalli Cc: "bibo,mao" Cc: Prasanna S Panchamukhi Cc: Yumiko Sugita Cc: Satoshi Oshima Cc: Hideo Aoki Signed-off-by: Masami Hiramatsu Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 117 +++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 96 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 610c837ad9e0..17ec4afb0994 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -83,9 +84,36 @@ struct kprobe_insn_page { kprobe_opcode_t *insns; /* Page of instruction slots */ char slot_used[INSNS_PER_PAGE]; int nused; + int ngarbage; }; static struct hlist_head kprobe_insn_pages; +static int kprobe_garbage_slots; +static int collect_garbage_slots(void); + +static int __kprobes check_safety(void) +{ + int ret = 0; +#if defined(CONFIG_PREEMPT) && defined(CONFIG_PM) + ret = freeze_processes(); + if (ret == 0) { + struct task_struct *p, *q; + do_each_thread(p, q) { + if (p != current && p->state == TASK_RUNNING && + p->pid != 0) { + printk("Check failed: %s is running\n",p->comm); + ret = -1; + goto loop_end; + } + } while_each_thread(p, q); + } +loop_end: + thaw_processes(); +#else + synchronize_sched(); +#endif + return ret; +} /** * get_insn_slot() - Find a slot on an executable page for an instruction. @@ -96,6 +124,7 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) struct kprobe_insn_page *kip; struct hlist_node *pos; + retry: hlist_for_each(pos, &kprobe_insn_pages) { kip = hlist_entry(pos, struct kprobe_insn_page, hlist); if (kip->nused < INSNS_PER_PAGE) { @@ -112,7 +141,11 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) } } - /* All out of space. Need to allocate a new page. Use slot 0.*/ + /* If there are any garbage slots, collect it and try again. */ + if (kprobe_garbage_slots && collect_garbage_slots() == 0) { + goto retry; + } + /* All out of space. Need to allocate a new page. Use slot 0. */ kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); if (!kip) { return NULL; @@ -133,10 +166,62 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) memset(kip->slot_used, 0, INSNS_PER_PAGE); kip->slot_used[0] = 1; kip->nused = 1; + kip->ngarbage = 0; return kip->insns; } -void __kprobes free_insn_slot(kprobe_opcode_t *slot) +/* Return 1 if all garbages are collected, otherwise 0. */ +static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) +{ + kip->slot_used[idx] = 0; + kip->nused--; + if (kip->nused == 0) { + /* + * Page is no longer in use. Free it unless + * it's the last one. We keep the last one + * so as not to have to set it up again the + * next time somebody inserts a probe. + */ + hlist_del(&kip->hlist); + if (hlist_empty(&kprobe_insn_pages)) { + INIT_HLIST_NODE(&kip->hlist); + hlist_add_head(&kip->hlist, + &kprobe_insn_pages); + } else { + module_free(NULL, kip->insns); + kfree(kip); + } + return 1; + } + return 0; +} + +static int __kprobes collect_garbage_slots(void) +{ + struct kprobe_insn_page *kip; + struct hlist_node *pos, *next; + + /* Ensure no-one is preepmted on the garbages */ + if (check_safety() != 0) + return -EAGAIN; + + hlist_for_each_safe(pos, next, &kprobe_insn_pages) { + int i; + kip = hlist_entry(pos, struct kprobe_insn_page, hlist); + if (kip->ngarbage == 0) + continue; + kip->ngarbage = 0; /* we will collect all garbages */ + for (i = 0; i < INSNS_PER_PAGE; i++) { + if (kip->slot_used[i] == -1 && + collect_one_slot(kip, i)) + break; + } + } + kprobe_garbage_slots = 0; + return 0; +} + +void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) { struct kprobe_insn_page *kip; struct hlist_node *pos; @@ -146,28 +231,18 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot) if (kip->insns <= slot && slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { int i = (slot - kip->insns) / MAX_INSN_SIZE; - kip->slot_used[i] = 0; - kip->nused--; - if (kip->nused == 0) { - /* - * Page is no longer in use. Free it unless - * it's the last one. We keep the last one - * so as not to have to set it up again the - * next time somebody inserts a probe. - */ - hlist_del(&kip->hlist); - if (hlist_empty(&kprobe_insn_pages)) { - INIT_HLIST_NODE(&kip->hlist); - hlist_add_head(&kip->hlist, - &kprobe_insn_pages); - } else { - module_free(NULL, kip->insns); - kfree(kip); - } + if (dirty) { + kip->slot_used[i] = -1; + kip->ngarbage++; + } else { + collect_one_slot(kip, i); } - return; + break; } } + if (dirty && (++kprobe_garbage_slots > INSNS_PER_PAGE)) { + collect_garbage_slots(); + } } #endif -- cgit v1.2.3 From 02316067852187b8bec781bec07410e91af79627 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 6 Dec 2006 20:38:17 -0800 Subject: [PATCH] hotplug CPU: clean up hotcpu_notifier() use There was lots of #ifdef noise in the kernel due to hotcpu_notifier(fn, prio) not correctly marking 'fn' as used in the !HOTPLUG_CPU case, and thus generating compiler warnings of unused symbols, hence forcing people to add #ifdefs. the compiler can skip truly unused functions just fine: text data bss dec hex filename 1624412 728710 3674856 6027978 5bfaca vmlinux.before 1624412 728710 3674856 6027978 5bfaca vmlinux.after [akpm@osdl.org: topology.c fix] Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 4 ---- kernel/profile.c | 3 +-- kernel/sched.c | 3 --- kernel/workqueue.c | 2 -- 4 files changed, 1 insertion(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index bd1e89c4c96a..9b62b4c03ad0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2044,7 +2044,6 @@ out: return err; } -#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG) /* * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs * or memory nodes, we need to walk over the cpuset hierarchy, @@ -2108,9 +2107,7 @@ static void common_cpu_mem_hotplug_unplug(void) mutex_unlock(&callback_mutex); mutex_unlock(&manage_mutex); } -#endif -#ifdef CONFIG_HOTPLUG_CPU /* * The top_cpuset tracks what CPUs and Memory Nodes are online, * period. This is necessary in order to make cpusets transparent @@ -2127,7 +2124,6 @@ static int cpuset_handle_cpuhp(struct notifier_block *nb, common_cpu_mem_hotplug_unplug(); return 0; } -#endif #ifdef CONFIG_MEMORY_HOTPLUG /* diff --git a/kernel/profile.c b/kernel/profile.c index 04fd84e8cdbe..0961d93e1d91 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -319,7 +319,6 @@ out: put_cpu(); } -#ifdef CONFIG_HOTPLUG_CPU static int __devinit profile_cpu_callback(struct notifier_block *info, unsigned long action, void *__cpu) { @@ -372,10 +371,10 @@ static int __devinit profile_cpu_callback(struct notifier_block *info, } return NOTIFY_OK; } -#endif /* CONFIG_HOTPLUG_CPU */ #else /* !CONFIG_SMP */ #define profile_flip_buffers() do { } while (0) #define profile_discard_flip_buffers() do { } while (0) +#define profile_cpu_callback NULL void profile_hits(int type, void *__pc, unsigned int nr_hits) { diff --git a/kernel/sched.c b/kernel/sched.c index 75a005ed4eda..c83f531c2886 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6740,8 +6740,6 @@ SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, sched_smt_power_savings_store); #endif - -#ifdef CONFIG_HOTPLUG_CPU /* * Force a reinitialization of the sched domains hierarchy. The domains * and groups cannot be updated in place without racing with the balancing @@ -6774,7 +6772,6 @@ static int update_sched_domains(struct notifier_block *nfb, return NOTIFY_OK; } -#endif void __init sched_init_smp(void) { diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5484d6e045c2..c5257316f4b9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -655,7 +655,6 @@ int current_is_keventd(void) } -#ifdef CONFIG_HOTPLUG_CPU /* Take the work from this (downed) CPU. */ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) { @@ -738,7 +737,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -#endif void init_workqueues(void) { -- cgit v1.2.3 From ebe7e5fe4b41deeb2731c5b52d8c8e6ac08b1f74 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 6 Dec 2006 20:38:21 -0800 Subject: [PATCH] remove kernel/lockdep.c:lockdep_internal Remove the no longer used lockdep_internal(). Signed-off-by: Adrian Bunk Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/lockdep.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 300d61bb314b..3926c3674354 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -140,13 +140,6 @@ void lockdep_on(void) EXPORT_SYMBOL(lockdep_on); -int lockdep_internal(void) -{ - return current->lockdep_recursion != 0; -} - -EXPORT_SYMBOL(lockdep_internal); - /* * Debugging switches: */ -- cgit v1.2.3 From d3228a887cae75ef2b8b1211c31c539bef5a5698 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 6 Dec 2006 20:38:22 -0800 Subject: [PATCH] make kernel/signal.c:kill_proc_info() static Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index bc972d7e6319..ec81defde339 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1134,8 +1134,7 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) return error; } -int -kill_proc_info(int sig, struct siginfo *info, pid_t pid) +static int kill_proc_info(int sig, struct siginfo *info, pid_t pid) { int error; rcu_read_lock(); -- cgit v1.2.3 From 1c69d921ed9cc6593ad4f60c0f9951cb0d62b0b4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Dec 2006 20:38:44 -0800 Subject: [PATCH] rcu: add a prefetch() in rcu_do_batch() On some workloads, (for example when lot of close() syscalls are done), RCU qlen can be quite large, and RCU heads are no longer in cpu cache when rcu_do_batch() is called. This patch adds a prefetch() in rcu_do_batch() to give CPU a hint to bring back cache lines containing 'struct rcu_head's. Most list manipulations macros include prefetch(), but not open coded ones (at least with current C compilers :) ) I got a nice speedup on a trivial benchmark (3.48 us per iteration instead of 3.95 us on a 1.6 GHz Pentium-M) while (1) { pipe(p); close(fd[0]); close(fd[1]);} Signed-off-by: Eric Dumazet Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcupdate.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 26bb5ffe1ef1..3554b76da84c 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -235,12 +235,14 @@ static void rcu_do_batch(struct rcu_data *rdp) list = rdp->donelist; while (list) { - next = rdp->donelist = list->next; + next = list->next; + prefetch(next); list->func(list); list = next; if (++count >= rdp->blimit) break; } + rdp->donelist = list; local_irq_disable(); rdp->qlen -= count; -- cgit v1.2.3 From 4668edc334ee90cf50c382c3e423cfc510b5a126 Mon Sep 17 00:00:00 2001 From: Burman Yan Date: Wed, 6 Dec 2006 20:38:51 -0800 Subject: [PATCH] kernel core: replace kmalloc+memset with kzalloc Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditfilter.c | 3 +-- kernel/futex.c | 3 +-- kernel/kexec.c | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 4f40d923af8e..2e896f8ae29e 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -636,10 +636,9 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule) struct audit_rule *rule; int i; - rule = kmalloc(sizeof(*rule), GFP_KERNEL); + rule = kzalloc(sizeof(*rule), GFP_KERNEL); if (unlikely(!rule)) return NULL; - memset(rule, 0, sizeof(*rule)); rule->flags = krule->flags | krule->listnr; rule->action = krule->action; diff --git a/kernel/futex.c b/kernel/futex.c index af7b81cbde30..7c0d0d4fa7f7 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -324,12 +324,11 @@ static int refill_pi_state_cache(void) if (likely(current->pi_state_cache)) return 0; - pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL); + pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); if (!pi_state) return -ENOMEM; - memset(pi_state, 0, sizeof(*pi_state)); INIT_LIST_HEAD(&pi_state->list); /* pi_mutex gets initialized later */ pi_state->owner = NULL; diff --git a/kernel/kexec.c b/kernel/kexec.c index fcdd5d2bc3f4..d43692cf2321 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -108,11 +108,10 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, /* Allocate a controlling structure */ result = -ENOMEM; - image = kmalloc(sizeof(*image), GFP_KERNEL); + image = kzalloc(sizeof(*image), GFP_KERNEL); if (!image) goto out; - memset(image, 0, sizeof(*image)); image->head = 0; image->entry = &image->head; image->last_entry = &image->head; -- cgit v1.2.3 From 95362fa90312ff2d52c0b4d42412cd7ceeb3b89b Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 6 Dec 2006 20:39:03 -0800 Subject: [PATCH] futex: init error check Check register_filesystem() and kern_mount() return values. Cc: Ingo Molnar Signed-off-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 7c0d0d4fa7f7..d60b7f7a8cc3 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1857,10 +1857,16 @@ static struct file_system_type futex_fs_type = { static int __init init(void) { - unsigned int i; + int i = register_filesystem(&futex_fs_type); + + if (i) + return i; - register_filesystem(&futex_fs_type); futex_mnt = kern_mount(&futex_fs_type); + if (IS_ERR(futex_mnt)) { + unregister_filesystem(&futex_fs_type); + return PTR_ERR(futex_mnt); + } for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { INIT_LIST_HEAD(&futex_queues[i].chain); -- cgit v1.2.3 From bd9b0bac6f601655044fc35978e26231dffee03e Mon Sep 17 00:00:00 2001 From: "BP, Praveen" Date: Wed, 6 Dec 2006 20:39:09 -0800 Subject: [PATCH] sysctl: string length calculated is wrong if it contains negative numbers In the functions do_proc_dointvec() and do_proc_doulongvec_minmax(), there seems to be a bug in string length calculation if string contains negative integer. The console log given below explains the bug. Setting negative values may not be a right thing to do for "console log level" but then the test (given below) can be used to demonstrate the bug in the code. # echo "-1 -1 -1 -123456" > /proc/sys/kernel/printk # cat /proc/sys/kernel/printk -1 -1 -1 -1234 # # echo "-1 -1 -1 123456" > /proc/sys/kernel/printk # cat /proc/sys/kernel/printk -1 -1 -1 1234 # (akpm: the bug is that 123456 gets truncated) It works as expected if string contains all +ve integers # echo "1 2 3 4" > /proc/sys/kernel/printk # cat /proc/sys/kernel/printk 1 2 3 4 # The patch given below fixes the issue. Signed-off-by: Praveen BP Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7abe9704e75a..6d7147cf66bb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1875,7 +1875,7 @@ static int __do_proc_dointvec(void *tbl_data, ctl_table *table, p = buf; if (*p == '-' && left > 1) { neg = 1; - left--, p++; + p++; } if (*p < '0' || *p > '9') break; @@ -2126,7 +2126,7 @@ static int __do_proc_doulongvec_minmax(void *data, ctl_table *table, int write, p = buf; if (*p == '-' && left > 1) { neg = 1; - left--, p++; + p++; } if (*p < '0' || *p > '9') break; -- cgit v1.2.3 From 301827acbe49d0ba7ec9770803970893ac9ded97 Mon Sep 17 00:00:00 2001 From: Chris Caputo Date: Wed, 6 Dec 2006 20:39:11 -0800 Subject: [PATCH] sched: correct output of show_state() At present show_state prints a header the does not match the output of show_task, as follows: - sibling task PC pid father child younger older init S 00000000 0 1 0 2 (NOTLB) - This patch corrects the output of show_state so that the header is aligned with the data, ala: - free sibling task PC stack pid father child younger older init S 00000000 0 1 0 2 (NOTLB) - Signed-off-by: Chris Caputo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c83f531c2886..b43cef02ce5b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4822,12 +4822,12 @@ void show_state_filter(unsigned long state_filter) #if (BITS_PER_LONG == 32) printk("\n" - " sibling\n"); - printk(" task PC pid father child younger older\n"); + " free sibling\n"); + printk(" task PC stack pid father child younger older\n"); #else printk("\n" - " sibling\n"); - printk(" task PC pid father child younger older\n"); + " free sibling\n"); + printk(" task PC stack pid father child younger older\n"); #endif read_lock(&tasklist_lock); do_each_thread(g, p) { -- cgit v1.2.3 From 50cc670aebf4fc64afaf533fb9fa1c8570f09d74 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 6 Dec 2006 20:39:30 -0800 Subject: [PATCH] lockdep: more chains Some have reported a chain-table overflow - double its size. Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/lockdep_internals.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index eab043c83bb2..8ce09bc4613d 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -20,7 +20,7 @@ #define MAX_LOCKDEP_KEYS_BITS 11 #define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) -#define MAX_LOCKDEP_CHAINS_BITS 13 +#define MAX_LOCKDEP_CHAINS_BITS 14 #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) /* -- cgit v1.2.3 From 2ee91f197c0bc654b24eed5831fd12aa0d566a7d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 6 Dec 2006 20:39:32 -0800 Subject: [PATCH] lockdep: show more details about self-test failures Make the locking self-test failures (of 'FAILURE' type) easier to debug by printing more information. Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/mutex-debug.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index 18651641a7b5..841539d72c55 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -77,6 +77,9 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, void debug_mutex_unlock(struct mutex *lock) { + if (unlikely(!debug_locks)) + return; + DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); DEBUG_LOCKS_WARN_ON(lock->magic != lock); DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); -- cgit v1.2.3 From 3908fd2ed920af818aa596672da68ba26173ff27 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Wed, 6 Dec 2006 20:39:39 -0800 Subject: [PATCH] softirq: remove BUG_ONs which can incorrectly trigger It is possible to have tasklets get scheduled before softirqd has had a chance to spawn on all CPUs. This is totally harmless; after success during action CPU_UP_PREPARE, action CPU_ONLINE will be called, which immediately wakes softirqd on the appropriate CPU to process the already pending tasklets. So there is no danger of having a missed wakeup for any tasklets that were already pending. In particular, i386 is affected by this during startup, and is visible when using a very large initrd; during the time it takes for the initrd to be decompressed, a timer IRQ can come in and schedule RCU callbacks. It is also possible that resending of a hardware IRQ via a softirq triggers the same bug. Because of different timing conditions, this shows up in all emulators and virtual machines tested, including Xen, VMware, Virtual PC, and Qemu. It is also possible to trigger on native hardware with a large enough initrd, although I don't have a reliable case demonstrating that. Signed-off-by: Zachary Amsden Cc: Cc: Ingo Molnar Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/softirq.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index bf25015dce16..918e52df090e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -574,8 +574,6 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: - BUG_ON(per_cpu(tasklet_vec, hotcpu).list); - BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list); p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); if (IS_ERR(p)) { printk("ksoftirqd for %i failed\n", hotcpu); -- cgit v1.2.3 From 045f147f3290395661b56b9231fc4d221e150963 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 6 Dec 2006 20:40:15 -0800 Subject: [PATCH] remove EXPORT_UNUSED_SYMBOL'ed symbols In time for 2.6.20, we can get rid of this junk. Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index ba59c2a30ed0..c3d90a58e4c5 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -53,8 +53,6 @@ int console_printk[4] = { DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ }; -EXPORT_UNUSED_SYMBOL(console_printk); /* June 2006 */ - /* * Low lever drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. @@ -772,7 +770,6 @@ int is_console_locked(void) { return console_locked; } -EXPORT_UNUSED_SYMBOL(is_console_locked); /* June 2006 */ /** * release_console_sem - unlock the console system -- cgit v1.2.3 From a09c17a6fdad9ae5b5ea1c3383080f84ec76ab20 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 Dec 2006 20:40:18 -0800 Subject: [PATCH] sys: remove unused variable Remove unused 'new_ruid' variable. Reported by David Binderman . Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index c87b461de38d..a0c1a29a507f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1102,14 +1102,14 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) asmlinkage long sys_setuid(uid_t uid) { int old_euid = current->euid; - int old_ruid, old_suid, new_ruid, new_suid; + int old_ruid, old_suid, new_suid; int retval; retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID); if (retval) return retval; - old_ruid = new_ruid = current->uid; + old_ruid = current->uid; old_suid = current->suid; new_suid = old_suid; -- cgit v1.2.3 From 012d3ca8d85bf3ae5278ba897dd1ca3999247107 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Wed, 6 Dec 2006 20:40:19 -0800 Subject: [PATCH] Add Sparse annotations to SRCU wrapper functions in rcutorture The SRCU wrapper functions srcu_torture_read_lock and srcu_torture_read_unlock in rcutorture intentionally change the SRCU context; annotate them accordingly, to avoid a warning. Signed-off-by: Josh Triplett Acked-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index e2bda18f6f42..c52f981ea008 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -401,7 +401,7 @@ static void srcu_torture_cleanup(void) cleanup_srcu_struct(&srcu_ctl); } -static int srcu_torture_read_lock(void) +static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) { return srcu_read_lock(&srcu_ctl); } @@ -419,7 +419,7 @@ static void srcu_read_delay(struct rcu_random_state *rrsp) schedule_timeout_interruptible(longdelay); } -static void srcu_torture_read_unlock(int idx) +static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) { srcu_read_unlock(&srcu_ctl, idx); } -- cgit v1.2.3 From ccdea2f88b5689f0fd29c3804be43a3acf0311e3 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 6 Dec 2006 20:40:26 -0800 Subject: [PATCH] futex: remove unneeded barrier When disassembling a kernel I found around over 90 sync Instructions from mb, rmb and wmb calls in the kernel and only few of those make any sense to me. So here's the first one - I think the wmb() in kernel/futex.c is not needed on uniprocessors so should become an smb_wmb(). Signed-off-by: Ralf Baechle Acked-by: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index d60b7f7a8cc3..a8302a1620ea 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -552,7 +552,7 @@ static void wake_futex(struct futex_q *q) * at the end of wake_up_all() does not prevent this store from * moving. */ - wmb(); + smp_wmb(); q->lock_ptr = NULL; } -- cgit v1.2.3 From 15ad7cdcfd76450d4beebc789ec646664238184d Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 6 Dec 2006 20:40:36 -0800 Subject: [PATCH] struct seq_operations and struct file_operations constification - move some file_operations structs into the .rodata section - move static strings from policy_types[] array into the .rodata section - fix generic seq_operations usages, so that those structs may be defined as "const" as well [akpm@osdl.org: couple of fixes] Signed-off-by: Helge Deller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/configs.c | 2 +- kernel/cpuset.c | 4 ++-- kernel/dma.c | 2 +- kernel/futex.c | 2 +- kernel/kallsyms.c | 4 ++-- kernel/lockdep_proc.c | 6 +++--- kernel/module.c | 2 +- kernel/power/user.c | 2 +- kernel/profile.c | 2 +- kernel/relay.c | 2 +- kernel/resource.c | 6 +++--- kernel/sched.c | 2 +- kernel/sysctl.c | 2 +- 13 files changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/configs.c b/kernel/configs.c index f9e31974f4ad..8fa1fb28f8a7 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -75,7 +75,7 @@ ikconfig_read_current(struct file *file, char __user *buf, return count; } -static struct file_operations ikconfig_file_ops = { +static const struct file_operations ikconfig_file_ops = { .owner = THIS_MODULE, .read = ikconfig_read_current, }; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 9b62b4c03ad0..4ef1d29297ca 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1531,7 +1531,7 @@ static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry, return simple_rename(old_dir, old_dentry, new_dir, new_dentry); } -static struct file_operations cpuset_file_operations = { +static const struct file_operations cpuset_file_operations = { .read = cpuset_file_read, .write = cpuset_file_write, .llseek = generic_file_llseek, @@ -2605,7 +2605,7 @@ static int cpuset_open(struct inode *inode, struct file *file) return single_open(file, proc_cpuset_show, pid); } -struct file_operations proc_cpuset_operations = { +const struct file_operations proc_cpuset_operations = { .open = cpuset_open, .read = seq_read, .llseek = seq_lseek, diff --git a/kernel/dma.c b/kernel/dma.c index 2020644c938a..937b13ca33ba 100644 --- a/kernel/dma.c +++ b/kernel/dma.c @@ -140,7 +140,7 @@ static int proc_dma_open(struct inode *inode, struct file *file) return single_open(file, proc_dma_show, NULL); } -static struct file_operations proc_dma_operations = { +static const struct file_operations proc_dma_operations = { .open = proc_dma_open, .read = seq_read, .llseek = seq_lseek, diff --git a/kernel/futex.c b/kernel/futex.c index a8302a1620ea..95989a3b4168 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1492,7 +1492,7 @@ static unsigned int futex_poll(struct file *filp, return ret; } -static struct file_operations futex_fops = { +static const struct file_operations futex_fops = { .release = futex_close, .poll = futex_poll, }; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 54befe36ee0b..ab63cfc42992 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -398,7 +398,7 @@ static int s_show(struct seq_file *m, void *p) return 0; } -static struct seq_operations kallsyms_op = { +static const struct seq_operations kallsyms_op = { .start = s_start, .next = s_next, .stop = s_stop, @@ -433,7 +433,7 @@ static int kallsyms_release(struct inode *inode, struct file *file) return seq_release(inode, file); } -static struct file_operations kallsyms_operations = { +static const struct file_operations kallsyms_operations = { .open = kallsyms_open, .read = seq_read, .llseek = seq_lseek, diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index f6e72eaab3fa..b554b40a4aa6 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -113,7 +113,7 @@ static int l_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations lockdep_ops = { +static const struct seq_operations lockdep_ops = { .start = l_start, .next = l_next, .stop = l_stop, @@ -135,7 +135,7 @@ static int lockdep_open(struct inode *inode, struct file *file) return res; } -static struct file_operations proc_lockdep_operations = { +static const struct file_operations proc_lockdep_operations = { .open = lockdep_open, .read = seq_read, .llseek = seq_lseek, @@ -319,7 +319,7 @@ static int lockdep_stats_open(struct inode *inode, struct file *file) return single_open(file, lockdep_stats_show, NULL); } -static struct file_operations proc_lockdep_stats_operations = { +static const struct file_operations proc_lockdep_stats_operations = { .open = lockdep_stats_open, .read = seq_read, .llseek = seq_lseek, diff --git a/kernel/module.c b/kernel/module.c index e2d09d604ca0..d9eae45d0145 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2209,7 +2209,7 @@ static int m_show(struct seq_file *m, void *p) Where refcount is a number or -, and deps is a comma-separated list of depends or -. */ -struct seq_operations modules_op = { +const struct seq_operations modules_op = { .start = m_start, .next = m_next, .stop = m_stop, diff --git a/kernel/power/user.c b/kernel/power/user.c index 069732e5695c..89443b85163b 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -383,7 +383,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, return error; } -static struct file_operations snapshot_fops = { +static const struct file_operations snapshot_fops = { .open = snapshot_open, .release = snapshot_release, .read = snapshot_read, diff --git a/kernel/profile.c b/kernel/profile.c index 0961d93e1d91..fb5e03d57e9d 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -501,7 +501,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf, return count; } -static struct file_operations proc_profile_operations = { +static const struct file_operations proc_profile_operations = { .read = read_profile, .write = write_profile, }; diff --git a/kernel/relay.c b/kernel/relay.c index 2b92e8ece85b..75a3a9a7efc2 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1013,7 +1013,7 @@ static ssize_t relay_file_sendfile(struct file *filp, actor, &desc); } -struct file_operations relay_file_operations = { +const struct file_operations relay_file_operations = { .open = relay_file_open, .poll = relay_file_poll, .mmap = relay_file_mmap, diff --git a/kernel/resource.c b/kernel/resource.c index 6de60c12143e..7b9a497419d9 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -88,7 +88,7 @@ static int r_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations resource_op = { +static const struct seq_operations resource_op = { .start = r_start, .next = r_next, .stop = r_stop, @@ -115,14 +115,14 @@ static int iomem_open(struct inode *inode, struct file *file) return res; } -static struct file_operations proc_ioports_operations = { +static const struct file_operations proc_ioports_operations = { .open = ioports_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; -static struct file_operations proc_iomem_operations = { +static const struct file_operations proc_iomem_operations = { .open = iomem_open, .read = seq_read, .llseek = seq_lseek, diff --git a/kernel/sched.c b/kernel/sched.c index b43cef02ce5b..f385eff4682d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -505,7 +505,7 @@ static int schedstat_open(struct inode *inode, struct file *file) return res; } -struct file_operations proc_schedstat_operations = { +const struct file_operations proc_schedstat_operations = { .open = schedstat_open, .read = seq_read, .llseek = seq_lseek, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6d7147cf66bb..758dbbf972a5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -170,7 +170,7 @@ static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *); static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *); static int proc_opensys(struct inode *, struct file *); -struct file_operations proc_sys_file_operations = { +const struct file_operations proc_sys_file_operations = { .open = proc_opensys, .read = proc_readsys, .write = proc_writesys, -- cgit v1.2.3 From 85916f8166b59eeac63d2b4f7f1df8de849334b4 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 6 Dec 2006 20:40:41 -0800 Subject: [PATCH] Kexec / Kdump: Unify elf note code The elf note saving code is currently duplicated over several architectures. This cleanup patch simply adds code to a common file and then replaces the arch-specific code with calls to the newly added code. The only drawback with this approach is that s390 doesn't fully support kexec-on-panic which for that arch leads to introduction of unused code. Signed-off-by: Magnus Damm Cc: Vivek Goyal Cc: Andi Kleen Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index d43692cf2321..afbbbe981be2 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include #include @@ -1066,6 +1068,60 @@ void crash_kexec(struct pt_regs *regs) } } +static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, + size_t data_len) +{ + struct elf_note note; + + note.n_namesz = strlen(name) + 1; + note.n_descsz = data_len; + note.n_type = type; + memcpy(buf, ¬e, sizeof(note)); + buf += (sizeof(note) + 3)/4; + memcpy(buf, name, note.n_namesz); + buf += (note.n_namesz + 3)/4; + memcpy(buf, data, note.n_descsz); + buf += (note.n_descsz + 3)/4; + + return buf; +} + +static void final_note(u32 *buf) +{ + struct elf_note note; + + note.n_namesz = 0; + note.n_descsz = 0; + note.n_type = 0; + memcpy(buf, ¬e, sizeof(note)); +} + +void crash_save_cpu(struct pt_regs *regs, int cpu) +{ + struct elf_prstatus prstatus; + u32 *buf; + + if ((cpu < 0) || (cpu >= NR_CPUS)) + return; + + /* Using ELF notes here is opportunistic. + * I need a well defined structure format + * for the data I pass, and I need tags + * on the data to indicate what information I have + * squirrelled away. ELF notes happen to provide + * all of that, so there is no need to invent something new. + */ + buf = (u32*)per_cpu_ptr(crash_notes, cpu); + if (!buf) + return; + memset(&prstatus, 0, sizeof(prstatus)); + prstatus.pr_pid = current->pid; + elf_core_copy_regs(&prstatus.pr_reg, regs); + buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus, + sizeof(prstatus)); + final_note(buf); +} + static int __init crash_notes_memory_init(void) { /* Allocate memory for saving cpu registers. */ -- cgit v1.2.3 From 70e4506765602cca047cfa31933836e354c61a63 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 6 Dec 2006 20:40:50 -0800 Subject: [PATCH] lockdep: register_lock_class() fix The hash_lock must only ever be taken with irqs disabled. This happens in all the important places, except one codepath: register_lock_class(). The race should trigger rarely because register_lock_class() is quite rare and single-threaded (happens during init most of the time). The fix is to disable irqs. ( bug found live in -rt: there preemption is alot more agressive and preempting with the hash-lock held caused a lockup.) Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/lockdep.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 3926c3674354..62e73ce68197 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -1182,6 +1182,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) struct lockdep_subclass_key *key; struct list_head *hash_head; struct lock_class *class; + unsigned long flags; class = look_up_lock_class(lock, subclass); if (likely(class)) @@ -1203,6 +1204,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) key = lock->key->subkeys + subclass; hash_head = classhashentry(key); + raw_local_irq_save(flags); __raw_spin_lock(&hash_lock); /* * We have to do the hash-walk again, to avoid races @@ -1217,6 +1219,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) */ if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { __raw_spin_unlock(&hash_lock); + raw_local_irq_restore(flags); debug_locks_off(); printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); printk("turning off the locking correctness validator.\n"); @@ -1239,15 +1242,18 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) if (verbose(class)) { __raw_spin_unlock(&hash_lock); + raw_local_irq_restore(flags); printk("\nnew class %p: %s", class->key, class->name); if (class->name_version > 1) printk("#%d", class->name_version); printk("\n"); dump_stack(); + raw_local_irq_save(flags); __raw_spin_lock(&hash_lock); } out_unlock_set: __raw_spin_unlock(&hash_lock); + raw_local_irq_restore(flags); if (!subclass || force) lock->class_cache = class; -- cgit v1.2.3 From 792908225064b1d841a8990b9d1d1cfc4e0e5bb2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 6 Dec 2006 20:40:51 -0800 Subject: [PATCH] add ignore_loglevel boot option Sometimes the kernel prints something interesting while userspace bootup keeps messages turned off via loglevel. Enable the printing of /all/ kernel messages via the "ignore_loglevel" boot option. Off by default. Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index c3d90a58e4c5..185bb45eacf7 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -333,13 +333,25 @@ static void __call_console_drivers(unsigned long start, unsigned long end) } } +static int __read_mostly ignore_loglevel; + +int __init ignore_loglevel_setup(char *str) +{ + ignore_loglevel = 1; + printk(KERN_INFO "debug: ignoring loglevel setting.\n"); + + return 1; +} + +__setup("ignore_loglevel", ignore_loglevel_setup); + /* * Write out chars from start to end - 1 inclusive */ static void _call_console_drivers(unsigned long start, unsigned long end, int msg_log_level) { - if (msg_log_level < console_loglevel && + if ((msg_log_level < console_loglevel || ignore_loglevel) && console_drivers && start != end) { if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { /* wrapped write */ -- cgit v1.2.3 From d3ed11c35635487d34ad476e1d6a66dd298ab2de Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Wed, 6 Dec 2006 20:41:37 -0800 Subject: [PATCH] cpuset: allow a larger buffer for writes to cpuset files When using fake NUMA setup, the number of memory nodes can greatly exceed the number of CPUs. So the current limit in cpuset_common_file_write() is insufficient. Signed-off-by: Paul Menage Acked-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4ef1d29297ca..0a6b4d89f9a0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1280,7 +1280,8 @@ typedef enum { FILE_TASKLIST, } cpuset_filetype_t; -static ssize_t cpuset_common_file_write(struct file *file, const char __user *userbuf, +static ssize_t cpuset_common_file_write(struct file *file, + const char __user *userbuf, size_t nbytes, loff_t *unused_ppos) { struct cpuset *cs = __d_cs(file->f_dentry->d_parent); @@ -1291,7 +1292,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us int retval = 0; /* Crude upper limit on largest legitimate cpulist user might write. */ - if (nbytes > 100 + 6 * NR_CPUS) + if (nbytes > 100 + 6 * max(NR_CPUS, MAX_NUMNODES)) return -E2BIG; /* +1 for nul-terminator */ -- cgit v1.2.3 From 68380b581383c028830f79ec2670f4a193854aa6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 7 Dec 2006 09:28:19 -0800 Subject: Add "run_scheduled_work()" workqueue function This allows workqueue users to run just their own pending work, rather than wait for the whole workqueue to finish running. This solves the deadlock with networking libphy that was due to other workqueue entries possibly needing a lock that was held by the routine that wanted to flush its own work. It's not wonderful: if you absolutely need to synchronize with the work function having been executed, any user strictly speaking should have its own completion tracking logic, since when we run things explicitly by hand, the generic workqueue layer can no longer help us synchronize. Also, this is strictly only usable for work that has been scheduled without any delayed timers. You can not mix the new interface with schedule_delayed_work(). But it's better than what we had currently. Acked-by: Maciej W. Rozycki Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c5257316f4b9..6b186750e9be 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -108,6 +108,79 @@ static inline void *get_wq_data(struct work_struct *work) return (void *) (work->management & WORK_STRUCT_WQ_DATA_MASK); } +static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work) +{ + int ret = 0; + unsigned long flags; + + spin_lock_irqsave(&cwq->lock, flags); + /* + * We need to re-validate the work info after we've gotten + * the cpu_workqueue lock. We can run the work now iff: + * + * - the wq_data still matches the cpu_workqueue_struct + * - AND the work is still marked pending + * - AND the work is still on a list (which will be this + * workqueue_struct list) + * + * All these conditions are important, because we + * need to protect against the work being run right + * now on another CPU (all but the last one might be + * true if it's currently running and has not been + * released yet, for example). + */ + if (get_wq_data(work) == cwq + && work_pending(work) + && !list_empty(&work->entry)) { + work_func_t f = work->func; + list_del_init(&work->entry); + spin_unlock_irqrestore(&cwq->lock, flags); + + if (!test_bit(WORK_STRUCT_NOAUTOREL, &work->management)) + work_release(work); + f(work); + + spin_lock_irqsave(&cwq->lock, flags); + cwq->remove_sequence++; + wake_up(&cwq->work_done); + ret = 1; + } + spin_unlock_irqrestore(&cwq->lock, flags); + return ret; +} + +/** + * run_scheduled_work - run scheduled work synchronously + * @work: work to run + * + * This checks if the work was pending, and runs it + * synchronously if so. It returns a boolean to indicate + * whether it had any scheduled work to run or not. + * + * NOTE! This _only_ works for normal work_structs. You + * CANNOT use this for delayed work, because the wq data + * for delayed work will not point properly to the per- + * CPU workqueue struct, but will change! + */ +int fastcall run_scheduled_work(struct work_struct *work) +{ + for (;;) { + struct cpu_workqueue_struct *cwq; + + if (!work_pending(work)) + return 0; + if (list_empty(&work->entry)) + return 0; + /* NOTE! This depends intimately on __queue_work! */ + cwq = get_wq_data(work); + if (!cwq) + return 0; + if (__run_work(cwq, work)) + return 1; + } +} +EXPORT_SYMBOL(run_scheduled_work); + /* Preempt must be disabled. */ static void __queue_work(struct cpu_workqueue_struct *cwq, struct work_struct *work) -- cgit v1.2.3 From a79561134f38de12dce14ed72138f38e55ef53fc Mon Sep 17 00:00:00 2001 From: Zou Nan hai Date: Thu, 7 Dec 2006 09:51:35 -0800 Subject: [IA64] IA64 Kexec/kdump Changes and updates. 1. Remove fake rendz path and related code according to discuss with Khalid Aziz. 2. fc.i offset fix in relocate_kernel.S. 3. iospic shutdown code eoi and mask race fix from Fujitsu. 4. Warm boot hook in machine_kexec to SN SAL code from Jack Steiner. 5. Send slave to SAL slave loop patch from Jay Lan. 6. Kdump on non-recoverable MCA event patch from Jay Lan 7. Use CTL_UNNUMBERED in kdump_on_init sysctl. Signed-off-by: Zou Nan hai Signed-off-by: Tony Luck --- kernel/kexec.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index fcdd5d2bc3f4..05aada293592 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -851,6 +851,7 @@ static int kimage_load_crash_segment(struct kimage *image, memset(ptr + uchunk, 0, mchunk - uchunk); } result = copy_from_user(ptr, buf, uchunk); + kexec_flush_icache_page(page); kunmap(page); if (result) { result = (result < 0) ? result : -EIO; -- cgit v1.2.3 From aad094701c6355cb2b3d74a07ec0496f4a48c787 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 8 Dec 2006 02:35:57 -0800 Subject: [PATCH] move kallsyms data to .rodata Kallsyms data is never written to, so it can as well benefit from CONFIG_DEBUG_RODATA. Signed-off-by: Jan Beulich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index ab63cfc42992..6f294ff4f9ee 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -31,14 +31,14 @@ #endif /* These will be re-linked against their real values during the second link stage */ -extern unsigned long kallsyms_addresses[] __attribute__((weak)); -extern unsigned long kallsyms_num_syms __attribute__((weak,section("data"))); -extern u8 kallsyms_names[] __attribute__((weak)); +extern const unsigned long kallsyms_addresses[] __attribute__((weak)); +extern const unsigned long kallsyms_num_syms __attribute__((weak)); +extern const u8 kallsyms_names[] __attribute__((weak)); -extern u8 kallsyms_token_table[] __attribute__((weak)); -extern u16 kallsyms_token_index[] __attribute__((weak)); +extern const u8 kallsyms_token_table[] __attribute__((weak)); +extern const u16 kallsyms_token_index[] __attribute__((weak)); -extern unsigned long kallsyms_markers[] __attribute__((weak)); +extern const unsigned long kallsyms_markers[] __attribute__((weak)); static inline int is_kernel_inittext(unsigned long addr) { @@ -84,7 +84,7 @@ static int is_ksym_addr(unsigned long addr) static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) { int len, skipped_first = 0; - u8 *tptr, *data; + const u8 *tptr, *data; /* get the compressed symbol length from the first symbol byte */ data = &kallsyms_names[off]; @@ -132,7 +132,7 @@ static char kallsyms_get_symbol_type(unsigned int off) * kallsyms array */ static unsigned int get_symbol_offset(unsigned long pos) { - u8 *name; + const u8 *name; int i; /* use the closest marker we have. We have markers every 256 positions, -- cgit v1.2.3 From 6e2ac66470976ad7f57e0948572669b2bdfea2d0 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Fri, 8 Dec 2006 02:35:58 -0800 Subject: [PATCH] CPEI gets warning at kernel/irq/migration.c:27/move_masked_irq() While running my MCA test (hardware error injection) on 2.6.19, I got some warning like following: > BUG: warning at kernel/irq/migration.c:27/move_masked_irq() > > Call Trace: > [] show_stack+0x40/0xa0 > sp=e00000006b2578d0 bsp=e00000006b2510b0 > [] dump_stack+0x30/0x60 > sp=e00000006b257aa0 bsp=e00000006b251098 > [] move_masked_irq+0xb0/0x240 > sp=e00000006b257aa0 bsp=e00000006b251070 > [] move_native_irq+0xe0/0x180 > sp=e00000006b257aa0 bsp=e00000006b251040 > [] iosapic_end_level_irq+0x30/0xe0 > sp=e00000006b257aa0 bsp=e00000006b251020 > [] __do_IRQ+0x170/0x400 > sp=e00000006b257aa0 bsp=e00000006b250fd8 > [] ia64_handle_irq+0x1b0/0x260 > sp=e00000006b257aa0 bsp=e00000006b250fa8 > [] ia64_leave_kernel+0x0/0x280 > sp=e00000006b257aa0 bsp=e00000006b250fa8 > [] _spin_unlock_irqrestore+0x30/0x60 > sp=e00000006b257c70 bsp=e00000006b250f90 It comes from: [kernel/irq/migration.c] 26 if (CHECK_IRQ_PER_CPU(desc->status)) { 27 WARN_ON(1); 28 return; 29 } By putting some printk in kernel, I found that irqbalance is trying to move CPEI which is handled as PER_CPU irq. That's why. CPEI(Corrected Platform Error Interrupt) is ia64 specific irq, is allowed to pin to particular processor which selected by the platform, and even it is PER_CPU but it has set_affinity handler (=iosapic_set_affinity) as same as other IO-SAPIC-level interrupts. (I don't know why, but I guess that there would be typical situation where the handler for migration is needed, such as hotplug - the processor going to be offline/hot-removed.) To shut up this warning, there are 2 way at least: a) fix CPEI stuff b) prohibit setting affinity to PER_CPU irq I'm not sure what stuff of CPEI need to be fixed, but I think that returning error to attempting move PER_CPU irq is useful for all applications since it will never work. Following small patch takes b) style. It works, the warning disappeared and irqbalance still runs well. Signed-off-by: Hidetoshi Seto Cc: Arjan van de Ven Acked-by: Ingo Molnar Acked-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/proc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 9a352667007c..61f5c717a8f5 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -54,7 +54,8 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, unsigned int irq = (int)(long)data, full_count = count, err; cpumask_t new_value, tmp; - if (!irq_desc[irq].chip->set_affinity || no_irq_affinity) + if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || + CHECK_IRQ_PER_CPU(irq_desc[irq].status)) return -EIO; err = cpumask_parse_user(buffer, count, new_value); -- cgit v1.2.3 From 24ec839c431eb79bb8f6abc00c4e1eb3b8c4d517 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Dec 2006 02:36:04 -0800 Subject: [PATCH] tty: ->signal->tty locking Fix the locking of signal->tty. Use ->sighand->siglock to protect ->signal->tty; this lock is already used by most other members of ->signal/->sighand. And unless we are 'current' or the tasklist_lock is held we need ->siglock to access ->signal anyway. (NOTE: sys_unshare() is broken wrt ->sighand locking rules) Note that tty_mutex is held over tty destruction, so while holding tty_mutex any tty pointer remains valid. Otherwise the lifetime of ttys are governed by their open file handles. This leaves some holes for tty access from signal->tty (or any other non file related tty access). It solves the tty SLAB scribbles we were seeing. (NOTE: the change from group_send_sig_info to __group_send_sig_info needs to be examined by someone familiar with the security framework, I think it is safe given the SEND_SIG_PRIV from other __group_send_sig_info invocations) [schwidefsky@de.ibm.com: 3270 fix] [akpm@osdl.org: various post-viro fixes] Signed-off-by: Peter Zijlstra Acked-by: Alan Cox Cc: Oleg Nesterov Cc: Prarit Bhargava Cc: Chris Wright Cc: Roland McGrath Cc: Stephen Smalley Cc: James Morris Cc: "David S. Miller" Cc: Jeff Dike Cc: Martin Schwidefsky Cc: Jan Kara Signed-off-by: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 9 +++------ kernel/auditsc.c | 2 ++ kernel/exit.c | 4 +--- kernel/sys.c | 6 ++++-- 4 files changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index dc12db8600e7..ca5619039367 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -428,6 +428,7 @@ static void do_acct_process(struct file *file) u64 elapsed; u64 run_time; struct timespec uptime; + struct tty_struct *tty; /* * First check to see if there is enough free_space to continue @@ -485,12 +486,8 @@ static void do_acct_process(struct file *file) #endif mutex_lock(&tty_mutex); - /* FIXME: Whoever is responsible for current->signal locking needs - to use the same locking all over the kernel and document it */ - read_lock(&tasklist_lock); - ac.ac_tty = current->signal->tty ? - old_encode_dev(tty_devnum(current->signal->tty)) : 0; - read_unlock(&tasklist_lock); + tty = get_current_tty(); + ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; mutex_unlock(&tty_mutex); spin_lock_irq(¤t->sighand->siglock); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 40722e26de98..b6cb802fbcd1 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -826,10 +826,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->return_code); mutex_lock(&tty_mutex); + read_lock(&tasklist_lock); if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) tty = tsk->signal->tty->name; else tty = "(none)"; + read_unlock(&tasklist_lock); audit_log_format(ab, " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" " ppid=%d pid=%d auid=%u uid=%u gid=%u" diff --git a/kernel/exit.c b/kernel/exit.c index 4e3f919edc48..fa235779b6a3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -384,9 +384,7 @@ void daemonize(const char *name, ...) exit_mm(current); set_special_pids(1, 1); - mutex_lock(&tty_mutex); - current->signal->tty = NULL; - mutex_unlock(&tty_mutex); + proc_clear_tty(current); /* Block and flush all signals */ sigfillset(&blocked); diff --git a/kernel/sys.c b/kernel/sys.c index a0c1a29a507f..1ac2d1c5d84e 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1484,7 +1484,6 @@ asmlinkage long sys_setsid(void) pid_t session; int err = -EPERM; - mutex_lock(&tty_mutex); write_lock_irq(&tasklist_lock); /* Fail if I am already a session leader */ @@ -1504,12 +1503,15 @@ asmlinkage long sys_setsid(void) group_leader->signal->leader = 1; __set_special_pids(session, session); + + spin_lock(&group_leader->sighand->siglock); group_leader->signal->tty = NULL; group_leader->signal->tty_old_pgrp = 0; + spin_unlock(&group_leader->sighand->siglock); + err = process_group(group_leader); out: write_unlock_irq(&tasklist_lock); - mutex_unlock(&tty_mutex); return err; } -- cgit v1.2.3 From 7bcfa95e561f11a17720162935e4f704c5d6fda3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 8 Dec 2006 02:36:07 -0800 Subject: [PATCH] do_acct_process(): don't take tty_mutex No need to take the global tty_mutex, signal->tty->driver can't go away while we are holding ->siglock. Signed-off-by: Oleg Nesterov Acked-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index ca5619039367..69a40c9777a2 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -485,12 +485,9 @@ static void do_acct_process(struct file *file) ac.ac_ppid = current->parent->tgid; #endif - mutex_lock(&tty_mutex); - tty = get_current_tty(); - ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; - mutex_unlock(&tty_mutex); - spin_lock_irq(¤t->sighand->siglock); + tty = current->signal->tty; + ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); ac.ac_flag = pacct->ac_flag; -- cgit v1.2.3 From ae424ae4b5bcd820ad6ee6f0b986c4e14ed4d6cf Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 8 Dec 2006 02:36:08 -0800 Subject: [PATCH] make set_special_pids() static Make set_special_pids() static, the only caller is daemonize(). Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index fa235779b6a3..fa0495e167e9 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -314,7 +314,7 @@ void __set_special_pids(pid_t session, pid_t pgrp) } } -void set_special_pids(pid_t session, pid_t pgrp) +static void set_special_pids(pid_t session, pid_t pgrp) { write_lock_irq(&tasklist_lock); __set_special_pids(session, pgrp); -- cgit v1.2.3 From dae3c5a0b7052ad7dd9fa78c51ecfab828c5007b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 8 Dec 2006 02:36:09 -0800 Subject: [PATCH] sys_unshare: remove a broken CLONE_SIGHAND code sys_unshare(CLONE_SIGHAND) is broken, the code under 'if (new_sigh)' is never executed but very wrong. Just remove it to avoid a confusion, task_lock() has nothing to do with ->sighand changing. Also, change the comment in unshare_sighand(). Yes, CLONE_THREAD implies CLONE_SIGHAND, but still it looks confusing. Also, we don't need to check current->sighand != NULL. Signed-off-by: Oleg Nesterov Acked-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 7f2e31ba33af..f387a1393ca5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1544,15 +1544,13 @@ static int unshare_namespace(unsigned long unshare_flags, struct namespace **new } /* - * Unsharing of sighand for tasks created with CLONE_SIGHAND is not - * supported yet + * Unsharing of sighand is not supported yet */ static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) { struct sighand_struct *sigh = current->sighand; - if ((unshare_flags & CLONE_SIGHAND) && - (sigh && atomic_read(&sigh->count) > 1)) + if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1) return -EINVAL; else return 0; @@ -1626,7 +1624,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) int err = 0; struct fs_struct *fs, *new_fs = NULL; struct namespace *ns, *new_ns = NULL; - struct sighand_struct *sigh, *new_sigh = NULL; + struct sighand_struct *new_sigh = NULL; struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; struct sem_undo_list *new_ulist = NULL; @@ -1671,7 +1669,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) } } - if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist || + if (new_fs || new_ns || new_mm || new_fd || new_ulist || new_uts || new_ipc) { task_lock(current); @@ -1693,12 +1691,6 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) new_ns = ns; } - if (new_sigh) { - sigh = current->sighand; - rcu_assign_pointer(current->sighand, new_sigh); - new_sigh = sigh; - } - if (new_mm) { mm = current->mm; active_mm = current->active_mm; -- cgit v1.2.3 From d63a5a74dee87883fda6b7d170244acaac5b05e8 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 8 Dec 2006 02:36:17 -0800 Subject: [PATCH] lockdep: avoid lockdep warning in md md_open takes ->reconfig_mutex which causes lockdep to complain. This (normally) doesn't have deadlock potential as the possible conflict is with a reconfig_mutex in a different device. I say "normally" because if a loop were created in the array->member hierarchy a deadlock could happen. However that causes bigger problems than a deadlock and should be fixed independently. So we flag the lock in md_open as a nested lock. This requires defining mutex_lock_interruptible_nested. Cc: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Ingo Molnar Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/mutex.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/mutex.c b/kernel/mutex.c index 8c71cf72a497..e7cbbb82765b 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -206,6 +206,15 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass) } EXPORT_SYMBOL_GPL(mutex_lock_nested); + +int __sched +mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) +{ + might_sleep(); + return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass); +} + +EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); #endif /* -- cgit v1.2.3 From f3a43f3f64bff8e205c3702f6b4804d66e306848 Mon Sep 17 00:00:00 2001 From: "Josef \"Jeff\" Sipek" Date: Fri, 8 Dec 2006 02:36:43 -0800 Subject: [PATCH] kernel: change uses of f_{dentry, vfsmnt} to use f_path Change all the uses of f_{dentry,vfsmnt} to f_path.{dentry,mnt} in linux/kernel/. Signed-off-by: Josef "Jeff" Sipek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 14 +++++++------- kernel/fork.c | 2 +- kernel/futex.c | 10 +++++----- kernel/relay.c | 4 ++-- kernel/sysctl.c | 2 +- 5 files changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 69a40c9777a2..70d0d88e5554 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -118,7 +118,7 @@ static int check_free_space(struct file *file) spin_unlock(&acct_globals.lock); /* May block */ - if (vfs_statfs(file->f_dentry, &sbuf)) + if (vfs_statfs(file->f_path.dentry, &sbuf)) return res; suspend = sbuf.f_blocks * SUSPEND; resume = sbuf.f_blocks * RESUME; @@ -194,7 +194,7 @@ static void acct_file_reopen(struct file *file) add_timer(&acct_globals.timer); } if (old_acct) { - mnt_unpin(old_acct->f_vfsmnt); + mnt_unpin(old_acct->f_path.mnt); spin_unlock(&acct_globals.lock); do_acct_process(old_acct); filp_close(old_acct, NULL); @@ -212,7 +212,7 @@ static int acct_on(char *name) if (IS_ERR(file)) return PTR_ERR(file); - if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { + if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) { filp_close(file, NULL); return -EACCES; } @@ -229,11 +229,11 @@ static int acct_on(char *name) } spin_lock(&acct_globals.lock); - mnt_pin(file->f_vfsmnt); + mnt_pin(file->f_path.mnt); acct_file_reopen(file); spin_unlock(&acct_globals.lock); - mntput(file->f_vfsmnt); /* it's pinned, now give up active reference */ + mntput(file->f_path.mnt); /* it's pinned, now give up active reference */ return 0; } @@ -283,7 +283,7 @@ asmlinkage long sys_acct(const char __user *name) void acct_auto_close_mnt(struct vfsmount *m) { spin_lock(&acct_globals.lock); - if (acct_globals.file && acct_globals.file->f_vfsmnt == m) + if (acct_globals.file && acct_globals.file->f_path.mnt == m) acct_file_reopen(NULL); spin_unlock(&acct_globals.lock); } @@ -299,7 +299,7 @@ void acct_auto_close(struct super_block *sb) { spin_lock(&acct_globals.lock); if (acct_globals.file && - acct_globals.file->f_vfsmnt->mnt_sb == sb) { + acct_globals.file->f_path.mnt->mnt_sb == sb) { acct_file_reopen(NULL); } spin_unlock(&acct_globals.lock); diff --git a/kernel/fork.c b/kernel/fork.c index f387a1393ca5..597707819327 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -252,7 +252,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) anon_vma_link(tmp); file = tmp->vm_file; if (file) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); diff --git a/kernel/futex.c b/kernel/futex.c index 95989a3b4168..5a737de857d3 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -166,7 +166,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) /* * Get parameters which are the keys for a futex. * - * For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode, + * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, * offset_within_page). For private mappings, it's (uaddr, current->mm). * We can usually work out the index without swapping in the page. * @@ -223,7 +223,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key) /* * Linear file mappings are also simple. */ - key->shared.inode = vma->vm_file->f_dentry->d_inode; + key->shared.inode = vma->vm_file->f_path.dentry->d_inode; key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ if (likely(!(vma->vm_flags & VM_NONLINEAR))) { key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) @@ -1528,9 +1528,9 @@ static int futex_fd(u32 __user *uaddr, int signal) goto out; } filp->f_op = &futex_fops; - filp->f_vfsmnt = mntget(futex_mnt); - filp->f_dentry = dget(futex_mnt->mnt_root); - filp->f_mapping = filp->f_dentry->d_inode->i_mapping; + filp->f_path.mnt = mntget(futex_mnt); + filp->f_path.dentry = dget(futex_mnt->mnt_root); + filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping; if (signal) { err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1); diff --git a/kernel/relay.c b/kernel/relay.c index 75a3a9a7efc2..818e514729cf 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -959,7 +959,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp, if (!desc->count) return 0; - mutex_lock(&filp->f_dentry->d_inode->i_mutex); + mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); do { if (!relay_file_read_avail(buf, *ppos)) break; @@ -979,7 +979,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp, *ppos = relay_file_read_end_pos(buf, read_start, ret); } } while (desc->count && ret); - mutex_unlock(&filp->f_dentry->d_inode->i_mutex); + mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); return desc->written; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8e9f00fd6d18..9846db93a595 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1614,7 +1614,7 @@ static ssize_t do_rw_proc(int write, struct file * file, char __user * buf, size_t count, loff_t *ppos) { int op; - struct proc_dir_entry *de = PDE(file->f_dentry->d_inode); + struct proc_dir_entry *de = PDE(file->f_path.dentry->d_inode); struct ctl_table *table; size_t res; ssize_t error = -ENOTDIR; -- cgit v1.2.3 From a7a005fd12b84392becca311f2a20d5bf2a1b7af Mon Sep 17 00:00:00 2001 From: Josef Sipek Date: Fri, 8 Dec 2006 02:37:17 -0800 Subject: [PATCH] struct path: convert kernel Signed-off-by: Josef Sipek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditsc.c | 4 ++-- kernel/cpuset.c | 22 +++++++++++----------- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b6cb802fbcd1..298897559ca4 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -781,8 +781,8 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file) { audit_log_d_path(ab, "exe=", - vma->vm_file->f_dentry, - vma->vm_file->f_vfsmnt); + vma->vm_file->f_path.dentry, + vma->vm_file->f_path.mnt); break; } vma = vma->vm_next; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 0a6b4d89f9a0..2c3b4431472b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -413,8 +413,8 @@ static struct file_system_type cpuset_fs_type = { * * * When reading/writing to a file: - * - the cpuset to use in file->f_dentry->d_parent->d_fsdata - * - the 'cftype' of the file is file->f_dentry->d_fsdata + * - the cpuset to use in file->f_path.dentry->d_parent->d_fsdata + * - the 'cftype' of the file is file->f_path.dentry->d_fsdata */ struct cftype { @@ -1284,8 +1284,8 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *userbuf, size_t nbytes, loff_t *unused_ppos) { - struct cpuset *cs = __d_cs(file->f_dentry->d_parent); - struct cftype *cft = __d_cft(file->f_dentry); + struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); + struct cftype *cft = __d_cft(file->f_path.dentry); cpuset_filetype_t type = cft->private; char *buffer; char *pathbuf = NULL; @@ -1367,7 +1367,7 @@ static ssize_t cpuset_file_write(struct file *file, const char __user *buf, size_t nbytes, loff_t *ppos) { ssize_t retval = 0; - struct cftype *cft = __d_cft(file->f_dentry); + struct cftype *cft = __d_cft(file->f_path.dentry); if (!cft) return -ENODEV; @@ -1417,8 +1417,8 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { - struct cftype *cft = __d_cft(file->f_dentry); - struct cpuset *cs = __d_cs(file->f_dentry->d_parent); + struct cftype *cft = __d_cft(file->f_path.dentry); + struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); cpuset_filetype_t type = cft->private; char *page; ssize_t retval = 0; @@ -1476,7 +1476,7 @@ static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbyt loff_t *ppos) { ssize_t retval = 0; - struct cftype *cft = __d_cft(file->f_dentry); + struct cftype *cft = __d_cft(file->f_path.dentry); if (!cft) return -ENODEV; @@ -1498,7 +1498,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file) if (err) return err; - cft = __d_cft(file->f_dentry); + cft = __d_cft(file->f_path.dentry); if (!cft) return -ENODEV; if (cft->open) @@ -1511,7 +1511,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file) static int cpuset_file_release(struct inode *inode, struct file *file) { - struct cftype *cft = __d_cft(file->f_dentry); + struct cftype *cft = __d_cft(file->f_path.dentry); if (cft->release) return cft->release(inode, file); return 0; @@ -1700,7 +1700,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) */ static int cpuset_tasks_open(struct inode *unused, struct file *file) { - struct cpuset *cs = __d_cs(file->f_dentry->d_parent); + struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); struct ctr_struct *ctr; pid_t *pidarray; int npids; -- cgit v1.2.3 From 937949d9edbf4049bd41af6c9f92c26280584564 Mon Sep 17 00:00:00 2001 From: Cedric Le Goater Date: Fri, 8 Dec 2006 02:37:54 -0800 Subject: [PATCH] add process_session() helper routine Replace occurences of task->signal->session by a new process_session() helper routine. It will be useful for pid namespaces to abstract the session pid number. Signed-off-by: Cedric Le Goater Cc: Kirill Korotaev Cc: Eric W. Biederman Cc: Herbert Poetzl Cc: Sukadev Bhattiprolu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 19 ++++++++++--------- kernel/fork.c | 4 ++-- kernel/signal.c | 2 +- kernel/sys.c | 8 ++++---- 4 files changed, 17 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index fa0495e167e9..8d289bfc13d1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -193,14 +193,14 @@ int session_of_pgrp(int pgrp) read_lock(&tasklist_lock); do_each_task_pid(pgrp, PIDTYPE_PGID, p) { - if (p->signal->session > 0) { - sid = p->signal->session; + if (process_session(p) > 0) { + sid = process_session(p); goto out; } } while_each_task_pid(pgrp, PIDTYPE_PGID, p); p = find_task_by_pid(pgrp); if (p) - sid = p->signal->session; + sid = process_session(p); out: read_unlock(&tasklist_lock); @@ -225,8 +225,8 @@ static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task) || p->exit_state || is_init(p->real_parent)) continue; - if (process_group(p->real_parent) != pgrp - && p->real_parent->signal->session == p->signal->session) { + if (process_group(p->real_parent) != pgrp && + process_session(p->real_parent) == process_session(p)) { ret = 0; break; } @@ -302,7 +302,7 @@ void __set_special_pids(pid_t session, pid_t pgrp) { struct task_struct *curr = current->group_leader; - if (curr->signal->session != session) { + if (process_session(curr) != session) { detach_pid(curr, PIDTYPE_SID); curr->signal->session = session; attach_pid(curr, PIDTYPE_SID, session); @@ -647,10 +647,11 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) * outside, so the child pgrp is now orphaned. */ if ((process_group(p) != process_group(father)) && - (p->signal->session == father->signal->session)) { + (process_session(p) == process_session(father))) { int pgrp = process_group(p); - if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { + if (will_become_orphaned_pgrp(pgrp, NULL) && + has_stopped_jobs(pgrp)) { __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp); __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp); } @@ -784,7 +785,7 @@ static void exit_notify(struct task_struct *tsk) t = tsk->real_parent; if ((process_group(t) != process_group(tsk)) && - (t->signal->session == tsk->signal->session) && + (process_session(t) == process_session(tsk)) && will_become_orphaned_pgrp(process_group(tsk), tsk) && has_stopped_jobs(process_group(tsk))) { __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk)); diff --git a/kernel/fork.c b/kernel/fork.c index 597707819327..298c4d6ab512 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1259,9 +1259,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (thread_group_leader(p)) { p->signal->tty = current->signal->tty; p->signal->pgrp = process_group(current); - p->signal->session = current->signal->session; + p->signal->session = process_session(current); attach_pid(p, PIDTYPE_PGID, process_group(p)); - attach_pid(p, PIDTYPE_SID, p->signal->session); + attach_pid(p, PIDTYPE_SID, process_session(p)); list_add_tail_rcu(&p->tasks, &init_task.tasks); __get_cpu_var(process_counts)++; diff --git a/kernel/signal.c b/kernel/signal.c index ec81defde339..9eac4db60eda 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -583,7 +583,7 @@ static int check_kill_permission(int sig, struct siginfo *info, error = -EPERM; if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) && ((sig != SIGCONT) || - (current->signal->session != t->signal->session)) + (process_session(current) != process_session(t))) && (current->euid ^ t->suid) && (current->euid ^ t->uid) && (current->uid ^ t->suid) && (current->uid ^ t->uid) && !capable(CAP_KILL)) diff --git a/kernel/sys.c b/kernel/sys.c index 1ac2d1c5d84e..4f9d23a3095f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1381,7 +1381,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) if (p->real_parent == group_leader) { err = -EPERM; - if (p->signal->session != group_leader->signal->session) + if (process_session(p) != process_session(group_leader)) goto out; err = -EACCES; if (p->did_exec) @@ -1400,7 +1400,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) struct task_struct *p; do_each_task_pid(pgid, PIDTYPE_PGID, p) { - if (p->signal->session == group_leader->signal->session) + if (process_session(p) == process_session(group_leader)) goto ok_pgid; } while_each_task_pid(pgid, PIDTYPE_PGID, p); goto out; @@ -1459,7 +1459,7 @@ asmlinkage long sys_getpgrp(void) asmlinkage long sys_getsid(pid_t pid) { if (!pid) - return current->signal->session; + return process_session(current); else { int retval; struct task_struct *p; @@ -1471,7 +1471,7 @@ asmlinkage long sys_getsid(pid_t pid) if (p) { retval = security_task_getsid(p); if (!retval) - retval = p->signal->session; + retval = process_session(p); } read_unlock(&tasklist_lock); return retval; -- cgit v1.2.3 From 1ec320afdc9552c92191d5f89fcd1ebe588334ca Mon Sep 17 00:00:00 2001 From: Cedric Le Goater Date: Fri, 8 Dec 2006 02:37:55 -0800 Subject: [PATCH] add process_session() helper routine: deprecate old field Add an anonymous union and ((deprecated)) to catch direct usage of the session field. [akpm@osdl.org: fix various missed conversions] [jdike@addtoit.com: fix UML bug] Signed-off-by: Jeff Dike Cc: Cedric Le Goater Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- kernel/fork.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 8d289bfc13d1..6267a6cc6113 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -304,7 +304,7 @@ void __set_special_pids(pid_t session, pid_t pgrp) if (process_session(curr) != session) { detach_pid(curr, PIDTYPE_SID); - curr->signal->session = session; + set_signal_session(curr->signal, session); attach_pid(curr, PIDTYPE_SID, session); } if (process_group(curr) != pgrp) { diff --git a/kernel/fork.c b/kernel/fork.c index 298c4d6ab512..60d2644bfe85 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1259,7 +1259,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (thread_group_leader(p)) { p->signal->tty = current->signal->tty; p->signal->pgrp = process_group(current); - p->signal->session = process_session(current); + set_signal_session(p->signal, process_session(current)); attach_pid(p, PIDTYPE_PGID, process_group(p)); attach_pid(p, PIDTYPE_SID, process_session(p)); -- cgit v1.2.3 From 6b3286ed1169d74fea401367d6d4d6c6ec758a81 Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Fri, 8 Dec 2006 02:37:56 -0800 Subject: [PATCH] rename struct namespace to struct mnt_namespace Rename 'struct namespace' to 'struct mnt_namespace' to avoid confusion with other namespaces being developped for the containers : pid, uts, ipc, etc. 'namespace' variables and attributes are also renamed to 'mnt_ns' Signed-off-by: Kirill Korotaev Signed-off-by: Cedric Le Goater Cc: Eric W. Biederman Cc: Herbert Poetzl Cc: Sukadev Bhattiprolu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- kernel/fork.c | 21 +++++++++++---------- kernel/kmod.c | 2 +- kernel/nsproxy.c | 16 ++++++++-------- 4 files changed, 21 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 6267a6cc6113..28d9feedfd27 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/fork.c b/kernel/fork.c index 60d2644bfe85..8c859eef8e6a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include @@ -1525,17 +1525,18 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) } /* - * Unshare the namespace structure if it is being shared + * Unshare the mnt_namespace structure if it is being shared */ -static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs) +static int unshare_mnt_namespace(unsigned long unshare_flags, + struct mnt_namespace **new_nsp, struct fs_struct *new_fs) { - struct namespace *ns = current->nsproxy->namespace; + struct mnt_namespace *ns = current->nsproxy->mnt_ns; if ((unshare_flags & CLONE_NEWNS) && ns) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; - *new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs); + *new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs); if (!*new_nsp) return -ENOMEM; } @@ -1623,7 +1624,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) { int err = 0; struct fs_struct *fs, *new_fs = NULL; - struct namespace *ns, *new_ns = NULL; + struct mnt_namespace *ns, *new_ns = NULL; struct sighand_struct *new_sigh = NULL; struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; @@ -1645,7 +1646,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) goto bad_unshare_out; if ((err = unshare_fs(unshare_flags, &new_fs))) goto bad_unshare_cleanup_thread; - if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs))) + if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs))) goto bad_unshare_cleanup_fs; if ((err = unshare_sighand(unshare_flags, &new_sigh))) goto bad_unshare_cleanup_ns; @@ -1686,8 +1687,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) } if (new_ns) { - ns = current->nsproxy->namespace; - current->nsproxy->namespace = new_ns; + ns = current->nsproxy->mnt_ns; + current->nsproxy->mnt_ns = new_ns; new_ns = ns; } @@ -1748,7 +1749,7 @@ bad_unshare_cleanup_sigh: bad_unshare_cleanup_ns: if (new_ns) - put_namespace(new_ns); + put_mnt_ns(new_ns); bad_unshare_cleanup_fs: if (new_fs) diff --git a/kernel/kmod.c b/kernel/kmod.c index 8d2bea09a4ec..3a7379aa31ca 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 674aceb7335a..bd9cb435dfe0 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); @@ -60,8 +60,8 @@ struct nsproxy *dup_namespaces(struct nsproxy *orig) struct nsproxy *ns = clone_namespaces(orig); if (ns) { - if (ns->namespace) - get_namespace(ns->namespace); + if (ns->mnt_ns) + get_mnt_ns(ns->mnt_ns); if (ns->uts_ns) get_uts_ns(ns->uts_ns); if (ns->ipc_ns) @@ -97,7 +97,7 @@ int copy_namespaces(int flags, struct task_struct *tsk) tsk->nsproxy = new_ns; - err = copy_namespace(flags, tsk); + err = copy_mnt_ns(flags, tsk); if (err) goto out_ns; @@ -117,8 +117,8 @@ out_ipc: if (new_ns->uts_ns) put_uts_ns(new_ns->uts_ns); out_uts: - if (new_ns->namespace) - put_namespace(new_ns->namespace); + if (new_ns->mnt_ns) + put_mnt_ns(new_ns->mnt_ns); out_ns: tsk->nsproxy = old_ns; kfree(new_ns); @@ -127,8 +127,8 @@ out_ns: void free_nsproxy(struct nsproxy *ns) { - if (ns->namespace) - put_namespace(ns->namespace); + if (ns->mnt_ns) + put_mnt_ns(ns->mnt_ns); if (ns->uts_ns) put_uts_ns(ns->uts_ns); if (ns->ipc_ns) -- cgit v1.2.3 From 373beb35cd6b625e0ba4ad98baace12310a26aa8 Mon Sep 17 00:00:00 2001 From: Cedric Le Goater Date: Fri, 8 Dec 2006 02:37:57 -0800 Subject: [PATCH] identifier to nsproxy Add an identifier to nsproxy. The default init_ns_proxy has identifier 0 and allocated nsproxies are given -1. This identifier will be used by a new syscall sys_bind_ns. Signed-off-by: Cedric Le Goater Cc: Kirill Korotaev Cc: Eric W. Biederman Cc: Herbert Poetzl Cc: Sukadev Bhattiprolu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/nsproxy.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index bd9cb435dfe0..f223c15c18e9 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -45,8 +45,10 @@ static inline struct nsproxy *clone_namespaces(struct nsproxy *orig) struct nsproxy *ns; ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL); - if (ns) + if (ns) { atomic_set(&ns->count, 1); + ns->id = -1; + } return ns; } -- cgit v1.2.3 From 61a58c6c238cc81f7742b8cc84212cc55fb57747 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Fri, 8 Dec 2006 02:37:58 -0800 Subject: [PATCH] rename struct pspace to struct pid_namespace Rename struct pspace to struct pid_namespace for consistency with other namespaces (uts_namespace and ipc_namespace). Also rename include/linux/pspace.h to include/linux/pid_namespace.h and variables from pspace to pid_ns. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Cedric Le Goater Cc: Kirill Korotaev Cc: Eric W. Biederman Cc: Herbert Poetzl Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 49 +++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index a48879b0b921..25807e1b98dd 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) static struct hlist_head *pid_hash; @@ -43,9 +43,10 @@ int pid_max_max = PID_MAX_LIMIT; #define BITS_PER_PAGE (PAGE_SIZE*8) #define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) -static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off) +static inline int mk_pid(struct pid_namespace *pid_ns, + struct pidmap *map, int off) { - return (map - pspace->pidmap)*BITS_PER_PAGE + off; + return (map - pid_ns->pidmap)*BITS_PER_PAGE + off; } #define find_next_offset(map, off) \ @@ -57,7 +58,7 @@ static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off) * value does not cause lots of bitmaps to be allocated, but * the scheme scales to up to 4 million PIDs, runtime. */ -struct pspace init_pspace = { +struct pid_namespace init_pid_ns = { .pidmap = { [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }, @@ -80,25 +81,25 @@ struct pspace init_pspace = { static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); -static fastcall void free_pidmap(struct pspace *pspace, int pid) +static fastcall void free_pidmap(struct pid_namespace *pid_ns, int pid) { - struct pidmap *map = pspace->pidmap + pid / BITS_PER_PAGE; + struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE; int offset = pid & BITS_PER_PAGE_MASK; clear_bit(offset, map->page); atomic_inc(&map->nr_free); } -static int alloc_pidmap(struct pspace *pspace) +static int alloc_pidmap(struct pid_namespace *pid_ns) { - int i, offset, max_scan, pid, last = pspace->last_pid; + int i, offset, max_scan, pid, last = pid_ns->last_pid; struct pidmap *map; pid = last + 1; if (pid >= pid_max) pid = RESERVED_PIDS; offset = pid & BITS_PER_PAGE_MASK; - map = &pspace->pidmap[pid/BITS_PER_PAGE]; + map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; for (i = 0; i <= max_scan; ++i) { if (unlikely(!map->page)) { @@ -120,11 +121,11 @@ static int alloc_pidmap(struct pspace *pspace) do { if (!test_and_set_bit(offset, map->page)) { atomic_dec(&map->nr_free); - pspace->last_pid = pid; + pid_ns->last_pid = pid; return pid; } offset = find_next_offset(map, offset); - pid = mk_pid(pspace, map, offset); + pid = mk_pid(pid_ns, map, offset); /* * find_next_offset() found a bit, the pid from it * is in-bounds, and if we fell back to the last @@ -135,34 +136,34 @@ static int alloc_pidmap(struct pspace *pspace) (i != max_scan || pid < last || !((last+1) & BITS_PER_PAGE_MASK))); } - if (map < &pspace->pidmap[(pid_max-1)/BITS_PER_PAGE]) { + if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { ++map; offset = 0; } else { - map = &pspace->pidmap[0]; + map = &pid_ns->pidmap[0]; offset = RESERVED_PIDS; if (unlikely(last == offset)) break; } - pid = mk_pid(pspace, map, offset); + pid = mk_pid(pid_ns, map, offset); } return -1; } -static int next_pidmap(struct pspace *pspace, int last) +static int next_pidmap(struct pid_namespace *pid_ns, int last) { int offset; struct pidmap *map, *end; offset = (last + 1) & BITS_PER_PAGE_MASK; - map = &pspace->pidmap[(last + 1)/BITS_PER_PAGE]; - end = &pspace->pidmap[PIDMAP_ENTRIES]; + map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; + end = &pid_ns->pidmap[PIDMAP_ENTRIES]; for (; map < end; map++, offset = 0) { if (unlikely(!map->page)) continue; offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); if (offset < BITS_PER_PAGE) - return mk_pid(pspace, map, offset); + return mk_pid(pid_ns, map, offset); } return -1; } @@ -192,7 +193,7 @@ fastcall void free_pid(struct pid *pid) hlist_del_rcu(&pid->pid_chain); spin_unlock_irqrestore(&pidmap_lock, flags); - free_pidmap(&init_pspace, pid->nr); + free_pidmap(&init_pid_ns, pid->nr); call_rcu(&pid->rcu, delayed_put_pid); } @@ -206,7 +207,7 @@ struct pid *alloc_pid(void) if (!pid) goto out; - nr = alloc_pidmap(&init_pspace); + nr = alloc_pidmap(&init_pid_ns); if (nr < 0) goto out_free; @@ -348,7 +349,7 @@ struct pid *find_ge_pid(int nr) pid = find_pid(nr); if (pid) break; - nr = next_pidmap(&init_pspace, nr); + nr = next_pidmap(&init_pid_ns, nr); } while (nr > 0); return pid; @@ -382,10 +383,10 @@ void __init pidhash_init(void) void __init pidmap_init(void) { - init_pspace.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); + init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); /* Reserve PID 0. We never call free_pidmap(0) */ - set_bit(0, init_pspace.pidmap[0].page); - atomic_dec(&init_pspace.pidmap[0].nr_free); + set_bit(0, init_pid_ns.pidmap[0].page); + atomic_dec(&init_pid_ns.pidmap[0].nr_free); pid_cachep = kmem_cache_create("pid", sizeof(struct pid), __alignof__(struct pid), -- cgit v1.2.3 From 9a575a92db3312a40cdf0b0406d88de88ad9741e Mon Sep 17 00:00:00 2001 From: Cedric Le Goater Date: Fri, 8 Dec 2006 02:37:59 -0800 Subject: [PATCH] to nsproxy Add the pid namespace framework to the nsproxy object. The copy of the pid namespace only increases the refcount on the global pid namespace, init_pid_ns, and unshare is not implemented. There is no configuration option to activate or deactivate this feature because this not relevant for the moment. Signed-off-by: Cedric Le Goater Cc: Kirill Korotaev Cc: Eric W. Biederman Cc: Herbert Poetzl Cc: Sukadev Bhattiprolu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/nsproxy.c | 26 +++++++++++++++++++------- kernel/pid.c | 23 +++++++++++++++++++++++ 2 files changed, 42 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index f223c15c18e9..e2ce748e96af 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -19,6 +19,7 @@ #include #include #include +#include struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); @@ -68,6 +69,8 @@ struct nsproxy *dup_namespaces(struct nsproxy *orig) get_uts_ns(ns->uts_ns); if (ns->ipc_ns) get_ipc_ns(ns->ipc_ns); + if (ns->pid_ns) + get_pid_ns(ns->pid_ns); } return ns; @@ -111,10 +114,17 @@ int copy_namespaces(int flags, struct task_struct *tsk) if (err) goto out_ipc; + err = copy_pid_ns(flags, tsk); + if (err) + goto out_pid; + out: put_nsproxy(old_ns); return err; +out_pid: + if (new_ns->ipc_ns) + put_ipc_ns(new_ns->ipc_ns); out_ipc: if (new_ns->uts_ns) put_uts_ns(new_ns->uts_ns); @@ -129,11 +139,13 @@ out_ns: void free_nsproxy(struct nsproxy *ns) { - if (ns->mnt_ns) - put_mnt_ns(ns->mnt_ns); - if (ns->uts_ns) - put_uts_ns(ns->uts_ns); - if (ns->ipc_ns) - put_ipc_ns(ns->ipc_ns); - kfree(ns); + if (ns->mnt_ns) + put_mnt_ns(ns->mnt_ns); + if (ns->uts_ns) + put_uts_ns(ns->uts_ns); + if (ns->ipc_ns) + put_ipc_ns(ns->ipc_ns); + if (ns->pid_ns) + put_pid_ns(ns->pid_ns); + kfree(ns); } diff --git a/kernel/pid.c b/kernel/pid.c index 25807e1b98dd..5319b9f2fc5e 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -59,6 +59,9 @@ static inline int mk_pid(struct pid_namespace *pid_ns, * the scheme scales to up to 4 million PIDs, runtime. */ struct pid_namespace init_pid_ns = { + .kref = { + .refcount = ATOMIC_INIT(2), + }, .pidmap = { [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }, @@ -356,6 +359,26 @@ struct pid *find_ge_pid(int nr) } EXPORT_SYMBOL_GPL(find_get_pid); +int copy_pid_ns(int flags, struct task_struct *tsk) +{ + struct pid_namespace *old_ns = tsk->nsproxy->pid_ns; + int err = 0; + + if (!old_ns) + return 0; + + get_pid_ns(old_ns); + return err; +} + +void free_pid_ns(struct kref *kref) +{ + struct pid_namespace *ns; + + ns = container_of(kref, struct pid_namespace, kref); + kfree(ns); +} + /* * The pid hash table is scaled according to the amount of memory in the * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or -- cgit v1.2.3 From 6cc1b22a4acef3816eaa5f8c227d93d749b23195 Mon Sep 17 00:00:00 2001 From: Cedric Le Goater Date: Fri, 8 Dec 2006 02:38:00 -0800 Subject: [PATCH] use current->nsproxy->pid_ns Signed-off-by: Cedric Le Goater Cc: Kirill Korotaev Cc: Eric W. Biederman Cc: Herbert Poetzl Cc: Sukadev Bhattiprolu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 5319b9f2fc5e..1d9cc268b499 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -196,7 +196,7 @@ fastcall void free_pid(struct pid *pid) hlist_del_rcu(&pid->pid_chain); spin_unlock_irqrestore(&pidmap_lock, flags); - free_pidmap(&init_pid_ns, pid->nr); + free_pidmap(current->nsproxy->pid_ns, pid->nr); call_rcu(&pid->rcu, delayed_put_pid); } @@ -210,7 +210,7 @@ struct pid *alloc_pid(void) if (!pid) goto out; - nr = alloc_pidmap(&init_pid_ns); + nr = alloc_pidmap(current->nsproxy->pid_ns); if (nr < 0) goto out_free; @@ -352,7 +352,7 @@ struct pid *find_ge_pid(int nr) pid = find_pid(nr); if (pid) break; - nr = next_pidmap(&init_pid_ns, nr); + nr = next_pidmap(current->nsproxy->pid_ns, nr); } while (nr > 0); return pid; -- cgit v1.2.3 From 84d737866e2babdeab0c6b18ea155c6a649663b8 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Fri, 8 Dec 2006 02:38:01 -0800 Subject: [PATCH] add child reaper to pid_namespace Add a per pid_namespace child-reaper. This is needed so processes are reaped within the same pid space and do not spill over to the parent pid space. Its also needed so containers preserve existing semantic that pid == 1 would reap orphaned children. This is based on Eric Biederman's patch: http://lkml.org/lkml/2006/2/6/285 Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Cedric Le Goater Cc: Kirill Korotaev Cc: Eric W. Biederman Cc: Herbert Poetzl Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 23 +++++++++++++++-------- kernel/pid.c | 3 ++- kernel/signal.c | 11 +++++++++-- 3 files changed, 26 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 28d9feedfd27..fd0e067952ab 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -48,7 +49,6 @@ #include extern void sem_exit (void); -extern struct task_struct *child_reaper; static void exit_mm(struct task_struct * tsk); @@ -260,7 +260,8 @@ static int has_stopped_jobs(int pgrp) } /** - * reparent_to_init - Reparent the calling kernel thread to the init task. + * reparent_to_init - Reparent the calling kernel thread to the init task + * of the pid space that the thread belongs to. * * If a kernel thread is launched as a result of a system call, or if * it ever exits, it should generally reparent itself to init so that @@ -278,8 +279,8 @@ static void reparent_to_init(void) ptrace_unlink(current); /* Reparent to init */ remove_parent(current); - current->parent = child_reaper; - current->real_parent = child_reaper; + current->parent = child_reaper(current); + current->real_parent = child_reaper(current); add_parent(current); /* Set the exit signal to SIGCHLD so we signal init on exit */ @@ -662,7 +663,8 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) * When we die, we re-parent all our children. * Try to give them to another thread in our thread * group, and if no such member exists, give it to - * the global child reaper process (ie "init") + * the child reaper process (ie "init") in our pid + * space. */ static void forget_original_parent(struct task_struct *father, struct list_head *to_release) @@ -673,7 +675,7 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release) do { reaper = next_thread(reaper); if (reaper == father) { - reaper = child_reaper; + reaper = child_reaper(father); break; } } while (reaper->exit_state); @@ -859,8 +861,13 @@ fastcall NORET_TYPE void do_exit(long code) panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); - if (unlikely(tsk == child_reaper)) - panic("Attempted to kill init!"); + if (unlikely(tsk == child_reaper(tsk))) { + if (tsk->nsproxy->pid_ns != &init_pid_ns) + tsk->nsproxy->pid_ns->child_reaper = init_pid_ns.child_reaper; + else + panic("Attempted to kill init!"); + } + if (unlikely(current->ptrace & PT_TRACE_EXIT)) { current->ptrace_message = code; diff --git a/kernel/pid.c b/kernel/pid.c index 1d9cc268b499..2efe9d8d367b 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -65,7 +65,8 @@ struct pid_namespace init_pid_ns = { .pidmap = { [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }, - .last_pid = 0 + .last_pid = 0, + .child_reaper = &init_task }; /* diff --git a/kernel/signal.c b/kernel/signal.c index 9eac4db60eda..1921ffdc5e77 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -24,6 +24,9 @@ #include #include #include +#include +#include + #include #include #include @@ -1877,8 +1880,12 @@ relock: if (sig_kernel_ignore(signr)) /* Default is nothing. */ continue; - /* Init gets no signals it doesn't want. */ - if (current == child_reaper) + /* + * Init of a pid space gets no signals it doesn't want from + * within that pid space. It can of course get signals from + * its parent pid space. + */ + if (current == child_reaper(current)) continue; if (sig_kernel_stop(signr)) { -- cgit v1.2.3 From f020bc468fe4a91d32046d448511978c7b611315 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 8 Dec 2006 02:38:02 -0800 Subject: [PATCH] sys_setpgid: eliminate unnecessary do_each_task_pid(PIDTYPE_PGID) All tasks in the process group have the same sid, we don't need to iterate them all to check that the caller of sys_setpgid() doesn't change its session. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 4f9d23a3095f..c7675c1bfdf2 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1397,16 +1397,13 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) goto out; if (pgid != pid) { - struct task_struct *p; + struct task_struct *g = + find_task_by_pid_type(PIDTYPE_PGID, pgid); - do_each_task_pid(pgid, PIDTYPE_PGID, p) { - if (process_session(p) == process_session(group_leader)) - goto ok_pgid; - } while_each_task_pid(pgid, PIDTYPE_PGID, p); - goto out; + if (!g || process_session(g) != process_session(group_leader)) + goto out; } -ok_pgid: err = security_task_setpgid(p, pgid); if (err) goto out; -- cgit v1.2.3 From 62dfb5541a025b47df9405ff0219c7829a97d83b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 8 Dec 2006 02:38:03 -0800 Subject: [PATCH] session_of_pgrp: kill unnecessary do_each_task_pid(PIDTYPE_PGID) All members of the process group have the same sid and it can't be == 0. NOTE: this code (and a similar one in sys_setpgid) was needed because it was possibe to have ->session == 0. It's not possible any longer since [PATCH] pidhash: don't use zero pids Commit: c7c6464117a02b0d54feb4ebeca4db70fa493678 Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index fd0e067952ab..03e64fe4a14a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -189,21 +189,18 @@ repeat: int session_of_pgrp(int pgrp) { struct task_struct *p; - int sid = -1; + int sid = 0; read_lock(&tasklist_lock); - do_each_task_pid(pgrp, PIDTYPE_PGID, p) { - if (process_session(p) > 0) { - sid = process_session(p); - goto out; - } - } while_each_task_pid(pgrp, PIDTYPE_PGID, p); - p = find_task_by_pid(pgrp); - if (p) + + p = find_task_by_pid_type(PIDTYPE_PGID, pgrp); + if (p == NULL) + p = find_task_by_pid(pgrp); + if (p != NULL) sid = process_session(p); -out: + read_unlock(&tasklist_lock); - + return sid; } -- cgit v1.2.3 From cf9f151c7257683f489df85f94baf408d1d5694a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 8 Dec 2006 02:39:55 -0800 Subject: [PATCH] sysctl: simplify sysctl_uts_string The binary interface to the namespace sysctls was never implemented resulting in some really weird things if you attempted to use sys_sysctl to read your hostname for example. This patch series simples the code a little and implements the binary sysctl interface. In testing this patch series I discovered that our 32bit compatibility for the binary sysctl interface is imperfect. In particular KERN_SHMMAX and KERN_SMMALL are size_t sized quantities and are returned as 8 bytes on to 32bit binaries using a x86_64 kernel. However this has existing for a long time so it is not a new regression with the namespace work. Gads the whole sysctl thing needs work before it stops being easy to shoot yourself in the foot. Looking forward a little bit we need a better way to handle sysctls and namespaces as our current technique will not work for the network namespace. I think something based on the current overlapping sysctl trees will work but the proc side needs to be redone before we can use it. This patch: Introduce get_uts() and put_uts() (used later) and remove most of the special cases for when UTS namespace is compiled in. Signed-off-by: Eric W. Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 128 ++++++++++++-------------------------------------------- 1 file changed, 26 insertions(+), 102 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9846db93a595..c8bcbfb2e069 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -163,6 +163,28 @@ extern ctl_table inotify_table[]; int sysctl_legacy_va_layout; #endif +static void *get_uts(ctl_table *table, int write) +{ + char *which = table->data; +#ifdef CONFIG_UTS_NS + struct uts_namespace *uts_ns = current->nsproxy->uts_ns; + which = (which - (char *)&init_uts_ns) + (char *)uts_ns; +#endif + if (!write) + down_read(&uts_sem); + else + down_write(&uts_sem); + return which; +} + +static void put_uts(ctl_table *table, int write, void *which) +{ + if (!write) + up_read(&uts_sem); + else + up_write(&uts_sem); +} + /* /proc declarations: */ #ifdef CONFIG_PROC_SYSCTL @@ -229,7 +251,6 @@ static ctl_table root_table[] = { }; static ctl_table kern_table[] = { -#ifndef CONFIG_UTS_NS { .ctl_name = KERN_OSTYPE, .procname = "ostype", @@ -275,54 +296,6 @@ static ctl_table kern_table[] = { .proc_handler = &proc_do_uts_string, .strategy = &sysctl_string, }, -#else /* !CONFIG_UTS_NS */ - { - .ctl_name = KERN_OSTYPE, - .procname = "ostype", - .data = NULL, - /* could maybe use __NEW_UTS_LEN here? */ - .maxlen = FIELD_SIZEOF(struct new_utsname, sysname), - .mode = 0444, - .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, - }, - { - .ctl_name = KERN_OSRELEASE, - .procname = "osrelease", - .data = NULL, - .maxlen = FIELD_SIZEOF(struct new_utsname, release), - .mode = 0444, - .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, - }, - { - .ctl_name = KERN_VERSION, - .procname = "version", - .data = NULL, - .maxlen = FIELD_SIZEOF(struct new_utsname, version), - .mode = 0444, - .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, - }, - { - .ctl_name = KERN_NODENAME, - .procname = "hostname", - .data = NULL, - .maxlen = FIELD_SIZEOF(struct new_utsname, nodename), - .mode = 0644, - .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, - }, - { - .ctl_name = KERN_DOMAINNAME, - .procname = "domainname", - .data = NULL, - .maxlen = FIELD_SIZEOF(struct new_utsname, domainname), - .mode = 0644, - .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, - }, -#endif /* !CONFIG_UTS_NS */ { .ctl_name = KERN_PANIC, .procname = "panic", @@ -1753,66 +1726,17 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? */ - -#ifndef CONFIG_UTS_NS -static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int r; - if (!write) { - down_read(&uts_sem); - r=proc_dostring(table,0,filp,buffer,lenp, ppos); - up_read(&uts_sem); - } else { - down_write(&uts_sem); - r=proc_dostring(table,1,filp,buffer,lenp, ppos); - up_write(&uts_sem); - } - return r; -} -#else /* !CONFIG_UTS_NS */ static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int r; - struct uts_namespace* uts_ns = current->nsproxy->uts_ns; - char* which; - - switch (table->ctl_name) { - case KERN_OSTYPE: - which = uts_ns->name.sysname; - break; - case KERN_NODENAME: - which = uts_ns->name.nodename; - break; - case KERN_OSRELEASE: - which = uts_ns->name.release; - break; - case KERN_VERSION: - which = uts_ns->name.version; - break; - case KERN_DOMAINNAME: - which = uts_ns->name.domainname; - break; - default: - r = -EINVAL; - goto out; - } - - if (!write) { - down_read(&uts_sem); - r=_proc_do_string(which,table->maxlen,0,filp,buffer,lenp, ppos); - up_read(&uts_sem); - } else { - down_write(&uts_sem); - r=_proc_do_string(which,table->maxlen,1,filp,buffer,lenp, ppos); - up_write(&uts_sem); - } - out: + void *which; + which = get_uts(table, write); + r = _proc_do_string(which, table->maxlen,write,filp,buffer,lenp, ppos); + put_uts(table, write, which); return r; } -#endif /* !CONFIG_UTS_NS */ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, int *valp, -- cgit v1.2.3 From c4b8b769fa9051838d2772886ecd0ee2a926ddc3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 8 Dec 2006 02:39:55 -0800 Subject: [PATCH] sysctl: implement sysctl_uts_string() The problem: When using sys_sysctl we don't read the proper values for the variables exported from the uts namespace, nor do we do the proper locking. This patch introduces sysctl_uts_string which properly fetches the values and does the proper locking. Signed-off-by: Eric W. Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c8bcbfb2e069..77ea4fc386ef 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -137,6 +137,10 @@ static int parse_table(int __user *, int, void __user *, size_t __user *, static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); +static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context); + #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -258,7 +262,7 @@ static ctl_table kern_table[] = { .maxlen = sizeof(init_uts_ns.name.sysname), .mode = 0444, .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, + .strategy = &sysctl_uts_string, }, { .ctl_name = KERN_OSRELEASE, @@ -267,7 +271,7 @@ static ctl_table kern_table[] = { .maxlen = sizeof(init_uts_ns.name.release), .mode = 0444, .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, + .strategy = &sysctl_uts_string, }, { .ctl_name = KERN_VERSION, @@ -276,7 +280,7 @@ static ctl_table kern_table[] = { .maxlen = sizeof(init_uts_ns.name.version), .mode = 0444, .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, + .strategy = &sysctl_uts_string, }, { .ctl_name = KERN_NODENAME, @@ -285,7 +289,7 @@ static ctl_table kern_table[] = { .maxlen = sizeof(init_uts_ns.name.nodename), .mode = 0644, .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, + .strategy = &sysctl_uts_string, }, { .ctl_name = KERN_DOMAINNAME, @@ -294,7 +298,7 @@ static ctl_table kern_table[] = { .maxlen = sizeof(init_uts_ns.name.domainname), .mode = 0644, .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, + .strategy = &sysctl_uts_string, }, { .ctl_name = KERN_PANIC, @@ -2598,6 +2602,23 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, return 1; } + +/* The generic string strategy routine: */ +static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + struct ctl_table uts_table; + int r, write; + write = newval && newlen; + memcpy(&uts_table, table, sizeof(uts_table)); + uts_table.data = get_uts(table, write); + r = sysctl_string(&uts_table, name, nlen, + oldval, oldlenp, newval, newlen, context); + put_uts(table, write, uts_table.data); + return r; +} + #else /* CONFIG_SYSCTL_SYSCALL */ @@ -2662,6 +2683,12 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, return -ENOSYS; } +static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + return -ENOSYS; +} #endif /* CONFIG_SYSCTL_SYSCALL */ /* -- cgit v1.2.3 From 9bc9a6bd3cf559bffe962c51efb062e8b5270ca9 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 8 Dec 2006 02:39:56 -0800 Subject: [PATCH] sysctl: simplify ipc ns specific sysctls Refactor the ipc sysctl support so that it is simpler, more readable, and prepares for fixing the bug with the wrong values being returned in the sys_sysctl interface. The function proc_do_ipc_string() was misnamed as it never handled strings. It's magic of when to work with strings and when to work with longs belonged in the sysctl table. I couldn't tell if the code would work if you disabled the ipc namespace but it certainly looked like it would have problems. Signed-off-by: Eric W. Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 106 ++++++++++++++++++++++++++------------------------------ 1 file changed, 49 insertions(+), 57 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 77ea4fc386ef..10474a63a111 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -92,7 +92,9 @@ extern char modprobe_path[]; extern int sg_big_buff; #endif #ifdef CONFIG_SYSVIPC -static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, +static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); +static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); #endif @@ -189,6 +191,18 @@ static void put_uts(ctl_table *table, int write, void *which) up_write(&uts_sem); } +#ifdef CONFIG_SYSVIPC +static void *get_ipc(ctl_table *table, int write) +{ + char *which = table->data; + struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; + which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns; + return which; +} +#else +#define get_ipc(T,W) ((T)->data) +#endif + /* /proc declarations: */ #ifdef CONFIG_PROC_SYSCTL @@ -458,58 +472,58 @@ static ctl_table kern_table[] = { { .ctl_name = KERN_SHMMAX, .procname = "shmmax", - .data = NULL, - .maxlen = sizeof (size_t), + .data = &init_ipc_ns.shm_ctlmax, + .maxlen = sizeof (init_ipc_ns.shm_ctlmax), .mode = 0644, - .proc_handler = &proc_do_ipc_string, + .proc_handler = &proc_ipc_doulongvec_minmax, }, { .ctl_name = KERN_SHMALL, .procname = "shmall", - .data = NULL, - .maxlen = sizeof (size_t), + .data = &init_ipc_ns.shm_ctlall, + .maxlen = sizeof (init_ipc_ns.shm_ctlall), .mode = 0644, - .proc_handler = &proc_do_ipc_string, + .proc_handler = &proc_ipc_doulongvec_minmax, }, { .ctl_name = KERN_SHMMNI, .procname = "shmmni", - .data = NULL, - .maxlen = sizeof (int), + .data = &init_ipc_ns.shm_ctlmni, + .maxlen = sizeof (init_ipc_ns.shm_ctlmni), .mode = 0644, - .proc_handler = &proc_do_ipc_string, + .proc_handler = &proc_ipc_dointvec, }, { .ctl_name = KERN_MSGMAX, .procname = "msgmax", - .data = NULL, - .maxlen = sizeof (int), + .data = &init_ipc_ns.msg_ctlmax, + .maxlen = sizeof (init_ipc_ns.msg_ctlmax), .mode = 0644, - .proc_handler = &proc_do_ipc_string, + .proc_handler = &proc_ipc_dointvec, }, { .ctl_name = KERN_MSGMNI, .procname = "msgmni", - .data = NULL, - .maxlen = sizeof (int), + .data = &init_ipc_ns.msg_ctlmni, + .maxlen = sizeof (init_ipc_ns.msg_ctlmni), .mode = 0644, - .proc_handler = &proc_do_ipc_string, + .proc_handler = &proc_ipc_dointvec, }, { .ctl_name = KERN_MSGMNB, .procname = "msgmnb", - .data = NULL, - .maxlen = sizeof (int), + .data = &init_ipc_ns.msg_ctlmnb, + .maxlen = sizeof (init_ipc_ns.msg_ctlmnb), .mode = 0644, - .proc_handler = &proc_do_ipc_string, + .proc_handler = &proc_ipc_dointvec, }, { .ctl_name = KERN_SEM, .procname = "sem", - .data = NULL, + .data = &init_ipc_ns.sem_ctls, .maxlen = 4*sizeof (int), .mode = 0644, - .proc_handler = &proc_do_ipc_string, + .proc_handler = &proc_ipc_dointvec, }, #endif #ifdef CONFIG_MAGIC_SYSRQ @@ -2319,46 +2333,24 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, } #ifdef CONFIG_SYSVIPC -static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) { - void *data; - struct ipc_namespace *ns; - - ns = current->nsproxy->ipc_ns; - - switch (table->ctl_name) { - case KERN_SHMMAX: - data = &ns->shm_ctlmax; - goto proc_minmax; - case KERN_SHMALL: - data = &ns->shm_ctlall; - goto proc_minmax; - case KERN_SHMMNI: - data = &ns->shm_ctlmni; - break; - case KERN_MSGMAX: - data = &ns->msg_ctlmax; - break; - case KERN_MSGMNI: - data = &ns->msg_ctlmni; - break; - case KERN_MSGMNB: - data = &ns->msg_ctlmnb; - break; - case KERN_SEM: - data = &ns->sem_ctls; - break; - default: - return -EINVAL; - } - - return __do_proc_dointvec(data, table, write, filp, buffer, + void *which; + which = get_ipc(table, write); + return __do_proc_dointvec(which, table, write, filp, buffer, lenp, ppos, NULL, NULL); -proc_minmax: - return __do_proc_doulongvec_minmax(data, table, write, filp, buffer, +} + +static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) +{ + void *which; + which = get_ipc(table, write); + return __do_proc_doulongvec_minmax(which, table, write, filp, buffer, lenp, ppos, 1l, 1l); } + #endif static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, -- cgit v1.2.3 From 6b49a257850fb8ad91f4c76bb712e9213141a34a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 8 Dec 2006 02:39:57 -0800 Subject: [PATCH] sysctl: fix sys_sysctl interface of ipc sysctls Currently there is a regression and the ipc sysctls don't show up in the binary sysctl namespace. This patch adds sysctl_ipc_data to read data/write from the appropriate namespace and deliver it in the expected manner. [akpm@osdl.org: warning fix] Signed-off-by: Eric W. Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 10474a63a111..025fcb3c66f8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -143,6 +143,12 @@ static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen, void **context); +#ifdef CONFIG_SYSVIPC +static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context); +#endif + #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -476,6 +482,7 @@ static ctl_table kern_table[] = { .maxlen = sizeof (init_ipc_ns.shm_ctlmax), .mode = 0644, .proc_handler = &proc_ipc_doulongvec_minmax, + .strategy = sysctl_ipc_data, }, { .ctl_name = KERN_SHMALL, @@ -484,6 +491,7 @@ static ctl_table kern_table[] = { .maxlen = sizeof (init_ipc_ns.shm_ctlall), .mode = 0644, .proc_handler = &proc_ipc_doulongvec_minmax, + .strategy = sysctl_ipc_data, }, { .ctl_name = KERN_SHMMNI, @@ -492,6 +500,7 @@ static ctl_table kern_table[] = { .maxlen = sizeof (init_ipc_ns.shm_ctlmni), .mode = 0644, .proc_handler = &proc_ipc_dointvec, + .strategy = sysctl_ipc_data, }, { .ctl_name = KERN_MSGMAX, @@ -500,6 +509,7 @@ static ctl_table kern_table[] = { .maxlen = sizeof (init_ipc_ns.msg_ctlmax), .mode = 0644, .proc_handler = &proc_ipc_dointvec, + .strategy = sysctl_ipc_data, }, { .ctl_name = KERN_MSGMNI, @@ -508,6 +518,7 @@ static ctl_table kern_table[] = { .maxlen = sizeof (init_ipc_ns.msg_ctlmni), .mode = 0644, .proc_handler = &proc_ipc_dointvec, + .strategy = sysctl_ipc_data, }, { .ctl_name = KERN_MSGMNB, @@ -516,6 +527,7 @@ static ctl_table kern_table[] = { .maxlen = sizeof (init_ipc_ns.msg_ctlmnb), .mode = 0644, .proc_handler = &proc_ipc_dointvec, + .strategy = sysctl_ipc_data, }, { .ctl_name = KERN_SEM, @@ -524,6 +536,7 @@ static ctl_table kern_table[] = { .maxlen = 4*sizeof (int), .mode = 0644, .proc_handler = &proc_ipc_dointvec, + .strategy = sysctl_ipc_data, }, #endif #ifdef CONFIG_MAGIC_SYSRQ @@ -2611,6 +2624,47 @@ static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, return r; } +#ifdef CONFIG_SYSVIPC +/* The generic sysctl ipc data routine. */ +static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + size_t len; + void *data; + + /* Get out of I don't have a variable */ + if (!table->data || !table->maxlen) + return -ENOTDIR; + + data = get_ipc(table, 1); + if (!data) + return -ENOTDIR; + + if (oldval && oldlenp) { + if (get_user(len, oldlenp)) + return -EFAULT; + if (len) { + if (len > table->maxlen) + len = table->maxlen; + if (copy_to_user(oldval, data, len)) + return -EFAULT; + if (put_user(len, oldlenp)) + return -EFAULT; + } + } + + if (newval && newlen) { + if (newlen > table->maxlen) + newlen = table->maxlen; + + if (copy_from_user(data, newval, newlen)) + return -EFAULT; + } + return 1; +} +#endif + #else /* CONFIG_SYSCTL_SYSCALL */ @@ -2681,6 +2735,12 @@ static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, { return -ENOSYS; } +static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + return -ENOSYS; +} #endif /* CONFIG_SYSCTL_SYSCALL */ /* -- cgit v1.2.3 From 4594bf159f1962cec3b727954b7c598b07e2e737 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 7 Dec 2006 11:33:26 +0000 Subject: [PATCH] WorkStruct: Use direct assignment rather than cmpxchg() Use direct assignment rather than cmpxchg() as the latter is unavailable and unimplementable on some platforms and is actually unnecessary. The use of cmpxchg() was to guard against two possibilities, neither of which can actually occur: (1) The pending flag may have been unset or may be cleared. However, given where it's called, the pending flag is _always_ set. I don't think it can be unset whilst we're in set_wq_data(). Once the work is enqueued to be actually run, the only way off the queue is for it to be actually run. If it's a delayed work item, then the bit can't be cleared by the timer because we haven't started the timer yet. Also, the pending bit can't be cleared by cancelling the delayed work _until_ the work item has had its timer started. (2) The workqueue pointer might change. This can only happen in two cases: (a) The work item has just been queued to actually run, and so we're protected by the appropriate workqueue spinlock. (b) A delayed work item is being queued, and so the timer hasn't been started yet, and so no one else knows about the work item or can access it (the pending bit protects us). Besides, set_wq_data() _sets_ the workqueue pointer unconditionally, so it can be assigned instead. So, replacing the set_wq_data() with a straight assignment would be okay in most cases. The problem is where we end up tangling with test_and_set_bit() emulated using spinlocks, and even then it's not a problem _provided_ test_and_set_bit() doesn't attempt to modify the word if the bit was set. If that's a problem, then a bitops-proofed assignment will be required - equivalent to atomic_set() vs other atomic_xxx() ops. Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6b186750e9be..db49886bfae1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -85,22 +85,19 @@ static inline int is_single_threaded(struct workqueue_struct *wq) return list_empty(&wq->list); } +/* + * Set the workqueue on which a work item is to be run + * - Must *only* be called if the pending flag is set + */ static inline void set_wq_data(struct work_struct *work, void *wq) { - unsigned long new, old, res; + unsigned long new; + + BUG_ON(!work_pending(work)); - /* assume the pending flag is already set and that the task has already - * been queued on this workqueue */ new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING); - res = work->management; - if (res != new) { - do { - old = res; - new = (unsigned long) wq; - new |= (old & WORK_STRUCT_FLAG_MASK); - res = cmpxchg(&work->management, old, new); - } while (res != old); - } + new |= work->management & WORK_STRUCT_FLAG_MASK; + work->management = new; } static inline void *get_wq_data(struct work_struct *work) -- cgit v1.2.3 From d53ef07ab45085c0b06b652d588aa49b8ba41458 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 10 Dec 2006 02:18:36 -0800 Subject: [PATCH] ipc-procfs-sysctl mixups When CONFIG_PROC_FS=n and CONFIG_PROC_SYSCTL=n but CONFIG_SYSVIPC=y, we get this build error: kernel/built-in.o:(.data+0xc38): undefined reference to `proc_ipc_doulongvec_minmax' kernel/built-in.o:(.data+0xc88): undefined reference to `proc_ipc_doulongvec_minmax' kernel/built-in.o:(.data+0xcd8): undefined reference to `proc_ipc_dointvec' kernel/built-in.o:(.data+0xd28): undefined reference to `proc_ipc_dointvec' kernel/built-in.o:(.data+0xd78): undefined reference to `proc_ipc_dointvec' kernel/built-in.o:(.data+0xdc8): undefined reference to `proc_ipc_dointvec' kernel/built-in.o:(.data+0xe18): undefined reference to `proc_ipc_dointvec' make: *** [vmlinux] Error 1 Signed-off-by: Randy Dunlap Acked-by: Eric Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 025fcb3c66f8..1c5697e3521e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2408,6 +2408,17 @@ static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, { return -ENOSYS; } +static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} +static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} #endif int proc_dointvec(ctl_table *table, int write, struct file *filp, -- cgit v1.2.3 From 98d7340c360993fdd703609ff7462051e03cc2fb Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 10 Dec 2006 02:19:09 -0800 Subject: [PATCH] sysctl: remove some OPs kernel.cap-bound uses only OP_SET and OP_AND Signed-off-by: Alexey Dobriyan Cc: "Eric W. Biederman" Cc: Chris Wright Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1c5697e3521e..c86445a62af2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1931,9 +1931,6 @@ int proc_dointvec(ctl_table *table, int write, struct file *filp, #define OP_SET 0 #define OP_AND 1 -#define OP_OR 2 -#define OP_MAX 3 -#define OP_MIN 4 static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, int *valp, @@ -1945,13 +1942,6 @@ static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, switch(op) { case OP_SET: *valp = val; break; case OP_AND: *valp &= val; break; - case OP_OR: *valp |= val; break; - case OP_MAX: if(*valp < val) - *valp = val; - break; - case OP_MIN: if(*valp > val) - *valp = val; - break; } } else { int val = *valp; -- cgit v1.2.3 From 1f29bcd739972f71f2fd5d5d265daf3e1208fa5e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 10 Dec 2006 02:19:10 -0800 Subject: [PATCH] sysctl: remove unused "context" param Signed-off-by: Alexey Dobriyan Cc: Andi Kleen Cc: "David S. Miller" Cc: David Howells Cc: Ralf Baechle Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 47 ++++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c86445a62af2..130c5ec9ee0b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -133,7 +133,7 @@ extern int max_lock_depth; #ifdef CONFIG_SYSCTL_SYSCALL static int parse_table(int __user *, int, void __user *, size_t __user *, - void __user *, size_t, ctl_table *, void **); + void __user *, size_t, ctl_table *); #endif static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, @@ -141,12 +141,12 @@ static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context); + void __user *newval, size_t newlen); #ifdef CONFIG_SYSVIPC static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context); + void __user *newval, size_t newlen); #endif #ifdef CONFIG_PROC_SYSCTL @@ -1243,7 +1243,6 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol do { struct ctl_table_header *head = list_entry(tmp, struct ctl_table_header, ctl_entry); - void *context = NULL; if (!use_table(head)) continue; @@ -1251,9 +1250,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol spin_unlock(&sysctl_lock); error = parse_table(name, nlen, oldval, oldlenp, - newval, newlen, head->ctl_table, - &context); - kfree(context); + newval, newlen, head->ctl_table); spin_lock(&sysctl_lock); unuse_table(head); @@ -1309,7 +1306,7 @@ static inline int ctl_perm(ctl_table *table, int op) static int parse_table(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen, - ctl_table *table, void **context) + ctl_table *table) { int n; repeat: @@ -1329,7 +1326,7 @@ repeat: error = table->strategy( table, name, nlen, oldval, oldlenp, - newval, newlen, context); + newval, newlen); if (error) return error; } @@ -1340,7 +1337,7 @@ repeat: } error = do_sysctl_strategy(table, name, nlen, oldval, oldlenp, - newval, newlen, context); + newval, newlen); return error; } } @@ -1351,7 +1348,7 @@ repeat: int do_sysctl_strategy (ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { int op = 0, rc; size_t len; @@ -1365,7 +1362,7 @@ int do_sysctl_strategy (ctl_table *table, if (table->strategy) { rc = table->strategy(table, name, nlen, oldval, oldlenp, - newval, newlen, context); + newval, newlen); if (rc < 0) return rc; if (rc > 0) @@ -2473,7 +2470,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, /* The generic string strategy routine: */ int sysctl_string(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { if (!table->data || !table->maxlen) return -ENOTDIR; @@ -2519,7 +2516,7 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen, */ int sysctl_intvec(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { if (newval && newlen) { @@ -2555,7 +2552,7 @@ int sysctl_intvec(ctl_table *table, int __user *name, int nlen, /* Strategy function to convert jiffies to seconds */ int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { if (oldval) { size_t olen; @@ -2583,7 +2580,7 @@ int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, /* Strategy function to convert jiffies to seconds */ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { if (oldval) { size_t olen; @@ -2612,7 +2609,7 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, /* The generic string strategy routine: */ static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { struct ctl_table uts_table; int r, write; @@ -2620,7 +2617,7 @@ static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, memcpy(&uts_table, table, sizeof(uts_table)); uts_table.data = get_uts(table, write); r = sysctl_string(&uts_table, name, nlen, - oldval, oldlenp, newval, newlen, context); + oldval, oldlenp, newval, newlen); put_uts(table, write, uts_table.data); return r; } @@ -2629,7 +2626,7 @@ static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, /* The generic sysctl ipc data routine. */ static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { size_t len; void *data; @@ -2704,41 +2701,41 @@ out: int sysctl_string(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { return -ENOSYS; } int sysctl_intvec(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { return -ENOSYS; } int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { return -ENOSYS; } int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { return -ENOSYS; } static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { return -ENOSYS; } static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { return -ENOSYS; } -- cgit v1.2.3 From 7c3ab7381e79dfc7db14a67c6f4f3285664e1ec2 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 10 Dec 2006 02:19:19 -0800 Subject: [PATCH] io-accounting: core statistics The present per-task IO accounting isn't very useful. It simply counts the number of bytes passed into read() and write(). So if a process reads 1MB from an already-cached file, it is accused of having performed 1MB of I/O, which is wrong. (David Wright had some comments on the applicability of the present logical IO accounting: For billing purposes it is useless but for workload analysis it is very useful read_bytes/read_calls average read request size write_bytes/write_calls average write request size read_bytes/read_blocks ie logical/physical can indicate hit rate or thrashing write_bytes/write_blocks ie logical/physical guess since pdflush writes can be missed I often look for logical larger than physical to see filesystem cache problems. And the bytes/cpusec can help find applications that are dominating the cache and causing slow interactive response from page cache contention. I want to find the IO intensive applications and make sure they are doing efficient IO. Thus the acctcms(sysV) or csacms command would give the high IO commands). This patchset adds new accounting which tries to be more accurate. We account for three things: reads: attempt to count the number of bytes which this process really did cause to be fetched from the storage layer. Done at the submit_bio() level, so it is accurate for block-backed filesystems. I also attempt to wire up NFS and CIFS. writes: attempt to count the number of bytes which this process caused to be sent to the storage layer. This is done at page-dirtying time. The big inaccuracy here is truncate. If a process writes 1MB to a file and then deletes the file, it will in fact perform no writeout. But it will have been accounted as having caused 1MB of write. So... cancelled_writes: account the number of bytes which this process caused to not happen, by truncating pagecache. We _could_ just subtract this from the process's `write' accounting. But that means that some processes would be reported to have done negative amounts of write IO, which is silly. So we just report the raw number and punt this decision up to userspace. Now, we _could_ account for writes at the physical I/O level. But - This would require that we track memory-dirtying tasks at the per-page level (would require a new pointer in struct page). - It would mean that IO statistics for a process are usually only available long after that process has exitted. Which means that we probably cannot communicate this info via taskstats. This patch: Wire up the kernel-private data structures and the accessor functions to manipulate them. Cc: Jay Lan Cc: Shailabh Nagar Cc: Balbir Singh Cc: Chris Sturtivant Cc: Tony Ernst Cc: Guillaume Thouvenin Cc: David Wright Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8c859eef8e6a..086e172d0d3d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -1055,6 +1056,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->wchar = 0; /* I/O counter: bytes written */ p->syscr = 0; /* I/O counter: read syscalls */ p->syscw = 0; /* I/O counter: write syscalls */ + task_io_accounting_init(p); acct_clear_integrals(p); p->it_virt_expires = cputime_zero; -- cgit v1.2.3 From 4a7864ca638e0a38307962ee8ef122822a351b65 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 10 Dec 2006 02:19:53 -0800 Subject: [PATCH] io-accounting: via taskstats Deliver IO accounting via taskstats. Cc: Jay Lan Cc: Shailabh Nagar Cc: Balbir Singh Cc: Chris Sturtivant Cc: Tony Ernst Cc: Guillaume Thouvenin Cc: David Wright Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/tsacct.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 96f77013d3f0..baacc3691415 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -96,6 +96,15 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) stats->write_char = p->wchar; stats->read_syscalls = p->syscr; stats->write_syscalls = p->syscw; +#ifdef CONFIG_TASK_IO_ACCOUNTING + stats->read_bytes = p->ioac.read_bytes; + stats->write_bytes = p->ioac.write_bytes; + stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes; +#else + stats->read_bytes = 0; + stats->write_bytes = 0; + stats->cancelled_write_bytes = 0; +#endif } #undef KB #undef MB -- cgit v1.2.3 From cc2a73b5caf065f8612fcb5df5bd2f5e25881d99 Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Sun, 10 Dec 2006 02:20:00 -0800 Subject: [PATCH] sched.c: correct comment for this_rq_lock() Acked-by: Ingo Molnar Signed-off-by: Robert P. J. Day Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f385eff4682d..479199ab14ab 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -547,7 +547,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) #endif /* - * rq_lock - lock a given runqueue and disable interrupts. + * this_rq_lock - lock this runqueue and disable interrupts. */ static inline struct rq *this_rq_lock(void) __acquires(rq->lock) -- cgit v1.2.3 From 6711cab43ed5e60bf51e3dbbce6395e87d4e9805 Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Sun, 10 Dec 2006 02:20:07 -0800 Subject: [PATCH] ched domain: move sched group allocations to percpu area Move the sched group allocations to percpu area. This will minimize cross node memory references and also cleans up the sched groups allocation for allnodes sched domain. Signed-off-by: Suresh Siddha Acked-by: Ingo Molnar Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 132 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 64 insertions(+), 68 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 479199ab14ab..a08387b5f7fa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5528,28 +5528,27 @@ static int __init isolated_cpu_setup(char *str) __setup ("isolcpus=", isolated_cpu_setup); /* - * init_sched_build_groups takes an array of groups, the cpumask we wish - * to span, and a pointer to a function which identifies what group a CPU - * belongs to. The return value of group_fn must be a valid index into the - * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we - * keep track of groups covered with a cpumask_t). + * init_sched_build_groups takes the cpumask we wish to span, and a pointer + * to a function which identifies what group(along with sched group) a CPU + * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS + * (due to the fact that we keep track of groups covered with a cpumask_t). * * init_sched_build_groups will build a circular linked list of the groups * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. */ static void -init_sched_build_groups(struct sched_group groups[], cpumask_t span, - const cpumask_t *cpu_map, - int (*group_fn)(int cpu, const cpumask_t *cpu_map)) +init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, + int (*group_fn)(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg)) { struct sched_group *first = NULL, *last = NULL; cpumask_t covered = CPU_MASK_NONE; int i; for_each_cpu_mask(i, span) { - int group = group_fn(i, cpu_map); - struct sched_group *sg = &groups[group]; + struct sched_group *sg; + int group = group_fn(i, cpu_map, &sg); int j; if (cpu_isset(i, covered)) @@ -5559,7 +5558,7 @@ init_sched_build_groups(struct sched_group groups[], cpumask_t span, sg->cpu_power = 0; for_each_cpu_mask(j, span) { - if (group_fn(j, cpu_map) != group) + if (group_fn(j, cpu_map, NULL) != group) continue; cpu_set(j, covered); @@ -6135,10 +6134,13 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; */ #ifdef CONFIG_SCHED_SMT static DEFINE_PER_CPU(struct sched_domain, cpu_domains); -static struct sched_group sched_group_cpus[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); -static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map) +static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg) { + if (sg) + *sg = &per_cpu(sched_group_cpus, cpu); return cpu; } #endif @@ -6148,39 +6150,52 @@ static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map) */ #ifdef CONFIG_SCHED_MC static DEFINE_PER_CPU(struct sched_domain, core_domains); -static struct sched_group sched_group_core[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_group, sched_group_core); #endif #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) -static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map) +static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg) { + int group; cpumask_t mask = cpu_sibling_map[cpu]; cpus_and(mask, mask, *cpu_map); - return first_cpu(mask); + group = first_cpu(mask); + if (sg) + *sg = &per_cpu(sched_group_core, group); + return group; } #elif defined(CONFIG_SCHED_MC) -static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map) +static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg) { + if (sg) + *sg = &per_cpu(sched_group_core, cpu); return cpu; } #endif static DEFINE_PER_CPU(struct sched_domain, phys_domains); -static struct sched_group sched_group_phys[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_group, sched_group_phys); -static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map) +static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg) { + int group; #ifdef CONFIG_SCHED_MC cpumask_t mask = cpu_coregroup_map(cpu); cpus_and(mask, mask, *cpu_map); - return first_cpu(mask); + group = first_cpu(mask); #elif defined(CONFIG_SCHED_SMT) cpumask_t mask = cpu_sibling_map[cpu]; cpus_and(mask, mask, *cpu_map); - return first_cpu(mask); + group = first_cpu(mask); #else - return cpu; + group = cpu; #endif + if (sg) + *sg = &per_cpu(sched_group_phys, group); + return group; } #ifdef CONFIG_NUMA @@ -6193,12 +6208,22 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains); static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); -static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); -static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map) +static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg) { - return cpu_to_node(cpu); + cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu)); + int group; + + cpus_and(nodemask, nodemask, *cpu_map); + group = first_cpu(nodemask); + + if (sg) + *sg = &per_cpu(sched_group_allnodes, group); + return group; } + static void init_numa_sched_groups_power(struct sched_group *group_head) { struct sched_group *sg = group_head; @@ -6234,16 +6259,9 @@ static void free_sched_groups(const cpumask_t *cpu_map) int cpu, i; for_each_cpu_mask(cpu, *cpu_map) { - struct sched_group *sched_group_allnodes - = sched_group_allnodes_bycpu[cpu]; struct sched_group **sched_group_nodes = sched_group_nodes_bycpu[cpu]; - if (sched_group_allnodes) { - kfree(sched_group_allnodes); - sched_group_allnodes_bycpu[cpu] = NULL; - } - if (!sched_group_nodes) continue; @@ -6337,7 +6355,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) struct sched_domain *sd; #ifdef CONFIG_NUMA struct sched_group **sched_group_nodes = NULL; - struct sched_group *sched_group_allnodes = NULL; + int sd_allnodes = 0; /* * Allocate the per-node list of sched groups @@ -6355,7 +6373,6 @@ static int build_sched_domains(const cpumask_t *cpu_map) * Set up domains for cpus specified by the cpu_map. */ for_each_cpu_mask(i, *cpu_map) { - int group; struct sched_domain *sd = NULL, *p; cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); @@ -6364,26 +6381,12 @@ static int build_sched_domains(const cpumask_t *cpu_map) #ifdef CONFIG_NUMA if (cpus_weight(*cpu_map) > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { - if (!sched_group_allnodes) { - sched_group_allnodes - = kmalloc_node(sizeof(struct sched_group) - * MAX_NUMNODES, - GFP_KERNEL, - cpu_to_node(i)); - if (!sched_group_allnodes) { - printk(KERN_WARNING - "Can not alloc allnodes sched group\n"); - goto error; - } - sched_group_allnodes_bycpu[i] - = sched_group_allnodes; - } sd = &per_cpu(allnodes_domains, i); *sd = SD_ALLNODES_INIT; sd->span = *cpu_map; - group = cpu_to_allnodes_group(i, cpu_map); - sd->groups = &sched_group_allnodes[group]; + cpu_to_allnodes_group(i, cpu_map, &sd->groups); p = sd; + sd_allnodes = 1; } else p = NULL; @@ -6398,36 +6401,33 @@ static int build_sched_domains(const cpumask_t *cpu_map) p = sd; sd = &per_cpu(phys_domains, i); - group = cpu_to_phys_group(i, cpu_map); *sd = SD_CPU_INIT; sd->span = nodemask; sd->parent = p; if (p) p->child = sd; - sd->groups = &sched_group_phys[group]; + cpu_to_phys_group(i, cpu_map, &sd->groups); #ifdef CONFIG_SCHED_MC p = sd; sd = &per_cpu(core_domains, i); - group = cpu_to_core_group(i, cpu_map); *sd = SD_MC_INIT; sd->span = cpu_coregroup_map(i); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; - sd->groups = &sched_group_core[group]; + cpu_to_core_group(i, cpu_map, &sd->groups); #endif #ifdef CONFIG_SCHED_SMT p = sd; sd = &per_cpu(cpu_domains, i); - group = cpu_to_cpu_group(i, cpu_map); *sd = SD_SIBLING_INIT; sd->span = cpu_sibling_map[i]; cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; - sd->groups = &sched_group_cpus[group]; + cpu_to_cpu_group(i, cpu_map, &sd->groups); #endif } @@ -6439,8 +6439,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) if (i != first_cpu(this_sibling_map)) continue; - init_sched_build_groups(sched_group_cpus, this_sibling_map, - cpu_map, &cpu_to_cpu_group); + init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group); } #endif @@ -6451,8 +6450,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) cpus_and(this_core_map, this_core_map, *cpu_map); if (i != first_cpu(this_core_map)) continue; - init_sched_build_groups(sched_group_core, this_core_map, - cpu_map, &cpu_to_core_group); + init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group); } #endif @@ -6465,15 +6463,13 @@ static int build_sched_domains(const cpumask_t *cpu_map) if (cpus_empty(nodemask)) continue; - init_sched_build_groups(sched_group_phys, nodemask, - cpu_map, &cpu_to_phys_group); + init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); } #ifdef CONFIG_NUMA /* Set up node groups */ - if (sched_group_allnodes) - init_sched_build_groups(sched_group_allnodes, *cpu_map, - cpu_map, &cpu_to_allnodes_group); + if (sd_allnodes) + init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group); for (i = 0; i < MAX_NUMNODES; i++) { /* Set up node groups */ @@ -6565,10 +6561,10 @@ static int build_sched_domains(const cpumask_t *cpu_map) for (i = 0; i < MAX_NUMNODES; i++) init_numa_sched_groups_power(sched_group_nodes[i]); - if (sched_group_allnodes) { - int group = cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map); - struct sched_group *sg = &sched_group_allnodes[group]; + if (sd_allnodes) { + struct sched_group *sg; + cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); init_numa_sched_groups_power(sg); } #endif -- cgit v1.2.3 From 054b9108e01ef27e2e6b32b4226abb6024626f06 Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Sun, 10 Dec 2006 02:20:11 -0800 Subject: [PATCH] move_task_off_dead_cpu() should be called with disabled ints move_task_off_dead_cpu() requires interrupts to be disabled, while migrate_dead() calls it with enabled interrupts. Added appropriate comments to functions and added BUG_ON(!irqs_disabled()) into double_rq_lock() and double_lock_balance() which are the origin sources of such bugs. Signed-off-by: Kirill Korotaev Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index a08387b5f7fa..f04add905bdf 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1952,6 +1952,7 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2) __acquires(rq1->lock) __acquires(rq2->lock) { + BUG_ON(!irqs_disabled()); if (rq1 == rq2) { spin_lock(&rq1->lock); __acquire(rq2->lock); /* Fake it out ;) */ @@ -1991,6 +1992,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest) __acquires(busiest->lock) __acquires(this_rq->lock) { + if (unlikely(!irqs_disabled())) { + /* printk() doesn't work good under rq->lock */ + spin_unlock(&this_rq->lock); + BUG_ON(1); + } if (unlikely(!spin_trylock(&busiest->lock))) { if (busiest < this_rq) { spin_unlock(&this_rq->lock); @@ -5067,7 +5073,10 @@ wait_to_die: } #ifdef CONFIG_HOTPLUG_CPU -/* Figure out where task on dead CPU should go, use force if neccessary. */ +/* + * Figure out where task on dead CPU should go, use force if neccessary. + * NOTE: interrupts should be disabled by the caller + */ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { unsigned long flags; @@ -5187,6 +5196,7 @@ void idle_task_exit(void) mmdrop(mm); } +/* called under rq->lock with disabled interrupts */ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) { struct rq *rq = cpu_rq(dead_cpu); @@ -5203,10 +5213,11 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) * Drop lock around migration; if someone else moves it, * that's OK. No task can be added to this CPU, so iteration is * fine. + * NOTE: interrupts should be left disabled --dev@ */ - spin_unlock_irq(&rq->lock); + spin_unlock(&rq->lock); move_task_off_dead_cpu(dead_cpu, p); - spin_lock_irq(&rq->lock); + spin_lock(&rq->lock); put_task_struct(p); } -- cgit v1.2.3 From 571f6d2fb0b1c04798df783db2ba85e96bcce43d Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 10 Dec 2006 02:20:13 -0800 Subject: [PATCH] sched: avoid taking rq lock in wake_priority_sleeper Avoid taking the request queue lock in wake_priority_sleeper if there are no running processes. Signed-off-by: Christoph Lameter Cc: Peter Williams Cc: Nick Piggin Cc: Christoph Lameter Cc: "Siddha, Suresh B" Cc: "Chen, Kenneth W" Cc: KAMEZAWA Hiroyuki Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f04add905bdf..fdd26fffaa20 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2915,6 +2915,9 @@ static inline int wake_priority_sleeper(struct rq *rq) int ret = 0; #ifdef CONFIG_SCHED_SMT + if (!rq->nr_running) + return 0; + spin_lock(&rq->lock); /* * If an SMT sibling task has been put to sleep for priority -- cgit v1.2.3 From 4211a9a2e94a34df8c02bc39b7ec10678ad5c2ab Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 10 Dec 2006 02:20:19 -0800 Subject: [PATCH] sched: remove staggering of load balancing Timer interrupts already are staggered. We do not need an additional layer of time staggering for short load balancing actions that take a reasonably small portion of the time slice. For load balancing on large sched_domains we will add a serialization later that avoids concurrent load balance operations and thus has the same effect as load staggering. Signed-off-by: Christoph Lameter Cc: Peter Williams Cc: Nick Piggin Cc: Christoph Lameter Cc: "Siddha, Suresh B" Cc: "Chen, Kenneth W" Acked-by: Ingo Molnar Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index fdd26fffaa20..b5b350135002 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2841,16 +2841,10 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) * Balancing parameters are set up in arch_init_sched_domains. */ -/* Don't have all balancing operations going off at once: */ -static inline unsigned long cpu_offset(int cpu) -{ - return jiffies + cpu * HZ / NR_CPUS; -} - static void rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) { - unsigned long this_load, interval, j = cpu_offset(this_cpu); + unsigned long this_load, interval; struct sched_domain *sd; int i, scale; @@ -2885,7 +2879,7 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) if (unlikely(!interval)) interval = 1; - if (j - sd->last_balance >= interval) { + if (jiffies - sd->last_balance >= interval) { if (load_balance(this_cpu, this_rq, sd, idle)) { /* * We've pulled tasks over so either we're no -- cgit v1.2.3 From fe2eea3fafb3df2f5b8a55a48bcbb0d23b3b5618 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 10 Dec 2006 02:20:21 -0800 Subject: [PATCH] sched: disable interrupts for locking in load_balance() Interrupts must be disabled for request queue locks if we want to run load_balance() with interrupts enabled. Signed-off-by: Christoph Lameter Cc: Peter Williams Cc: Nick Piggin Cc: Christoph Lameter Cc: "Siddha, Suresh B" Cc: "Chen, Kenneth W" Acked-by: Ingo Molnar Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b5b350135002..d327511d268e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2546,8 +2546,6 @@ static inline unsigned long minus_1_or_zero(unsigned long n) /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. - * - * Called with this_rq unlocked. */ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum idle_type idle) @@ -2557,6 +2555,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, unsigned long imbalance; struct rq *busiest; cpumask_t cpus = CPU_MASK_ALL; + unsigned long flags; /* * When power savings policy is enabled for the parent domain, idle @@ -2596,11 +2595,13 @@ redo: * still unbalanced. nr_moved simply stays zero, so it is * correctly treated as an imbalance. */ + local_irq_save(flags); double_rq_lock(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, minus_1_or_zero(busiest->nr_running), imbalance, sd, idle, &all_pinned); double_rq_unlock(this_rq, busiest); + local_irq_restore(flags); /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) { @@ -2617,13 +2618,13 @@ redo: if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { - spin_lock(&busiest->lock); + spin_lock_irqsave(&busiest->lock, flags); /* don't kick the migration_thread, if the curr * task on busiest cpu can't be moved to this_cpu */ if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { - spin_unlock(&busiest->lock); + spin_unlock_irqrestore(&busiest->lock, flags); all_pinned = 1; goto out_one_pinned; } @@ -2633,7 +2634,7 @@ redo: busiest->push_cpu = this_cpu; active_balance = 1; } - spin_unlock(&busiest->lock); + spin_unlock_irqrestore(&busiest->lock, flags); if (active_balance) wake_up_process(busiest->migration_thread); -- cgit v1.2.3 From 7835b98bc6de2ca10afa45572d272304b000b048 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 10 Dec 2006 02:20:22 -0800 Subject: [PATCH] sched: extract load calculation from rebalance_tick A load calculation is always done in rebalance_tick() in addition to the real load balancing activities that only take place when certain jiffie counts have been reached. Move that processing into a separate function and call it directly from scheduler_tick(). Also extract the time slice handling from scheduler_tick and put it into a separate function. Then we can clean up scheduler_tick significantly. It will no longer have any gotos. Signed-off-by: Christoph Lameter Cc: Peter Williams Cc: Nick Piggin Cc: Christoph Lameter Cc: "Siddha, Suresh B" Cc: "Chen, Kenneth W" Acked-by: Ingo Molnar Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 96 +++++++++++++++++++++++++++++++++------------------------- 1 file changed, 54 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index d327511d268e..3c1b74aa1b07 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2833,20 +2833,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) spin_unlock(&target_rq->lock); } -/* - * rebalance_tick will get called every timer tick, on every CPU. - * - * It checks each scheduling domain to see if it is due to be balanced, - * and initiates a balancing operation if so. - * - * Balancing parameters are set up in arch_init_sched_domains. - */ - -static void -rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) +static void update_load(struct rq *this_rq) { - unsigned long this_load, interval; - struct sched_domain *sd; + unsigned long this_load; int i, scale; this_load = this_rq->raw_weighted_load; @@ -2866,6 +2855,22 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) new_load += scale-1; this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; } +} + +/* + * rebalance_tick will get called every timer tick, on every CPU. + * + * It checks each scheduling domain to see if it is due to be balanced, + * and initiates a balancing operation if so. + * + * Balancing parameters are set up in arch_init_sched_domains. + */ + +static void +rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) +{ + unsigned long interval; + struct sched_domain *sd; for_each_domain(this_cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -2897,12 +2902,15 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) /* * on UP we do not need to balance between CPUs: */ -static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle) +static inline void rebalance_tick(int cpu, struct rq *rq) { } static inline void idle_balance(int cpu, struct rq *rq) { } +static inline void update_load(struct rq *this_rq) +{ +} #endif static inline int wake_priority_sleeper(struct rq *rq) @@ -3052,35 +3060,12 @@ void account_steal_time(struct task_struct *p, cputime_t steal) cpustat->steal = cputime64_add(cpustat->steal, tmp); } -/* - * This function gets called by the timer code, with HZ frequency. - * We call it with interrupts disabled. - * - * It also gets called by the fork code, when changing the parent's - * timeslices. - */ -void scheduler_tick(void) +static void task_running_tick(struct rq *rq, struct task_struct *p) { - unsigned long long now = sched_clock(); - struct task_struct *p = current; - int cpu = smp_processor_id(); - struct rq *rq = cpu_rq(cpu); - - update_cpu_clock(p, rq, now); - - rq->timestamp_last_tick = now; - - if (p == rq->idle) { - if (wake_priority_sleeper(rq)) - goto out; - rebalance_tick(cpu, rq, SCHED_IDLE); - return; - } - - /* Task might have expired already, but not scheduled off yet */ if (p->array != rq->active) { + /* Task has expired but was not scheduled yet */ set_tsk_need_resched(p); - goto out; + return; } spin_lock(&rq->lock); /* @@ -3148,8 +3133,35 @@ void scheduler_tick(void) } out_unlock: spin_unlock(&rq->lock); -out: - rebalance_tick(cpu, rq, NOT_IDLE); +} + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + * + * It also gets called by the fork code, when changing the parent's + * timeslices. + */ +void scheduler_tick(void) +{ + unsigned long long now = sched_clock(); + struct task_struct *p = current; + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + enum idle_type idle = NOT_IDLE; + + update_cpu_clock(p, rq, now); + + rq->timestamp_last_tick = now; + + if (p == rq->idle) { + /* Task on the idle queue */ + if (!wake_priority_sleeper(rq)) + idle = SCHED_IDLE; + } else + task_running_tick(rq, p); + update_load(rq); + rebalance_tick(cpu, rq, idle); } #ifdef CONFIG_SCHED_SMT -- cgit v1.2.3 From e418e1c2bf1a253916b569370653414eb28597b6 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 10 Dec 2006 02:20:23 -0800 Subject: [PATCH] sched: move idle status calculation into rebalance_tick() Perform the idle state determination in rebalance_tick. If we separate balancing from sched_tick then we also need to determine the idle state in rebalance_tick. V2->V3 Remove useless idlle != 0 check. Checking nr_running seems to be sufficient. Thanks Suresh. Signed-off-by: Christoph Lameter Cc: Peter Williams Cc: Nick Piggin Cc: Christoph Lameter Cc: "Siddha, Suresh B" Cc: "Chen, Kenneth W" Acked-by: Ingo Molnar Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3c1b74aa1b07..14a8d9050cd4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2867,10 +2867,16 @@ static void update_load(struct rq *this_rq) */ static void -rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) +rebalance_tick(int this_cpu, struct rq *this_rq) { unsigned long interval; struct sched_domain *sd; + /* + * We are idle if there are no processes running. This + * is valid even if we are the idle process (SMT). + */ + enum idle_type idle = !this_rq->nr_running ? + SCHED_IDLE : NOT_IDLE; for_each_domain(this_cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -2902,37 +2908,26 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) /* * on UP we do not need to balance between CPUs: */ -static inline void rebalance_tick(int cpu, struct rq *rq) -{ -} static inline void idle_balance(int cpu, struct rq *rq) { } -static inline void update_load(struct rq *this_rq) -{ -} #endif -static inline int wake_priority_sleeper(struct rq *rq) +static inline void wake_priority_sleeper(struct rq *rq) { - int ret = 0; - #ifdef CONFIG_SCHED_SMT if (!rq->nr_running) - return 0; + return; spin_lock(&rq->lock); /* * If an SMT sibling task has been put to sleep for priority * reasons reschedule the idle task to see if it can now run. */ - if (rq->nr_running) { + if (rq->nr_running) resched_task(rq->idle); - ret = 1; - } spin_unlock(&rq->lock); #endif - return ret; } DEFINE_PER_CPU(struct kernel_stat, kstat); @@ -3148,20 +3143,20 @@ void scheduler_tick(void) struct task_struct *p = current; int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); - enum idle_type idle = NOT_IDLE; update_cpu_clock(p, rq, now); rq->timestamp_last_tick = now; - if (p == rq->idle) { + if (p == rq->idle) /* Task on the idle queue */ - if (!wake_priority_sleeper(rq)) - idle = SCHED_IDLE; - } else + wake_priority_sleeper(rq); + else task_running_tick(rq, p); +#ifdef CONFIG_SMP update_load(rq); - rebalance_tick(cpu, rq, idle); + rebalance_tick(cpu, rq); +#endif } #ifdef CONFIG_SCHED_SMT -- cgit v1.2.3 From c9819f4593e8d052b41a89f47140f5c5e7e30582 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 10 Dec 2006 02:20:25 -0800 Subject: [PATCH] sched: use softirq for load balancing Call rebalance_tick (renamed to run_rebalance_domains) from a newly introduced softirq. We calculate the earliest time for each layer of sched domains to be rescanned (this is the rescan time for idle) and use the earliest of those to schedule the softirq via a new field "next_balance" added to struct rq. Signed-off-by: Christoph Lameter Cc: Peter Williams Cc: Nick Piggin Cc: Christoph Lameter Cc: "Siddha, Suresh B" Cc: "Chen, Kenneth W" Acked-by: Ingo Molnar Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 14a8d9050cd4..0a3e748d737d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -227,6 +227,7 @@ struct rq { unsigned long expired_timestamp; unsigned long long timestamp_last_tick; struct task_struct *curr, *idle; + unsigned long next_balance; struct mm_struct *prev_mm; struct prio_array *active, *expired, arrays[2]; int best_expired_prio; @@ -2858,7 +2859,7 @@ static void update_load(struct rq *this_rq) } /* - * rebalance_tick will get called every timer tick, on every CPU. + * run_rebalance_domains is triggered when needed from the scheduler tick. * * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. @@ -2866,9 +2867,10 @@ static void update_load(struct rq *this_rq) * Balancing parameters are set up in arch_init_sched_domains. */ -static void -rebalance_tick(int this_cpu, struct rq *this_rq) +static void run_rebalance_domains(struct softirq_action *h) { + int this_cpu = smp_processor_id(); + struct rq *this_rq = cpu_rq(this_cpu); unsigned long interval; struct sched_domain *sd; /* @@ -2877,6 +2879,8 @@ rebalance_tick(int this_cpu, struct rq *this_rq) */ enum idle_type idle = !this_rq->nr_running ? SCHED_IDLE : NOT_IDLE; + /* Earliest time when we have to call run_rebalance_domains again */ + unsigned long next_balance = jiffies + 60*HZ; for_each_domain(this_cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -2891,7 +2895,7 @@ rebalance_tick(int this_cpu, struct rq *this_rq) if (unlikely(!interval)) interval = 1; - if (jiffies - sd->last_balance >= interval) { + if (time_after_eq(jiffies, sd->last_balance + interval)) { if (load_balance(this_cpu, this_rq, sd, idle)) { /* * We've pulled tasks over so either we're no @@ -2902,7 +2906,10 @@ rebalance_tick(int this_cpu, struct rq *this_rq) } sd->last_balance += interval; } + if (time_after(next_balance, sd->last_balance + interval)) + next_balance = sd->last_balance + interval; } + this_rq->next_balance = next_balance; } #else /* @@ -3155,7 +3162,8 @@ void scheduler_tick(void) task_running_tick(rq, p); #ifdef CONFIG_SMP update_load(rq); - rebalance_tick(cpu, rq); + if (time_after_eq(jiffies, rq->next_balance)) + raise_softirq(SCHED_SOFTIRQ); #endif } @@ -6859,6 +6867,10 @@ void __init sched_init(void) set_load_weight(&init_task); +#ifdef CONFIG_SMP + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); +#endif + #ifdef CONFIG_RT_MUTEXES plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); #endif -- cgit v1.2.3 From 1bd77f2da58e9cdd1f159217887343dadd9af417 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 10 Dec 2006 02:20:27 -0800 Subject: [PATCH] sched: call tasklet less frequently Trigger softirq less frequently We trigger the softirq before this patch using offset of sd->interval. However, if the queue is busy then it is sufficient to schedule the softirq with sd->interval * busy_factor. So we modify the calculation of the next time to balance by taking the interval added to last_balance again. This is only the right value if the idle/busy situation continues as is. There are two potential trouble spots: - If the queue was idle and now gets busy then we call rebalance early. However, that is not a problem because we will then use the longer interval for the next period. - If the queue was busy and becomes idle then we potentially wait too long before rebalancing. However, when the task goes idle then idle_balance is called. We add another calculation of the next balance time based on sd->interval in idle_balance so that we will rebalance soon. V2->V3: - Calculate rebalance time based on current jiffies and not based on the jiffies at the last time we load balanced. We no longer rely on staggering and therefore we can affort to do this now. V3->V4: - Use functions to do jiffy comparisons. Signed-off-by: Christoph Lameter Cc: Peter Williams Cc: Nick Piggin Cc: Christoph Lameter Cc: "Siddha, Suresh B" Cc: "Chen, Kenneth W" Acked-by: Ingo Molnar Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0a3e748d737d..0a4a26b21f69 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2774,14 +2774,28 @@ out_balanced: static void idle_balance(int this_cpu, struct rq *this_rq) { struct sched_domain *sd; + int pulled_task = 0; + unsigned long next_balance = jiffies + 60 * HZ; for_each_domain(this_cpu, sd) { if (sd->flags & SD_BALANCE_NEWIDLE) { /* If we've pulled tasks over stop searching: */ - if (load_balance_newidle(this_cpu, this_rq, sd)) + pulled_task = load_balance_newidle(this_cpu, + this_rq, sd); + if (time_after(next_balance, + sd->last_balance + sd->balance_interval)) + next_balance = sd->last_balance + + sd->balance_interval; + if (pulled_task) break; } } + if (!pulled_task) + /* + * We are going idle. next_balance may be set based on + * a busy processor. So reset next_balance. + */ + this_rq->next_balance = next_balance; } /* @@ -2904,7 +2918,7 @@ static void run_rebalance_domains(struct softirq_action *h) */ idle = NOT_IDLE; } - sd->last_balance += interval; + sd->last_balance = jiffies; } if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; -- cgit v1.2.3 From 08c183f31bdbb709f177f6d3110d5f288ea33933 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 10 Dec 2006 02:20:29 -0800 Subject: [PATCH] sched: add option to serialize load balancing Large sched domains can be very expensive to scan. Add an option SD_SERIALIZE to the sched domain flags. If that flag is set then we make sure that no other such domain is being balanced. [akpm@osdl.org: build fix] Signed-off-by: Christoph Lameter Cc: Peter Williams Cc: Nick Piggin Cc: Christoph Lameter Cc: "Siddha, Suresh B" Cc: "Chen, Kenneth W" Acked-by: Ingo Molnar Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0a4a26b21f69..2b2b780939c9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2880,6 +2880,7 @@ static void update_load(struct rq *this_rq) * * Balancing parameters are set up in arch_init_sched_domains. */ +static DEFINE_SPINLOCK(balancing); static void run_rebalance_domains(struct softirq_action *h) { @@ -2909,6 +2910,11 @@ static void run_rebalance_domains(struct softirq_action *h) if (unlikely(!interval)) interval = 1; + if (sd->flags & SD_SERIALIZE) { + if (!spin_trylock(&balancing)) + goto out; + } + if (time_after_eq(jiffies, sd->last_balance + interval)) { if (load_balance(this_cpu, this_rq, sd, idle)) { /* @@ -2920,6 +2926,9 @@ static void run_rebalance_domains(struct softirq_action *h) } sd->last_balance = jiffies; } + if (sd->flags & SD_SERIALIZE) + spin_unlock(&balancing); +out: if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; } -- cgit v1.2.3 From b18ec80396834497933d77b81ec0918519f4e2a7 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Sun, 10 Dec 2006 02:20:31 -0800 Subject: [PATCH] sched: improve migration accuracy Co-opt rq->timestamp_last_tick to maintain a cache_hot_time evaluation reference timestamp at both tick and sched times to prevent said reference, formerly rq->timestamp_last_tick, from being behind task->last_ran at evaluation time, and to move said reference closer to current time on the remote processor, intent being to improve cache hot evaluation and timestamp adjustment accuracy for task migration. Fix minor sched_time double accounting error which occurs when a task passing through schedule() does not schedule off, and takes the next timer tick. [kenneth.w.chen@intel.com: cleanup] Signed-off-by: Mike Galbraith Acked-by: Ingo Molnar Acked-by: Ken Chen Cc: Don Mullis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2b2b780939c9..15ce772a471a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -225,7 +225,8 @@ struct rq { unsigned long nr_uninterruptible; unsigned long expired_timestamp; - unsigned long long timestamp_last_tick; + /* Cached timestamp set by update_cpu_clock() */ + unsigned long long most_recent_timestamp; struct task_struct *curr, *idle; unsigned long next_balance; struct mm_struct *prev_mm; @@ -944,8 +945,8 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) if (!local) { /* Compensate for drifting sched_clock */ struct rq *this_rq = this_rq(); - now = (now - this_rq->timestamp_last_tick) - + rq->timestamp_last_tick; + now = (now - this_rq->most_recent_timestamp) + + rq->most_recent_timestamp; } #endif @@ -1689,8 +1690,8 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * Not the local CPU - must adjust timestamp. This should * get optimised away in the !CONFIG_SMP case. */ - p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) - + rq->timestamp_last_tick; + p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) + + rq->most_recent_timestamp; __activate_task(p, rq); if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); @@ -2068,8 +2069,8 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array, set_task_cpu(p, this_cpu); inc_nr_running(p, this_rq); enqueue_task(p, this_array); - p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) - + this_rq->timestamp_last_tick; + p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) + + this_rq->most_recent_timestamp; /* * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. @@ -2105,10 +2106,15 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) too many balance attempts have failed. */ - if (sd->nr_balance_failed > sd->cache_nice_tries) + if (sd->nr_balance_failed > sd->cache_nice_tries) { +#ifdef CONFIG_SCHEDSTATS + if (task_hot(p, rq->most_recent_timestamp, sd)) + schedstat_inc(sd, lb_hot_gained[idle]); +#endif return 1; + } - if (task_hot(p, rq->timestamp_last_tick, sd)) + if (task_hot(p, rq->most_recent_timestamp, sd)) return 0; return 1; } @@ -2206,11 +2212,6 @@ skip_queue: goto skip_bitmap; } -#ifdef CONFIG_SCHEDSTATS - if (task_hot(tmp, busiest->timestamp_last_tick, sd)) - schedstat_inc(sd, lb_hot_gained[idle]); -#endif - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); pulled++; rem_load_move -= tmp->load_weight; @@ -2971,7 +2972,8 @@ EXPORT_PER_CPU_SYMBOL(kstat); static inline void update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) { - p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick); + p->sched_time += now - p->last_ran; + p->last_ran = rq->most_recent_timestamp = now; } /* @@ -2984,8 +2986,7 @@ unsigned long long current_sched_time(const struct task_struct *p) unsigned long flags; local_irq_save(flags); - ns = max(p->timestamp, task_rq(p)->timestamp_last_tick); - ns = p->sched_time + sched_clock() - ns; + ns = p->sched_time + sched_clock() - p->last_ran; local_irq_restore(flags); return ns; @@ -3176,8 +3177,6 @@ void scheduler_tick(void) update_cpu_clock(p, rq, now); - rq->timestamp_last_tick = now; - if (p == rq->idle) /* Task on the idle queue */ wake_priority_sleeper(rq); @@ -5032,8 +5031,8 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) * afterwards, and pretending it was a local activate. * This way is cleaner and logically correct. */ - p->timestamp = p->timestamp - rq_src->timestamp_last_tick - + rq_dest->timestamp_last_tick; + p->timestamp = p->timestamp - rq_src->most_recent_timestamp + + rq_dest->most_recent_timestamp; deactivate_task(p, rq_src); __activate_task(p, rq_dest); if (TASK_PREEMPTS_CURR(p, rq_dest)) -- cgit v1.2.3 From 783609c6cb4eaa23f2ac5c968a44483584ec133f Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Sun, 10 Dec 2006 02:20:33 -0800 Subject: [PATCH] sched: decrease number of load balances Currently at a particular domain, each cpu in the sched group will do a load balance at the frequency of balance_interval. More the cores and threads, more the cpus will be in each sched group at SMP and NUMA domain. And we endup spending quite a bit of time doing load balancing in those domains. Fix this by making only one cpu(first idle cpu or first cpu in the group if all the cpus are busy) in the sched group do the load balance at that particular sched domain and this load will slowly percolate down to the other cpus with in that group(when they do load balancing at lower domains). Signed-off-by: Suresh Siddha Cc: Christoph Lameter Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 15ce772a471a..4e453431c61a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -428,7 +428,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) * bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ -#define SCHEDSTAT_VERSION 12 +#define SCHEDSTAT_VERSION 13 static int show_schedstat(struct seq_file *seq, void *v) { @@ -466,7 +466,7 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "domain%d %s", dcnt++, mask_str); for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; itype++) { - seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu", sd->lb_cnt[itype], sd->lb_balanced[itype], sd->lb_failed[itype], @@ -474,7 +474,8 @@ static int show_schedstat(struct seq_file *seq, void *v) sd->lb_gained[itype], sd->lb_hot_gained[itype], sd->lb_nobusyq[itype], - sd->lb_nobusyg[itype]); + sd->lb_nobusyg[itype], + sd->lb_stopbalance[itype]); } seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", sd->alb_cnt, sd->alb_failed, sd->alb_pushed, @@ -2249,7 +2250,7 @@ out: static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum idle_type idle, int *sd_idle, - cpumask_t *cpus) + cpumask_t *cpus, int *balance) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; @@ -2278,10 +2279,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long load, group_capacity; int local_group; int i; + unsigned int balance_cpu = -1, first_idle_cpu = 0; unsigned long sum_nr_running, sum_weighted_load; local_group = cpu_isset(this_cpu, group->cpumask); + if (local_group) + balance_cpu = first_cpu(group->cpumask); + /* Tally up the load of all CPUs in the group */ sum_weighted_load = sum_nr_running = avg_load = 0; @@ -2297,9 +2302,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, *sd_idle = 0; /* Bias balancing toward cpus of our domain */ - if (local_group) + if (local_group) { + if (idle_cpu(i) && !first_idle_cpu) { + first_idle_cpu = 1; + balance_cpu = i; + } + load = target_load(i, load_idx); - else + } else load = source_load(i, load_idx); avg_load += load; @@ -2307,6 +2317,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, sum_weighted_load += rq->raw_weighted_load; } + /* + * First idle cpu or the first cpu(busiest) in this sched group + * is eligible for doing load balancing at this and above + * domains. + */ + if (local_group && balance_cpu != this_cpu && balance) { + *balance = 0; + goto ret; + } + total_load += avg_load; total_pwr += group->cpu_power; @@ -2498,8 +2518,8 @@ out_balanced: *imbalance = min_load_per_task; return group_min; } -ret: #endif +ret: *imbalance = 0; return NULL; } @@ -2550,7 +2570,8 @@ static inline unsigned long minus_1_or_zero(unsigned long n) * tasks if there is an imbalance. */ static int load_balance(int this_cpu, struct rq *this_rq, - struct sched_domain *sd, enum idle_type idle) + struct sched_domain *sd, enum idle_type idle, + int *balance) { int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; @@ -2573,7 +2594,13 @@ static int load_balance(int this_cpu, struct rq *this_rq, redo: group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, - &cpus); + &cpus, balance); + + if (*balance == 0) { + schedstat_inc(sd, lb_stopbalance[idle]); + goto out_balanced; + } + if (!group) { schedstat_inc(sd, lb_nobusyg[idle]); goto out_balanced; @@ -2715,7 +2742,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); redo: group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, - &sd_idle, &cpus); + &sd_idle, &cpus, NULL); if (!group) { schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); goto out_balanced; @@ -2885,7 +2912,7 @@ static DEFINE_SPINLOCK(balancing); static void run_rebalance_domains(struct softirq_action *h) { - int this_cpu = smp_processor_id(); + int this_cpu = smp_processor_id(), balance = 1; struct rq *this_rq = cpu_rq(this_cpu); unsigned long interval; struct sched_domain *sd; @@ -2917,7 +2944,7 @@ static void run_rebalance_domains(struct softirq_action *h) } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(this_cpu, this_rq, sd, idle)) { + if (load_balance(this_cpu, this_rq, sd, idle, &balance)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is @@ -2932,6 +2959,14 @@ static void run_rebalance_domains(struct softirq_action *h) out: if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; + + /* + * Stop the load balance at this level. There is another + * CPU in our sched group which is doing load balancing more + * actively. + */ + if (!balance) + break; } this_rq->next_balance = next_balance; } -- cgit v1.2.3 From 06066714f6016cffcb249f6ab21b7919de1bc859 Mon Sep 17 00:00:00 2001 From: "Chen, Kenneth W" Date: Sun, 10 Dec 2006 02:20:35 -0800 Subject: [PATCH] sched: remove lb_stopbalance counter Remove scheduler stats lb_stopbalance counter. This counter can be calculated by: lb_balanced - lb_nobusyg - lb_nobusyq. There is no need to create gazillion counters while we can derive the value. Signed-off-by: Ken Chen Signed-off-by: Suresh Siddha Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 4e453431c61a..66e44b5b53d2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -428,7 +428,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) * bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ -#define SCHEDSTAT_VERSION 13 +#define SCHEDSTAT_VERSION 14 static int show_schedstat(struct seq_file *seq, void *v) { @@ -466,7 +466,7 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "domain%d %s", dcnt++, mask_str); for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; itype++) { - seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu", + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", sd->lb_cnt[itype], sd->lb_balanced[itype], sd->lb_failed[itype], @@ -474,8 +474,7 @@ static int show_schedstat(struct seq_file *seq, void *v) sd->lb_gained[itype], sd->lb_hot_gained[itype], sd->lb_nobusyq[itype], - sd->lb_nobusyg[itype], - sd->lb_stopbalance[itype]); + sd->lb_nobusyg[itype]); } seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", sd->alb_cnt, sd->alb_failed, sd->alb_pushed, @@ -2596,10 +2595,8 @@ redo: group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, &cpus, balance); - if (*balance == 0) { - schedstat_inc(sd, lb_stopbalance[idle]); + if (*balance == 0) goto out_balanced; - } if (!group) { schedstat_inc(sd, lb_nobusyg[idle]); -- cgit v1.2.3 From 62ab616d54371a65f595c199aad1e1755b837d25 Mon Sep 17 00:00:00 2001 From: "Chen, Kenneth W" Date: Sun, 10 Dec 2006 02:20:36 -0800 Subject: [PATCH] sched: optimize activate_task for RT task RT task does not participate in interactiveness priority and thus shouldn't be bothered with timestamp and p->sleep_type manipulation when task is being put on run queue. Bypass all of the them with a single if (rt_task) test. Signed-off-by: Ken Chen Acked-by: Ingo Molnar Cc: Nick Piggin Cc: "Siddha, Suresh B" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 66e44b5b53d2..48e35c916326 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -940,6 +940,9 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) { unsigned long long now; + if (rt_task(p)) + goto out; + now = sched_clock(); #ifdef CONFIG_SMP if (!local) { @@ -961,8 +964,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) (now - p->timestamp) >> 20); } - if (!rt_task(p)) - p->prio = recalc_task_prio(p, now); + p->prio = recalc_task_prio(p, now); /* * This checks to make sure it's not an uninterruptible task @@ -987,7 +989,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) } } p->timestamp = now; - +out: __activate_task(p, rq); } -- cgit v1.2.3 From 33859f7f9788da2ac9aa23be4dc8e948112809ca Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Sandonis Date: Sun, 10 Dec 2006 02:20:38 -0800 Subject: [PATCH] kernel/sched.c: whitespace cleanups [akpm@osdl.org: additional cleanups] Signed-off-by: Miguel Ojeda Sandonis Acked-by: Ingo Molnar Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 95 +++++++++++++++++++++++++++++++++------------------------- 1 file changed, 55 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 48e35c916326..8a0afb97af71 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -466,7 +466,8 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "domain%d %s", dcnt++, mask_str); for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; itype++) { - seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " + "%lu", sd->lb_cnt[itype], sd->lb_balanced[itype], sd->lb_failed[itype], @@ -476,11 +477,13 @@ static int show_schedstat(struct seq_file *seq, void *v) sd->lb_nobusyq[itype], sd->lb_nobusyg[itype]); } - seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" + " %lu %lu %lu\n", sd->alb_cnt, sd->alb_failed, sd->alb_pushed, sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, - sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); + sd->ttwu_wake_remote, sd->ttwu_move_affine, + sd->ttwu_move_balance); } preempt_enable(); #endif @@ -1454,7 +1457,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (this_sd->flags & SD_WAKE_AFFINE) { unsigned long tl = this_load; - unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu); + unsigned long tl_per_task; + + tl_per_task = cpu_avg_load_per_task(this_cpu); /* * If sync wakeup then subtract the (maximum possible) @@ -2487,18 +2492,21 @@ small_imbalance: pwr_now /= SCHED_LOAD_SCALE; /* Amount of load we'd subtract */ - tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power; + tmp = busiest_load_per_task * SCHED_LOAD_SCALE / + busiest->cpu_power; if (max_load > tmp) pwr_move += busiest->cpu_power * min(busiest_load_per_task, max_load - tmp); /* Amount of load we'd add */ - if (max_load*busiest->cpu_power < - busiest_load_per_task*SCHED_LOAD_SCALE) - tmp = max_load*busiest->cpu_power/this->cpu_power; + if (max_load * busiest->cpu_power < + busiest_load_per_task * SCHED_LOAD_SCALE) + tmp = max_load * busiest->cpu_power / this->cpu_power; else - tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power; - pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp); + tmp = busiest_load_per_task * SCHED_LOAD_SCALE / + this->cpu_power; + pwr_move += this->cpu_power * + min(this_load_per_task, this_load + tmp); pwr_move /= SCHED_LOAD_SCALE; /* Move if we gain throughput */ @@ -3366,7 +3374,8 @@ void fastcall add_preempt_count(int val) /* * Spinlock count overflowing soon? */ - DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); + DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= + PREEMPT_MASK - 10); } EXPORT_SYMBOL(add_preempt_count); @@ -5439,16 +5448,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) if (!(sd->flags & SD_LOAD_BALANCE)) { printk("does not load-balance\n"); if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); + printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" + " has parent"); break; } printk("span %s\n", str); if (!cpu_isset(cpu, sd->span)) - printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); + printk(KERN_ERR "ERROR: domain->span does not contain " + "CPU%d\n", cpu); if (!cpu_isset(cpu, group->cpumask)) - printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); + printk(KERN_ERR "ERROR: domain->groups does not contain" + " CPU%d\n", cpu); printk(KERN_DEBUG); for (i = 0; i < level + 2; i++) @@ -5463,7 +5475,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) if (!group->cpu_power) { printk("\n"); - printk(KERN_ERR "ERROR: domain->cpu_power not set\n"); + printk(KERN_ERR "ERROR: domain->cpu_power not " + "set\n"); } if (!cpus_weight(group->cpumask)) { @@ -5486,15 +5499,17 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) printk("\n"); if (!cpus_equal(sd->span, groupmask)) - printk(KERN_ERR "ERROR: groups don't span domain->span\n"); + printk(KERN_ERR "ERROR: groups don't span " + "domain->span\n"); level++; sd = sd->parent; + if (!sd) + continue; - if (sd) { - if (!cpus_subset(groupmask, sd->span)) - printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); - } + if (!cpus_subset(groupmask, sd->span)) + printk(KERN_ERR "ERROR: parent span is not a superset " + "of domain->span\n"); } while (sd); } @@ -5812,8 +5827,9 @@ __setup("max_cache_size=", setup_max_cache_size); */ static void touch_cache(void *__cache, unsigned long __size) { - unsigned long size = __size/sizeof(long), chunk1 = size/3, - chunk2 = 2*size/3; + unsigned long size = __size / sizeof(long); + unsigned long chunk1 = size / 3; + unsigned long chunk2 = 2 * size / 3; unsigned long *cache = __cache; int i; @@ -5922,11 +5938,11 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) */ measure_one(cache, size, cpu1, cpu2); for (i = 0; i < ITERATIONS; i++) - cost1 += measure_one(cache, size - i*1024, cpu1, cpu2); + cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2); measure_one(cache, size, cpu2, cpu1); for (i = 0; i < ITERATIONS; i++) - cost1 += measure_one(cache, size - i*1024, cpu2, cpu1); + cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1); /* * (We measure the non-migrating [cached] cost on both @@ -5936,17 +5952,17 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) measure_one(cache, size, cpu1, cpu1); for (i = 0; i < ITERATIONS; i++) - cost2 += measure_one(cache, size - i*1024, cpu1, cpu1); + cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1); measure_one(cache, size, cpu2, cpu2); for (i = 0; i < ITERATIONS; i++) - cost2 += measure_one(cache, size - i*1024, cpu2, cpu2); + cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2); /* * Get the per-iteration migration cost: */ - do_div(cost1, 2*ITERATIONS); - do_div(cost2, 2*ITERATIONS); + do_div(cost1, 2 * ITERATIONS); + do_div(cost2, 2 * ITERATIONS); return cost1 - cost2; } @@ -5984,7 +6000,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2) */ cache = vmalloc(max_size); if (!cache) { - printk("could not vmalloc %d bytes for cache!\n", 2*max_size); + printk("could not vmalloc %d bytes for cache!\n", 2 * max_size); return 1000000; /* return 1 msec on very small boxen */ } @@ -6009,7 +6025,8 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2) avg_fluct = (avg_fluct + fluct)/2; if (migration_debug) - printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n", + printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): " + "(%8Ld %8Ld)\n", cpu1, cpu2, size, (long)cost / 1000000, ((long)cost / 100000) % 10, @@ -6104,20 +6121,18 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map) -1 #endif ); - if (system_state == SYSTEM_BOOTING) { - if (num_online_cpus() > 1) { - printk("migration_cost="); - for (distance = 0; distance <= max_distance; distance++) { - if (distance) - printk(","); - printk("%ld", (long)migration_cost[distance] / 1000); - } - printk("\n"); + if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) { + printk("migration_cost="); + for (distance = 0; distance <= max_distance; distance++) { + if (distance) + printk(","); + printk("%ld", (long)migration_cost[distance] / 1000); } + printk("\n"); } j1 = jiffies; if (migration_debug) - printk("migration: %ld seconds\n", (j1-j0)/HZ); + printk("migration: %ld seconds\n", (j1-j0) / HZ); /* * Move back to the original CPU. NUMA-Q gets confused -- cgit v1.2.3 From f3d19c90fb117a5f080310a4592929aa8e1ad8e9 Mon Sep 17 00:00:00 2001 From: Vadim Lobanov Date: Sun, 10 Dec 2006 02:21:09 -0800 Subject: [PATCH] fdtable: Delete pointless code in dup_fd() The dup_fd() function creates a new files_struct and fdtable embedded inside that files_struct, and then possibly expands the fdtable using expand_files(). The out_release error path is invoked when expand_files() returns an error code. However, when this attempt to expand fails, the fdtable is left in its original embedded form, so it is pointless to try to free the associated fdarray and fdsets. Signed-off-by: Vadim Lobanov Cc: Dipankar Sarma Cc: Christoph Hellwig Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 086e172d0d3d..30eab4f063cd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -711,8 +711,10 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) old_fds = old_fdt->fd; new_fds = new_fdt->fd; - memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8); - memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8); + memcpy(new_fdt->open_fds->fds_bits, + old_fdt->open_fds->fds_bits, open_files/8); + memcpy(new_fdt->close_on_exec->fds_bits, + old_fdt->close_on_exec->fds_bits, open_files/8); for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; @@ -745,14 +747,11 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); } -out: return newf; out_release: - free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset); - free_fdset (new_fdt->open_fds, new_fdt->max_fdset); - free_fd_array(new_fdt->fd, new_fdt->max_fds); kmem_cache_free(files_cachep, newf); +out: return NULL; } -- cgit v1.2.3 From bbea9f69668a3d0cf9feba15a724cd02896f8675 Mon Sep 17 00:00:00 2001 From: Vadim Lobanov Date: Sun, 10 Dec 2006 02:21:12 -0800 Subject: [PATCH] fdtable: Make fdarray and fdsets equal in size Currently, each fdtable supports three dynamically-sized arrays of data: the fdarray and two fdsets. The code allows the number of fds supported by the fdarray (fdtable->max_fds) to differ from the number of fds supported by each of the fdsets (fdtable->max_fdset). In practice, it is wasteful for these two sizes to differ: whenever we hit a limit on the smaller-capacity structure, we will reallocate the entire fdtable and all the dynamic arrays within it, so any delta in the memory used by the larger-capacity structure will never be touched at all. Rather than hogging this excess, we shouldn't even allocate it in the first place, and keep the capacities of the fdarray and the fdsets equal. This patch removes fdtable->max_fdset. As an added bonus, most of the supporting code becomes simpler. Signed-off-by: Vadim Lobanov Cc: Christoph Hellwig Cc: Al Viro Cc: Dipankar Sarma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- kernel/fork.c | 24 ++++++------------------ 2 files changed, 7 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 03e64fe4a14a..5f77e76b4f97 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -425,7 +425,7 @@ static void close_files(struct files_struct * files) for (;;) { unsigned long set; i = j * __NFDBITS; - if (i >= fdt->max_fdset || i >= fdt->max_fds) + if (i >= fdt->max_fds) break; set = fdt->open_fds->fds_bits[j++]; while (set) { diff --git a/kernel/fork.c b/kernel/fork.c index 30eab4f063cd..aba595424f78 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -614,7 +614,7 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) static int count_open_files(struct fdtable *fdt) { - int size = fdt->max_fdset; + int size = fdt->max_fds; int i; /* Find the last open fd */ @@ -641,7 +641,6 @@ static struct files_struct *alloc_files(void) newf->next_fd = 0; fdt = &newf->fdtab; fdt->max_fds = NR_OPEN_DEFAULT; - fdt->max_fdset = EMBEDDED_FD_SET_SIZE; fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; fdt->open_fds = (fd_set *)&newf->open_fds_init; fdt->fd = &newf->fd_array[0]; @@ -662,7 +661,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) { struct files_struct *newf; struct file **old_fds, **new_fds; - int open_files, size, i, expand; + int open_files, size, i; struct fdtable *old_fdt, *new_fdt; *errorp = -ENOMEM; @@ -673,25 +672,14 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); new_fdt = files_fdtable(newf); - size = old_fdt->max_fdset; open_files = count_open_files(old_fdt); - expand = 0; /* - * Check whether we need to allocate a larger fd array or fd set. - * Note: we're not a clone task, so the open count won't change. + * Check whether we need to allocate a larger fd array and fd set. + * Note: we're not a clone task, so the open count won't change. */ - if (open_files > new_fdt->max_fdset) { - new_fdt->max_fdset = 0; - expand = 1; - } if (open_files > new_fdt->max_fds) { new_fdt->max_fds = 0; - expand = 1; - } - - /* if the old fdset gets grown now, we'll only copy up to "size" fds */ - if (expand) { spin_unlock(&oldf->file_lock); spin_lock(&newf->file_lock); *errorp = expand_files(newf, open_files-1); @@ -739,8 +727,8 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) /* This is long word aligned thus could use a optimized version */ memset(new_fds, 0, size); - if (new_fdt->max_fdset > open_files) { - int left = (new_fdt->max_fdset-open_files)/8; + if (new_fdt->max_fds > open_files) { + int left = (new_fdt->max_fds-open_files)/8; int start = open_files / (8 * sizeof(unsigned long)); memset(&new_fdt->open_fds->fds_bits[start], 0, left); -- cgit v1.2.3 From 4fd45812cbe875a620c86a096a5d46c742694b7e Mon Sep 17 00:00:00 2001 From: Vadim Lobanov Date: Sun, 10 Dec 2006 02:21:17 -0800 Subject: [PATCH] fdtable: Remove the free_files field An fdtable can either be embedded inside a files_struct or standalone (after being expanded). When an fdtable is being discarded after all RCU references to it have expired, we must either free it directly, in the standalone case, or free the files_struct it is contained within, in the embedded case. Currently the free_files field controls this behavior, but we can get rid of it entirely, as all the necessary information is already recorded. We can distinguish embedded and standalone fdtables using max_fds, and if it is embedded we can divine the relevant files_struct using container_of(). Signed-off-by: Vadim Lobanov Cc: Christoph Hellwig Cc: Al Viro Cc: Dipankar Sarma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 6 ++---- kernel/fork.c | 1 - 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 5f77e76b4f97..122fadb972fc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -466,11 +466,9 @@ void fastcall put_files_struct(struct files_struct *files) * you can free files immediately. */ fdt = files_fdtable(files); - if (fdt == &files->fdtab) - fdt->free_files = files; - else + if (fdt != &files->fdtab) kmem_cache_free(files_cachep, files); - free_fdtable(fdt); + call_rcu(&fdt->rcu, free_fdtable_rcu); } } diff --git a/kernel/fork.c b/kernel/fork.c index aba595424f78..d16c566eb645 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -645,7 +645,6 @@ static struct files_struct *alloc_files(void) fdt->open_fds = (fd_set *)&newf->open_fds_init; fdt->fd = &newf->fd_array[0]; INIT_RCU_HEAD(&fdt->rcu); - fdt->free_files = NULL; fdt->next = NULL; rcu_assign_pointer(newf->fdt, fdt); out: -- cgit v1.2.3 From 4c36a5dec25fb344ad76b11860da3a8b50bd1248 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 10 Dec 2006 02:21:24 -0800 Subject: [PATCH] round_jiffies infrastructure Introduce a round_jiffies() function as well as a round_jiffies_relative() function. These functions round a jiffies value to the next whole second. The primary purpose of this rounding is to cause all "we don't care exactly when" timers to happen at the same jiffy. This avoids multiple timers firing within the second for no real reason; with dynamic ticks these extra timers cause wakeups from deep sleep CPU sleep states and thus waste power. The exact wakeup moment is skewed by the cpu number, to avoid all cpus from waking up at the exact same time (and hitting the same lock/cachelines there) [akpm@osdl.org: fix variable type] Signed-off-by: Arjan van de Ven Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/timer.c | 132 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index c1c7fbcffec1..b1f40f256eb0 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -80,6 +80,138 @@ tvec_base_t boot_tvec_bases; EXPORT_SYMBOL(boot_tvec_bases); static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; +/** + * __round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * __round_jiffies rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The exact rounding is skewed for each processor to avoid all + * processors firing at the exact same time, which could lead + * to lock contention or spurious cache line bouncing. + * + * The return value is the rounded version of the "j" parameter. + */ +unsigned long __round_jiffies(unsigned long j, int cpu) +{ + int rem; + unsigned long original = j; + + /* + * We don't want all cpus firing their timers at once hitting the + * same lock or cachelines, so we skew each extra cpu with an extra + * 3 jiffies. This 3 jiffies came originally from the mm/ code which + * already did this. + * The skew is done by adding 3*cpunr, then round, then subtract this + * extra offset again. + */ + j += cpu * 3; + + rem = j % HZ; + + /* + * If the target jiffie is just after a whole second (which can happen + * due to delays of the timer irq, long irq off times etc etc) then + * we should round down to the whole second, not up. Use 1/4th second + * as cutoff for this rounding as an extreme upper bound for this. + */ + if (rem < HZ/4) /* round down */ + j = j - rem; + else /* round up */ + j = j - rem + HZ; + + /* now that we have rounded, subtract the extra skew again */ + j -= cpu * 3; + + if (j <= jiffies) /* rounding ate our timeout entirely; */ + return original; + return j; +} +EXPORT_SYMBOL_GPL(__round_jiffies); + +/** + * __round_jiffies_relative - function to round jiffies to a full second + * @j: the time in (relative) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * __round_jiffies_relative rounds a time delta in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The exact rounding is skewed for each processor to avoid all + * processors firing at the exact same time, which could lead + * to lock contention or spurious cache line bouncing. + * + * The return value is the rounded version of the "j" parameter. + */ +unsigned long __round_jiffies_relative(unsigned long j, int cpu) +{ + /* + * In theory the following code can skip a jiffy in case jiffies + * increments right between the addition and the later subtraction. + * However since the entire point of this function is to use approximate + * timeouts, it's entirely ok to not handle that. + */ + return __round_jiffies(j + jiffies, cpu) - jiffies; +} +EXPORT_SYMBOL_GPL(__round_jiffies_relative); + +/** + * round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * + * round_jiffies rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The return value is the rounded version of the "j" parameter. + */ +unsigned long round_jiffies(unsigned long j) +{ + return __round_jiffies(j, raw_smp_processor_id()); +} +EXPORT_SYMBOL_GPL(round_jiffies); + +/** + * round_jiffies_relative - function to round jiffies to a full second + * @j: the time in (relative) jiffies that should be rounded + * + * round_jiffies_relative rounds a time delta in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The return value is the rounded version of the "j" parameter. + */ +unsigned long round_jiffies_relative(unsigned long j) +{ + return __round_jiffies_relative(j, raw_smp_processor_id()); +} +EXPORT_SYMBOL_GPL(round_jiffies_relative); + + static inline void set_running_timer(tvec_base_t *base, struct timer_list *timer) { -- cgit v1.2.3 From 2b0137001de68153203dd3bc20e6d27eb7c9719c Mon Sep 17 00:00:00 2001 From: Daniel Walker Date: Sun, 10 Dec 2006 02:21:30 -0800 Subject: [PATCH] clocksource: add usage of CONFIG_SYSFS Simply adds some ifdefs to remove clocksoure sysfs code when CONFIG_SYSFS isn't turn on. Signed-off-by: Daniel Walker Acked-by: John Stultz Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/clocksource.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 74eca5939bd9..3651b34cc355 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -186,6 +186,7 @@ void clocksource_reselect(void) } EXPORT_SYMBOL(clocksource_reselect); +#ifdef CONFIG_SYSFS /** * sysfs_show_current_clocksources - sysfs interface for current clocksource * @dev: unused @@ -307,6 +308,7 @@ static int __init init_clocksource_sysfs(void) } device_initcall(init_clocksource_sysfs); +#endif /* CONFIG_SYSFS */ /** * boot_override_clocksource - boot clock override -- cgit v1.2.3 From f5f1a24a2caa299bb7d294aee92d7dd3410d9ed7 Mon Sep 17 00:00:00 2001 From: Daniel Walker Date: Sun, 10 Dec 2006 02:21:33 -0800 Subject: [PATCH] clocksource: small cleanup Mostly changing alignment. Just some general cleanup. [akpm@osdl.org: build fix] Signed-off-by: Daniel Walker Acked-by: John Stultz Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/clocksource.c | 6 +++--- kernel/timer.c | 16 +++++++++++----- 2 files changed, 14 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 3651b34cc355..22504afc0d34 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -156,7 +156,7 @@ int clocksource_register(struct clocksource *c) /* check if clocksource is already registered */ if (is_registered_source(c)) { printk("register_clocksource: Cannot register %s. " - "Already registered!", c->name); + "Already registered!", c->name); ret = -EBUSY; } else { /* register it */ @@ -276,10 +276,10 @@ sysfs_show_available_clocksources(struct sys_device *dev, char *buf) * Sysfs setup bits: */ static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources, - sysfs_override_clocksource); + sysfs_override_clocksource); static SYSDEV_ATTR(available_clocksource, 0600, - sysfs_show_available_clocksources, NULL); + sysfs_show_available_clocksources, NULL); static struct sysdev_class clocksource_sysclass = { set_kset_name("clocksource"), diff --git a/kernel/timer.c b/kernel/timer.c index b1f40f256eb0..0256ab443d8a 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -846,7 +846,7 @@ static int change_clocksource(void) clock = new; clock->cycle_last = now; printk(KERN_INFO "Time: %s clocksource has been installed.\n", - clock->name); + clock->name); return 1; } else if (clock->update_callback) { return clock->update_callback(); @@ -854,7 +854,10 @@ static int change_clocksource(void) return 0; } #else -#define change_clocksource() (0) +static inline int change_clocksource(void) +{ + return 0; +} #endif /** @@ -952,7 +955,8 @@ device_initcall(timekeeping_init_device); * If the error is already larger, we look ahead even further * to compensate for late or lost adjustments. */ -static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset) +static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, + s64 *offset) { s64 tick_error, i; u32 look_ahead, adj; @@ -976,7 +980,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 * * Now calculate the error in (1 << look_ahead) ticks, but first * remove the single look ahead already included in the error. */ - tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1); + tick_error = current_tick_length() >> + (TICK_LENGTH_SHIFT - clock->shift + 1); tick_error -= clock->xtime_interval >> 1; error = ((error - tick_error) >> look_ahead) + tick_error; @@ -1028,7 +1033,8 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset) clock->mult += adj; clock->xtime_interval += interval; clock->xtime_nsec -= offset; - clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift); + clock->error -= (interval - offset) << + (TICK_LENGTH_SHIFT - clock->shift); } /** -- cgit v1.2.3