From 8d2587970b8bdf7c8d9208e3f4bb93182aef1a0f Mon Sep 17 00:00:00 2001 From: Phil Carmody Date: Tue, 22 Mar 2011 16:30:13 -0700 Subject: cgroups: if you list_empty() a head then don't list_del() it list_del() leaves poison in the prev and next pointers. The next list_empty() will compare those poisons, and say the list isn't empty. Any list operations that assume the node is on a list because of such a check will be fooled into dereferencing poison. One needs to INIT the node after the del, and fortunately there's already a wrapper for that - list_del_init(). Some of the dels are followed by deallocations, so can be ignored, and one can be merged with an add to make a move. Apart from that, I erred on the side of caution in making nodes list_empty()-queriable. Signed-off-by: Phil Carmody Reviewed-by: Paul Menage Cc: Li Zefan Acked-by: Kirill A. Shutemov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 95362d15128c..e31b220a743d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1813,10 +1813,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) /* Update the css_set linked lists if we're using them */ write_lock(&css_set_lock); - if (!list_empty(&tsk->cg_list)) { - list_del(&tsk->cg_list); - list_add(&tsk->cg_list, &newcg->tasks); - } + if (!list_empty(&tsk->cg_list)) + list_move(&tsk->cg_list, &newcg->tasks); write_unlock(&css_set_lock); for_each_subsys(root, ss) { @@ -3655,12 +3653,12 @@ again: spin_lock(&release_list_lock); set_bit(CGRP_REMOVED, &cgrp->flags); if (!list_empty(&cgrp->release_list)) - list_del(&cgrp->release_list); + list_del_init(&cgrp->release_list); spin_unlock(&release_list_lock); cgroup_lock_hierarchy(cgrp->root); /* delete this cgroup from parent->children */ - list_del(&cgrp->sibling); + list_del_init(&cgrp->sibling); cgroup_unlock_hierarchy(cgrp->root); d = dget(cgrp->dentry); @@ -3879,7 +3877,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) subsys[ss->subsys_id] = NULL; /* remove subsystem from rootnode's list of subsystems */ - list_del(&ss->sibling); + list_del_init(&ss->sibling); /* * disentangle the css from all css_sets attached to the dummytop. as @@ -4241,7 +4239,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) if (!list_empty(&tsk->cg_list)) { write_lock(&css_set_lock); if (!list_empty(&tsk->cg_list)) - list_del(&tsk->cg_list); + list_del_init(&tsk->cg_list); write_unlock(&css_set_lock); } -- cgit v1.2.3 From 504f52b5439aaf26d3e2c1d45ec10fce38c8dd27 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Mar 2011 16:30:41 -0700 Subject: mm: NUMA aware alloc_task_struct_node() All kthreads being created from a single helper task, they all use memory from a single node for their kernel stack and task struct. This patch suite creates kthread_create_on_cpu(), adding a 'cpu' parameter to parameters already used by kthread_create(). This parameter serves in allocating memory for the new kthread on its memory node if available. Users of this new function are : ksoftirqd, kworker, migration, pktgend... This patch: Add a node parameter to alloc_task_struct(), and change its name to alloc_task_struct_node() This change is needed to allow NUMA aware kthread_create_on_cpu() Signed-off-by: Eric Dumazet Acked-by: David S. Miller Reviewed-by: Andi Kleen Acked-by: Rusty Russell Cc: Tejun Heo Cc: Tony Luck Cc: Fenghua Yu Cc: David Howells Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 05b92c457010..cffbe8a4e1fc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -109,8 +109,10 @@ int nr_processes(void) } #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR -# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) -# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) +# define alloc_task_struct_node(node) \ + kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node) +# define free_task_struct(tsk) \ + kmem_cache_free(task_struct_cachep, (tsk)) static struct kmem_cache *task_struct_cachep; #endif @@ -249,12 +251,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) struct task_struct *tsk; struct thread_info *ti; unsigned long *stackend; - + int node = numa_node_id(); int err; prepare_to_copy(orig); - tsk = alloc_task_struct(); + tsk = alloc_task_struct_node(node); if (!tsk) return NULL; -- cgit v1.2.3 From b6a84016bd2598e35ead635147fa53619982648d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Mar 2011 16:30:42 -0700 Subject: mm: NUMA aware alloc_thread_info_node() Add a node parameter to alloc_thread_info(), and change its name to alloc_thread_info_node() This change is needed to allow NUMA aware kthread_create_on_cpu() Signed-off-by: Eric Dumazet Acked-by: David S. Miller Reviewed-by: Andi Kleen Acked-by: Rusty Russell Cc: Tejun Heo Cc: Tony Luck Cc: Fenghua Yu Cc: David Howells Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index cffbe8a4e1fc..cbc6adc6e891 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -117,14 +117,17 @@ static struct kmem_cache *task_struct_cachep; #endif #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR -static inline struct thread_info *alloc_thread_info(struct task_struct *tsk) +static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, + int node) { #ifdef CONFIG_DEBUG_STACK_USAGE gfp_t mask = GFP_KERNEL | __GFP_ZERO; #else gfp_t mask = GFP_KERNEL; #endif - return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER); + struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER); + + return page ? page_address(page) : NULL; } static inline void free_thread_info(struct thread_info *ti) @@ -260,7 +263,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) if (!tsk) return NULL; - ti = alloc_thread_info(tsk); + ti = alloc_thread_info_node(tsk, node); if (!ti) { free_task_struct(tsk); return NULL; -- cgit v1.2.3 From 207205a2ba2655652fe46a60b49838af6c16a919 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Mar 2011 16:30:44 -0700 Subject: kthread: NUMA aware kthread_create_on_node() All kthreads being created from a single helper task, they all use memory from a single node for their kernel stack and task struct. This patch suite creates kthread_create_on_node(), adding a 'cpu' parameter to parameters already used by kthread_create(). This parameter serves in allocating memory for the new kthread on its memory node if possible. Signed-off-by: Eric Dumazet Acked-by: David S. Miller Reviewed-by: Andi Kleen Acked-by: Rusty Russell Cc: Tejun Heo Cc: Tony Luck Cc: Fenghua Yu Cc: David Howells Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 ++- kernel/kthread.c | 31 +++++++++++++++++++++++++------ 2 files changed, 27 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index cbc6adc6e891..a8f64f8ec7e1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -254,7 +255,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) struct task_struct *tsk; struct thread_info *ti; unsigned long *stackend; - int node = numa_node_id(); + int node = tsk_fork_get_node(orig); int err; prepare_to_copy(orig); diff --git a/kernel/kthread.c b/kernel/kthread.c index c55afba990a3..684ab3f7dd72 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -27,6 +27,7 @@ struct kthread_create_info /* Information passed to kthread() from kthreadd. */ int (*threadfn)(void *data); void *data; + int node; /* Result passed back to kthread_create() from kthreadd. */ struct task_struct *result; @@ -98,10 +99,23 @@ static int kthread(void *_create) do_exit(ret); } +/* called from do_fork() to get node information for about to be created task */ +int tsk_fork_get_node(struct task_struct *tsk) +{ +#ifdef CONFIG_NUMA + if (tsk == kthreadd_task) + return tsk->pref_node_fork; +#endif + return numa_node_id(); +} + static void create_kthread(struct kthread_create_info *create) { int pid; +#ifdef CONFIG_NUMA + current->pref_node_fork = create->node; +#endif /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { @@ -111,15 +125,18 @@ static void create_kthread(struct kthread_create_info *create) } /** - * kthread_create - create a kthread. + * kthread_create_on_node - create a kthread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. + * @node: memory node number. * @namefmt: printf-style name for the thread. * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start * it. See also kthread_run(). * + * If thread is going to be bound on a particular cpu, give its node + * in @node, to get NUMA affinity for kthread stack, or else give -1. * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either call do_exit() directly if it is a * standalone thread for which noone will call kthread_stop(), or @@ -129,15 +146,17 @@ static void create_kthread(struct kthread_create_info *create) * * Returns a task_struct or ERR_PTR(-ENOMEM). */ -struct task_struct *kthread_create(int (*threadfn)(void *data), - void *data, - const char namefmt[], - ...) +struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), + void *data, + int node, + const char namefmt[], + ...) { struct kthread_create_info create; create.threadfn = threadfn; create.data = data; + create.node = node; init_completion(&create.done); spin_lock(&kthread_create_lock); @@ -164,7 +183,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), } return create.result; } -EXPORT_SYMBOL(kthread_create); +EXPORT_SYMBOL(kthread_create_on_node); /** * kthread_bind - bind a just-created kthread to a cpu. -- cgit v1.2.3 From 94dcf29a11b3d20a28790598d701f98484a969da Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Mar 2011 16:30:45 -0700 Subject: kthread: use kthread_create_on_node() ksoftirqd, kworker, migration, and pktgend kthreads can be created with kthread_create_on_node(), to get proper NUMA affinities for their stack and task_struct. Signed-off-by: Eric Dumazet Acked-by: David S. Miller Reviewed-by: Andi Kleen Acked-by: Rusty Russell Acked-by: Tejun Heo Cc: Tony Luck Cc: Fenghua Yu Cc: David Howells Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/softirq.c | 5 ++++- kernel/stop_machine.c | 6 ++++-- kernel/workqueue.c | 6 ++++-- 3 files changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 56e5dec837f0..735d87095172 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -845,7 +845,10 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); + p = kthread_create_on_node(run_ksoftirqd, + hcpu, + cpu_to_node(hotcpu), + "ksoftirqd/%d", hotcpu); if (IS_ERR(p)) { printk("ksoftirqd for %i failed\n", hotcpu); return notifier_from_errno(PTR_ERR(p)); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 2df820b03beb..e3516b29076c 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -301,8 +301,10 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, case CPU_UP_PREPARE: BUG_ON(stopper->thread || stopper->enabled || !list_empty(&stopper->works)); - p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", - cpu); + p = kthread_create_on_node(cpu_stopper_thread, + stopper, + cpu_to_node(cpu), + "migration/%d", cpu); if (IS_ERR(p)) return notifier_from_errno(PTR_ERR(p)); get_task_struct(p); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5ca7ce9ce754..04ef830690ec 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1366,8 +1366,10 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind) worker->id = id; if (!on_unbound_cpu) - worker->task = kthread_create(worker_thread, worker, - "kworker/%u:%d", gcwq->cpu, id); + worker->task = kthread_create_on_node(worker_thread, + worker, + cpu_to_node(gcwq->cpu), + "kworker/%u:%d", gcwq->cpu, id); else worker->task = kthread_create(worker_thread, worker, "kworker/u:%d", id); -- cgit v1.2.3 From d404ab0a1133e95557bb7deab2a49b348dfeba85 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Tue, 22 Mar 2011 16:34:04 -0700 Subject: move x86 specific oops=panic to generic code The oops=panic cmdline option is not x86 specific, move it to generic code. Update documentation. Signed-off-by: Olaf Hering Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 991bb87a1704..69231670eb95 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -433,3 +433,13 @@ EXPORT_SYMBOL(__stack_chk_fail); core_param(panic, panic_timeout, int, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); + +static int __init oops_setup(char *s) +{ + if (!s) + return -EINVAL; + if (!strcmp(s, "panic")) + panic_on_oops = 1; + return 0; +} +early_param("oops", oops_setup); -- cgit v1.2.3 From 34db18a054c600b6f81787165669dc572fe4de25 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Tue, 22 Mar 2011 16:34:06 -0700 Subject: smp: move smp setup functions to kernel/smp.c Move setup_nr_cpu_ids(), smp_init() and some other SMP boot parameter setup functions from init/main.c to kenrel/smp.c, saves some #ifdef CONFIG_SMP. Signed-off-by: WANG Cong Cc: Rakib Mullick Cc: David Howells Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Tejun Heo Cc: Arnd Bergmann Cc: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 7cbd0f293df4..73a195193558 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -604,6 +604,87 @@ void ipi_call_unlock_irq(void) } #endif /* USE_GENERIC_SMP_HELPERS */ +/* Setup configured maximum number of CPUs to activate */ +unsigned int setup_max_cpus = NR_CPUS; +EXPORT_SYMBOL(setup_max_cpus); + + +/* + * Setup routine for controlling SMP activation + * + * Command-line option of "nosmp" or "maxcpus=0" will disable SMP + * activation entirely (the MPS table probe still happens, though). + * + * Command-line option of "maxcpus=", where is an integer + * greater than 0, limits the maximum number of CPUs activated in + * SMP mode to . + */ + +void __weak arch_disable_smp_support(void) { } + +static int __init nosmp(char *str) +{ + setup_max_cpus = 0; + arch_disable_smp_support(); + + return 0; +} + +early_param("nosmp", nosmp); + +/* this is hard limit */ +static int __init nrcpus(char *str) +{ + int nr_cpus; + + get_option(&str, &nr_cpus); + if (nr_cpus > 0 && nr_cpus < nr_cpu_ids) + nr_cpu_ids = nr_cpus; + + return 0; +} + +early_param("nr_cpus", nrcpus); + +static int __init maxcpus(char *str) +{ + get_option(&str, &setup_max_cpus); + if (setup_max_cpus == 0) + arch_disable_smp_support(); + + return 0; +} + +early_param("maxcpus", maxcpus); + +/* Setup number of possible processor ids */ +int nr_cpu_ids __read_mostly = NR_CPUS; +EXPORT_SYMBOL(nr_cpu_ids); + +/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */ +void __init setup_nr_cpu_ids(void) +{ + nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; +} + +/* Called by boot processor to activate the rest. */ +void __init smp_init(void) +{ + unsigned int cpu; + + /* FIXME: This should be done in userspace --RR */ + for_each_present_cpu(cpu) { + if (num_online_cpus() >= setup_max_cpus) + break; + if (!cpu_online(cpu)) + cpu_up(cpu); + } + + /* Any cleanup work */ + printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); + smp_cpus_done(setup_max_cpus); +} + /* * Call a function on all processors. May be used during early boot while * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead -- cgit v1.2.3 From 4d51985e484dd11d9047dfcd1278ec9ccfb435d5 Mon Sep 17 00:00:00 2001 From: Michael Rodriguez Date: Tue, 22 Mar 2011 16:34:07 -0700 Subject: kernel/cpu.c: fix many errors related to style. Change the printk() calls to have the KERN_INFO/KERN_ERROR stuff, and fixes other coding style errors. Not _all_ of them are gone, though. [akpm@linux-foundation.org: revert the bits I disagree with] Signed-off-by: Michael Rodriguez Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 156cc5556140..c95fc4df0faa 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -160,7 +160,6 @@ static void cpu_notify_nofail(unsigned long val, void *v) { BUG_ON(cpu_notify(val, v)); } - EXPORT_SYMBOL(register_cpu_notifier); void __ref unregister_cpu_notifier(struct notifier_block *nb) @@ -205,7 +204,6 @@ static int __ref take_cpu_down(void *_param) return err; cpu_notify(CPU_DYING | param->mod, param->hcpu); - return 0; } @@ -227,6 +225,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) return -EINVAL; cpu_hotplug_begin(); + err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); if (err) { nr_calls--; @@ -304,7 +303,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); if (ret) { nr_calls--; - printk("%s: attempt to bring up CPU %u failed\n", + printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", __func__, cpu); goto out_notify; } @@ -450,14 +449,14 @@ void __ref enable_nonboot_cpus(void) if (cpumask_empty(frozen_cpus)) goto out; - printk("Enabling non-boot CPUs ...\n"); + printk(KERN_INFO "Enabling non-boot CPUs ...\n"); arch_enable_nonboot_cpus_begin(); for_each_cpu(cpu, frozen_cpus) { error = _cpu_up(cpu, 1); if (!error) { - printk("CPU%d is up\n", cpu); + printk(KERN_INFO "CPU%d is up\n", cpu); continue; } printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); @@ -509,7 +508,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu) */ /* cpu_bit_bitmap[0] is empty - so we can back into it */ -#define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x) +#define MASK_DECLARE_1(x) [x+1][0] = (1UL << (x)) #define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1) #define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2) #define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4) -- cgit v1.2.3 From 9bfb23fc4a481650e60d22dbe84c0fd5a9d49bba Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 22 Mar 2011 16:34:09 -0700 Subject: sys_unshare: remove the dead CLONE_THREAD/SIGHAND/VM code Cleanup: kill the dead code which does nothing but complicates the code and confuses the reader. sys_unshare(CLONE_THREAD/SIGHAND/VM) is not really implemented, and I doubt very much it will ever work. At least, nobody even tried since the original 99d1419d96d7df9cfa56 ("unshare system call -v5: system call handler function") was applied more than 4 years ago. And the code is not consistent. unshare_thread() always fails unconditionally, while unshare_sighand() and unshare_vm() pretend to work if there is nothing to unshare. Remove unshare_thread(), unshare_sighand(), unshare_vm() helpers and related variables and add a simple CLONE_THREAD | CLONE_SIGHAND| CLONE_VM check into check_unshare_flags(). Also, move the "CLONE_NEWNS needs CLONE_FS" check from check_unshare_flags() to sys_unshare(). This looks more consistent and matches the similar do_sysvsem check in sys_unshare(). Note: with or without this patch "atomic_read(mm->mm_users) > 1" can give a false positive due to get_task_mm(). Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Cc: Janak Desai Cc: Daniel Lezcano Cc: "Eric W. Biederman" Cc: KOSAKI Motohiro Cc: Alexey Dobriyan Acked-by: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 123 ++++++++++++---------------------------------------------- 1 file changed, 25 insertions(+), 98 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a8f64f8ec7e1..f2b494d7c557 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1519,38 +1519,24 @@ void __init proc_caches_init(void) } /* - * Check constraints on flags passed to the unshare system call and - * force unsharing of additional process context as appropriate. + * Check constraints on flags passed to the unshare system call. */ -static void check_unshare_flags(unsigned long *flags_ptr) +static int check_unshare_flags(unsigned long unshare_flags) { + if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| + CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| + CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) + return -EINVAL; /* - * If unsharing a thread from a thread group, must also - * unshare vm. - */ - if (*flags_ptr & CLONE_THREAD) - *flags_ptr |= CLONE_VM; - - /* - * If unsharing vm, must also unshare signal handlers. - */ - if (*flags_ptr & CLONE_VM) - *flags_ptr |= CLONE_SIGHAND; - - /* - * If unsharing namespace, must also unshare filesystem information. + * Not implemented, but pretend it works if there is nothing to + * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND + * needs to unshare vm. */ - if (*flags_ptr & CLONE_NEWNS) - *flags_ptr |= CLONE_FS; -} - -/* - * Unsharing of tasks created with CLONE_THREAD is not supported yet - */ -static int unshare_thread(unsigned long unshare_flags) -{ - if (unshare_flags & CLONE_THREAD) - return -EINVAL; + if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) { + /* FIXME: get_task_mm() increments ->mm_users */ + if (atomic_read(¤t->mm->mm_users) > 1) + return -EINVAL; + } return 0; } @@ -1576,34 +1562,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) return 0; } -/* - * Unsharing of sighand is not supported yet - */ -static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) -{ - struct sighand_struct *sigh = current->sighand; - - if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1) - return -EINVAL; - else - return 0; -} - -/* - * Unshare vm if it is being shared - */ -static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp) -{ - struct mm_struct *mm = current->mm; - - if ((unshare_flags & CLONE_VM) && - (mm && atomic_read(&mm->mm_users) > 1)) { - return -EINVAL; - } - - return 0; -} - /* * Unshare file descriptor table if it is being shared */ @@ -1632,23 +1590,21 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp */ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) { - int err = 0; struct fs_struct *fs, *new_fs = NULL; - struct sighand_struct *new_sigh = NULL; - struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; struct nsproxy *new_nsproxy = NULL; int do_sysvsem = 0; + int err; - check_unshare_flags(&unshare_flags); - - /* Return -EINVAL for all unsupported flags */ - err = -EINVAL; - if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| - CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) + err = check_unshare_flags(unshare_flags); + if (err) goto bad_unshare_out; + /* + * If unsharing namespace, must also unshare filesystem information. + */ + if (unshare_flags & CLONE_NEWNS) + unshare_flags |= CLONE_FS; /* * CLONE_NEWIPC must also detach from the undolist: after switching * to a new ipc namespace, the semaphore arrays from the old @@ -1656,21 +1612,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) */ if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) do_sysvsem = 1; - if ((err = unshare_thread(unshare_flags))) - goto bad_unshare_out; if ((err = unshare_fs(unshare_flags, &new_fs))) - goto bad_unshare_cleanup_thread; - if ((err = unshare_sighand(unshare_flags, &new_sigh))) - goto bad_unshare_cleanup_fs; - if ((err = unshare_vm(unshare_flags, &new_mm))) - goto bad_unshare_cleanup_sigh; + goto bad_unshare_out; if ((err = unshare_fd(unshare_flags, &new_fd))) - goto bad_unshare_cleanup_vm; + goto bad_unshare_cleanup_fs; if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs))) goto bad_unshare_cleanup_fd; - if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { + if (new_fs || new_fd || do_sysvsem || new_nsproxy) { if (do_sysvsem) { /* * CLONE_SYSVSEM is equivalent to sys_exit(). @@ -1696,19 +1646,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) spin_unlock(&fs->lock); } - if (new_mm) { - mm = current->mm; - active_mm = current->active_mm; - current->mm = new_mm; - current->active_mm = new_mm; - if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { - atomic_dec(&mm->oom_disable_count); - atomic_inc(&new_mm->oom_disable_count); - } - activate_mm(active_mm, new_mm); - new_mm = mm; - } - if (new_fd) { fd = current->files; current->files = new_fd; @@ -1725,20 +1662,10 @@ bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); -bad_unshare_cleanup_vm: - if (new_mm) - mmput(new_mm); - -bad_unshare_cleanup_sigh: - if (new_sigh) - if (atomic_dec_and_test(&new_sigh->count)) - kmem_cache_free(sighand_cachep, new_sigh); - bad_unshare_cleanup_fs: if (new_fs) free_fs_struct(new_fs); -bad_unshare_cleanup_thread: bad_unshare_out: return err; } -- cgit v1.2.3 From fef2c9bc1b54c0261324a96e948c0b849796e896 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 22 Mar 2011 16:34:16 -0700 Subject: kernel/watchdog.c: allow hardlockup to panic by default When a cpu is considered stuck, instead of limping along and just printing a warning, it is sometimes preferred to just panic, let kdump capture the vmcore and reboot. This gets the machine back into a stable state quickly while saving the info that got it into a stuck state to begin with. Add a Kconfig option to allow users to set the hardlockup to panic by default. Also add in a 'nmi_watchdog=nopanic' to override this. [akpm@linux-foundation.org: fix strncmp length] Signed-off-by: Don Zickus Acked-by: Peter Zijlstra Reviewed-by: WANG Cong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 18bb15776c57..054a67cca9da 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -48,12 +48,15 @@ static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); * Should we panic when a soft-lockup or hard-lockup occurs: */ #ifdef CONFIG_HARDLOCKUP_DETECTOR -static int hardlockup_panic; +static int hardlockup_panic = + CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; static int __init hardlockup_panic_setup(char *str) { if (!strncmp(str, "panic", 5)) hardlockup_panic = 1; + else if (!strncmp(str, "nopanic", 7)) + hardlockup_panic = 0; else if (!strncmp(str, "0", 1)) watchdog_enabled = 0; return 1; -- cgit v1.2.3 From f99a99330f85a84c346ddeb4adc72dbfad9b9e3e Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 22 Mar 2011 16:34:17 -0700 Subject: kernel/watchdog.c: always return NOTIFY_OK during cpu up/down events This patch addresses a couple of problems. One was the case when the hardlockup failed to start, it also failed to start the softlockup. There were valid cases when the hardlockup shouldn't start and that shouldn't block the softlockup (no lapic, bios controls perf counters). The second problem was when the hardlockup failed to start on boxes (from a no lapic or bios controlled perf counter case), it reported failure to the cpu notifier chain. This blocked the notifier from continuing to start other more critical pieces of cpu bring-up (in our case based on a 2.6.32 fork, it was the mce). As a result, during soft cpu online/offline testing, the system would panic when a cpu was offlined because the cpu notifier would succeed in processing a watchdog disable cpu event and would panic in the mce case as a result of un-initialized variables from a never executed cpu up event. I realized the hardlockup/softlockup cases are really just debugging aids and should never impede the progress of a cpu up/down event. Therefore I modified the code to always return NOTIFY_OK and instead rely on printks to inform the user of problems. Signed-off-by: Don Zickus Acked-by: Peter Zijlstra Reviewed-by: WANG Cong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 054a67cca9da..140dce750450 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -418,19 +418,22 @@ static int watchdog_prepare_cpu(int cpu) static int watchdog_enable(int cpu) { struct task_struct *p = per_cpu(softlockup_watchdog, cpu); - int err; + int err = 0; /* enable the perf event */ err = watchdog_nmi_enable(cpu); - if (err) - return err; + + /* Regardless of err above, fall through and start softlockup */ /* create the watchdog thread */ if (!p) { p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); if (IS_ERR(p)) { printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); - return PTR_ERR(p); + if (!err) + /* if hardlockup hasn't already set this */ + err = PTR_ERR(p); + goto out; } kthread_bind(p, cpu); per_cpu(watchdog_touch_ts, cpu) = 0; @@ -438,7 +441,8 @@ static int watchdog_enable(int cpu) wake_up_process(p); } - return 0; +out: + return err; } static void watchdog_disable(int cpu) @@ -550,7 +554,13 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; #endif /* CONFIG_HOTPLUG_CPU */ } - return notifier_from_errno(err); + + /* + * hardlockup and softlockup are not important enough + * to block cpu bring up. Just always succeed and + * rely on printk output to flag problems. + */ + return NOTIFY_OK; } static struct notifier_block __cpuinitdata cpu_nfb = { -- cgit v1.2.3 From 7bf693951a8e5f7e600a45b74d91d962a453146e Mon Sep 17 00:00:00 2001 From: "Fabio M. Di Nitto" Date: Tue, 22 Mar 2011 16:34:20 -0700 Subject: console: allow to retain boot console via boot option keep_bootcon On some architectures, the boot process involves de-registering the boot console (early boot), initialize drivers and then re-register the console. This mechanism introduces a window in which no printk can happen on the console and messages are buffered and then printed once the new console is available. If a kernel crashes during this window, all it's left on the boot console is "console [foo] enabled, bootconsole disabled" making debug of the crash rather 'interesting'. By adding "keep_bootcon" option, do not unregister the boot console, that will allow to printk everything that is happening up to the crash. The option is clearly meant only for debugging purposes as it introduces lots of duplicated info printed on console, but will make bug report from users easier as it doesn't require a kernel build just to figure out where we crash. Signed-off-by: Fabio M. Di Nitto Acked-by: David S. Miller Cc: Alan Cox Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 33284adb2189..2b591f252e55 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1316,6 +1316,18 @@ void console_start(struct console *console) } EXPORT_SYMBOL(console_start); +static int __read_mostly keep_bootcon; + +static int __init keep_bootcon_setup(char *str) +{ + keep_bootcon = 1; + printk(KERN_INFO "debug: skip boot console de-registration.\n"); + + return 0; +} + +early_param("keep_bootcon", keep_bootcon_setup); + /* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to @@ -1463,7 +1475,9 @@ void register_console(struct console *newcon) * users know there might be something in the kernel's log buffer that * went to the bootconsole (that they do not see on the real console) */ - if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { + if (bcon && + ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && + !keep_bootcon) { /* we need to iterate through twice, to make sure we print * everything out, before we unregister the console(s) */ -- cgit v1.2.3 From fe3d8ad31cf51b062bbb8a9609eeb1d0c41a7f30 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Tue, 22 Mar 2011 16:34:21 -0700 Subject: console: prevent registered consoles from dumping old kernel message over again For a platform with many consoles like: "console=tty1 console=ttyMFD2 console=ttyS0 earlyprintk=mrst" Each time when the non "selected_console" (tty1 and ttyMFD2 here) get registered, the existing kernel message will be printed out on registered consoles again, the "mrst" early console will get some same message for 3 times, and "tty1" will get some for twice. As suggested by Andrew Morton, every time a new console is registered, it will be set as the "exclusive" console which will dump the already existing kernel messages. Signed-off-by: Feng Tang Cc: Greg KH Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 2b591f252e55..a53607eea6d0 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -112,6 +112,11 @@ static unsigned log_start; /* Index into log_buf: next char to be read by syslog static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */ static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ +/* + * If exclusive_console is non-NULL then only this console is to be printed to. + */ +static struct console *exclusive_console; + /* * Array of consoles built from command line options (console=) */ @@ -476,6 +481,8 @@ static void __call_console_drivers(unsigned start, unsigned end) struct console *con; for_each_console(con) { + if (exclusive_console && con != exclusive_console) + continue; if ((con->flags & CON_ENABLED) && con->write && (cpu_online(smp_processor_id()) || (con->flags & CON_ANYTIME))) @@ -1230,6 +1237,11 @@ void console_unlock(void) local_irq_restore(flags); } console_locked = 0; + + /* Release the exclusive_console once it is used */ + if (unlikely(exclusive_console)) + exclusive_console = NULL; + up(&console_sem); spin_unlock_irqrestore(&logbuf_lock, flags); if (wake_klogd) @@ -1464,6 +1476,12 @@ void register_console(struct console *newcon) spin_lock_irqsave(&logbuf_lock, flags); con_start = log_start; spin_unlock_irqrestore(&logbuf_lock, flags); + /* + * We're about to replay the log buffer. Only do this to the + * just-registered console to avoid excessive message spam to + * the already-registered consoles. + */ + exclusive_console = newcon; } console_unlock(); console_sysfs_notify(); -- cgit v1.2.3 From 9f36e2c448007b54851e7e4fa48da97d1477a175 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 22 Mar 2011 16:34:22 -0700 Subject: printk: use %pK for /proc/kallsyms and /proc/modules In an effort to reduce kernel address leaks that might be used to help target kernel privilege escalation exploits, this patch uses %pK when displaying addresses in /proc/kallsyms, /proc/modules, and /sys/module/*/sections/*. Note that this changes %x to %p, so some legitimately 0 values in /proc/kallsyms would have changed from 00000000 to "(null)". To avoid this, "(null)" is not used when using the "K" format. Anything that was already successfully parsing "(null)" in addition to full hex digits should have no problem with this change. (Thanks to Joe Perches for the suggestion.) Due to the %x to %p, "void *" casts are needed since these addresses are already "unsigned long" everywhere internally, due to their starting life as ELF section offsets. Signed-off-by: Kees Cook Cc: Eugene Teo Cc: Dan Rosenberg Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 10 ++++------ kernel/module.c | 4 ++-- 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 6f6d091b5757..75dcca37d61a 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -477,13 +477,11 @@ static int s_show(struct seq_file *m, void *p) */ type = iter->exported ? toupper(iter->type) : tolower(iter->type); - seq_printf(m, "%0*lx %c %s\t[%s]\n", - (int)(2 * sizeof(void *)), - iter->value, type, iter->name, iter->module_name); + seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value, + type, iter->name, iter->module_name); } else - seq_printf(m, "%0*lx %c %s\n", - (int)(2 * sizeof(void *)), - iter->value, iter->type, iter->name); + seq_printf(m, "%pK %c %s\n", (void *)iter->value, + iter->type, iter->name); return 0; } diff --git a/kernel/module.c b/kernel/module.c index efa290ea94bf..1f9f7bc56ca1 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1168,7 +1168,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr, { struct module_sect_attr *sattr = container_of(mattr, struct module_sect_attr, mattr); - return sprintf(buf, "0x%lx\n", sattr->address); + return sprintf(buf, "0x%pK\n", (void *)sattr->address); } static void free_sect_attrs(struct module_sect_attrs *sect_attrs) @@ -3224,7 +3224,7 @@ static int m_show(struct seq_file *m, void *p) mod->state == MODULE_STATE_COMING ? "Loading": "Live"); /* Used by oprofile and other similar tools. */ - seq_printf(m, " 0x%p", mod->module_core); + seq_printf(m, " 0x%pK", mod->module_core); /* Taints info */ if (mod->taints) -- cgit v1.2.3 From 5af5bcb8d37f99ba415a1adc6da71051b84f93a5 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Tue, 22 Mar 2011 16:34:23 -0700 Subject: printk: allow setting DEFAULT_MESSAGE_LEVEL via Kconfig We've been burned by regressions/bugs which we later realized could have been triaged quicker if only we'd paid closer attention to dmesg. To make it easier to audit dmesg, we'd like to make DEFAULT_MESSAGE_LEVEL Kconfig-settable. That way we can set it to KERN_NOTICE and audit any messages <= KERN_WARNING. Signed-off-by: Mandeep Singh Baines Cc: Ingo Molnar Cc: Joe Perches Cc: Olof Johansson Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index a53607eea6d0..da8ca817eae3 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -53,7 +53,7 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) /* printk's without a loglevel use this.. */ -#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */ +#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL /* We show everything that is MORE important than this.. */ #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ -- cgit v1.2.3