From 5f12a761937373d2f9b557d7519e6f1cf738b8f0 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 11 Aug 2011 12:31:20 +0100 Subject: perf: provide PMU when initing events Currently, an event's 'pmu' field is set after pmu::event_init() is called. This means that pmu::event_init() must figure out which struct pmu the event was initialised from. This makes it difficult to consolidate common event initialisation code for similar PMUs, and very difficult to implement drivers for PMUs which can have multiple instances (e.g. a USB controller PMU, a GPU PMU, etc). This patch sets the 'pmu' field before initialising the event, allowing event init code to identify the struct pmu instance easily. In the event of failure to initialise an event, the event is destroyed via kfree() without calling perf_event::destroy(), so this shouldn't result in bad behaviour even if the destroy field was set before failure to initialise was noted. Signed-off-by: Mark Rutland Reviewed-by: Will Deacon Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1313062280-19123-1-git-send-email-mark.rutland@arm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index b8785e26ee1c..68c8017de969 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5715,6 +5715,7 @@ struct pmu *perf_init_event(struct perf_event *event) pmu = idr_find(&pmu_idr, event->attr.type); rcu_read_unlock(); if (pmu) { + event->pmu = pmu; ret = pmu->event_init(event); if (ret) pmu = ERR_PTR(ret); @@ -5722,6 +5723,7 @@ struct pmu *perf_init_event(struct perf_event *event) } list_for_each_entry_rcu(pmu, &pmus, entry) { + event->pmu = pmu; ret = pmu->event_init(event); if (!ret) goto unlock; @@ -5848,8 +5850,6 @@ done: return ERR_PTR(err); } - event->pmu = pmu; - if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) jump_label_inc(&perf_sched_events); -- cgit v1.2.3 From 590e4d857153c5d4cf86052cdfd42cf9b0779841 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Sun, 24 Jul 2011 16:33:13 +0000 Subject: sched: Allow SD_NODES_PER_DOMAIN to be overridden We want to override the default value of SD_NODES_PER_DOMAIN on ppc64, so move it into linux/topology.h. Signed-off-by: Anton Blanchard Acked-by: Peter Zijlstra Signed-off-by: Benjamin Herrenschmidt --- kernel/sched.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ec5f472bc5b9..d258b2d9a66d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6947,8 +6947,6 @@ static int __init isolated_cpu_setup(char *str) __setup("isolcpus=", isolated_cpu_setup); -#define SD_NODES_PER_DOMAIN 16 - #ifdef CONFIG_NUMA /** -- cgit v1.2.3 From ab10023e0088d5075354afc7cb9e72304757dddd Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Thu, 10 Feb 2011 02:04:45 -0800 Subject: cpu_pm: Add cpu power management notifiers During some CPU power modes entered during idle, hotplug and suspend, peripherals located in the CPU power domain, such as the GIC, localtimers, and VFP, may be powered down. Add a notifier chain that allows drivers for those peripherals to be notified before and after they may be reset. Notified drivers can include VFP co-processor, interrupt controller and it's PM extensions, local CPU timers context save/restore which shouldn't be interrupted. Hence CPU PM event APIs must be called with interrupts disabled. Signed-off-by: Colin Cross Signed-off-by: Santosh Shilimkar Reviewed-by: Kevin Hilman Tested-and-Acked-by: Shawn Guo Tested-by: Kevin Hilman Tested-by: Vishwanath BS --- kernel/Makefile | 1 + kernel/cpu_pm.c | 200 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/power/Kconfig | 4 ++ 3 files changed, 205 insertions(+) create mode 100644 kernel/cpu_pm.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index eca595e2fd52..988cb3da7031 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -101,6 +101,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_IRQ_WORK) += irq_work.o +obj-$(CONFIG_CPU_PM) += cpu_pm.o obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c new file mode 100644 index 000000000000..4d1ff4acd04b --- /dev/null +++ b/kernel/cpu_pm.c @@ -0,0 +1,200 @@ +/* + * Copyright (C) 2011 Google, Inc. + * + * Author: + * Colin Cross + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include +#include + +static DEFINE_RWLOCK(cpu_pm_notifier_lock); +static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain); + +static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls) +{ + int ret; + + ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL, + nr_to_call, nr_calls); + + return notifier_to_errno(ret); +} + +/** + * cpu_pm_register_notifier - register a driver with cpu_pm + * @nb: notifier block to register + * + * Add a driver to a list of drivers that are notified about + * CPU and CPU cluster low power entry and exit. + * + * This function may sleep, and has the same return conditions as + * raw_notifier_chain_register. + */ +int cpu_pm_register_notifier(struct notifier_block *nb) +{ + unsigned long flags; + int ret; + + write_lock_irqsave(&cpu_pm_notifier_lock, flags); + ret = raw_notifier_chain_register(&cpu_pm_notifier_chain, nb); + write_unlock_irqrestore(&cpu_pm_notifier_lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(cpu_pm_register_notifier); + +/** + * cpu_pm_unregister_notifier - unregister a driver with cpu_pm + * @nb: notifier block to be unregistered + * + * Remove a driver from the CPU PM notifier list. + * + * This function may sleep, and has the same return conditions as + * raw_notifier_chain_unregister. + */ +int cpu_pm_unregister_notifier(struct notifier_block *nb) +{ + unsigned long flags; + int ret; + + write_lock_irqsave(&cpu_pm_notifier_lock, flags); + ret = raw_notifier_chain_unregister(&cpu_pm_notifier_chain, nb); + write_unlock_irqrestore(&cpu_pm_notifier_lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); + +/** + * cpm_pm_enter - CPU low power entry notifier + * + * Notifies listeners that a single CPU is entering a low power state that may + * cause some blocks in the same power domain as the cpu to reset. + * + * Must be called on the affected CPU with interrupts disabled. Platform is + * responsible for ensuring that cpu_pm_enter is not called twice on the same + * CPU before cpu_pm_exit is called. Notified drivers can include VFP + * co-processor, interrupt controller and it's PM extensions, local CPU + * timers context save/restore which shouldn't be interrupted. Hence it + * must be called with interrupts disabled. + * + * Return conditions are same as __raw_notifier_call_chain. + */ +int cpu_pm_enter(void) +{ + int nr_calls; + int ret = 0; + + read_lock(&cpu_pm_notifier_lock); + ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls); + if (ret) + /* + * Inform listeners (nr_calls - 1) about failure of CPU PM + * PM entry who are notified earlier to prepare for it. + */ + cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL); + read_unlock(&cpu_pm_notifier_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(cpu_pm_enter); + +/** + * cpm_pm_exit - CPU low power exit notifier + * + * Notifies listeners that a single CPU is exiting a low power state that may + * have caused some blocks in the same power domain as the cpu to reset. + * + * Notified drivers can include VFP co-processor, interrupt controller + * and it's PM extensions, local CPU timers context save/restore which + * shouldn't be interrupted. Hence it must be called with interrupts disabled. + * + * Return conditions are same as __raw_notifier_call_chain. + */ +int cpu_pm_exit(void) +{ + int ret; + + read_lock(&cpu_pm_notifier_lock); + ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL); + read_unlock(&cpu_pm_notifier_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(cpu_pm_exit); + +/** + * cpm_cluster_pm_enter - CPU cluster low power entry notifier + * + * Notifies listeners that all cpus in a power domain are entering a low power + * state that may cause some blocks in the same power domain to reset. + * + * Must be called after cpu_pm_enter has been called on all cpus in the power + * domain, and before cpu_pm_exit has been called on any cpu in the power + * domain. Notified drivers can include VFP co-processor, interrupt controller + * and it's PM extensions, local CPU timers context save/restore which + * shouldn't be interrupted. Hence it must be called with interrupts disabled. + * + * Must be called with interrupts disabled. + * + * Return conditions are same as __raw_notifier_call_chain. + */ +int cpu_cluster_pm_enter(void) +{ + int nr_calls; + int ret = 0; + + read_lock(&cpu_pm_notifier_lock); + ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls); + if (ret) + /* + * Inform listeners (nr_calls - 1) about failure of CPU cluster + * PM entry who are notified earlier to prepare for it. + */ + cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL); + read_unlock(&cpu_pm_notifier_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); + +/** + * cpm_cluster_pm_exit - CPU cluster low power exit notifier + * + * Notifies listeners that all cpus in a power domain are exiting form a + * low power state that may have caused some blocks in the same power domain + * to reset. + * + * Must be called after cpu_pm_exit has been called on all cpus in the power + * domain, and before cpu_pm_exit has been called on any cpu in the power + * domain. Notified drivers can include VFP co-processor, interrupt controller + * and it's PM extensions, local CPU timers context save/restore which + * shouldn't be interrupted. Hence it must be called with interrupts disabled. + * + * Return conditions are same as __raw_notifier_call_chain. + */ +int cpu_cluster_pm_exit(void) +{ + int ret; + + read_lock(&cpu_pm_notifier_lock); + ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL); + read_unlock(&cpu_pm_notifier_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 3744c594b19b..80a85971cf64 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -235,3 +235,7 @@ config PM_GENERIC_DOMAINS config PM_GENERIC_DOMAINS_RUNTIME def_bool y depends on PM_RUNTIME && PM_GENERIC_DOMAINS + +config CPU_PM + bool + depends on SUSPEND || CPU_IDLE -- cgit v1.2.3 From 6f3eaec87b6b17bfa49cb3b5b8d07fa84be18512 Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Fri, 22 Jul 2011 14:57:09 -0700 Subject: cpu_pm: call notifiers during suspend Implements syscore_ops in cpu_pm to call the cpu and cpu cluster notifiers during suspend and resume, allowing drivers receiving the notifications to avoid implementing syscore_ops. Signed-off-by: Colin Cross Signed-off-by: Santosh Shilimkar Reviewed-by: Kevin Hilman Tested-and-Acked-by: Shawn Guo Tested-by: Vishwanath BS --- kernel/cpu_pm.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index 4d1ff4acd04b..249152e15308 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -20,6 +20,7 @@ #include #include #include +#include static DEFINE_RWLOCK(cpu_pm_notifier_lock); static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain); @@ -198,3 +199,35 @@ int cpu_cluster_pm_exit(void) return ret; } EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit); + +#ifdef CONFIG_PM +static int cpu_pm_suspend(void) +{ + int ret; + + ret = cpu_pm_enter(); + if (ret) + return ret; + + ret = cpu_cluster_pm_enter(); + return ret; +} + +static void cpu_pm_resume(void) +{ + cpu_cluster_pm_exit(); + cpu_pm_exit(); +} + +static struct syscore_ops cpu_pm_syscore_ops = { + .suspend = cpu_pm_suspend, + .resume = cpu_pm_resume, +}; + +static int cpu_pm_init(void) +{ + register_syscore_ops(&cpu_pm_syscore_ops); + return 0; +} +core_initcall(cpu_pm_init); +#endif -- cgit v1.2.3 From 9d823e8f6b1b7b39f952d7d1795f29162143a433 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sat, 11 Jun 2011 18:10:12 -0600 Subject: writeback: per task dirty rate limit Add two fields to task_struct. 1) account dirtied pages in the individual tasks, for accuracy 2) per-task balance_dirty_pages() call intervals, for flexibility The balance_dirty_pages() call interval (ie. nr_dirtied_pause) will scale near-sqrt to the safety gap between dirty pages and threshold. The main problem of per-task nr_dirtied is, if 1k+ tasks start dirtying pages at exactly the same time, each task will be assigned a large initial nr_dirtied_pause, so that the dirty threshold will be exceeded long before each task reached its nr_dirtied_pause and hence call balance_dirty_pages(). The solution is to watch for the number of pages dirtied on each CPU in between the calls into balance_dirty_pages(). If it exceeds ratelimit_pages (3% dirty threshold), force call balance_dirty_pages() for a chance to set bdi->dirty_exceeded. In normal situations, this safeguarding condition is not expected to trigger at all. On the sqrt in dirty_poll_interval(): It will serve as an initial guess when dirty pages are still in the freerun area. When dirty pages are floating inside the dirty control scope [freerun, limit], a followup patch will use some refined dirty poll interval to get the desired pause time. thresh-dirty (MB) sqrt 1 16 2 22 4 32 8 45 16 64 32 90 64 128 128 181 256 256 512 362 1024 512 The above table means, given 1MB (or 1GB) gap and the dd tasks polling balance_dirty_pages() on every 16 (or 512) pages, the dirty limit won't be exceeded as long as there are less than 16 (or 512) concurrent dd's. So sqrt naturally leads to less overheads and more safe concurrent tasks for large memory servers, which have large (thresh-freerun) gaps. peter: keep the per-CPU ratelimit for safeguarding the 1k+ tasks case CC: Peter Zijlstra Reviewed-by: Andrea Righi Signed-off-by: Wu Fengguang --- kernel/fork.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8e6b6f4fb272..cc0815df99f2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1302,6 +1302,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->pdeath_signal = 0; p->exit_state = 0; + p->nr_dirtied = 0; + p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); + /* * Ok, make it visible to the rest of the system. * We dont wake it up yet. -- cgit v1.2.3 From 825de2e9007439977aed63771db570fc2235e8cd Mon Sep 17 00:00:00 2001 From: Nobuhiro Iwamatsu Date: Mon, 17 Oct 2011 11:08:46 +0900 Subject: irq: Add EXPORT_SYMBOL_GPL to function of irq generic-chip Some functions of irq generic-chip is undefined, because EXPORT_SYMBOL_GPL is not set to these. ERROR: "irq_setup_generic_chip" [drivers/gpio/gpio-pch.ko] undefined! ERROR: "irq_alloc_generic_chip" [drivers/gpio/gpio-pch.ko] undefined! ERROR: "irq_setup_generic_chip" [drivers/gpio/gpio-ml-ioh.ko] undefined! ERROR: "irq_alloc_generic_chip" [drivers/gpio/gpio-ml-ioh.ko] undefined! This is revised that EXPORT_SYMBOL_GPL can be added and referred to in functions. Signed-off-by: Nobuhiro Iwamatsu Acked-by: Thomas Gleixner Signed-off-by: Grant Likely --- kernel/irq/generic-chip.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index e38544dddb18..6cb7613e4bf4 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -211,6 +211,7 @@ irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base, } return gc; } +EXPORT_SYMBOL_GPL(irq_alloc_generic_chip); /* * Separate lockdep class for interrupt chip which can nest irq_desc @@ -258,6 +259,7 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, } gc->irq_cnt = i - gc->irq_base; } +EXPORT_SYMBOL_GPL(irq_setup_generic_chip); /** * irq_setup_alt_chip - Switch to alternative chip @@ -281,6 +283,7 @@ int irq_setup_alt_chip(struct irq_data *d, unsigned int type) } return -EINVAL; } +EXPORT_SYMBOL_GPL(irq_setup_alt_chip); /** * irq_remove_generic_chip - Remove a chip @@ -311,6 +314,7 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, irq_modify_status(i, clr, set); } } +EXPORT_SYMBOL_GPL(irq_remove_generic_chip); #ifdef CONFIG_PM static int irq_gc_suspend(void) -- cgit v1.2.3 From 189c3fd68c7016e37c1ffd7a00009e2c944a9d06 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 30 Sep 2011 09:10:54 -0700 Subject: stop_machine: make stop_machine safe and efficient to call early Make stop_machine() safe to call early in boot, before stop_machine() has been set up, by simply calling the callback function directly if there's only one CPU online. [ Fixes from AKPM: - add comment - local_irq_flags, not save_flags - also call hard_irq_disable() for systems which need it Tejun suggested using an explicit flag rather than just looking at the online cpu count. ] Signed-off-by: Jeremy Fitzhardinge Acked-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Rusty Russell Cc: Peter Zijlstra Cc: Andrew Morton Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Steven Rostedt --- kernel/stop_machine.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index ba5070ce5765..d3f960a6ca56 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -41,6 +41,7 @@ struct cpu_stopper { }; static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); +static bool stop_machine_initialized = false; static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) { @@ -386,6 +387,8 @@ static int __init cpu_stop_init(void) cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); register_cpu_notifier(&cpu_stop_cpu_notifier); + stop_machine_initialized = true; + return 0; } early_initcall(cpu_stop_init); @@ -485,6 +488,25 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) .num_threads = num_online_cpus(), .active_cpus = cpus }; + if (!stop_machine_initialized) { + /* + * Handle the case where stop_machine() is called + * early in boot before stop_machine() has been + * initialized. + */ + unsigned long flags; + int ret; + + WARN_ON_ONCE(smdata.num_threads != 1); + + local_irq_save(flags); + hard_irq_disable(); + ret = (*fn)(data); + local_irq_restore(flags); + + return ret; + } + /* Set the initial state and stop all online cpus. */ set_state(&smdata, STOPMACHINE_PREPARE); return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata); -- cgit v1.2.3 From 37348804e0289087d21ae8bff4c0732030a3c6ac Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 29 Sep 2011 11:10:05 -0700 Subject: jump_label: if a key has already been initialized, don't nop it out If a key has been enabled before jump_label_init() is called, don't nop it out. This removes arch_jump_label_text_poke_early() (which can only nop out a site) and uses arch_jump_label_transform() instead. Signed-off-by: Jeremy Fitzhardinge Acked-by: Jason Baron Acked-by: Peter Zijlstra --- kernel/jump_label.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index a8ce45097f3d..059202d5b77a 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -121,13 +121,6 @@ static void __jump_label_update(struct jump_label_key *key, } } -/* - * Not all archs need this. - */ -void __weak arch_jump_label_text_poke_early(jump_label_t addr) -{ -} - static __init int jump_label_init(void) { struct jump_entry *iter_start = __start___jump_table; @@ -139,12 +132,15 @@ static __init int jump_label_init(void) jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { - arch_jump_label_text_poke_early(iter->code); - if (iter->key == (jump_label_t)(unsigned long)key) + struct jump_label_key *iterk; + + iterk = (struct jump_label_key *)(unsigned long)iter->key; + arch_jump_label_transform(iter, jump_label_enabled(iterk) ? + JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); + if (iterk == key) continue; - key = (struct jump_label_key *)(unsigned long)iter->key; - atomic_set(&key->enabled, 0); + key = iterk; key->entries = iter; #ifdef CONFIG_MODULES key->next = NULL; @@ -212,7 +208,7 @@ void jump_label_apply_nops(struct module *mod) return; for (iter = iter_start; iter < iter_stop; iter++) - arch_jump_label_text_poke_early(iter->code); + arch_jump_label_transform(iter, JUMP_LABEL_DISABLE); } static int jump_label_add_module(struct module *mod) -- cgit v1.2.3 From 20284aa77c0f6227da4783a920b72dc61d4bcc09 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 3 Oct 2011 11:01:46 -0700 Subject: jump_label: add arch_jump_label_transform_static() to optimise non-live code updates When updating a newly loaded module, the code is definitely not yet executing on any processor, so it can be updated with no need for any heavyweight synchronization. This patch adds arch_jump_label_static() which is implemented as arch_jump_label_transform() by default, but architectures can override it if it avoids, say, a call to stop_machine(). Signed-off-by: Jeremy Fitzhardinge Acked-by: Jason Baron Acked-by: Peter Zijlstra --- kernel/jump_label.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 059202d5b77a..ff2028f35aa8 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -104,6 +104,18 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start, return 0; } +/* + * Update code which is definitely not currently executing. + * Architectures which need heavyweight synchronization to modify + * running code can override this to make the non-live update case + * cheaper. + */ +void __weak arch_jump_label_transform_static(struct jump_entry *entry, + enum jump_label_type type) +{ + arch_jump_label_transform(entry, type); +} + static void __jump_label_update(struct jump_label_key *key, struct jump_entry *entry, struct jump_entry *stop, int enable) @@ -135,8 +147,8 @@ static __init int jump_label_init(void) struct jump_label_key *iterk; iterk = (struct jump_label_key *)(unsigned long)iter->key; - arch_jump_label_transform(iter, jump_label_enabled(iterk) ? - JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); + arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? + JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); if (iterk == key) continue; @@ -208,7 +220,7 @@ void jump_label_apply_nops(struct module *mod) return; for (iter = iter_start; iter < iter_stop; iter++) - arch_jump_label_transform(iter, JUMP_LABEL_DISABLE); + arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE); } static int jump_label_add_module(struct module *mod) -- cgit v1.2.3 From 97ce2c88f9ad42e3c60a9beb9fca87abf3639faa Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 12 Oct 2011 16:17:54 -0700 Subject: jump-label: initialize jump-label subsystem much earlier Initialize jump_labels much, much earlier, so they're available for use during system setup. Signed-off-by: Jeremy Fitzhardinge Acked-by: Peter Zijlstra --- kernel/jump_label.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index ff2028f35aa8..bbdfe2a462a0 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -133,7 +133,7 @@ static void __jump_label_update(struct jump_label_key *key, } } -static __init int jump_label_init(void) +void __init jump_label_init(void) { struct jump_entry *iter_start = __start___jump_table; struct jump_entry *iter_stop = __stop___jump_table; @@ -159,10 +159,7 @@ static __init int jump_label_init(void) #endif } jump_label_unlock(); - - return 0; } -early_initcall(jump_label_init); #ifdef CONFIG_MODULES -- cgit v1.2.3 From 3d214faea6e4f9b6018bf8589f4b245126349c0a Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Sun, 30 Oct 2011 15:16:36 +0100 Subject: [S390] kdump: Add KEXEC_CRASH_CONTROL_MEMORY_LIMIT On s390 there is a different KEXEC_CONTROL_MEMORY_LIMIT for the normal and the kdump kexec case. Therefore this patch introduces a new macro KEXEC_CRASH_CONTROL_MEMORY_LIMIT. This is set to KEXEC_CONTROL_MEMORY_LIMIT for all architectures that do not define KEXEC_CRASH_CONTROL_MEMORY_LIMIT. Acked-by: Vivek Goyal Acked-by: Andrew Morton Signed-off-by: Michael Holzheu Signed-off-by: Martin Schwidefsky --- kernel/kexec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 296fbc84d659..7204fb982ed5 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -498,7 +498,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, while (hole_end <= crashk_res.end) { unsigned long i; - if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) + if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) break; if (hole_end > crashk_res.end) break; -- cgit v1.2.3 From d3bf37955d46718ee1a7f1fc69f953d2328ba7c2 Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Sun, 30 Oct 2011 15:16:37 +0100 Subject: [S390] kdump: Add size to elfcorehdr kernel parameter Currently only the address of the pre-allocated ELF header is passed with the elfcorehdr= kernel parameter. In order to reserve memory for the header in the 2nd kernel also the size is required. Current kdump architecture backends use different methods to do that, e.g. x86 uses the memmap= kernel parameter. On s390 there is no easy way to transfer this information. Therefore the elfcorehdr kernel parameter is extended to also pass the size. This now can also be used as standard mechanism by all future kdump architecture backends. The syntax of the kernel parameter is extended as follows: elfcorehdr=[size[KMG]@]offset[KMG] This change is backward compatible because elfcorehdr=size is still allowed. Acked-by: Vivek Goyal Acked-by: Andrew Morton Signed-off-by: Michael Holzheu Signed-off-by: Martin Schwidefsky --- kernel/crash_dump.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c index 5f85690285d4..69ebf3380bac 100644 --- a/kernel/crash_dump.c +++ b/kernel/crash_dump.c @@ -19,9 +19,16 @@ unsigned long saved_max_pfn; */ unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; +/* + * stores the size of elf header of crash image + */ +unsigned long long elfcorehdr_size; + /* * elfcorehdr= specifies the location of elf core header stored by the crashed * kernel. This option will be passed by kexec loader to the capture kernel. + * + * Syntax: elfcorehdr=[size[KMG]@]offset[KMG] */ static int __init setup_elfcorehdr(char *arg) { @@ -29,6 +36,10 @@ static int __init setup_elfcorehdr(char *arg) if (!arg) return -EINVAL; elfcorehdr_addr = memparse(arg, &end); + if (*end == '@') { + elfcorehdr_size = elfcorehdr_addr; + elfcorehdr_addr = memparse(end + 1, &end); + } return end > arg ? 0 : -EINVAL; } early_param("elfcorehdr", setup_elfcorehdr); -- cgit v1.2.3 From fa8ff292bb4844cc0b66b7b24d611eb7177f7ded Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Sun, 30 Oct 2011 15:16:41 +0100 Subject: [S390] kdump: Initialize vmcoreinfo note at startup Currently the vmcoreinfo note is only initialized in case of kdump. On s390 it is possible to create kernel dumps with other dump mechanisms than kdump (e.g. via hypervisor dump or stand-alone dump tools). For those dumps it would also be desirable to include the vmcoreinfo data. To accomplish this, with this patch the vmcoreinfo ELF note is always initialized, not only in case of a (kdump) crash. On s390 we will add an ABI defined pointer at a well known address to vmcoreinfo so that dump analysis tools are able to find this information. In particular on s390 we have a tool named zgetdump. With this tool it is possible to convert dump formats on the fly using fuse. E.g. you can mount a s390 stand-alone dump as ELF dump. When this is done, the tool finds the vmcoreinfo in the stand-alone dump via the well known ABI defined address and it creates the respective VMCOREINFO ELF note in the output ELF dump. This then can be used e.g. by makedumpfile for dump filtering. No more need for a vmlinux file with debug information. So this will look like the following: $ zgetdump --mount standalone.dump -f elf /mnt $ ls /mnt dump.elf $ readelf -n /mnt/dump.elf $ ... VMCOREINFO 0x00000474 Unknown note type: (0x00000000) $ makedumpfile -c -d 31 /mnt/dump.elf dump.kdump Cc: Andrew Morton Acked-by: Vivek Goyal Signed-off-by: Michael Holzheu Signed-off-by: Martin Schwidefsky --- kernel/kexec.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 7204fb982ed5..d3b8a4ceb90b 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1380,24 +1380,23 @@ int __init parse_crashkernel(char *cmdline, } - -void crash_save_vmcoreinfo(void) +static void update_vmcoreinfo_note(void) { - u32 *buf; + u32 *buf = vmcoreinfo_note; if (!vmcoreinfo_size) return; - - vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds()); - - buf = (u32 *)vmcoreinfo_note; - buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, vmcoreinfo_size); - final_note(buf); } +void crash_save_vmcoreinfo(void) +{ + vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds()); + update_vmcoreinfo_note(); +} + void vmcoreinfo_append_str(const char *fmt, ...) { va_list args; @@ -1483,6 +1482,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PG_swapcache); arch_crash_save_vmcoreinfo(); + update_vmcoreinfo_note(); return 0; } -- cgit v1.2.3 From 558df7209e7997275f6b8ad37737494cf2da1512 Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Sun, 30 Oct 2011 15:16:43 +0100 Subject: [S390] kdump: Add infrastructure for unmapping crashkernel memory This patch introduces a mechanism that allows architecture backends to remove page tables for the crashkernel memory. This can protect the loaded kdump kernel from being overwritten by broken kernel code. Two new functions crash_map_reserved_pages() and crash_unmap_reserved_pages() are added that can be implemented by architecture code. The crash_map_reserved_pages() function is called before and crash_unmap_reserved_pages() after the crashkernel segments are loaded. The functions are also called in crash_shrink_memory() to create/remove page tables when the crashkernel memory size is reduced. To support architectures that have large pages this patch also introduces a new define KEXEC_CRASH_MEM_ALIGN. The crashkernel start and size must always be aligned with KEXEC_CRASH_MEM_ALIGN. Cc: Andrew Morton Acked-by: Vivek Goyal Signed-off-by: Michael Holzheu Signed-off-by: Martin Schwidefsky --- kernel/kexec.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index d3b8a4ceb90b..dc7bc0829286 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -999,6 +999,7 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, kimage_free(xchg(&kexec_crash_image, NULL)); result = kimage_crash_alloc(&image, entry, nr_segments, segments); + crash_map_reserved_pages(); } if (result) goto out; @@ -1015,6 +1016,8 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, goto out; } kimage_terminate(image); + if (flags & KEXEC_ON_CRASH) + crash_unmap_reserved_pages(); } /* Install the new kernel, and Uninstall the old */ image = xchg(dest_image, image); @@ -1026,6 +1029,18 @@ out: return result; } +/* + * Add and remove page tables for crashkernel memory + * + * Provide an empty default implementation here -- architecture + * code may override this + */ +void __weak crash_map_reserved_pages(void) +{} + +void __weak crash_unmap_reserved_pages(void) +{} + #ifdef CONFIG_COMPAT asmlinkage long compat_sys_kexec_load(unsigned long entry, unsigned long nr_segments, @@ -1134,14 +1149,16 @@ int crash_shrink_memory(unsigned long new_size) goto unlock; } - start = roundup(start, PAGE_SIZE); - end = roundup(start + new_size, PAGE_SIZE); + start = roundup(start, KEXEC_CRASH_MEM_ALIGN); + end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); + crash_map_reserved_pages(); crash_free_reserved_phys_range(end, crashk_res.end); if ((start == end) && (crashk_res.parent != NULL)) release_resource(&crashk_res); crashk_res.end = end - 1; + crash_unmap_reserved_pages(); unlock: mutex_unlock(&kexec_mutex); -- cgit v1.2.3 From 638ad34a8811119b32247b7722288ef8b90907d1 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Sun, 30 Oct 2011 15:17:13 +0100 Subject: [S390] sparse: fix sparse warnings about missing prototypes Add prototypes and includes for functions used in different modules. Signed-off-by: Martin Schwidefsky --- kernel/sysctl.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2d2ecdcc8cdb..2fe2bc2a57ea 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -151,14 +151,6 @@ extern int pwrsw_enabled; extern int unaligned_enabled; #endif -#ifdef CONFIG_S390 -#ifdef CONFIG_MATHEMU -extern int sysctl_ieee_emulation_warnings; -#endif -extern int sysctl_userprocess_debug; -extern int spin_retry; -#endif - #ifdef CONFIG_IA64 extern int no_unaligned_warning; extern int unaligned_dump_stack; -- cgit v1.2.3 From 6d274309d0e64bdbdb6c50945ca2964596e8fa5a Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Fri, 30 Sep 2011 10:48:38 -0500 Subject: irq: support domains with non-zero hwirq base Interrupt controllers can have non-zero starting value for h/w irq numbers. Adding support in irq_domain allows the domain hwirq numbering to match the interrupt controllers' numbering. As this makes looping over irqs for a domain more complicated, add loop iterators to iterate over all hwirqs and irqs for a domain. Signed-off-by: Rob Herring Reviewed-by: Jamie Iles Tested-by: Thomas Abraham Acked-by: Grant Likely Acked-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index b57a3776de44..200ce832c585 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -20,15 +20,15 @@ static DEFINE_MUTEX(irq_domain_mutex); void irq_domain_add(struct irq_domain *domain) { struct irq_data *d; - int hwirq; + int hwirq, irq; /* * This assumes that the irq_domain owner has already allocated * the irq_descs. This block will be removed when support for dynamic * allocation of irq_descs is added to irq_domain. */ - for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) { - d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq)); + irq_domain_for_each_irq(domain, hwirq, irq) { + d = irq_get_irq_data(irq); if (!d) { WARN(1, "error: assigning domain to non existant irq_desc"); return; @@ -54,15 +54,15 @@ void irq_domain_add(struct irq_domain *domain) void irq_domain_del(struct irq_domain *domain) { struct irq_data *d; - int hwirq; + int hwirq, irq; mutex_lock(&irq_domain_mutex); list_del(&domain->list); mutex_unlock(&irq_domain_mutex); /* Clear the irq_domain assignments */ - for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) { - d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq)); + irq_domain_for_each_irq(domain, hwirq, irq) { + d = irq_get_irq_data(irq); d->domain = NULL; } } -- cgit v1.2.3 From 9a418455134f5dc23f124d2818b2e8e1cea997a1 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 25 May 2011 09:52:21 -0400 Subject: range: fix bogus misuse of module.h to get printk() This file isn't doing anything with modules and so it should not be including just to get basic stuff like printk() and min/max. Revector it to . Signed-off-by: Paul Gortmaker --- kernel/range.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/range.c b/kernel/range.c index 37fa9b99ad58..9b8ae2d6ed68 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -1,7 +1,7 @@ /* * Range add and subtract */ -#include +#include #include #include -- cgit v1.2.3 From 9984de1a5a8a96275fcab818f7419af5a3c86e71 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Mon, 23 May 2011 14:51:41 -0400 Subject: kernel: Map most files to use export.h instead of module.h The changed files were only including linux/module.h for the EXPORT_SYMBOL infrastructure, and nothing else. Revector them onto the isolated export header for faster compile times. Nothing to see here but a whole lot of instances of: -#include +#include This commit is only changing the kernel dir; next targets will probably be mm, fs, the arch dirs, etc. Signed-off-by: Paul Gortmaker --- kernel/async.c | 2 +- kernel/audit.c | 2 +- kernel/auditsc.c | 2 +- kernel/capability.c | 2 +- kernel/cgroup_freezer.c | 2 +- kernel/cpu.c | 2 +- kernel/cpuset.c | 2 +- kernel/crash_dump.c | 2 +- kernel/cred.c | 2 +- kernel/dma.c | 2 +- kernel/freezer.c | 2 +- kernel/futex.c | 2 +- kernel/groups.c | 2 +- kernel/hrtimer.c | 2 +- kernel/hung_task.c | 2 +- kernel/irq_work.c | 2 +- kernel/kfifo.c | 2 +- kernel/kprobes.c | 2 +- kernel/ksysfs.c | 2 +- kernel/kthread.c | 2 +- kernel/latencytop.c | 2 +- kernel/lockdep_proc.c | 2 +- kernel/module.c | 2 +- kernel/mutex-debug.c | 2 +- kernel/mutex.c | 2 +- kernel/notifier.c | 2 +- kernel/nsproxy.c | 2 +- kernel/padata.c | 2 +- kernel/pid.c | 2 +- kernel/posix-timers.c | 2 +- kernel/profile.c | 2 +- kernel/ptrace.c | 2 +- kernel/rcupdate.c | 2 +- kernel/rcutiny.c | 2 +- kernel/rcutree.c | 2 +- kernel/relay.c | 2 +- kernel/resource.c | 2 +- kernel/rtmutex-debug.c | 2 +- kernel/rtmutex-tester.c | 2 +- kernel/rtmutex.c | 2 +- kernel/rwsem.c | 2 +- kernel/sched_clock.c | 2 +- kernel/semaphore.c | 2 +- kernel/signal.c | 2 +- kernel/smp.c | 2 +- kernel/softirq.c | 2 +- kernel/spinlock.c | 2 +- kernel/srcu.c | 2 +- kernel/stacktrace.c | 2 +- kernel/stop_machine.c | 2 +- kernel/sys.c | 2 +- kernel/time.c | 2 +- kernel/timer.c | 2 +- kernel/up.c | 2 +- kernel/user-return-notifier.c | 2 +- kernel/user.c | 2 +- kernel/user_namespace.c | 2 +- kernel/utsname.c | 2 +- kernel/utsname_sysctl.c | 2 +- kernel/wait.c | 2 +- kernel/workqueue.c | 2 +- 61 files changed, 61 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index 4c2843c0043e..80b74b88fefe 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -51,7 +51,7 @@ asynchronous and synchronous parts of the kernel. #include #include #include -#include +#include #include #include #include diff --git a/kernel/audit.c b/kernel/audit.c index 0a1355ca3d79..09fae2677a45 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -45,7 +45,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ce4b054acee5..47b7fc1ea893 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -48,7 +48,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/capability.c b/kernel/capability.c index 283c529f8b1c..b463871a4e69 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index e691818d7e45..5e828a2ca8e6 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -14,7 +14,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. */ -#include +#include #include #include #include diff --git a/kernel/cpu.c b/kernel/cpu.c index 12b7458f23b1..6a81ca906a06 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 10131fdaff70..d970fb508e34 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c index 5f85690285d4..20e699265cef 100644 --- a/kernel/crash_dump.c +++ b/kernel/crash_dump.c @@ -2,7 +2,7 @@ #include #include #include -#include +#include /* * If we have booted due to a crash, max_pfn will be a very low value. We need diff --git a/kernel/cred.c b/kernel/cred.c index bb55d052d858..5791612a4045 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -8,7 +8,7 @@ * as published by the Free Software Foundation; either version * 2 of the Licence, or (at your option) any later version. */ -#include +#include #include #include #include diff --git a/kernel/dma.c b/kernel/dma.c index f903189c5304..68a2306522c8 100644 --- a/kernel/dma.c +++ b/kernel/dma.c @@ -9,7 +9,7 @@ * [It also happened to remove the sizeof(char *) == sizeof(int) * assumption introduced because of those /proc/dma patches. -- Hennus] */ -#include +#include #include #include #include diff --git a/kernel/freezer.c b/kernel/freezer.c index 66a594e8ad2f..f24aa0005530 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -6,7 +6,7 @@ #include #include -#include +#include #include #include diff --git a/kernel/futex.c b/kernel/futex.c index 1511dff0cfd6..ea87f4d2f455 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -55,7 +55,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/groups.c b/kernel/groups.c index 1cc476d52dd3..99b53d1eb7ea 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -2,7 +2,7 @@ * Supplementary group IDs */ #include -#include +#include #include #include #include diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index a9205e32a059..422e567eecf6 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -32,7 +32,7 @@ */ #include -#include +#include #include #include #include diff --git a/kernel/hung_task.c b/kernel/hung_task.c index ea640120ab86..8b1748d0172c 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include /* diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 0e2cde4f380b..3e460ea44955 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -6,7 +6,7 @@ */ #include -#include +#include #include #include diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 01a0700e873f..c744b88c44e2 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -20,7 +20,7 @@ */ #include -#include +#include #include #include #include diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 2f193d0ba7f2..e5d84644823b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 3b053c04dd86..6771de3a655d 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/kthread.c b/kernel/kthread.c index 4ba7cccb4994..b6d216a92639 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 4ac8ebfcab59..a462b317f9a0 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -53,7 +53,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 71edd2f60c02..91c32a0b612c 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -11,7 +11,7 @@ * Code for /proc/lockdep and /proc/lockdep_stats: * */ -#include +#include #include #include #include diff --git a/kernel/module.c b/kernel/module.c index 93342d992f34..84205ae1607a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -16,7 +16,7 @@ along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include +#include #include #include #include diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index 73da83aff418..7e3443fe1f48 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -14,7 +14,7 @@ */ #include #include -#include +#include #include #include #include diff --git a/kernel/mutex.c b/kernel/mutex.c index d607ed5dd441..89096dd8786f 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -19,7 +19,7 @@ */ #include #include -#include +#include #include #include #include diff --git a/kernel/notifier.c b/kernel/notifier.c index 8d7b435806c9..2d5cc4ccff7f 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include #include diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 9aeab4b98c64..b576f7f14bc6 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -14,7 +14,7 @@ */ #include -#include +#include #include #include #include diff --git a/kernel/padata.c b/kernel/padata.c index b91941df5e63..b45259931512 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -18,7 +18,7 @@ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. */ -#include +#include #include #include #include diff --git a/kernel/pid.c b/kernel/pid.c index 8cafe7e72ad2..fa5f72227e5f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -27,7 +27,7 @@ */ #include -#include +#include #include #include #include diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 4556182527f3..69185ae6b701 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -46,7 +46,7 @@ #include #include #include -#include +#include /* * Management arrays for POSIX timers. Timers are kept in slab memory diff --git a/kernel/profile.c b/kernel/profile.c index 961b389fe52f..76b8e77773ee 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -13,7 +13,7 @@ * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 */ -#include +#include #include #include #include diff --git a/kernel/ptrace.c b/kernel/ptrace.c index a70d2a5d8c7b..24d04477b257 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -8,7 +8,7 @@ */ #include -#include +#include #include #include #include diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index ca0d23b6b3e8..c5b98e565aee 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -43,7 +43,7 @@ #include #include #include -#include +#include #include #define CREATE_TRACE_POINTS diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index da775c87f27f..b5e525d67fe3 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e234eb92a177..6b76d812740c 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/relay.c b/kernel/relay.c index 859ea5a9605f..226fade4d727 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/resource.c b/kernel/resource.c index c8dc249da5ce..7640b3a947d0 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -7,7 +7,7 @@ * Arbitrary resource management. */ -#include +#include #include #include #include diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index a2e7e7210f3e..8eafd1bd273e 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -18,7 +18,7 @@ */ #include #include -#include +#include #include #include #include diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 5c9ccd380966..3d9f31cd79e7 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -7,7 +7,7 @@ * */ #include -#include +#include #include #include #include diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 5e8d9cce7470..f9d8482dd487 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -11,7 +11,7 @@ * See Documentation/rt-mutex-design.txt for details. */ #include -#include +#include #include #include diff --git a/kernel/rwsem.c b/kernel/rwsem.c index 9f48f3d82e9b..b152f74f02de 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 9d8af0b3fb64..c685e31492df 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -62,7 +62,7 @@ */ #include #include -#include +#include #include #include #include diff --git a/kernel/semaphore.c b/kernel/semaphore.c index d831841e55a7..60636a4e25c3 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c @@ -27,7 +27,7 @@ #include #include -#include +#include #include #include #include diff --git a/kernel/signal.c b/kernel/signal.c index d252be2d3de5..b3f78d09a105 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -11,7 +11,7 @@ */ #include -#include +#include #include #include #include diff --git a/kernel/smp.c b/kernel/smp.c index fb67dfa8394e..db197d60489b 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/softirq.c b/kernel/softirq.c index fca82c32042b..2c71d91efff0 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -10,7 +10,7 @@ * Remote softirq infrastructure is by Jens Axboe. */ -#include +#include #include #include #include diff --git a/kernel/spinlock.c b/kernel/spinlock.c index be6517fb9c14..84c7d96918bf 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include /* * If lockdep is enabled then we use the non-preemption spin-ops diff --git a/kernel/srcu.c b/kernel/srcu.c index 73ce23feaea9..0febf61e1aa3 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -24,7 +24,7 @@ * */ -#include +#include #include #include #include diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index d20c6983aad9..00fe55cc5a82 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -7,7 +7,7 @@ */ #include #include -#include +#include #include #include diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index ba5070ce5765..e78db365fa83 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/sys.c b/kernel/sys.c index 58459509b14c..4a0286241829 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -4,7 +4,7 @@ * Copyright (C) 1991, 1992 Linus Torvalds */ -#include +#include #include #include #include diff --git a/kernel/time.c b/kernel/time.c index d77606214529..73e416db0a1e 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -27,7 +27,7 @@ * with nanosecond accuracy */ -#include +#include #include #include #include diff --git a/kernel/timer.c b/kernel/timer.c index 8cff36119e4d..dbaa62422b13 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -20,7 +20,7 @@ */ #include -#include +#include #include #include #include diff --git a/kernel/up.c b/kernel/up.c index 1ff27a28bb7d..c54c75e9faf7 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -4,7 +4,7 @@ #include #include -#include +#include #include int smp_call_function_single(int cpu, void (*func) (void *info), void *info, diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c index 92cb706c7fc8..1744bb80f1fb 100644 --- a/kernel/user-return-notifier.c +++ b/kernel/user-return-notifier.c @@ -2,7 +2,7 @@ #include #include #include -#include +#include static DEFINE_PER_CPU(struct hlist_head, return_notifier_list); diff --git a/kernel/user.c b/kernel/user.c index 9e03e9c1df8d..71dd2363ab0f 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include /* diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 9da289c34f22..3b906e98b1db 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -5,7 +5,7 @@ * License. */ -#include +#include #include #include #include diff --git a/kernel/utsname.c b/kernel/utsname.c index bff131b9510a..405caf91aad5 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -9,7 +9,7 @@ * License. */ -#include +#include #include #include #include diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index a2cd77e70d4d..5a709452ec19 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -9,7 +9,7 @@ * License. */ -#include +#include #include #include #include diff --git a/kernel/wait.c b/kernel/wait.c index f45ea8d2a1ce..26fa7797f90f 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -4,7 +4,7 @@ * (C) 2004 William Irwin, Oracle */ #include -#include +#include #include #include #include diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1783aabc6128..42fa9ad0a810 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -23,7 +23,7 @@ * Please read Documentation/workqueue.txt for details. */ -#include +#include #include #include #include -- cgit v1.2.3 From 56d82e000cdfb51aa92241d4682302f78c35cd92 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 26 May 2011 17:53:52 -0400 Subject: kernel: Add to files using it implicitly These files are doing things like module_put and try_module_get so they need to call out the module.h for explicit inclusion, rather than getting it via which we ideally want to remove the module.h inclusion from. Signed-off-by: Paul Gortmaker --- kernel/trace/ftrace.c | 1 + kernel/trace/trace_syscalls.c | 1 + 2 files changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 077d85387908..900b409543db 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index ee7b5a0bb9f8..cb654542c1a1 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -2,6 +2,7 @@ #include #include #include +#include /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */ #include #include #include -- cgit v1.2.3 From 74da1ff71350f3638c51613085f89c0865d7fe08 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 26 May 2011 12:48:41 -0400 Subject: kernel: fix several implicit usasges of kmod.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These files were implicitly relying on coming in via module.h, as without it we get things like: kernel/power/suspend.c:100: error: implicit declaration of function ‘usermodehelper_disable’ kernel/power/suspend.c:109: error: implicit declaration of function ‘usermodehelper_enable’ kernel/power/user.c:254: error: implicit declaration of function ‘usermodehelper_disable’ kernel/power/user.c:261: error: implicit declaration of function ‘usermodehelper_enable’ kernel/sys.c:317: error: implicit declaration of function ‘usermodehelper_disable’ kernel/sys.c:1816: error: implicit declaration of function ‘call_usermodehelper_setup’ kernel/sys.c:1822: error: implicit declaration of function ‘call_usermodehelper_setfns’ kernel/sys.c:1824: error: implicit declaration of function ‘call_usermodehelper_exec’ Signed-off-by: Paul Gortmaker --- kernel/power/suspend.c | 1 + kernel/power/user.c | 1 + kernel/sys.c | 1 + 3 files changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index fdd4263b995d..31aae3219f89 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/power/user.c b/kernel/power/user.c index 42ddbc6f0de6..6d8f535c2b88 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/sys.c b/kernel/sys.c index 4a0286241829..d2cb1cda823a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 967d1f90625ed9c1ab205d3f738fedf9d852e1fd Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Mon, 18 Jul 2011 13:03:04 -0400 Subject: kernel: fix two implicit header assumptions in irq_work.c Up until now, this file was getting percpu.h because nearly every file was implicitly getting module.h (and all its sub-includes). But we want to clean that up, so call out percpu.h explicitly. Otherwise we'll get things like this on an ARM build: kernel/irq_work.c:48: error: expected declaration specifiers or '...' before 'irq_work_list' kernel/irq_work.c:48: warning: type defaults to 'int' in declaration of 'DEFINE_PER_CPU' The same thing was happening for builds on ARM for asm/processor.h kernel/irq_work.c: In function 'irq_work_sync': kernel/irq_work.c:166: error: implicit declaration of function 'cpu_relax' Signed-off-by: Paul Gortmaker --- kernel/irq_work.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 3e460ea44955..c3c46c72046e 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -8,7 +8,9 @@ #include #include #include +#include #include +#include /* * An entry can be in one of four states: -- cgit v1.2.3 From 1596425fd7545abccb7b5059376e4d56c3d6be4d Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 28 Jul 2011 14:22:29 -0400 Subject: kernel: ksysfs.c is implicitly using stat.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the module.h usage cleanup, we'll get this: kernel/ksysfs.c:161: error: ‘S_IRUGO’ undeclared here (not in a function) make[2]: *** [kernel/ksysfs.o] Error 1 Signed-off-by: Paul Gortmaker --- kernel/ksysfs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 6771de3a655d..4e316e1acf58 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From 72a59aaada499d9bbf19f2fb68daa37502e4a9bb Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 26 May 2011 21:07:10 -0400 Subject: kernel: params.c needs module.h not moduleparam.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Through various other implicit include paths, some files were getting the full module.h file, and hence living the illusion that they really only needed moduleparam.h -- but the reality is that once you remove the module.h presence, these show up: kernel/params.c:583: warning: ‘struct module_kobject’ declared inside parameter list Such files really require module.h so simply make it so. As the file module.h grabs moduleparam.h on the fly, all will be well. Signed-off-by: Paul Gortmaker --- kernel/params.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 821788947e40..65aae11eb93f 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -15,7 +15,7 @@ along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include +#include #include #include #include -- cgit v1.2.3 From bdfa97bf7263657b83bc5b68567a3a60dde84c5b Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Tue, 25 Oct 2011 13:13:57 -0400 Subject: kernel: fix up module header handling in rcutiny files The file rcutiny.c does not need moduleparam.h header, as there are no modparams in this file. However rcutiny_plugin.h does define a module_init() and a module_exit() and it uses the various MODULE_ macros, so it really does need module.h included. Signed-off-by: Paul Gortmaker --- kernel/rcutiny.c | 1 - kernel/rcutiny_plugin.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index b5e525d67fe3..636af6d9c6e5 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -22,7 +22,6 @@ * For detailed explanation of Read-Copy Update mechanism see - * Documentation/RCU */ -#include #include #include #include diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 02aa7139861c..2b0484a5dc28 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -23,6 +23,7 @@ */ #include +#include #include #include -- cgit v1.2.3 From 6e5fdeedca610df600aabc393c4b1f44b128fe49 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 26 May 2011 16:00:52 -0400 Subject: kernel: Fix files explicitly needing EXPORT_SYMBOL infrastructure These files were getting via an implicit non-obvious path, but we want to crush those out of existence since they cost time during compiles of processing thousands of lines of headers for no reason. Give them the lightweight header that just contains the EXPORT_SYMBOL infrastructure. Signed-off-by: Paul Gortmaker --- kernel/compat.c | 1 + kernel/debug/kdb/kdb_debugger.c | 1 + kernel/events/core.c | 1 + kernel/irq/generic-chip.c | 1 + kernel/power/hibernate.c | 1 + kernel/power/main.c | 1 + kernel/power/qos.c | 1 + kernel/power/suspend.c | 1 + kernel/time/posix-clock.c | 1 + kernel/trace/blktrace.c | 1 + 10 files changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index e2435ee9993a..f346cedfe24d 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index d9ca9aa481ec..8b68ce78ff17 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "kdb_private.h" #include "../debug_core.h" diff --git a/kernel/events/core.c b/kernel/events/core.c index d1a1bee35228..92b8811f2234 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 6cb7613e4bf4..c89295a8f668 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 1c53f7fad5f7..b4511b6d3ef9 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -9,6 +9,7 @@ * This file is released under the GPLv2. */ +#include #include #include #include diff --git a/kernel/power/main.c b/kernel/power/main.c index a52e88425a31..71f49fe4377e 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -8,6 +8,7 @@ * */ +#include #include #include #include diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 1c1797dd1d1d..2c0a65e27626 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -43,6 +43,7 @@ #include #include +#include /* * locking rule: all changes to constraints or notifiers lists diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 31aae3219f89..4953dc054c53 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index c340ca658f37..ce033c7aa2e8 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -18,6 +18,7 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include +#include #include #include #include diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7c910a5593a6..16fc34a0806f 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From ec53cf23c0ddb0c29950b9a4ac46964c4c6c6c2f Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Mon, 19 Sep 2011 20:33:19 -0400 Subject: irq: don't put module.h into irq.h for tracking irqgen modules. Recent commit "irq: Track the owner of irq descriptor" in commit ID b6873807a7143b7 placed module.h into linux/irq.h but we are trying to limit module.h inclusion to just C files that really need it, due to its size and number of children includes. This targets just reversing that include. Add in the basic "struct module" since that is all we really need to ensure things compile. In theory, b687380 should have added the module.h include to the irqdesc.h header as well, but the implicit module.h everywhere presence masked this from showing up. So give it the "struct module" as well. As for the C files, irqdesc.c is only using THIS_MODULE, so it does not need module.h - give it export.h instead. The C file irq/manage.c is now (as of b687380) using try_module_get and module_put and so it needs module.h (which it already has). Also convert the irq_alloc_descs variants to macros, since all they really do is is call the __irq_alloc_descs primitive. This avoids including export.h and no debug info is lost. Signed-off-by: Paul Gortmaker --- kernel/irq/irqdesc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 1550e8447a16..d86e254b95eb 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -9,7 +9,7 @@ */ #include #include -#include +#include #include #include #include -- cgit v1.2.3 From fcf634098c00dd9cd247447368495f0b79be12d1 Mon Sep 17 00:00:00 2001 From: Christopher Yeoh Date: Mon, 31 Oct 2011 17:06:39 -0700 Subject: Cross Memory Attach The basic idea behind cross memory attach is to allow MPI programs doing intra-node communication to do a single copy of the message rather than a double copy of the message via shared memory. The following patch attempts to achieve this by allowing a destination process, given an address and size from a source process, to copy memory directly from the source process into its own address space via a system call. There is also a symmetrical ability to copy from the current process's address space into a destination process's address space. - Use of /proc/pid/mem has been considered, but there are issues with using it: - Does not allow for specifying iovecs for both src and dest, assuming preadv or pwritev was implemented either the area read from or written to would need to be contiguous. - Currently mem_read allows only processes who are currently ptrace'ing the target and are still able to ptrace the target to read from the target. This check could possibly be moved to the open call, but its not clear exactly what race this restriction is stopping (reason appears to have been lost) - Having to send the fd of /proc/self/mem via SCM_RIGHTS on unix domain socket is a bit ugly from a userspace point of view, especially when you may have hundreds if not (eventually) thousands of processes that all need to do this with each other - Doesn't allow for some future use of the interface we would like to consider adding in the future (see below) - Interestingly reading from /proc/pid/mem currently actually involves two copies! (But this could be fixed pretty easily) As mentioned previously use of vmsplice instead was considered, but has problems. Since you need the reader and writer working co-operatively if the pipe is not drained then you block. Which requires some wrapping to do non blocking on the send side or polling on the receive. In all to all communication it requires ordering otherwise you can deadlock. And in the example of many MPI tasks writing to one MPI task vmsplice serialises the copying. There are some cases of MPI collectives where even a single copy interface does not get us the performance gain we could. For example in an MPI_Reduce rather than copy the data from the source we would like to instead use it directly in a mathops (say the reduce is doing a sum) as this would save us doing a copy. We don't need to keep a copy of the data from the source. I haven't implemented this, but I think this interface could in the future do all this through the use of the flags - eg could specify the math operation and type and the kernel rather than just copying the data would apply the specified operation between the source and destination and store it in the destination. Although we don't have a "second user" of the interface (though I've had some nibbles from people who may be interested in using it for intra process messaging which is not MPI). This interface is something which hardware vendors are already doing for their custom drivers to implement fast local communication. And so in addition to this being useful for OpenMPI it would mean the driver maintainers don't have to fix things up when the mm changes. There was some discussion about how much faster a true zero copy would go. Here's a link back to the email with some testing I did on that: http://marc.info/?l=linux-mm&m=130105930902915&w=2 There is a basic man page for the proposed interface here: http://ozlabs.org/~cyeoh/cma/process_vm_readv.txt This has been implemented for x86 and powerpc, other architecture should mainly (I think) just need to add syscall numbers for the process_vm_readv and process_vm_writev. There are 32 bit compatibility versions for 64-bit kernels. For arch maintainers there are some simple tests to be able to quickly verify that the syscalls are working correctly here: http://ozlabs.org/~cyeoh/cma/cma-test-20110718.tgz Signed-off-by: Chris Yeoh Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Arnd Bergmann Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: David Howells Cc: James Morris Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index a9a5de07c4f1..47bfa16430d7 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -145,6 +145,10 @@ cond_syscall(sys_io_submit); cond_syscall(sys_io_cancel); cond_syscall(sys_io_getevents); cond_syscall(sys_syslog); +cond_syscall(sys_process_vm_readv); +cond_syscall(sys_process_vm_writev); +cond_syscall(compat_sys_process_vm_readv); +cond_syscall(compat_sys_process_vm_writev); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); -- cgit v1.2.3 From c9f01245b6a7d77d17deaa71af10f6aca14fa24e Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 31 Oct 2011 17:07:15 -0700 Subject: oom: remove oom_disable_count This removes mm->oom_disable_count entirely since it's unnecessary and currently buggy. The counter was intended to be per-process but it's currently decremented in the exit path for each thread that exits, causing it to underflow. The count was originally intended to prevent oom killing threads that share memory with threads that cannot be killed since it doesn't lead to future memory freeing. The counter could be fixed to represent all threads sharing the same mm, but it's better to remove the count since: - it is possible that the OOM_DISABLE thread sharing memory with the victim is waiting on that thread to exit and will actually cause future memory freeing, and - there is no guarantee that a thread is disabled from oom killing just because another thread sharing its mm is oom disabled. Signed-off-by: David Rientjes Reported-by: Oleg Nesterov Reviewed-by: Oleg Nesterov Cc: Ying Han Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 -- kernel/fork.c | 10 +--------- 2 files changed, 1 insertion(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 2913b3509d42..d0b7d988f873 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -681,8 +681,6 @@ static void exit_mm(struct task_struct * tsk) enter_lazy_tlb(mm, current); /* We don't want this task to be frozen prematurely */ clear_freeze_flag(tsk); - if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) - atomic_dec(&mm->oom_disable_count); task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); diff --git a/kernel/fork.c b/kernel/fork.c index 8e6b6f4fb272..70d76191afb9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -501,7 +501,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->cached_hole_size = ~0UL; mm_init_aio(mm); mm_init_owner(mm, p); - atomic_set(&mm->oom_disable_count, 0); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -816,8 +815,6 @@ good_mm: /* Initializing for Swap token stuff */ mm->token_priority = 0; mm->last_interval = 0; - if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) - atomic_inc(&mm->oom_disable_count); tsk->mm = mm; tsk->active_mm = mm; @@ -1391,13 +1388,8 @@ bad_fork_cleanup_io: bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: - if (p->mm) { - task_lock(p); - if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) - atomic_dec(&p->mm->oom_disable_count); - task_unlock(p); + if (p->mm) mmput(p->mm); - } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) free_signal_struct(p->signal); -- cgit v1.2.3 From bc3e53f682d93df677dbd5006a404722b3adfe18 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 31 Oct 2011 17:07:30 -0700 Subject: mm: distinguish between mlocked and pinned pages Some kernel components pin user space memory (infiniband and perf) (by increasing the page count) and account that memory as "mlocked". The difference between mlocking and pinning is: A. mlocked pages are marked with PG_mlocked and are exempt from swapping. Page migration may move them around though. They are kept on a special LRU list. B. Pinned pages cannot be moved because something needs to directly access physical memory. They may not be on any LRU list. I recently saw an mlockalled process where mm->locked_vm became bigger than the virtual size of the process (!) because some memory was accounted for twice: Once when the page was mlocked and once when the Infiniband layer increased the refcount because it needt to pin the RDMA memory. This patch introduces a separate counter for pinned pages and accounts them seperately. Signed-off-by: Christoph Lameter Cc: Mike Marciniszyn Cc: Roland Dreier Cc: Sean Hefty Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d1a1bee35228..12a0287e0358 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3544,7 +3544,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) struct ring_buffer *rb = event->rb; atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); - vma->vm_mm->locked_vm -= event->mmap_locked; + vma->vm_mm->pinned_vm -= event->mmap_locked; rcu_assign_pointer(event->rb, NULL); mutex_unlock(&event->mmap_mutex); @@ -3625,7 +3625,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; - locked = vma->vm_mm->locked_vm + extra; + locked = vma->vm_mm->pinned_vm + extra; if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && !capable(CAP_IPC_LOCK)) { @@ -3651,7 +3651,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) atomic_long_add(user_extra, &user->locked_vm); event->mmap_locked = extra; event->mmap_user = get_current_user(); - vma->vm_mm->locked_vm += event->mmap_locked; + vma->vm_mm->pinned_vm += event->mmap_locked; unlock: if (!ret) -- cgit v1.2.3 From f445027e4e692bd885118460b292d08027fd5501 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 31 Oct 2011 17:11:15 -0700 Subject: stop_machine: make stop_machine safe and efficient to call early Make stop_machine() safe to call early in boot, before SMP has been set up, by simply calling the callback function directly if there's only one CPU online. [ Fixes from AKPM: - add comment - local_irq_flags, not save_flags - also call hard_irq_disable() for systems which need it Tejun suggested using an explicit flag rather than just looking at the online cpu count. ] Cc: Tejun Heo Acked-by: Rusty Russell Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Steven Rostedt Acked-by: Tejun Heo Cc: Konrad Rzeszutek Wilk Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/stop_machine.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index ba5070ce5765..5b0951aa0496 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -41,6 +41,7 @@ struct cpu_stopper { }; static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); +static bool stop_machine_initialized = false; static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) { @@ -386,6 +387,8 @@ static int __init cpu_stop_init(void) cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); register_cpu_notifier(&cpu_stop_cpu_notifier); + stop_machine_initialized = true; + return 0; } early_initcall(cpu_stop_init); @@ -485,6 +488,25 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) .num_threads = num_online_cpus(), .active_cpus = cpus }; + if (!stop_machine_initialized) { + /* + * Handle the case where stop_machine() is called + * early in boot before stop_machine() has been + * initialized. + */ + unsigned long flags; + int ret; + + WARN_ON_ONCE(smdata.num_threads != 1); + + local_irq_save(flags); + hard_irq_disable(); + ret = (*fn)(data); + local_irq_restore(flags); + + return ret; + } + /* Set the initial state and stop all online cpus. */ set_state(&smdata, STOPMACHINE_PREPARE); return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata); -- cgit v1.2.3 From 4ff819515b203f937cc6c8a0215a37a68d1ee71f Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Mon, 31 Oct 2011 17:11:18 -0700 Subject: watchdog: move watchdog_*_all_cpus under CONFIG_SYSCTL Fix compilation warnings for CONFIG_SYSCTL=n: fixed compilation warnings in case of disabled CONFIG_SYSCTL kernel/watchdog.c:483:13: warning: `watchdog_enable_all_cpus' defined but not used kernel/watchdog.c:500:13: warning: `watchdog_disable_all_cpus' defined but not used these functions are static and are used only in sysctl handler, so move them inside #ifdef CONFIG_SYSCTL too Signed-off-by: Vasily Averin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index d680381b0e9c..1d7bca7f4f52 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -481,6 +481,8 @@ static void watchdog_disable(int cpu) } } +/* sysctl functions */ +#ifdef CONFIG_SYSCTL static void watchdog_enable_all_cpus(void) { int cpu; @@ -510,8 +512,6 @@ static void watchdog_disable_all_cpus(void) } -/* sysctl functions */ -#ifdef CONFIG_SYSCTL /* * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh */ -- cgit v1.2.3 From 73efc0394e148d0e15583e13712637831f926720 Mon Sep 17 00:00:00 2001 From: Dan Ballard Date: Mon, 31 Oct 2011 17:11:20 -0700 Subject: kernel/sysctl.c: add cap_last_cap to /proc/sys/kernel Userspace needs to know the highest valid capability of the running kernel, which right now cannot reliably be retrieved from the header files only. The fact that this value cannot be determined properly right now creates various problems for libraries compiled on newer header files which are run on older kernels. They assume capabilities are available which actually aren't. libcap-ng is one example. And we ran into the same problem with systemd too. Now the capability is exported in /proc/sys/kernel/cap_last_cap. [akpm@linux-foundation.org: make cap_last_cap const, per Ulrich] Signed-off-by: Dan Ballard Cc: Randy Dunlap Cc: Ingo Molnar Cc: Lennart Poettering Cc: Kay Sievers Cc: Ulrich Drepper Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2d2ecdcc8cdb..c49d66658ec0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include @@ -134,6 +135,7 @@ static int minolduid; static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; +static const int cap_last_cap = CAP_LAST_CAP; #ifdef CONFIG_INOTIFY_USER #include @@ -740,6 +742,13 @@ static struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = proc_dointvec, }, + { + .procname = "cap_last_cap", + .data = (void *)&cap_last_cap, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, #if defined(CONFIG_LOCKUP_DETECTOR) { .procname = "watchdog", -- cgit v1.2.3 From 0eca6b7c78fd997e02bd9850e608102382b7822e Mon Sep 17 00:00:00 2001 From: Yanmin Zhang Date: Mon, 31 Oct 2011 17:11:25 -0700 Subject: printk: add module parameter ignore_loglevel to control ignore_loglevel We are enabling some power features on medfield. To test suspend-2-RAM conveniently, we need turn on/off ignore_loglevel frequently without rebooting. Add a module parameter, so users can change it by: /sys/module/printk/parameters/ignore_loglevel Signed-off-by: Yanmin Zhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index b7da18391c38..e62f949ec140 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -532,6 +532,9 @@ static int __init ignore_loglevel_setup(char *str) } early_param("ignore_loglevel", ignore_loglevel_setup); +module_param_named(ignore_loglevel, ignore_loglevel, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" + "print all kernel messages to the console."); /* * Write out chars from start to end - 1 inclusive -- cgit v1.2.3 From 134620f7a865b3bc9e3d56d460603592b70ede21 Mon Sep 17 00:00:00 2001 From: Yanmin Zhang Date: Mon, 31 Oct 2011 17:11:27 -0700 Subject: printk: add console_suspend module parameter We are enabling some power features on medfield. To test suspend-2-RAM conveniently, we need turn on/off console_suspend_enabled frequently. Add a module parameter, so users could change it by: /sys/module/printk/parameters/console_suspend Signed-off-by: Yanmin Zhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index e62f949ec140..6d9dedd11450 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1111,6 +1111,10 @@ static int __init console_suspend_disable(char *str) return 1; } __setup("no_console_suspend", console_suspend_disable); +module_param_named(console_suspend, console_suspend_enabled, + bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(console_suspend, "suspend console during suspend" + " and hibernate operations"); /** * suspend_console - suspend the console subsystem -- cgit v1.2.3 From 48e41899e4a3592746e5263c14681bf5c1393563 Mon Sep 17 00:00:00 2001 From: William Douglas Date: Mon, 31 Oct 2011 17:11:29 -0700 Subject: printk: fix bounds checking for log_prefix Currently log_prefix is testing that the first character of the log level and facility is less than '0' and greater than '9' (which is always false). It should be testing to see if the character less than '0' or greater than '9' instead. This patch makes that change. The code being changed worked because strtoul bombs out (endp isn't updated) and 0 is returned anyway. Signed-off-by: William Douglas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 6d9dedd11450..286d2c7be52c 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -595,7 +595,7 @@ static size_t log_prefix(const char *p, unsigned int *level, char *special) /* multi digit including the level and facility number */ char *endp = NULL; - if (p[1] < '0' && p[1] > '9') + if (p[1] < '0' || p[1] > '9') return 0; lev = (simple_strtoul(&p[1], &endp, 10) & 7); -- cgit v1.2.3 From ae29bc92da01a2e9d278a9a58c3b307d41cc0254 Mon Sep 17 00:00:00 2001 From: William Douglas Date: Mon, 31 Oct 2011 17:11:31 -0700 Subject: printk: remove bounds checking for log_prefix Currently log_prefix is testing that the first character of the log level and facility is less than '0' and greater than '9' (which is always false). Since the code being updated works because strtoul bombs out (endp isn't updated) and 0 is returned anyway just remove the check and don't change the behavior of the function. Signed-off-by: William Douglas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 286d2c7be52c..1455a0d4eedd 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -595,9 +595,6 @@ static size_t log_prefix(const char *p, unsigned int *level, char *special) /* multi digit including the level and facility number */ char *endp = NULL; - if (p[1] < '0' || p[1] > '9') - return 0; - lev = (simple_strtoul(&p[1], &endp, 10) & 7); if (endp == NULL || endp[0] != '>') return 0; -- cgit v1.2.3 From 50e1499f468fd74c6db95deb2e1e6bfee578ae70 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 31 Oct 2011 17:12:51 -0700 Subject: kgdb: follow rename pack_hex_byte() to hex_byte_pack() There is no functional change. Signed-off-by: Andy Shevchenko Acked-by: Jesper Nilsson Cc: David Howells Cc: Koichi Yasutake Cc: Jason Wessel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/debug/gdbstub.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 34872482315e..c22d8c28ad84 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -217,7 +217,7 @@ void gdbstub_msg_write(const char *s, int len) /* Pack in hex chars */ for (i = 0; i < wcount; i++) - bufptr = pack_hex_byte(bufptr, s[i]); + bufptr = hex_byte_pack(bufptr, s[i]); *bufptr = '\0'; /* Move up */ @@ -249,7 +249,7 @@ char *kgdb_mem2hex(char *mem, char *buf, int count) if (err) return NULL; while (count > 0) { - buf = pack_hex_byte(buf, *tmp); + buf = hex_byte_pack(buf, *tmp); tmp++; count--; } @@ -411,14 +411,14 @@ static char *pack_threadid(char *pkt, unsigned char *id) limit = id + (BUF_THREAD_ID_SIZE / 2); while (id < limit) { if (!lzero || *id != 0) { - pkt = pack_hex_byte(pkt, *id); + pkt = hex_byte_pack(pkt, *id); lzero = 0; } id++; } if (lzero) - pkt = pack_hex_byte(pkt, 0); + pkt = hex_byte_pack(pkt, 0); return pkt; } @@ -486,7 +486,7 @@ static void gdb_cmd_status(struct kgdb_state *ks) dbg_remove_all_break(); remcom_out_buffer[0] = 'S'; - pack_hex_byte(&remcom_out_buffer[1], ks->signo); + hex_byte_pack(&remcom_out_buffer[1], ks->signo); } static void gdb_get_regs_helper(struct kgdb_state *ks) @@ -954,7 +954,7 @@ int gdb_serial_stub(struct kgdb_state *ks) /* Reply to host that an exception has occurred */ ptr = remcom_out_buffer; *ptr++ = 'T'; - ptr = pack_hex_byte(ptr, ks->signo); + ptr = hex_byte_pack(ptr, ks->signo); ptr += strlen(strcpy(ptr, "thread:")); int_to_threadref(thref, shadow_pid(current->pid)); ptr = pack_threadid(ptr, thref); -- cgit v1.2.3 From 33ef6b6984403a688189317ef46bb3caab3b70e0 Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Wed, 2 Nov 2011 13:38:05 -0700 Subject: cgroups: more safe tasklist locking in cgroup_attach_proc Fix unstable tasklist locking in cgroup_attach_proc. According to this thread - https://lkml.org/lkml/2011/7/27/243 - RCU is not sufficient to guarantee the tasklist is stable w.r.t. de_thread and exit. Taking tasklist_lock for reading, instead of rcu_read_lock, ensures proper exclusion. Signed-off-by: Ben Blum Acked-by: Paul Menage Cc: Oleg Nesterov Cc: Frederic Weisbecker Cc: "Paul E. McKenney" Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 453100a4159d..64b0e73402df 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2027,7 +2027,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) goto out_free_group_list; /* prevent changes to the threadgroup list while we take a snapshot. */ - rcu_read_lock(); + read_lock(&tasklist_lock); if (!thread_group_leader(leader)) { /* * a race with de_thread from another thread's exec() may strip @@ -2036,7 +2036,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * throw this task away and try again (from cgroup_procs_write); * this is "double-double-toil-and-trouble-check locking". */ - rcu_read_unlock(); + read_unlock(&tasklist_lock); retval = -EAGAIN; goto out_free_group_list; } @@ -2057,7 +2057,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) } while_each_thread(leader, tsk); /* remember the number of threads in the array for later. */ group_size = i; - rcu_read_unlock(); + read_unlock(&tasklist_lock); /* * step 1: check that we can legitimately attach to the cgroup. -- cgit v1.2.3 From 77ceab8ea590d7dc6c8f055ce43dfebd74428107 Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Wed, 2 Nov 2011 13:38:07 -0700 Subject: cgroups: don't attach task to subsystem if migration failed If a task has exited to the point it has called cgroup_exit() already, then we can't migrate it to another cgroup anymore. This can happen when we are attaching a task to a new cgroup between the call to ->can_attach_task() on subsystems and the migration that is eventually tried in cgroup_task_migrate(). In this case cgroup_task_migrate() returns -ESRCH and we don't want to attach the task to the subsystems because the attachment to the new cgroup itself failed. Fix this by only calling ->attach_task() on the subsystems if the cgroup migration succeeded. Reported-by: Oleg Nesterov Signed-off-by: Ben Blum Acked-by: Paul Menage Cc: Li Zefan Cc: Tejun Heo Signed-off-by: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 64b0e73402df..8386b21224ef 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2135,14 +2135,17 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) oldcgrp = task_cgroup_from_root(tsk, root); if (cgrp == oldcgrp) continue; - /* attach each task to each subsystem */ - for_each_subsys(root, ss) { - if (ss->attach_task) - ss->attach_task(cgrp, tsk); - } /* if the thread is PF_EXITING, it can just get skipped. */ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); - BUG_ON(retval != 0 && retval != -ESRCH); + if (retval == 0) { + /* attach each task to each subsystem */ + for_each_subsys(root, ss) { + if (ss->attach_task) + ss->attach_task(cgrp, tsk); + } + } else { + BUG_ON(retval != -ESRCH); + } } /* nothing is sensitive to fork() after this point. */ -- cgit v1.2.3 From 89e8a244b97e48f1f30e898b6f32acca477f2a13 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 2 Nov 2011 13:38:39 -0700 Subject: cpusets: avoid looping when storing to mems_allowed if one node remains set {get,put}_mems_allowed() exist so that general kernel code may locklessly access a task's set of allowable nodes without having the chance that a concurrent write will cause the nodemask to be empty on configurations where MAX_NUMNODES > BITS_PER_LONG. This could incur a significant delay, however, especially in low memory conditions because the page allocator is blocking and reclaim requires get_mems_allowed() itself. It is not atypical to see writes to cpuset.mems take over 2 seconds to complete, for example. In low memory conditions, this is problematic because it's one of the most imporant times to change cpuset.mems in the first place! The only way a task's set of allowable nodes may change is through cpusets by writing to cpuset.mems and when attaching a task to a generic code is not reading the nodemask with get_mems_allowed() at the same time, and then clearing all the old nodes. This prevents the possibility that a reader will see an empty nodemask at the same time the writer is storing a new nodemask. If at least one node remains unchanged, though, it's possible to simply set all new nodes and then clear all the old nodes. Changing a task's nodemask is protected by cgroup_mutex so it's guaranteed that two threads are not changing the same task's nodemask at the same time, so the nodemask is guaranteed to be stored before another thread changes it and determines whether a node remains set or not. Signed-off-by: David Rientjes Cc: Miao Xie Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 10131fdaff70..ed0ff443f036 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -949,6 +949,8 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, static void cpuset_change_task_nodemask(struct task_struct *tsk, nodemask_t *newmems) { + bool masks_disjoint = !nodes_intersects(*newmems, tsk->mems_allowed); + repeat: /* * Allow tasks that have access to memory reserves because they have @@ -963,7 +965,6 @@ repeat: nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); - /* * ensure checking ->mems_allowed_change_disable after setting all new * allowed nodes. @@ -980,9 +981,11 @@ repeat: /* * Allocation of memory is very fast, we needn't sleep when waiting - * for the read-side. + * for the read-side. No wait is necessary, however, if at least one + * node remains unchanged. */ - while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) { + while (masks_disjoint && + ACCESS_ONCE(tsk->mems_allowed_change_disable)) { task_unlock(tsk); if (!task_curr(tsk)) yield(); -- cgit v1.2.3 From f1ecf06854a66ee663f4d4cf029c78cd62a15e04 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Wed, 2 Nov 2011 13:39:22 -0700 Subject: sysctl: add support for poll() Adding support for poll() in sysctl fs allows userspace to receive notifications of changes in sysctl entries. This adds a infrastructure to allow files in sysctl fs to be pollable and implements it for hostname and domainname. [akpm@linux-foundation.org: s/declare/define/ for definitions] Signed-off-by: Lucas De Marchi Cc: Greg KH Cc: Kay Sievers Cc: Al Viro Cc: "Eric W. Biederman" Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 ++ kernel/utsname_sysctl.c | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 58459509b14c..d06c091e0345 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1286,6 +1286,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) memset(u->nodename + len, 0, sizeof(u->nodename) - len); errno = 0; } + uts_proc_notify(UTS_PROC_HOSTNAME); up_write(&uts_sem); return errno; } @@ -1336,6 +1337,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) memset(u->domainname + len, 0, sizeof(u->domainname) - len); errno = 0; } + uts_proc_notify(UTS_PROC_DOMAINNAME); up_write(&uts_sem); return errno; } diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index a2cd77e70d4d..3b0d48ebf81d 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -13,6 +13,7 @@ #include #include #include +#include static void *get_uts(ctl_table *table, int write) { @@ -51,12 +52,19 @@ static int proc_do_uts_string(ctl_table *table, int write, uts_table.data = get_uts(table, write); r = proc_dostring(&uts_table,write,buffer,lenp, ppos); put_uts(table, write, uts_table.data); + + if (write) + proc_sys_poll_notify(table->poll); + return r; } #else #define proc_do_uts_string NULL #endif +static DEFINE_CTL_TABLE_POLL(hostname_poll); +static DEFINE_CTL_TABLE_POLL(domainname_poll); + static struct ctl_table uts_kern_table[] = { { .procname = "ostype", @@ -85,6 +93,7 @@ static struct ctl_table uts_kern_table[] = { .maxlen = sizeof(init_uts_ns.name.nodename), .mode = 0644, .proc_handler = proc_do_uts_string, + .poll = &hostname_poll, }, { .procname = "domainname", @@ -92,6 +101,7 @@ static struct ctl_table uts_kern_table[] = { .maxlen = sizeof(init_uts_ns.name.domainname), .mode = 0644, .proc_handler = proc_do_uts_string, + .poll = &domainname_poll, }, {} }; @@ -105,6 +115,19 @@ static struct ctl_table uts_root_table[] = { {} }; +#ifdef CONFIG_PROC_SYSCTL +/* + * Notify userspace about a change in a certain entry of uts_kern_table, + * identified by the parameter proc. + */ +void uts_proc_notify(enum uts_proc proc) +{ + struct ctl_table *table = &uts_kern_table[proc]; + + proc_sys_poll_notify(table->poll); +} +#endif + static int __init utsname_sysctl_init(void) { register_sysctl_table(uts_root_table); -- cgit v1.2.3 From c1e2ee2dc436574880758b3836fc96935b774c32 Mon Sep 17 00:00:00 2001 From: Andrew Bresticker Date: Wed, 2 Nov 2011 13:40:29 -0700 Subject: memcg: replace ss->id_lock with a rwlock While back-porting Johannes Weiner's patch "mm: memcg-aware global reclaim" for an internal effort, we noticed a significant performance regression during page-reclaim heavy workloads due to high contention of the ss->id_lock. This lock protects idr map, and serializes calls to idr_get_next() in css_get_next() (which is used during the memcg hierarchy walk). Since idr_get_next() is just doing a look up, we need only serialize it with respect to idr_remove()/idr_get_new(). By making the ss->id_lock a rwlock, contention is greatly reduced and performance improves. Tested: cat a 256m file from a ramdisk in a 128m container 50 times on each core (one file + container per core) in parallel on a NUMA machine. Result is the time for the test to complete in 1 of the containers. Both kernels included Johannes' memcg-aware global reclaim patches. Before rwlock patch: 1710.778s After rwlock patch: 152.227s Signed-off-by: Andrew Bresticker Cc: Paul Menage Cc: Li Zefan Acked-by: KAMEZAWA Hiroyuki Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8386b21224ef..d9d5648f3cdc 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4883,9 +4883,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) rcu_assign_pointer(id->css, NULL); rcu_assign_pointer(css->id, NULL); - spin_lock(&ss->id_lock); + write_lock(&ss->id_lock); idr_remove(&ss->idr, id->id); - spin_unlock(&ss->id_lock); + write_unlock(&ss->id_lock); kfree_rcu(id, rcu_head); } EXPORT_SYMBOL_GPL(free_css_id); @@ -4911,10 +4911,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) error = -ENOMEM; goto err_out; } - spin_lock(&ss->id_lock); + write_lock(&ss->id_lock); /* Don't use 0. allocates an ID of 1-65535 */ error = idr_get_new_above(&ss->idr, newid, 1, &myid); - spin_unlock(&ss->id_lock); + write_unlock(&ss->id_lock); /* Returns error when there are no free spaces for new ID.*/ if (error) { @@ -4929,9 +4929,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) return newid; remove_idr: error = -ENOSPC; - spin_lock(&ss->id_lock); + write_lock(&ss->id_lock); idr_remove(&ss->idr, myid); - spin_unlock(&ss->id_lock); + write_unlock(&ss->id_lock); err_out: kfree(newid); return ERR_PTR(error); @@ -4943,7 +4943,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, { struct css_id *newid; - spin_lock_init(&ss->id_lock); + rwlock_init(&ss->id_lock); idr_init(&ss->idr); newid = get_new_cssid(ss, 0); @@ -5038,9 +5038,9 @@ css_get_next(struct cgroup_subsys *ss, int id, * scan next entry from bitmap(tree), tmpid is updated after * idr_get_next(). */ - spin_lock(&ss->id_lock); + read_lock(&ss->id_lock); tmp = idr_get_next(&ss->idr, &tmpid); - spin_unlock(&ss->id_lock); + read_unlock(&ss->id_lock); if (!tmp) break; -- cgit v1.2.3 From 4536e4d1d21c8172402a2217b0fa1880665ace36 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 3 Nov 2011 07:44:04 -0700 Subject: Revert "perf: Add PM notifiers to fix CPU hotplug races" This reverts commit 144060fee07e9c22e179d00819c83c86fbcbf82c. It causes a resume regression for Andi on his Acer Aspire 1830T post 3.1. The screen just stays black after wakeup. Also, it really looks like the wrong way to suspend and resume perf events: I think they should be done as part of the CPU suspend and resume, rather than as a notifier that does smp_call_function(). Reported-by: Andi Kleen Acked-by: Ingo Molnar Cc: Peter Zijlstra Cc: Rafael J. Wysocki Signed-off-by: Linus Torvalds --- kernel/events/core.c | 97 ++-------------------------------------------------- 1 file changed, 2 insertions(+), 95 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 12a0287e0358..e1253faa34dd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include @@ -6853,7 +6852,7 @@ static void __cpuinit perf_event_init_cpu(int cpu) struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); - if (swhash->hlist_refcount > 0 && !swhash->swevent_hlist) { + if (swhash->hlist_refcount > 0) { struct swevent_hlist *hlist; hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); @@ -6942,14 +6941,7 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (long)hcpu; - /* - * Ignore suspend/resume action, the perf_pm_notifier will - * take care of that. - */ - if (action & CPU_TASKS_FROZEN) - return NOTIFY_OK; - - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: case CPU_DOWN_FAILED: @@ -6968,90 +6960,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) return NOTIFY_OK; } -static void perf_pm_resume_cpu(void *unused) -{ - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; - struct pmu *pmu; - int idx; - - idx = srcu_read_lock(&pmus_srcu); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - ctx = cpuctx->task_ctx; - - perf_ctx_lock(cpuctx, ctx); - perf_pmu_disable(cpuctx->ctx.pmu); - - cpu_ctx_sched_out(cpuctx, EVENT_ALL); - if (ctx) - ctx_sched_out(ctx, cpuctx, EVENT_ALL); - - perf_pmu_enable(cpuctx->ctx.pmu); - perf_ctx_unlock(cpuctx, ctx); - } - srcu_read_unlock(&pmus_srcu, idx); -} - -static void perf_pm_suspend_cpu(void *unused) -{ - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; - struct pmu *pmu; - int idx; - - idx = srcu_read_lock(&pmus_srcu); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - ctx = cpuctx->task_ctx; - - perf_ctx_lock(cpuctx, ctx); - perf_pmu_disable(cpuctx->ctx.pmu); - - perf_event_sched_in(cpuctx, ctx, current); - - perf_pmu_enable(cpuctx->ctx.pmu); - perf_ctx_unlock(cpuctx, ctx); - } - srcu_read_unlock(&pmus_srcu, idx); -} - -static int perf_resume(void) -{ - get_online_cpus(); - smp_call_function(perf_pm_resume_cpu, NULL, 1); - put_online_cpus(); - - return NOTIFY_OK; -} - -static int perf_suspend(void) -{ - get_online_cpus(); - smp_call_function(perf_pm_suspend_cpu, NULL, 1); - put_online_cpus(); - - return NOTIFY_OK; -} - -static int perf_pm(struct notifier_block *self, unsigned long action, void *ptr) -{ - switch (action) { - case PM_POST_HIBERNATION: - case PM_POST_SUSPEND: - return perf_resume(); - case PM_HIBERNATION_PREPARE: - case PM_SUSPEND_PREPARE: - return perf_suspend(); - default: - return NOTIFY_DONE; - } -} - -static struct notifier_block perf_pm_notifier = { - .notifier_call = perf_pm, -}; - void __init perf_event_init(void) { int ret; @@ -7066,7 +6974,6 @@ void __init perf_event_init(void) perf_tp_register(); perf_cpu_notifier(perf_cpu_notify); register_reboot_notifier(&perf_reboot_notifier); - register_pm_notifier(&perf_pm_notifier); ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); -- cgit v1.2.3 From 79cfbdfa87e84992d509e6c1648a18e1d7e68c20 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 3 Nov 2011 00:59:25 +0100 Subject: PM / Sleep: Fix race between CPU hotplug and freezer The CPU hotplug notifications sent out by the _cpu_up() and _cpu_down() functions depend on the value of the 'tasks_frozen' argument passed to them (which indicates whether tasks have been frozen or not). (Examples for such CPU hotplug notifications: CPU_ONLINE, CPU_ONLINE_FROZEN, CPU_DEAD, CPU_DEAD_FROZEN). Thus, it is essential that while the callbacks for those notifications are running, the state of the system with respect to the tasks being frozen or not remains unchanged, *throughout that duration*. Hence there is a need for synchronizing the CPU hotplug code with the freezer subsystem. Since the freezer is involved only in the Suspend/Hibernate call paths, this patch hooks the CPU hotplug code to the suspend/hibernate notifiers PM_[SUSPEND|HIBERNATE]_PREPARE and PM_POST_[SUSPEND|HIBERNATE] to prevent the race between CPU hotplug and freezer, thus ensuring that CPU hotplug notifications will always be run with the state of the system really being what the notifications indicate, _throughout_ their execution time. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki --- kernel/cpu.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 12b7458f23b1..aa39dd7a3846 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -15,6 +15,7 @@ #include #include #include +#include #ifdef CONFIG_SMP /* Serializes the updates to cpu_online_mask, cpu_present_mask */ @@ -476,6 +477,79 @@ static int alloc_frozen_cpus(void) return 0; } core_initcall(alloc_frozen_cpus); + +/* + * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU + * hotplug when tasks are about to be frozen. Also, don't allow the freezer + * to continue until any currently running CPU hotplug operation gets + * completed. + * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the + * 'cpu_add_remove_lock'. And this same lock is also taken by the regular + * CPU hotplug path and released only after it is complete. Thus, we + * (and hence the freezer) will block here until any currently running CPU + * hotplug operation gets completed. + */ +void cpu_hotplug_disable_before_freeze(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 1; + cpu_maps_update_done(); +} + + +/* + * When tasks have been thawed, re-enable regular CPU hotplug (which had been + * disabled while beginning to freeze tasks). + */ +void cpu_hotplug_enable_after_thaw(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 0; + cpu_maps_update_done(); +} + +/* + * When callbacks for CPU hotplug notifications are being executed, we must + * ensure that the state of the system with respect to the tasks being frozen + * or not, as reported by the notification, remains unchanged *throughout the + * duration* of the execution of the callbacks. + * Hence we need to prevent the freezer from racing with regular CPU hotplug. + * + * This synchronization is implemented by mutually excluding regular CPU + * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/ + * Hibernate notifications. + */ +static int +cpu_hotplug_pm_callback(struct notifier_block *nb, + unsigned long action, void *ptr) +{ + switch (action) { + + case PM_SUSPEND_PREPARE: + case PM_HIBERNATION_PREPARE: + cpu_hotplug_disable_before_freeze(); + break; + + case PM_POST_SUSPEND: + case PM_POST_HIBERNATION: + cpu_hotplug_enable_after_thaw(); + break; + + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} + + +int cpu_hotplug_pm_sync_init(void) +{ + pm_notifier(cpu_hotplug_pm_callback, 0); + return 0; +} +core_initcall(cpu_hotplug_pm_sync_init); + #endif /* CONFIG_PM_SLEEP_SMP */ /** -- cgit v1.2.3 From 6513fd6972f725291ee8ce62c7a39fb8a6c7391e Mon Sep 17 00:00:00 2001 From: Guennadi Liakhovetski Date: Thu, 3 Nov 2011 10:12:36 +0100 Subject: PM / QoS: Remove redundant check Remove an "if" check, that repeats an equivalent one 6 lines above. Signed-off-by: Guennadi Liakhovetski Signed-off-by: Rafael J. Wysocki --- kernel/power/qos.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 1c1797dd1d1d..5167d996cd02 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -386,8 +386,7 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp) pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE); filp->private_data = req; - if (filp->private_data) - return 0; + return 0; } return -EPERM; } -- cgit v1.2.3 From d6cc76856d353a3a9c43bead33210b9216dce332 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 4 Nov 2011 01:04:52 +0100 Subject: PM / Freezer: Revert 27920651fe "PM / Freezer: Make fake_signal_wake_up() wake TASK_KILLABLE tasks too" Commit 27920651fe "PM / Freezer: Make fake_signal_wake_up() wake TASK_KILLABLE tasks too" updated fake_signal_wake_up() used by freezer to wake up KILLABLE tasks. Sending unsolicited wakeups to tasks in killable sleep is dangerous as there are code paths which depend on tasks not waking up spuriously from KILLABLE sleep. For example. sys_read() or page can sleep in TASK_KILLABLE assuming that wait/down/whatever _killable can only fail if we can not return to the usermode. TASK_TRACED is another obvious example. The previous patch updated wait_event_freezekillable() such that it doesn't depend on the spurious wakeup. This patch reverts the offending commit. Note that the spurious KILLABLE wakeup had other implicit effects in KILLABLE sleeps in nfs and cifs and those will need further updates to regain freezekillable behavior. Signed-off-by: Tejun Heo Signed-off-by: Rafael J. Wysocki --- kernel/freezer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index 66a594e8ad2f..7b01de98bb6a 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -67,7 +67,7 @@ static void fake_signal_wake_up(struct task_struct *p) unsigned long flags; spin_lock_irqsave(&p->sighand->siglock, flags); - signal_wake_up(p, 1); + signal_wake_up(p, 0); spin_unlock_irqrestore(&p->sighand->siglock, flags); } -- cgit v1.2.3 From 1cd0d6c3021c8d76641b37203f504634b87fbabc Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 1 Nov 2011 03:59:33 +0000 Subject: module: Enable dynamic debugging regardless of taint Dynamic debugging is currently disabled for tainted modules, except for TAINT_CRAP. This prevents use of dynamic debugging for out-of-tree modules once the next patch is applied. This condition was apparently intended to avoid a crash if a force- loaded module has an incompatible definition of dynamic debug structures. However, a administrator that forces us to load a module is claiming that it *is* compatible even though it fails our version checks. If they are mistaken, there are any number of ways the module could crash the system. As a side-effect, proprietary and other tainted modules can now use dynamic_debug. Signed-off-by: Ben Hutchings Acked-by: Mathieu Desnoyers Signed-off-by: Rusty Russell --- kernel/module.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 93342d992f34..3c5509642847 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2878,8 +2878,7 @@ static struct module *load_module(void __user *umod, } /* This has to be done once we're sure module name is unique. */ - if (!mod->taints || mod->taints == (1U<taints || mod->taints == (1U< Date: Mon, 24 Oct 2011 15:12:28 +0200 Subject: module,bug: Add TAINT_OOT_MODULE flag for modules not built in-tree Use of the GPL or a compatible licence doesn't necessarily make the code any good. We already consider staging modules to be suspect, and this should also be true for out-of-tree modules which may receive very little review. Signed-off-by: Ben Hutchings Reviewed-by: Dave Jones Acked-by: Greg Kroah-Hartman Signed-off-by: Rusty Russell (patched oops-tracing.txt) --- kernel/module.c | 5 +++++ kernel/panic.c | 2 ++ 2 files changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 3c5509642847..ef8cb70c6996 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2487,6 +2487,9 @@ static int check_modinfo(struct module *mod, struct load_info *info) return -ENOEXEC; } + if (!get_modinfo(info, "intree")) + add_taint_module(mod, TAINT_OOT_MODULE); + if (get_modinfo(info, "staging")) { add_taint_module(mod, TAINT_CRAP); printk(KERN_WARNING "%s: module is from the staging directory," @@ -3255,6 +3258,8 @@ static char *module_flags(struct module *mod, char *buf) buf[bx++] = '('; if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) buf[bx++] = 'P'; + else if (mod->taints & (1 << TAINT_OOT_MODULE)) + buf[bx++] = 'O'; if (mod->taints & (1 << TAINT_FORCED_MODULE)) buf[bx++] = 'F'; if (mod->taints & (1 << TAINT_CRAP)) diff --git a/kernel/panic.c b/kernel/panic.c index d7bb6974efb5..b26593604214 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -177,6 +177,7 @@ static const struct tnt tnts[] = { { TAINT_WARN, 'W', ' ' }, { TAINT_CRAP, 'C', ' ' }, { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, + { TAINT_OOT_MODULE, 'O', ' ' }, }; /** @@ -194,6 +195,7 @@ static const struct tnt tnts[] = { * 'W' - Taint on warning. * 'C' - modules from drivers/staging are loaded. * 'I' - Working around severe firmware bug. + * 'O' - Out-of-tree module has been loaded. * * The string is overwritten by the next call to print_tainted(). */ -- cgit v1.2.3 From a6f05b97d1ba87326bd96f3da9fef994830d6994 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sun, 6 Nov 2011 21:54:12 +0100 Subject: PM / QoS: Set cpu_dma_pm_qos->name Since commit 4a31a334, the name of this misc device is not initialized, which leads to a funny device named /dev/(null) being created and /proc/misc containing an entry with just a number but no name. The latter leads to complaints by cryptsetup, which caused me to investigate this matter. Signed-off-by: Dominik Brodowski Signed-off-by: Rafael J. Wysocki --- kernel/power/qos.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 56db75147186..995e3bd3417b 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -70,6 +70,7 @@ static struct pm_qos_constraints cpu_dma_constraints = { }; static struct pm_qos_object cpu_dma_pm_qos = { .constraints = &cpu_dma_constraints, + .name = "cpu_dma_latency", }; static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); -- cgit v1.2.3