From 0782e63bc6fe7e2d3408d250df11d388b7799c6b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 May 2015 19:49:49 +0200 Subject: sched: Handle priority boosted tasks proper in setscheduler() Ronny reported that the following scenario is not handled correctly: T1 (prio = 10) lock(rtmutex); T2 (prio = 20) lock(rtmutex) boost T1 T1 (prio = 20) sys_set_scheduler(prio = 30) T1 prio = 30 .... sys_set_scheduler(prio = 10) T1 prio = 30 The last step is wrong as T1 should now be back at prio 20. Commit c365c292d059 ("sched: Consider pi boosting in setscheduler()") only handles the case where a boosted tasks tries to lower its priority. Fix it by taking the new effective priority into account for the decision whether a change of the priority is required. Reported-by: Ronny Meeus Tested-by: Steven Rostedt Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt Cc: Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Mike Galbraith Fixes: c365c292d059 ("sched: Consider pi boosting in setscheduler()") Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1505051806060.4225@nanos Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex.c | 12 +++++++----- kernel/sched/core.c | 26 ++++++++++++++------------ 2 files changed, 21 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index b73279367087..b025295f4966 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -265,15 +265,17 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task) } /* - * Called by sched_setscheduler() to check whether the priority change - * is overruled by a possible priority boosting. + * Called by sched_setscheduler() to get the priority which will be + * effective after the change. */ -int rt_mutex_check_prio(struct task_struct *task, int newprio) +int rt_mutex_get_effective_prio(struct task_struct *task, int newprio) { if (!task_has_pi_waiters(task)) - return 0; + return newprio; - return task_top_pi_waiter(task)->task->prio <= newprio; + if (task_top_pi_waiter(task)->task->prio <= newprio) + return task_top_pi_waiter(task)->task->prio; + return newprio; } /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fe22f7510bce..34db9bf892a3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3300,15 +3300,18 @@ static void __setscheduler_params(struct task_struct *p, /* Actually do priority change: must hold pi & rq lock. */ static void __setscheduler(struct rq *rq, struct task_struct *p, - const struct sched_attr *attr) + const struct sched_attr *attr, bool keep_boost) { __setscheduler_params(p, attr); /* - * If we get here, there was no pi waiters boosting the - * task. It is safe to use the normal prio. + * Keep a potential priority boosting if called from + * sched_setscheduler(). */ - p->prio = normal_prio(p); + if (keep_boost) + p->prio = rt_mutex_get_effective_prio(p, normal_prio(p)); + else + p->prio = normal_prio(p); if (dl_prio(p->prio)) p->sched_class = &dl_sched_class; @@ -3408,7 +3411,7 @@ static int __sched_setscheduler(struct task_struct *p, int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : MAX_RT_PRIO - 1 - attr->sched_priority; int retval, oldprio, oldpolicy = -1, queued, running; - int policy = attr->sched_policy; + int new_effective_prio, policy = attr->sched_policy; unsigned long flags; const struct sched_class *prev_class; struct rq *rq; @@ -3590,15 +3593,14 @@ change: oldprio = p->prio; /* - * Special case for priority boosted tasks. - * - * If the new priority is lower or equal (user space view) - * than the current (boosted) priority, we just store the new + * Take priority boosted tasks into account. If the new + * effective priority is unchanged, we just store the new * normal parameters and do not touch the scheduler class and * the runqueue. This will be done when the task deboost * itself. */ - if (rt_mutex_check_prio(p, newprio)) { + new_effective_prio = rt_mutex_get_effective_prio(p, newprio); + if (new_effective_prio == oldprio) { __setscheduler_params(p, attr); task_rq_unlock(rq, p, &flags); return 0; @@ -3612,7 +3614,7 @@ change: put_prev_task(rq, p); prev_class = p->sched_class; - __setscheduler(rq, p, attr); + __setscheduler(rq, p, attr, true); if (running) p->sched_class->set_curr_task(rq); @@ -7346,7 +7348,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p) queued = task_on_rq_queued(p); if (queued) dequeue_task(rq, p, 0); - __setscheduler(rq, p, &attr); + __setscheduler(rq, p, &attr, false); if (queued) { enqueue_task(rq, p, 0); resched_curr(rq); -- cgit v1.2.3 From 533445c6e53368569e50ab3fb712230c03d523f3 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 4 May 2015 03:09:36 -0700 Subject: sched/core: Fix regression in cpuset_cpu_inactive() for suspend Commit 3c18d447b3b3 ("sched/core: Check for available DL bandwidth in cpuset_cpu_inactive()"), a SCHED_DEADLINE bugfix, had a logic error that caused a regression in setting a CPU inactive during suspend. I ran into this when a program was failing pthread_setaffinity_np() with EINVAL after a suspend+wake up. A simple reproducer: $ ./a.out sched_setaffinity: Success $ systemctl suspend $ ./a.out sched_setaffinity: Invalid argument ... where ./a.out is: #define _GNU_SOURCE #include #include #include #include #include #include int main(void) { long num_cores; cpu_set_t cpu_set; int ret; num_cores = sysconf(_SC_NPROCESSORS_ONLN); CPU_ZERO(&cpu_set); CPU_SET(num_cores - 1, &cpu_set); errno = 0; ret = sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set); perror("sched_setaffinity"); return ret ? EXIT_FAILURE : EXIT_SUCCESS; } The mistake is that suspend is handled in the action == CPU_DOWN_PREPARE_FROZEN case of the switch statement in cpuset_cpu_inactive(). However, the commit in question masked out CPU_TASKS_FROZEN from the action, making this case dead. The fix is straightforward. Signed-off-by: Omar Sandoval Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Juri Lelli Cc: Thomas Gleixner Fixes: 3c18d447b3b3 ("sched/core: Check for available DL bandwidth in cpuset_cpu_inactive()") Link: http://lkml.kernel.org/r/1cb5ecb3d6543c38cce5790387f336f54ec8e2bc.1430733960.git.osandov@osandov.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 34db9bf892a3..57bd333bc4ab 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6999,27 +6999,23 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, unsigned long flags; long cpu = (long)hcpu; struct dl_bw *dl_b; + bool overflow; + int cpus; - switch (action & ~CPU_TASKS_FROZEN) { + switch (action) { case CPU_DOWN_PREPARE: - /* explicitly allow suspend */ - if (!(action & CPU_TASKS_FROZEN)) { - bool overflow; - int cpus; - - rcu_read_lock_sched(); - dl_b = dl_bw_of(cpu); + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); - raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(cpu); - overflow = __dl_overflow(dl_b, cpus, 0, 0); - raw_spin_unlock_irqrestore(&dl_b->lock, flags); + raw_spin_lock_irqsave(&dl_b->lock, flags); + cpus = dl_bw_cpus(cpu); + overflow = __dl_overflow(dl_b, cpus, 0, 0); + raw_spin_unlock_irqrestore(&dl_b->lock, flags); - rcu_read_unlock_sched(); + rcu_read_unlock_sched(); - if (overflow) - return notifier_from_errno(-EBUSY); - } + if (overflow) + return notifier_from_errno(-EBUSY); cpuset_update_active_cpus(false); break; case CPU_DOWN_PREPARE_FROZEN: -- cgit v1.2.3 From 8b10c5e2b59ef2a80a07ab594a3b4987a4676211 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 1 May 2015 16:08:46 +0200 Subject: perf: Annotate inherited event ctx->mutex recursion While fuzzing Sasha tripped over another ctx->mutex recursion lockdep splat. Annotate this. Reported-by: Sasha Levin Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 81aa3a4ece9f..1a3bf48743ce 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -913,10 +913,30 @@ static void put_ctx(struct perf_event_context *ctx) * Those places that change perf_event::ctx will hold both * perf_event_ctx::mutex of the 'old' and 'new' ctx value. * - * Lock ordering is by mutex address. There is one other site where - * perf_event_context::mutex nests and that is put_event(). But remember that - * that is a parent<->child context relation, and migration does not affect - * children, therefore these two orderings should not interact. + * Lock ordering is by mutex address. There are two other sites where + * perf_event_context::mutex nests and those are: + * + * - perf_event_exit_task_context() [ child , 0 ] + * __perf_event_exit_task() + * sync_child_event() + * put_event() [ parent, 1 ] + * + * - perf_event_init_context() [ parent, 0 ] + * inherit_task_group() + * inherit_group() + * inherit_event() + * perf_event_alloc() + * perf_init_event() + * perf_try_init_event() [ child , 1 ] + * + * While it appears there is an obvious deadlock here -- the parent and child + * nesting levels are inverted between the two. This is in fact safe because + * life-time rules separate them. That is an exiting task cannot fork, and a + * spawning task cannot (yet) exit. + * + * But remember that that these are parent<->child context relations, and + * migration does not affect children, therefore these two orderings should not + * interact. * * The change in perf_event::ctx does not affect children (as claimed above) * because the sys_perf_event_open() case will install a new event and break @@ -3657,9 +3677,6 @@ static void perf_remove_from_owner(struct perf_event *event) } } -/* - * Called when the last reference to the file is gone. - */ static void put_event(struct perf_event *event) { struct perf_event_context *ctx; @@ -3697,6 +3714,9 @@ int perf_event_release_kernel(struct perf_event *event) } EXPORT_SYMBOL_GPL(perf_event_release_kernel); +/* + * Called when the last reference to the file is gone. + */ static int perf_release(struct inode *inode, struct file *file) { put_event(file->private_data); @@ -7364,7 +7384,12 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) return -ENODEV; if (event->group_leader != event) { - ctx = perf_event_ctx_lock(event->group_leader); + /* + * This ctx->mutex can nest when we're called through + * inheritance. See the perf_event_ctx_lock_nested() comment. + */ + ctx = perf_event_ctx_lock_nested(event->group_leader, + SINGLE_DEPTH_NESTING); BUG_ON(!ctx); } -- cgit v1.2.3 From 37815bf866ab6722a47550f8d25ad3f1a16a680c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 9 May 2015 03:06:23 +0930 Subject: module: Call module notifier on failure after complete_formation() The module notifier call chain for MODULE_STATE_COMING was moved up before the parsing of args, into the complete_formation() call. But if the module failed to load after that, the notifier call chain for MODULE_STATE_GOING was never called and that prevented the users of those call chains from cleaning up anything that was allocated. Link: http://lkml.kernel.org/r/554C52B9.9060700@gmail.com Reported-by: Pontus Fuchs Fixes: 4982223e51e8 "module: set nx before marking module MODULE_STATE_COMING" Cc: stable@vger.kernel.org # 3.16+ Signed-off-by: Steven Rostedt Signed-off-by: Rusty Russell --- kernel/module.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 42a1d2afb217..cfc9e843a924 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3370,6 +3370,9 @@ static int load_module(struct load_info *info, const char __user *uargs, module_bug_cleanup(mod); mutex_unlock(&module_mutex); + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_GOING, mod); + /* we can't deallocate the module until we clear memory protection */ unset_module_init_ro_nx(mod); unset_module_core_ro_nx(mod); -- cgit v1.2.3 From f7bcb70ebae0dcdb5a2d859b09e4465784d99029 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 8 May 2015 13:47:23 -0700 Subject: ktime: Fix ktime_divns to do signed division It was noted that the 32bit implementation of ktime_divns() was doing unsigned division and didn't properly handle negative values. And when a ktime helper was changed to utilize ktime_divns, it caused a regression on some IR blasters. See the following bugzilla for details: https://bugzilla.redhat.com/show_bug.cgi?id=1200353 This patch fixes the problem in ktime_divns by checking and preserving the sign bit, and then reapplying it if appropriate after the division, it also changes the return type to a s64 to make it more obvious this is expected. Nicolas also pointed out that negative dividers would cause infinite loops on 32bit systems, negative dividers is unlikely for users of this function, but out of caution this patch adds checks for negative dividers for both 32-bit (BUG_ON) and 64-bit(WARN_ON) versions to make sure no such use cases creep in. [ tglx: Hand an u64 to do_div() to avoid the compiler warning ] Fixes: 166afb64511e 'ktime: Sanitize ktime_to_us/ms conversion' Reported-and-tested-by: Trevor Cordes Signed-off-by: John Stultz Acked-by: Nicolas Pitre Cc: Ingo Molnar Cc: Josh Boyer Cc: One Thousand Gnomes Cc: Link: http://lkml.kernel.org/r/1431118043-23452-1-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 76d4bd962b19..93ef7190bdea 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -266,21 +266,23 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) /* * Divide a ktime value by a nanosecond value */ -u64 __ktime_divns(const ktime_t kt, s64 div) +s64 __ktime_divns(const ktime_t kt, s64 div) { - u64 dclc; int sft = 0; + s64 dclc; + u64 tmp; dclc = ktime_to_ns(kt); + tmp = dclc < 0 ? -dclc : dclc; + /* Make sure the divisor is less than 2^32: */ while (div >> 32) { sft++; div >>= 1; } - dclc >>= sft; - do_div(dclc, (unsigned long) div); - - return dclc; + tmp >>= sft; + do_div(tmp, (unsigned long) div); + return dclc < 0 ? -tmp : tmp; } EXPORT_SYMBOL_GPL(__ktime_divns); #endif /* BITS_PER_LONG >= 64 */ -- cgit v1.2.3 From ab992dc38f9ae40b3ab996d68449692d464c98cf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 18 May 2015 11:31:50 +0200 Subject: watchdog: Fix merge 'conflict' Two watchdog changes that came through different trees had a non conflicting conflict, that is, one changed the semantics of a variable but no actual code conflict happened. So the merge appeared fine, but the resulting code did not behave as expected. Commit 195daf665a62 ("watchdog: enable the new user interface of the watchdog mechanism") changes the semantics of watchdog_user_enabled, which thereafter is only used by the functions introduced by b3738d293233 ("watchdog: Add watchdog enable/disable all functions"). There further appears to be a distinct lack of serialization between setting and using watchdog_enabled, so perhaps we should wrap the {en,dis}able_all() things in watchdog_proc_mutex. This patch fixes a s2r failure reported by Michal; which I cannot readily explain. But this does make the code internally consistent again. Reported-and-tested-by: Michal Hocko Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 2316f50b07a4..506edcc500c4 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -41,6 +41,8 @@ #define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT) #define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT) +static DEFINE_MUTEX(watchdog_proc_mutex); + #ifdef CONFIG_HARDLOCKUP_DETECTOR static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; #else @@ -608,26 +610,36 @@ void watchdog_nmi_enable_all(void) { int cpu; - if (!watchdog_user_enabled) - return; + mutex_lock(&watchdog_proc_mutex); + + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + goto unlock; get_online_cpus(); for_each_online_cpu(cpu) watchdog_nmi_enable(cpu); put_online_cpus(); + +unlock: + mutex_lock(&watchdog_proc_mutex); } void watchdog_nmi_disable_all(void) { int cpu; + mutex_lock(&watchdog_proc_mutex); + if (!watchdog_running) - return; + goto unlock; get_online_cpus(); for_each_online_cpu(cpu) watchdog_nmi_disable(cpu); put_online_cpus(); + +unlock: + mutex_unlock(&watchdog_proc_mutex); } #else static int watchdog_nmi_enable(unsigned int cpu) { return 0; } @@ -744,8 +756,6 @@ static int proc_watchdog_update(void) } -static DEFINE_MUTEX(watchdog_proc_mutex); - /* * common function for watchdog, nmi_watchdog and soft_watchdog parameter * -- cgit v1.2.3 From 10d784eae2b41e25d8fc6a88096cd27286093c84 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 8 May 2015 10:51:29 -0700 Subject: sched: always use blk_schedule_flush_plug in io_schedule_out block plug callback could sleep, so we introduce a parameter 'from_schedule' and corresponding drivers can use it to destinguish a schedule plug flush or a plug finish. Unfortunately io_schedule_out still uses blk_flush_plug(). This causes below output (Note, I added a might_sleep() in raid1_unplug to make it trigger faster, but the whole thing doesn't matter if I add might_sleep). In raid1/10, this can cause deadlock. This patch makes io_schedule_out always uses blk_schedule_flush_plug. This should only impact drivers (as far as I know, raid 1/10) which are sensitive to the 'from_schedule' parameter. [ 370.817949] ------------[ cut here ]------------ [ 370.817960] WARNING: CPU: 7 PID: 145 at ../kernel/sched/core.c:7306 __might_sleep+0x7f/0x90() [ 370.817969] do not call blocking ops when !TASK_RUNNING; state=2 set at [] prepare_to_wait+0x2f/0x90 [ 370.817971] Modules linked in: raid1 [ 370.817976] CPU: 7 PID: 145 Comm: kworker/u16:9 Tainted: G W 4.0.0+ #361 [ 370.817977] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140709_153802- 04/01/2014 [ 370.817983] Workqueue: writeback bdi_writeback_workfn (flush-9:1) [ 370.817985] ffffffff81cd83be ffff8800ba8cb298 ffffffff819dd7af 0000000000000001 [ 370.817988] ffff8800ba8cb2e8 ffff8800ba8cb2d8 ffffffff81051afc ffff8800ba8cb2c8 [ 370.817990] ffffffffa00061a8 000000000000041e 0000000000000000 ffff8800ba8cba28 [ 370.817993] Call Trace: [ 370.817999] [] dump_stack+0x4f/0x7b [ 370.818002] [] warn_slowpath_common+0x8c/0xd0 [ 370.818004] [] warn_slowpath_fmt+0x46/0x50 [ 370.818006] [] ? prepare_to_wait+0x2f/0x90 [ 370.818008] [] ? prepare_to_wait+0x2f/0x90 [ 370.818010] [] __might_sleep+0x7f/0x90 [ 370.818014] [] raid1_unplug+0xd3/0x170 [raid1] [ 370.818024] [] blk_flush_plug_list+0x8a/0x1e0 [ 370.818028] [] ? bit_wait+0x50/0x50 [ 370.818031] [] io_schedule_timeout+0x130/0x140 [ 370.818033] [] bit_wait_io+0x36/0x50 [ 370.818034] [] __wait_on_bit+0x65/0x90 [ 370.818041] [] ? ext4_read_block_bitmap_nowait+0xbc/0x630 [ 370.818043] [] ? bit_wait+0x50/0x50 [ 370.818045] [] out_of_line_wait_on_bit+0x72/0x80 [ 370.818047] [] ? autoremove_wake_function+0x40/0x40 [ 370.818050] [] __wait_on_buffer+0x44/0x50 [ 370.818053] [] ext4_wait_block_bitmap+0xe0/0xf0 [ 370.818058] [] ext4_mb_init_cache+0x206/0x790 [ 370.818062] [] ? lru_cache_add+0x1c/0x50 [ 370.818064] [] ext4_mb_init_group+0x11e/0x200 [ 370.818066] [] ext4_mb_load_buddy+0x341/0x360 [ 370.818068] [] ext4_mb_find_by_goal+0x93/0x2f0 [ 370.818070] [] ? ext4_mb_normalize_request+0x1e4/0x5b0 [ 370.818072] [] ext4_mb_regular_allocator+0x67/0x460 [ 370.818074] [] ? ext4_mb_normalize_request+0x1e4/0x5b0 [ 370.818076] [] ext4_mb_new_blocks+0x4cb/0x620 [ 370.818079] [] ext4_ext_map_blocks+0x4c6/0x14d0 [ 370.818081] [] ? ext4_es_lookup_extent+0x4e/0x290 [ 370.818085] [] ext4_map_blocks+0x14d/0x4f0 [ 370.818088] [] ext4_writepages+0x76d/0xe50 [ 370.818094] [] do_writepages+0x21/0x50 [ 370.818097] [] __writeback_single_inode+0x60/0x490 [ 370.818099] [] writeback_sb_inodes+0x2da/0x590 [ 370.818103] [] ? trylock_super+0x1b/0x50 [ 370.818105] [] ? trylock_super+0x1b/0x50 [ 370.818107] [] __writeback_inodes_wb+0x9f/0xd0 [ 370.818109] [] wb_writeback+0x34b/0x3c0 [ 370.818111] [] bdi_writeback_workfn+0x23f/0x550 [ 370.818116] [] process_one_work+0x1c8/0x570 [ 370.818117] [] ? process_one_work+0x14b/0x570 [ 370.818119] [] worker_thread+0x11b/0x470 [ 370.818121] [] ? process_one_work+0x570/0x570 [ 370.818124] [] kthread+0xf8/0x110 [ 370.818126] [] ? kthread_create_on_node+0x210/0x210 [ 370.818129] [] ret_from_fork+0x42/0x70 [ 370.818131] [] ? kthread_create_on_node+0x210/0x210 [ 370.818132] ---[ end trace 7b4deb71e68b6605 ]--- V2: don't change ->in_iowait Cc: NeilBrown Signed-off-by: Shaohua Li Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- kernel/sched/core.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fe22f7510bce..cfeebb499e79 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4387,10 +4387,7 @@ long __sched io_schedule_timeout(long timeout) long ret; current->in_iowait = 1; - if (old_iowait) - blk_schedule_flush_plug(current); - else - blk_flush_plug(current); + blk_schedule_flush_plug(current); delayacct_blkio_start(); rq = raw_rq(); -- cgit v1.2.3 From 1173ff09b9c57be8248427b7be161f7599dccd6b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 19 May 2015 09:07:27 +0200 Subject: watchdog: fix double lock in watchdog_nmi_enable_all Commit ab992dc38f9a ("watchdog: Fix merge 'conflict'") has introduced an obvious deadlock because of a typo. watchdog_proc_mutex should be unlocked on exit. Thanks to Miroslav Benes who was staring at the code with me and noticed this. Signed-off-by: Michal Hocko Duh-by: Peter Zijlstra Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 506edcc500c4..581a68a04c64 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -621,7 +621,7 @@ void watchdog_nmi_enable_all(void) put_online_cpus(); unlock: - mutex_lock(&watchdog_proc_mutex); + mutex_unlock(&watchdog_proc_mutex); } void watchdog_nmi_disable_all(void) -- cgit v1.2.3 From dead9f29ddcc69551f35529a252d2704047870d3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 15 May 2015 12:15:21 -0700 Subject: perf: Fix race in BPF program unregister there is a race between perf_event_free_bpf_prog() and free_trace_kprobe(): __free_event() event->destroy(event) tp_perf_event_destroy() perf_trace_destroy() perf_trace_event_unreg() which is dropping event->tp_event->perf_refcount and allows to proceed in: unregister_trace_kprobe() unregister_kprobe_event() trace_remove_event_call() probe_remove_event_call() free_trace_kprobe() while __free_event does: call_rcu(&event->rcu_head, free_event_rcu); free_event_rcu() perf_event_free_bpf_prog() To fix the race simply move perf_event_free_bpf_prog() before event->destroy(), since event->tp_event is still valid at that point. Note, perf_trace_destroy() is not racing with trace_remove_event_call() since they both grab event_mutex. Reported-by: Wang Nan Signed-off-by: Alexei Starovoitov Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: lizefan@huawei.com Cc: pi3orama@163.com Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes") Link: http://lkml.kernel.org/r/1431717321-28772-1-git-send-email-ast@plumgrid.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1a3bf48743ce..eddf1ed4155e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3442,7 +3442,6 @@ static void free_event_rcu(struct rcu_head *head) if (event->ns) put_pid_ns(event->ns); perf_event_free_filter(event); - perf_event_free_bpf_prog(event); kfree(event); } @@ -3573,6 +3572,8 @@ static void __free_event(struct perf_event *event) put_callchain_buffers(); } + perf_event_free_bpf_prog(event); + if (event->destroy) event->destroy(event); -- cgit v1.2.3 From aa319bcd366349c6f72fcd331da89d3d06090651 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 22 May 2015 18:30:20 +0300 Subject: perf: Disallow sparse AUX allocations for non-SG PMUs in overwrite mode PMUs that don't support hardware scatter tables require big contiguous chunks of memory and a PMI to switch between them. However, in overwrite using a PMI for this purpose adds extra overhead that the users would like to avoid. Thus, in overwrite mode for such PMUs we can only allow one contiguous chunk for the entire requested buffer. This patch changes the behavior accordingly, so that if the buddy allocator fails to come up with a single high-order chunk for the entire requested buffer, the allocation will fail. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: hpa@zytor.com Link: http://lkml.kernel.org/r/1432308626-18845-2-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 232f00f273cb..725c416085e3 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -493,6 +493,20 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, rb->aux_pages[rb->aux_nr_pages] = page_address(page++); } + /* + * In overwrite mode, PMUs that don't support SG may not handle more + * than one contiguous allocation, since they rely on PMI to do double + * buffering. In this case, the entire buffer has to be one contiguous + * chunk. + */ + if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) && + overwrite) { + struct page *page = virt_to_page(rb->aux_pages[0]); + + if (page_private(page) != max_order) + goto out; + } + rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages, overwrite); if (!rb->aux_priv) -- cgit v1.2.3 From 9b7b819ca1e508195feed5ece558dca66adeef05 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 4 Jun 2015 23:57:18 +0200 Subject: compat: cleanup coding in compat_get_bitmap() and compat_put_bitmap() In the functions compat_get_bitmap() and compat_put_bitmap() the variable nr_compat_longs stores how many compat_ulong_t words should be copied in a loop. The copy loop itself is this: if (nr_compat_longs-- > 0) { if (__get_user(um, umask)) return -EFAULT; } else { um = 0; } Since nr_compat_longs gets unconditionally decremented in each loop and since it's type is unsigned this could theoretically lead to out of bounds accesses to userspace if nr_compat_longs wraps around to (unsigned)(-1). Although the callers currently do not trigger out-of-bounds accesses, we should better implement the loop in a safe way to completely avoid such warp-arounds. Signed-off-by: Helge Deller Cc: Linus Torvalds Cc: Al Viro --- kernel/compat.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 24f00610c575..333d364be29d 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -912,7 +912,8 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, * bitmap. We must however ensure the end of the * kernel bitmap is zeroed. */ - if (nr_compat_longs-- > 0) { + if (nr_compat_longs) { + nr_compat_longs--; if (__get_user(um, umask)) return -EFAULT; } else { @@ -954,7 +955,8 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, * We dont want to write past the end of the userspace * bitmap. */ - if (nr_compat_longs-- > 0) { + if (nr_compat_longs) { + nr_compat_longs--; if (__put_user(um, umask)) return -EFAULT; } -- cgit v1.2.3