From 77f300b198f93328c26191b52655ce1b62e202cf Mon Sep 17 00:00:00 2001 From: Daeseok Youn Date: Wed, 16 Apr 2014 14:32:29 +0900 Subject: workqueue: fix bugs in wq_update_unbound_numa() failure path wq_update_unbound_numa() failure path has the following two bugs. - alloc_unbound_pwq() is called without holding wq->mutex; however, if the allocation fails, it jumps to out_unlock which tries to unlock wq->mutex. - The function should switch to dfl_pwq on failure but didn't do so after alloc_unbound_pwq() failure. Fix it by regrabbing wq->mutex and jumping to use_dfl_pwq on alloc_unbound_pwq() failure. Signed-off-by: Daeseok Youn Acked-by: Lai Jiangshan Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org Fixes: 4c16bd327c74 ("workqueue: implement NUMA affinity for unbound workqueues") --- kernel/workqueue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0ee63af30bd1..3150b217c936 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4100,7 +4100,8 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, if (!pwq) { pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", wq->name); - goto out_unlock; + mutex_lock(&wq->mutex); + goto use_dfl_pwq; } /* -- cgit v1.2.3 From e37a06f10994c2ba86f54d8f96734f2483a869b8 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 17 Apr 2014 13:53:08 +0800 Subject: cgroup: fix the retry path of cgroup_mount() If we hit the retry path, we'll call parse_cgroupfs_options() again, but the string we pass to it has been modified by the previous call to this function. This bug can be observed by: # mount -t cgroup -o name=foo,cpuset xxx /mnt && umount /mnt && \ mount -t cgroup -o name=foo,cpuset xxx /mnt mount: wrong fs type, bad option, bad superblock on xxx, missing codepage or helper program, or other error ... The second mount passed "name=foo,cpuset" to the parser, and then it hit the retry path and call the parser again, but this time the string passed to the parser is "name=foo". To fix this, we avoid calling parse_cgroupfs_options() again in this case. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9fcdaa705b6c..11a03d67635a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1495,7 +1495,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, */ if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); -retry: + mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); @@ -1503,7 +1503,7 @@ retry: ret = parse_cgroupfs_options(data, &opts); if (ret) goto out_unlock; - +retry: /* look for a matching existing root */ if (!opts.subsys_mask && !opts.none && !opts.name) { cgrp_dfl_root_visible = true; @@ -1562,9 +1562,9 @@ retry: if (!atomic_inc_not_zero(&root->cgrp.refcnt)) { mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); - kfree(opts.release_agent); - kfree(opts.name); msleep(10); + mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); goto retry; } -- cgit v1.2.3 From 4d595b866d2c653dc90a492b9973a834eabfa354 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Apr 2014 11:04:16 -0400 Subject: workqueue: make rescuer_thread() empty wq->maydays list before exiting After a @pwq is scheduled for emergency execution, other workers may consume the affectd work items before the rescuer gets to them. This means that a workqueue many have pwqs queued on @wq->maydays list while not having any work item pending or in-flight. If destroy_workqueue() executes in such condition, the rescuer may exit without emptying @wq->maydays. This currently doesn't cause any actual harm. destroy_workqueue() can safely destroy all the involved data structures whether @wq->maydays is populated or not as nobody access the list once the rescuer exits. However, this is nasty and makes future development difficult. Let's update rescuer_thread() so that it empties @wq->maydays after seeing should_stop to guarantee that the list is empty on rescuer exit. tj: Updated comment and patch description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org # v3.10+ --- kernel/workqueue.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3150b217c936..6ba0c6054224 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2398,6 +2398,7 @@ static int rescuer_thread(void *__rescuer) struct worker *rescuer = __rescuer; struct workqueue_struct *wq = rescuer->rescue_wq; struct list_head *scheduled = &rescuer->scheduled; + bool should_stop; set_user_nice(current, RESCUER_NICE_LEVEL); @@ -2409,11 +2410,15 @@ static int rescuer_thread(void *__rescuer) repeat: set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) { - __set_current_state(TASK_RUNNING); - rescuer->task->flags &= ~PF_WQ_WORKER; - return 0; - } + /* + * By the time the rescuer is requested to stop, the workqueue + * shouldn't have any work pending, but @wq->maydays may still have + * pwq(s) queued. This can happen by non-rescuer workers consuming + * all the work items before the rescuer got to them. Go through + * @wq->maydays processing before acting on should_stop so that the + * list is always empty on exit. + */ + should_stop = kthread_should_stop(); /* see whether any pwq is asking for help */ spin_lock_irq(&wq_mayday_lock); @@ -2459,6 +2464,12 @@ repeat: spin_unlock_irq(&wq_mayday_lock); + if (should_stop) { + __set_current_state(TASK_RUNNING); + rescuer->task->flags &= ~PF_WQ_WORKER; + return 0; + } + /* rescuers should never participate in concurrency management */ WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); schedule(); -- cgit v1.2.3 From 77668c8b559e4fe2acf2a0749c7c83cde49a5025 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Apr 2014 11:04:16 -0400 Subject: workqueue: fix a possible race condition between rescuer and pwq-release There is a race condition between rescuer_thread() and pwq_unbound_release_workfn(). Even after a pwq is scheduled for rescue, the associated work items may be consumed by any worker. If all of them are consumed before the rescuer gets to them and the pwq's base ref was put due to attribute change, the pwq may be released while still being linked on @wq->maydays list making the rescuer dereference already freed pwq later. Make send_mayday() pin the target pwq until the rescuer is done with it. tj: Updated comment and patch description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org # v3.10+ --- kernel/workqueue.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6ba0c6054224..8edc87185427 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1916,6 +1916,12 @@ static void send_mayday(struct work_struct *work) /* mayday mayday mayday */ if (list_empty(&pwq->mayday_node)) { + /* + * If @pwq is for an unbound wq, its base ref may be put at + * any time due to an attribute change. Pin @pwq until the + * rescuer is done with it. + */ + get_pwq(pwq); list_add_tail(&pwq->mayday_node, &wq->maydays); wake_up_process(wq->rescuer->task); } @@ -2449,6 +2455,12 @@ repeat: process_scheduled_works(rescuer); + /* + * Put the reference grabbed by send_mayday(). @pool won't + * go away while we're holding its lock. + */ + put_pwq(pwq); + /* * Leave this pool. If keep_working() is %true, notify a * regular worker; otherwise, we end up with 0 concurrency -- cgit v1.2.3 From 90f62cf30a78721641e08737bda787552428061e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 23 Apr 2014 14:29:27 -0700 Subject: net: Use netlink_ns_capable to verify the permisions of netlink messages It is possible by passing a netlink socket to a more privileged executable and then to fool that executable into writing to the socket data that happens to be valid netlink message to do something that privileged executable did not intend to do. To keep this from happening replace bare capable and ns_capable calls with netlink_capable, netlink_net_calls and netlink_ns_capable calls. Which act the same as the previous calls except they verify that the opener of the socket had the desired permissions as well. Reported-by: Andy Lutomirski Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- kernel/audit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 7c2893602d06..47845c57eb19 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -643,13 +643,13 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) if ((task_active_pid_ns(current) != &init_pid_ns)) return -EPERM; - if (!capable(CAP_AUDIT_CONTROL)) + if (!netlink_capable(skb, CAP_AUDIT_CONTROL)) err = -EPERM; break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: - if (!capable(CAP_AUDIT_WRITE)) + if (!netlink_capable(skb, CAP_AUDIT_WRITE)) err = -EPERM; break; default: /* bad msg */ -- cgit v1.2.3 From 722a9f9299ca720a3f14660e7c0dce7b76a9cb42 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 2 May 2014 00:44:38 +0200 Subject: asmlinkage: Add explicit __visible to drivers/*, lib/*, kernel/* As requested by Linus add explicit __visible to the asmlinkage users. This marks functions visible to assembler. Tree sweep for rest of tree. Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/1398984278-29319-4-git-send-email-andi@firstfloor.org Signed-off-by: H. Peter Anvin --- kernel/context_tracking.c | 2 +- kernel/locking/lockdep.c | 2 +- kernel/power/snapshot.c | 2 +- kernel/printk/printk.c | 4 ++-- kernel/sched/core.c | 10 +++++----- kernel/softirq.c | 4 ++-- 6 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 6cb20d2e7ee0..019d45008448 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -120,7 +120,7 @@ void context_tracking_user_enter(void) * instead of preempt_schedule() to exit user context if needed before * calling the scheduler. */ -asmlinkage void __sched notrace preempt_schedule_context(void) +asmlinkage __visible void __sched notrace preempt_schedule_context(void) { enum ctx_state prev_ctx; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index b0e9467922e1..d24e4339b46d 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4188,7 +4188,7 @@ void debug_show_held_locks(struct task_struct *task) } EXPORT_SYMBOL_GPL(debug_show_held_locks); -asmlinkage void lockdep_sys_exit(void) +asmlinkage __visible void lockdep_sys_exit(void) { struct task_struct *curr = current; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 18fb7a2fb14b..1ea328aafdc9 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1586,7 +1586,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, return -ENOMEM; } -asmlinkage int swsusp_save(void) +asmlinkage __visible int swsusp_save(void) { unsigned int nr_pages, nr_highmem; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a45b50962295..7228258b85ec 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1674,7 +1674,7 @@ EXPORT_SYMBOL(printk_emit); * * See the vsnprintf() documentation for format string extensions over C99. */ -asmlinkage int printk(const char *fmt, ...) +asmlinkage __visible int printk(const char *fmt, ...) { va_list args; int r; @@ -1737,7 +1737,7 @@ void early_vprintk(const char *fmt, va_list ap) } } -asmlinkage void early_printk(const char *fmt, ...) +asmlinkage __visible void early_printk(const char *fmt, ...) { va_list ap; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 268a45ea238c..d9d8ece46a15 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2192,7 +2192,7 @@ static inline void post_schedule(struct rq *rq) * schedule_tail - first thing a freshly forked thread must call. * @prev: the thread we just switched away from. */ -asmlinkage void schedule_tail(struct task_struct *prev) +asmlinkage __visible void schedule_tail(struct task_struct *prev) __releases(rq->lock) { struct rq *rq = this_rq(); @@ -2741,7 +2741,7 @@ static inline void sched_submit_work(struct task_struct *tsk) blk_schedule_flush_plug(tsk); } -asmlinkage void __sched schedule(void) +asmlinkage __visible void __sched schedule(void) { struct task_struct *tsk = current; @@ -2751,7 +2751,7 @@ asmlinkage void __sched schedule(void) EXPORT_SYMBOL(schedule); #ifdef CONFIG_CONTEXT_TRACKING -asmlinkage void __sched schedule_user(void) +asmlinkage __visible void __sched schedule_user(void) { /* * If we come here after a random call to set_need_resched(), @@ -2783,7 +2783,7 @@ void __sched schedule_preempt_disabled(void) * off of preempt_enable. Kernel preemptions off return from interrupt * occur there and call schedule directly. */ -asmlinkage void __sched notrace preempt_schedule(void) +asmlinkage __visible void __sched notrace preempt_schedule(void) { /* * If there is a non-zero preempt_count or interrupts are disabled, @@ -2813,7 +2813,7 @@ EXPORT_SYMBOL(preempt_schedule); * Note, that this is called and return with irqs disabled. This will * protect us against recursive calling from irq. */ -asmlinkage void __sched preempt_schedule_irq(void) +asmlinkage __visible void __sched preempt_schedule_irq(void) { enum ctx_state prev_state; diff --git a/kernel/softirq.c b/kernel/softirq.c index 33e4648ae0e7..92f24f5e8d52 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -223,7 +223,7 @@ static inline bool lockdep_softirq_start(void) { return false; } static inline void lockdep_softirq_end(bool in_hardirq) { } #endif -asmlinkage void __do_softirq(void) +asmlinkage __visible void __do_softirq(void) { unsigned long end = jiffies + MAX_SOFTIRQ_TIME; unsigned long old_flags = current->flags; @@ -299,7 +299,7 @@ restart: tsk_restore_flags(current, old_flags, PF_MEMALLOC); } -asmlinkage void do_softirq(void) +asmlinkage __visible void do_softirq(void) { __u32 pending; unsigned long flags; -- cgit v1.2.3 From 8058bd0faad860e75547cc5cb5d4ade016247a79 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 8 May 2014 07:47:49 -0400 Subject: tracepoint: Fix use of tracepoint funcs after rcu free Commit de7b2973903c "tracepoint: Use struct pointer instead of name hash for reg/unreg tracepoints" introduces a use after free by calling release_probes on the old struct tracepoint array before the newly allocated array is published with rcu_assign_pointer. There is a race window where tracepoints (RCU readers) can perform a "use-after-grace-period-after-free", which shows up as a GPF in stress-tests. Link: http://lkml.kernel.org/r/53698021.5020108@oracle.com Link: http://lkml.kernel.org/p/1399549669-25465-1-git-send-email-mathieu.desnoyers@efficios.com Reported-by: Sasha Levin CC: Oleg Nesterov CC: Dave Jones Fixes: de7b2973903c "tracepoint: Use struct pointer instead of name hash for reg/unreg tracepoints" Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt --- kernel/tracepoint.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index ac5b23cf7212..6620e5837ce2 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -188,7 +188,6 @@ static int tracepoint_add_func(struct tracepoint *tp, WARN_ON_ONCE(1); return PTR_ERR(old); } - release_probes(old); /* * rcu_assign_pointer has a smp_wmb() which makes sure that the new @@ -200,6 +199,7 @@ static int tracepoint_add_func(struct tracepoint *tp, rcu_assign_pointer(tp->funcs, tp_funcs); if (!static_key_enabled(&tp->key)) static_key_slow_inc(&tp->key); + release_probes(old); return 0; } @@ -221,7 +221,6 @@ static int tracepoint_remove_func(struct tracepoint *tp, WARN_ON_ONCE(1); return PTR_ERR(old); } - release_probes(old); if (!tp_funcs) { /* Removed last function */ @@ -232,6 +231,7 @@ static int tracepoint_remove_func(struct tracepoint *tp, static_key_slow_dec(&tp->key); } rcu_assign_pointer(tp->funcs, tp_funcs); + release_probes(old); return 0; } -- cgit v1.2.3 From 84ea7fe37908254c3bd90910921f6e1045c1747a Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 12 May 2014 13:42:29 +0530 Subject: hrtimer: Set expiry time before switch_hrtimer_base() switch_hrtimer_base() calls hrtimer_check_target() which ensures that we do not migrate a timer to a remote cpu if the timer expires before the current programmed expiry time on that remote cpu. But __hrtimer_start_range_ns() calls switch_hrtimer_base() before the new expiry time is set. So the sanity check in hrtimer_check_target() is operating on stale or even uninitialized data. Update expiry time before calling switch_hrtimer_base(). [ tglx: Rewrote changelog once again ] Signed-off-by: Viresh Kumar Cc: linaro-kernel@lists.linaro.org Cc: linaro-networking@linaro.org Cc: fweisbec@gmail.com Cc: arvind.chauhan@arm.com Link: http://lkml.kernel.org/r/81999e148745fc51bbcd0615823fbab9b2e87e23.1399882253.git.viresh.kumar@linaro.org Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 6b715c0af1b1..e0501fe7140d 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -990,11 +990,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, /* Remove an active timer from the queue: */ ret = remove_hrtimer(timer, base); - /* Switch the timer base, if necessary: */ - new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); - if (mode & HRTIMER_MODE_REL) { - tim = ktime_add_safe(tim, new_base->get_time()); + tim = ktime_add_safe(tim, base->get_time()); /* * CONFIG_TIME_LOW_RES is a temporary way for architectures * to signal that they simply return xtime in @@ -1009,6 +1006,9 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, hrtimer_set_expires_range_ns(timer, tim, delta_ns); + /* Switch the timer base, if necessary: */ + new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); + timer_stats_hrtimer_set_start_info(timer); leftmost = enqueue_hrtimer(timer, new_base); -- cgit v1.2.3