From c0944355a74bc9c2b5b3cc5b627efe0c73e30bd9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Mar 2016 16:22:45 +0100 Subject: sched/cgroup: Fix/cleanup cgroup teardown/init commit 2f5177f0fd7e531b26d54633be62d1d4cb94621c upstream. The CPU controller hasn't kept up with the various changes in the whole cgroup initialization / destruction sequence, and commit: 2e91fa7f6d45 ("cgroup: keep zombies associated with their original cgroups") caused it to explode. The reason for this is that zombies do not inhibit css_offline() from being called, but do stall css_released(). Now we tear down the cfs_rq structures on css_offline() but zombies can run after that, leading to use-after-free issues. The solution is to move the tear-down to css_released(), which guarantees nobody (including no zombies) is still using our cgroup. Furthermore, a few simple cleanups are possible too. There doesn't appear to be any point to us using css_online() (anymore?) so fold that in css_alloc(). And since cgroup code guarantees an RCU grace period between css_released() and css_free() we can forgo using call_rcu() and free the stuff immediately. Suggested-by: Tejun Heo Reported-by: Kazuki Yamaguchi Reported-by: Niklas Cassel Tested-by: Niklas Cassel Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 2e91fa7f6d45 ("cgroup: keep zombies associated with their original cgroups") Link: http://lkml.kernel.org/r/20160316152245.GY6344@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/sched/core.c | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 70e5e09341f1..55bebf924946 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7693,7 +7693,7 @@ void set_curr_task(int cpu, struct task_struct *p) /* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock); -static void free_sched_group(struct task_group *tg) +static void sched_free_group(struct task_group *tg) { free_fair_sched_group(tg); free_rt_sched_group(tg); @@ -7719,7 +7719,7 @@ struct task_group *sched_create_group(struct task_group *parent) return tg; err: - free_sched_group(tg); + sched_free_group(tg); return ERR_PTR(-ENOMEM); } @@ -7739,17 +7739,16 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) } /* rcu callback to free various structures associated with a task group */ -static void free_sched_group_rcu(struct rcu_head *rhp) +static void sched_free_group_rcu(struct rcu_head *rhp) { /* now it should be safe to free those cfs_rqs */ - free_sched_group(container_of(rhp, struct task_group, rcu)); + sched_free_group(container_of(rhp, struct task_group, rcu)); } -/* Destroy runqueue etc associated with a task group */ void sched_destroy_group(struct task_group *tg) { /* wait for possible concurrent references to cfs_rqs complete */ - call_rcu(&tg->rcu, free_sched_group_rcu); + call_rcu(&tg->rcu, sched_free_group_rcu); } void sched_offline_group(struct task_group *tg) @@ -8210,31 +8209,26 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); + sched_online_group(tg, parent); + return &tg->css; } -static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) +static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); - struct task_group *parent = css_tg(css->parent); - if (parent) - sched_online_group(tg, parent); - return 0; + sched_offline_group(tg); } static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); - sched_destroy_group(tg); -} - -static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) -{ - struct task_group *tg = css_tg(css); - - sched_offline_group(tg); + /* + * Relies on the RCU grace period between css_released() and this. + */ + sched_free_group(tg); } static void cpu_cgroup_fork(struct task_struct *task, void *private) @@ -8594,9 +8588,8 @@ static struct cftype cpu_files[] = { struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, + .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, - .css_online = cpu_cgroup_css_online, - .css_offline = cpu_cgroup_css_offline, .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, -- cgit v1.2.3 From 61fc0ae42c498f8eb782733065d93da6817d28b4 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 15 Apr 2016 14:35:39 +0200 Subject: futex: Handle unlock_pi race gracefully commit 89e9e66ba1b3bde9d8ea90566c2aee20697ad681 upstream. If userspace calls UNLOCK_PI unconditionally without trying the TID -> 0 transition in user space first then the user space value might not have the waiters bit set. This opens the following race: CPU0 CPU1 uval = get_user(futex) lock(hb) lock(hb) futex |= FUTEX_WAITERS .... unlock(hb) cmpxchg(futex, uval, newval) So the cmpxchg fails and returns -EINVAL to user space, which is wrong because the futex value is valid. To handle this (yes, yet another) corner case gracefully, check for a flag change and retry. [ tglx: Massaged changelog and slightly reworked implementation ] Fixes: ccf9e6a80d9e ("futex: Make unlock_pi more robust") Signed-off-by: Sebastian Andrzej Siewior Cc: Davidlohr Bueso Cc: Darren Hart Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1460723739-5195-1-git-send-email-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 461c72b2dac2..eaa3a8dfd345 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1244,10 +1244,20 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, if (unlikely(should_fail_futex(true))) ret = -EFAULT; - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { ret = -EFAULT; - else if (curval != uval) - ret = -EINVAL; + } else if (curval != uval) { + /* + * If a unconditional UNLOCK_PI operation (user space did not + * try the TID->0 transition) raced with a waiter setting the + * FUTEX_WAITERS flag between get_user() and locking the hash + * bucket lock, retry the operation. + */ + if ((FUTEX_TID_MASK & curval) == uval) + ret = -EAGAIN; + else + ret = -EINVAL; + } if (ret) { raw_spin_unlock(&pi_state->pi_mutex.wait_lock); return ret; @@ -2537,6 +2547,15 @@ retry: */ if (ret == -EFAULT) goto pi_faulted; + /* + * A unconditional UNLOCK_PI op raced against a waiter + * setting the FUTEX_WAITERS bit. Try again. + */ + if (ret == -EAGAIN) { + spin_unlock(&hb->lock); + put_futex_key(&key); + goto retry; + } /* * wake_futex_pi has detected invalid state. Tell user * space. -- cgit v1.2.3 From ad4b209d192624e8587f4988171d624346913ddd Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 20 Apr 2016 20:09:24 -0700 Subject: futex: Acknowledge a new waiter in counter before plist commit fe1bce9e2107ba3a8faffe572483b6974201a0e6 upstream. Otherwise an incoming waker on the dest hash bucket can miss the waiter adding itself to the plist during the lockless check optimization (small window but still the correct way of doing this); similarly to the decrement counterpart. Suggested-by: Peter Zijlstra Signed-off-by: Davidlohr Bueso Cc: Davidlohr Bueso Cc: bigeasy@linutronix.de Cc: dvhart@infradead.org Link: http://lkml.kernel.org/r/1461208164-29150-1-git-send-email-dave@stgolabs.net Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index eaa3a8dfd345..9d8163afd87c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1484,8 +1484,8 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, if (likely(&hb1->chain != &hb2->chain)) { plist_del(&q->list, &hb1->chain); hb_waiters_dec(hb1); - plist_add(&q->list, &hb2->chain); hb_waiters_inc(hb2); + plist_add(&q->list, &hb2->chain); q->lock_ptr = &hb2->lock; } get_futex_key_refs(key2); -- cgit v1.2.3 From 2da9606aea5a8fd1b710f8c8dd5295da4825e9cd Mon Sep 17 00:00:00 2001 From: Roman Pen Date: Tue, 26 Apr 2016 13:15:35 +0200 Subject: workqueue: fix ghost PENDING flag while doing MQ IO commit 346c09f80459a3ad97df1816d6d606169a51001a upstream. The bug in a workqueue leads to a stalled IO request in MQ ctx->rq_list with the following backtrace: [ 601.347452] INFO: task kworker/u129:5:1636 blocked for more than 120 seconds. [ 601.347574] Tainted: G O 4.4.5-1-storage+ #6 [ 601.347651] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 601.348142] kworker/u129:5 D ffff880803077988 0 1636 2 0x00000000 [ 601.348519] Workqueue: ibnbd_server_fileio_wq ibnbd_dev_file_submit_io_worker [ibnbd_server] [ 601.348999] ffff880803077988 ffff88080466b900 ffff8808033f9c80 ffff880803078000 [ 601.349662] ffff880807c95000 7fffffffffffffff ffffffff815b0920 ffff880803077ad0 [ 601.350333] ffff8808030779a0 ffffffff815b01d5 0000000000000000 ffff880803077a38 [ 601.350965] Call Trace: [ 601.351203] [] ? bit_wait+0x60/0x60 [ 601.351444] [] schedule+0x35/0x80 [ 601.351709] [] schedule_timeout+0x192/0x230 [ 601.351958] [] ? blk_flush_plug_list+0xc7/0x220 [ 601.352208] [] ? ktime_get+0x37/0xa0 [ 601.352446] [] ? bit_wait+0x60/0x60 [ 601.352688] [] io_schedule_timeout+0xa4/0x110 [ 601.352951] [] ? _raw_spin_unlock_irqrestore+0xe/0x10 [ 601.353196] [] bit_wait_io+0x1b/0x70 [ 601.353440] [] __wait_on_bit+0x5d/0x90 [ 601.353689] [] wait_on_page_bit+0xc0/0xd0 [ 601.353958] [] ? autoremove_wake_function+0x40/0x40 [ 601.354200] [] __filemap_fdatawait_range+0xe4/0x140 [ 601.354441] [] filemap_fdatawait_range+0x14/0x30 [ 601.354688] [] filemap_write_and_wait_range+0x3f/0x70 [ 601.354932] [] blkdev_fsync+0x1b/0x50 [ 601.355193] [] vfs_fsync_range+0x49/0xa0 [ 601.355432] [] blkdev_write_iter+0xca/0x100 [ 601.355679] [] __vfs_write+0xaa/0xe0 [ 601.355925] [] vfs_write+0xa9/0x1a0 [ 601.356164] [] kernel_write+0x38/0x50 The underlying device is a null_blk, with default parameters: queue_mode = MQ submit_queues = 1 Verification that nullb0 has something inflight: root@pserver8:~# cat /sys/block/nullb0/inflight 0 1 root@pserver8:~# find /sys/block/nullb0/mq/0/cpu* -name rq_list -print -exec cat {} \; ... /sys/block/nullb0/mq/0/cpu2/rq_list CTX pending: ffff8838038e2400 ... During debug it became clear that stalled request is always inserted in the rq_list from the following path: save_stack_trace_tsk + 34 blk_mq_insert_requests + 231 blk_mq_flush_plug_list + 281 blk_flush_plug_list + 199 wait_on_page_bit + 192 __filemap_fdatawait_range + 228 filemap_fdatawait_range + 20 filemap_write_and_wait_range + 63 blkdev_fsync + 27 vfs_fsync_range + 73 blkdev_write_iter + 202 __vfs_write + 170 vfs_write + 169 kernel_write + 56 So blk_flush_plug_list() was called with from_schedule == true. If from_schedule is true, that means that finally blk_mq_insert_requests() offloads execution of __blk_mq_run_hw_queue() and uses kblockd workqueue, i.e. it calls kblockd_schedule_delayed_work_on(). That means, that we race with another CPU, which is about to execute __blk_mq_run_hw_queue() work. Further debugging shows the following traces from different CPUs: CPU#0 CPU#1 ---------------------------------- ------------------------------- reqeust A inserted STORE hctx->ctx_map[0] bit marked kblockd_schedule...() returns 1 request B inserted STORE hctx->ctx_map[1] bit marked kblockd_schedule...() returns 0 *** WORK PENDING bit is cleared *** flush_busy_ctxs() is executed, but bit 1, set by CPU#1, is not observed As a result request B pended forever. This behaviour can be explained by speculative LOAD of hctx->ctx_map on CPU#0, which is reordered with clear of PENDING bit and executed _before_ actual STORE of bit 1 on CPU#1. The proper fix is an explicit full barrier , which guarantees that clear of PENDING bit is to be executed before all possible speculative LOADS or STORES inside actual work function. Signed-off-by: Roman Pen Cc: Gioh Kim Cc: Michael Wang Cc: Tejun Heo Cc: Jens Axboe Cc: linux-block@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- kernel/workqueue.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 450c21fd0e6e..0ec05948a97b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -649,6 +649,35 @@ static void set_work_pool_and_clear_pending(struct work_struct *work, */ smp_wmb(); set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0); + /* + * The following mb guarantees that previous clear of a PENDING bit + * will not be reordered with any speculative LOADS or STORES from + * work->current_func, which is executed afterwards. This possible + * reordering can lead to a missed execution on attempt to qeueue + * the same @work. E.g. consider this case: + * + * CPU#0 CPU#1 + * ---------------------------- -------------------------------- + * + * 1 STORE event_indicated + * 2 queue_work_on() { + * 3 test_and_set_bit(PENDING) + * 4 } set_..._and_clear_pending() { + * 5 set_work_data() # clear bit + * 6 smp_mb() + * 7 work->current_func() { + * 8 LOAD event_indicated + * } + * + * Without an explicit full barrier speculative LOAD on line 8 can + * be executed before CPU#0 does STORE on line 1. If that happens, + * CPU#0 observes the PENDING bit is still set and new execution of + * a @work is not queued in a hope, that CPU#1 will eventually + * finish the queued @work. Meanwhile CPU#1 does not see + * event_indicated is set, because speculative LOAD was executed + * before actual STORE. + */ + smp_mb(); } static void clear_work_data(struct work_struct *work) -- cgit v1.2.3 From d52097476caeb14f4d7e3417dda08220d2813cc4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 21 Apr 2016 19:06:48 -0400 Subject: cgroup, cpuset: replace cpuset_post_attach_flush() with cgroup_subsys->post_attach callback commit 5cf1cacb49aee39c3e02ae87068fc3c6430659b0 upstream. Since e93ad19d0564 ("cpuset: make mm migration asynchronous"), cpuset kicks off asynchronous NUMA node migration if necessary during task migration and flushes it from cpuset_post_attach_flush() which is called at the end of __cgroup_procs_write(). This is to avoid performing migration with cgroup_threadgroup_rwsem write-locked which can lead to deadlock through dependency on kworker creation. memcg has a similar issue with charge moving, so let's convert it to an official callback rather than the current one-off cpuset specific function. This patch adds cgroup_subsys->post_attach callback and makes cpuset register cpuset_post_attach_flush() as its ->post_attach. The conversion is mostly one-to-one except that the new callback is called under cgroup_mutex. This is to guarantee that no other migration operations are started before ->post_attach callbacks are finished. cgroup_mutex is one of the outermost mutex in the system and has never been and shouldn't be a problem. We can add specialized synchronization around __cgroup_procs_write() but I don't think there's any noticeable benefit. Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Greg Kroah-Hartman --- kernel/cgroup.c | 7 +++++-- kernel/cpuset.c | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index dc94f8beb097..b0ea3aebc05a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2721,9 +2721,10 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool threadgroup) { struct task_struct *tsk; + struct cgroup_subsys *ss; struct cgroup *cgrp; pid_t pid; - int ret; + int ssid, ret; if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return -EINVAL; @@ -2771,8 +2772,10 @@ out_unlock_rcu: rcu_read_unlock(); out_unlock_threadgroup: percpu_up_write(&cgroup_threadgroup_rwsem); + for_each_subsys(ss, ssid) + if (ss->post_attach) + ss->post_attach(); cgroup_kn_unlock(of->kn); - cpuset_post_attach_flush(); return ret ?: nbytes; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 2ade632197d5..11eaf14b52c2 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -57,7 +57,6 @@ #include #include #include -#include #include #include @@ -1015,7 +1014,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, } } -void cpuset_post_attach_flush(void) +static void cpuset_post_attach(void) { flush_workqueue(cpuset_migrate_mm_wq); } @@ -2083,6 +2082,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, + .post_attach = cpuset_post_attach, .bind = cpuset_bind, .legacy_cftypes = files, .early_init = 1, -- cgit v1.2.3 From 3c6266d57c4c4fa02588070347acf21b610bbd96 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 21 Jan 2016 15:32:15 -0500 Subject: cgroup: make sure a parent css isn't freed before its children commit 8bb5ef79bc0f4016ecf79e8dce6096a3c63603e4 upstream. There are three subsystem callbacks in css shutdown path - css_offline(), css_released() and css_free(). Except for css_released(), cgroup core didn't guarantee the order of invocation. css_offline() or css_free() could be called on a parent css before its children. This behavior is unexpected and led to bugs in cpu and memory controller. The previous patch updated ordering for css_offline() which fixes the cpu controller issue. While there currently isn't a known bug caused by misordering of css_free() invocations, let's fix it too for consistency. css_free() ordering can be trivially fixed by moving putting of the parent css below css_free() invocation. Signed-off-by: Tejun Heo Cc: Peter Zijlstra Signed-off-by: Greg Kroah-Hartman --- kernel/cgroup.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b0ea3aebc05a..1c9d701f7a72 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4692,14 +4692,15 @@ static void css_free_work_fn(struct work_struct *work) if (ss) { /* css free path */ + struct cgroup_subsys_state *parent = css->parent; int id = css->id; - if (css->parent) - css_put(css->parent); - ss->css_free(css); cgroup_idr_remove(&ss->css_idr, id); cgroup_put(cgrp); + + if (parent) + css_put(parent); } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); -- cgit v1.2.3 From 23a67ddd4636584816e2dc2c6393511d55944974 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 1 Feb 2016 15:11:28 +0100 Subject: locking/mcs: Fix mcs_spin_lock() ordering commit 920c720aa5aa3900a7f1689228fdfc2580a91e7e upstream. Similar to commit b4b29f94856a ("locking/osq: Fix ordering of node initialisation in osq_lock") the use of xchg_acquire() is fundamentally broken with MCS like constructs. Furthermore, it turns out we rely on the global transitivity of this operation because the unlock path observes the pointer with a READ_ONCE(), not an smp_load_acquire(). This is non-critical because the MCS code isn't actually used and mostly serves as documentation, a stepping stone to the more complex things we've build on top of the idea. Reported-by: Andrea Parri Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Fixes: 3552a07a9c4a ("locking/mcs: Use acquire/release semantics") Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/locking/mcs_spinlock.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 5b9102a47ea5..c835270f0c2f 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -67,7 +67,13 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) node->locked = 0; node->next = NULL; - prev = xchg_acquire(lock, node); + /* + * We rely on the full barrier with global transitivity implied by the + * below xchg() to order the initialization stores above against any + * observation of @node. And to provide the ACQUIRE ordering associated + * with a LOCK primitive. + */ + prev = xchg(lock, node); if (likely(prev == NULL)) { /* * Lock acquired, don't need to set node->locked to 1. Threads -- cgit v1.2.3