From c0944355a74bc9c2b5b3cc5b627efe0c73e30bd9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 16 Mar 2016 16:22:45 +0100
Subject: sched/cgroup: Fix/cleanup cgroup teardown/init

commit 2f5177f0fd7e531b26d54633be62d1d4cb94621c upstream.

The CPU controller hasn't kept up with the various changes in the whole
cgroup initialization / destruction sequence, and commit:

  2e91fa7f6d45 ("cgroup: keep zombies associated with their original cgroups")

caused it to explode.

The reason for this is that zombies do not inhibit css_offline() from
being called, but do stall css_released(). Now we tear down the cfs_rq
structures on css_offline() but zombies can run after that, leading to
use-after-free issues.

The solution is to move the tear-down to css_released(), which
guarantees nobody (including no zombies) is still using our cgroup.

Furthermore, a few simple cleanups are possible too. There doesn't
appear to be any point to us using css_online() (anymore?) so fold that
in css_alloc().

And since cgroup code guarantees an RCU grace period between
css_released() and css_free() we can forgo using call_rcu() and free the
stuff immediately.

Suggested-by: Tejun Heo <tj@kernel.org>
Reported-by: Kazuki Yamaguchi <k@rhe.jp>
Reported-by: Niklas Cassel <niklas.cassel@axis.com>
Tested-by: Niklas Cassel <niklas.cassel@axis.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 2e91fa7f6d45 ("cgroup: keep zombies associated with their original cgroups")
Link: http://lkml.kernel.org/r/20160316152245.GY6344@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/sched/core.c | 35 ++++++++++++++---------------------
 1 file changed, 14 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 70e5e09341f1..55bebf924946 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7693,7 +7693,7 @@ void set_curr_task(int cpu, struct task_struct *p)
 /* task_group_lock serializes the addition/removal of task groups */
 static DEFINE_SPINLOCK(task_group_lock);
 
-static void free_sched_group(struct task_group *tg)
+static void sched_free_group(struct task_group *tg)
 {
 	free_fair_sched_group(tg);
 	free_rt_sched_group(tg);
@@ -7719,7 +7719,7 @@ struct task_group *sched_create_group(struct task_group *parent)
 	return tg;
 
 err:
-	free_sched_group(tg);
+	sched_free_group(tg);
 	return ERR_PTR(-ENOMEM);
 }
 
@@ -7739,17 +7739,16 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
 }
 
 /* rcu callback to free various structures associated with a task group */
-static void free_sched_group_rcu(struct rcu_head *rhp)
+static void sched_free_group_rcu(struct rcu_head *rhp)
 {
 	/* now it should be safe to free those cfs_rqs */
-	free_sched_group(container_of(rhp, struct task_group, rcu));
+	sched_free_group(container_of(rhp, struct task_group, rcu));
 }
 
-/* Destroy runqueue etc associated with a task group */
 void sched_destroy_group(struct task_group *tg)
 {
 	/* wait for possible concurrent references to cfs_rqs complete */
-	call_rcu(&tg->rcu, free_sched_group_rcu);
+	call_rcu(&tg->rcu, sched_free_group_rcu);
 }
 
 void sched_offline_group(struct task_group *tg)
@@ -8210,31 +8209,26 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 	if (IS_ERR(tg))
 		return ERR_PTR(-ENOMEM);
 
+	sched_online_group(tg, parent);
+
 	return &tg->css;
 }
 
-static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
 {
 	struct task_group *tg = css_tg(css);
-	struct task_group *parent = css_tg(css->parent);
 
-	if (parent)
-		sched_online_group(tg, parent);
-	return 0;
+	sched_offline_group(tg);
 }
 
 static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
 {
 	struct task_group *tg = css_tg(css);
 
-	sched_destroy_group(tg);
-}
-
-static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
-{
-	struct task_group *tg = css_tg(css);
-
-	sched_offline_group(tg);
+	/*
+	 * Relies on the RCU grace period between css_released() and this.
+	 */
+	sched_free_group(tg);
 }
 
 static void cpu_cgroup_fork(struct task_struct *task, void *private)
@@ -8594,9 +8588,8 @@ static struct cftype cpu_files[] = {
 
 struct cgroup_subsys cpu_cgrp_subsys = {
 	.css_alloc	= cpu_cgroup_css_alloc,
+	.css_released	= cpu_cgroup_css_released,
 	.css_free	= cpu_cgroup_css_free,
-	.css_online	= cpu_cgroup_css_online,
-	.css_offline	= cpu_cgroup_css_offline,
 	.fork		= cpu_cgroup_fork,
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
-- 
cgit v1.2.3


From 61fc0ae42c498f8eb782733065d93da6817d28b4 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 15 Apr 2016 14:35:39 +0200
Subject: futex: Handle unlock_pi race gracefully

commit 89e9e66ba1b3bde9d8ea90566c2aee20697ad681 upstream.

If userspace calls UNLOCK_PI unconditionally without trying the TID -> 0
transition in user space first then the user space value might not have the
waiters bit set. This opens the following race:

CPU0	    	      	    CPU1
uval = get_user(futex)
			    lock(hb)
lock(hb)
			    futex |= FUTEX_WAITERS
			    ....
			    unlock(hb)

cmpxchg(futex, uval, newval)

So the cmpxchg fails and returns -EINVAL to user space, which is wrong because
the futex value is valid.

To handle this (yes, yet another) corner case gracefully, check for a flag
change and retry.

[ tglx: Massaged changelog and slightly reworked implementation ]

Fixes: ccf9e6a80d9e ("futex: Make unlock_pi more robust")
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Darren Hart <dvhart@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1460723739-5195-1-git-send-email-bigeasy@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/futex.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 461c72b2dac2..eaa3a8dfd345 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1244,10 +1244,20 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
 	if (unlikely(should_fail_futex(true)))
 		ret = -EFAULT;
 
-	if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
+	if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
 		ret = -EFAULT;
-	else if (curval != uval)
-		ret = -EINVAL;
+	} else if (curval != uval) {
+		/*
+		 * If a unconditional UNLOCK_PI operation (user space did not
+		 * try the TID->0 transition) raced with a waiter setting the
+		 * FUTEX_WAITERS flag between get_user() and locking the hash
+		 * bucket lock, retry the operation.
+		 */
+		if ((FUTEX_TID_MASK & curval) == uval)
+			ret = -EAGAIN;
+		else
+			ret = -EINVAL;
+	}
 	if (ret) {
 		raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
 		return ret;
@@ -2537,6 +2547,15 @@ retry:
 		 */
 		if (ret == -EFAULT)
 			goto pi_faulted;
+		/*
+		 * A unconditional UNLOCK_PI op raced against a waiter
+		 * setting the FUTEX_WAITERS bit. Try again.
+		 */
+		if (ret == -EAGAIN) {
+			spin_unlock(&hb->lock);
+			put_futex_key(&key);
+			goto retry;
+		}
 		/*
 		 * wake_futex_pi has detected invalid state. Tell user
 		 * space.
-- 
cgit v1.2.3


From ad4b209d192624e8587f4988171d624346913ddd Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 20 Apr 2016 20:09:24 -0700
Subject: futex: Acknowledge a new waiter in counter before plist

commit fe1bce9e2107ba3a8faffe572483b6974201a0e6 upstream.

Otherwise an incoming waker on the dest hash bucket can miss
the waiter adding itself to the plist during the lockless
check optimization (small window but still the correct way
of doing this); similarly to the decrement counterpart.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: bigeasy@linutronix.de
Cc: dvhart@infradead.org
Link: http://lkml.kernel.org/r/1461208164-29150-1-git-send-email-dave@stgolabs.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/futex.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index eaa3a8dfd345..9d8163afd87c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1484,8 +1484,8 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
 	if (likely(&hb1->chain != &hb2->chain)) {
 		plist_del(&q->list, &hb1->chain);
 		hb_waiters_dec(hb1);
-		plist_add(&q->list, &hb2->chain);
 		hb_waiters_inc(hb2);
+		plist_add(&q->list, &hb2->chain);
 		q->lock_ptr = &hb2->lock;
 	}
 	get_futex_key_refs(key2);
-- 
cgit v1.2.3


From 2da9606aea5a8fd1b710f8c8dd5295da4825e9cd Mon Sep 17 00:00:00 2001
From: Roman Pen <roman.penyaev@profitbricks.com>
Date: Tue, 26 Apr 2016 13:15:35 +0200
Subject: workqueue: fix ghost PENDING flag while doing MQ IO

commit 346c09f80459a3ad97df1816d6d606169a51001a upstream.

The bug in a workqueue leads to a stalled IO request in MQ ctx->rq_list
with the following backtrace:

[  601.347452] INFO: task kworker/u129:5:1636 blocked for more than 120 seconds.
[  601.347574]       Tainted: G           O    4.4.5-1-storage+ #6
[  601.347651] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  601.348142] kworker/u129:5  D ffff880803077988     0  1636      2 0x00000000
[  601.348519] Workqueue: ibnbd_server_fileio_wq ibnbd_dev_file_submit_io_worker [ibnbd_server]
[  601.348999]  ffff880803077988 ffff88080466b900 ffff8808033f9c80 ffff880803078000
[  601.349662]  ffff880807c95000 7fffffffffffffff ffffffff815b0920 ffff880803077ad0
[  601.350333]  ffff8808030779a0 ffffffff815b01d5 0000000000000000 ffff880803077a38
[  601.350965] Call Trace:
[  601.351203]  [<ffffffff815b0920>] ? bit_wait+0x60/0x60
[  601.351444]  [<ffffffff815b01d5>] schedule+0x35/0x80
[  601.351709]  [<ffffffff815b2dd2>] schedule_timeout+0x192/0x230
[  601.351958]  [<ffffffff812d43f7>] ? blk_flush_plug_list+0xc7/0x220
[  601.352208]  [<ffffffff810bd737>] ? ktime_get+0x37/0xa0
[  601.352446]  [<ffffffff815b0920>] ? bit_wait+0x60/0x60
[  601.352688]  [<ffffffff815af784>] io_schedule_timeout+0xa4/0x110
[  601.352951]  [<ffffffff815b3a4e>] ? _raw_spin_unlock_irqrestore+0xe/0x10
[  601.353196]  [<ffffffff815b093b>] bit_wait_io+0x1b/0x70
[  601.353440]  [<ffffffff815b056d>] __wait_on_bit+0x5d/0x90
[  601.353689]  [<ffffffff81127bd0>] wait_on_page_bit+0xc0/0xd0
[  601.353958]  [<ffffffff81096db0>] ? autoremove_wake_function+0x40/0x40
[  601.354200]  [<ffffffff81127cc4>] __filemap_fdatawait_range+0xe4/0x140
[  601.354441]  [<ffffffff81127d34>] filemap_fdatawait_range+0x14/0x30
[  601.354688]  [<ffffffff81129a9f>] filemap_write_and_wait_range+0x3f/0x70
[  601.354932]  [<ffffffff811ced3b>] blkdev_fsync+0x1b/0x50
[  601.355193]  [<ffffffff811c82d9>] vfs_fsync_range+0x49/0xa0
[  601.355432]  [<ffffffff811cf45a>] blkdev_write_iter+0xca/0x100
[  601.355679]  [<ffffffff81197b1a>] __vfs_write+0xaa/0xe0
[  601.355925]  [<ffffffff81198379>] vfs_write+0xa9/0x1a0
[  601.356164]  [<ffffffff811c59d8>] kernel_write+0x38/0x50

The underlying device is a null_blk, with default parameters:

  queue_mode    = MQ
  submit_queues = 1

Verification that nullb0 has something inflight:

root@pserver8:~# cat /sys/block/nullb0/inflight
       0        1
root@pserver8:~# find /sys/block/nullb0/mq/0/cpu* -name rq_list -print -exec cat {} \;
...
/sys/block/nullb0/mq/0/cpu2/rq_list
CTX pending:
        ffff8838038e2400
...

During debug it became clear that stalled request is always inserted in
the rq_list from the following path:

   save_stack_trace_tsk + 34
   blk_mq_insert_requests + 231
   blk_mq_flush_plug_list + 281
   blk_flush_plug_list + 199
   wait_on_page_bit + 192
   __filemap_fdatawait_range + 228
   filemap_fdatawait_range + 20
   filemap_write_and_wait_range + 63
   blkdev_fsync + 27
   vfs_fsync_range + 73
   blkdev_write_iter + 202
   __vfs_write + 170
   vfs_write + 169
   kernel_write + 56

So blk_flush_plug_list() was called with from_schedule == true.

If from_schedule is true, that means that finally blk_mq_insert_requests()
offloads execution of __blk_mq_run_hw_queue() and uses kblockd workqueue,
i.e. it calls kblockd_schedule_delayed_work_on().

That means, that we race with another CPU, which is about to execute
__blk_mq_run_hw_queue() work.

Further debugging shows the following traces from different CPUs:

  CPU#0                                  CPU#1
  ----------------------------------     -------------------------------
  reqeust A inserted
  STORE hctx->ctx_map[0] bit marked
  kblockd_schedule...() returns 1
  <schedule to kblockd workqueue>
                                         request B inserted
                                         STORE hctx->ctx_map[1] bit marked
                                         kblockd_schedule...() returns 0
  *** WORK PENDING bit is cleared ***
  flush_busy_ctxs() is executed, but
  bit 1, set by CPU#1, is not observed

As a result request B pended forever.

This behaviour can be explained by speculative LOAD of hctx->ctx_map on
CPU#0, which is reordered with clear of PENDING bit and executed _before_
actual STORE of bit 1 on CPU#1.

The proper fix is an explicit full barrier <mfence>, which guarantees
that clear of PENDING bit is to be executed before all possible
speculative LOADS or STORES inside actual work function.

Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com>
Cc: Gioh Kim <gi-oh.kim@profitbricks.com>
Cc: Michael Wang <yun.wang@profitbricks.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/workqueue.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 450c21fd0e6e..0ec05948a97b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -649,6 +649,35 @@ static void set_work_pool_and_clear_pending(struct work_struct *work,
 	 */
 	smp_wmb();
 	set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
+	/*
+	 * The following mb guarantees that previous clear of a PENDING bit
+	 * will not be reordered with any speculative LOADS or STORES from
+	 * work->current_func, which is executed afterwards.  This possible
+	 * reordering can lead to a missed execution on attempt to qeueue
+	 * the same @work.  E.g. consider this case:
+	 *
+	 *   CPU#0                         CPU#1
+	 *   ----------------------------  --------------------------------
+	 *
+	 * 1  STORE event_indicated
+	 * 2  queue_work_on() {
+	 * 3    test_and_set_bit(PENDING)
+	 * 4 }                             set_..._and_clear_pending() {
+	 * 5                                 set_work_data() # clear bit
+	 * 6                                 smp_mb()
+	 * 7                               work->current_func() {
+	 * 8				      LOAD event_indicated
+	 *				   }
+	 *
+	 * Without an explicit full barrier speculative LOAD on line 8 can
+	 * be executed before CPU#0 does STORE on line 1.  If that happens,
+	 * CPU#0 observes the PENDING bit is still set and new execution of
+	 * a @work is not queued in a hope, that CPU#1 will eventually
+	 * finish the queued @work.  Meanwhile CPU#1 does not see
+	 * event_indicated is set, because speculative LOAD was executed
+	 * before actual STORE.
+	 */
+	smp_mb();
 }
 
 static void clear_work_data(struct work_struct *work)
-- 
cgit v1.2.3


From d52097476caeb14f4d7e3417dda08220d2813cc4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 21 Apr 2016 19:06:48 -0400
Subject: cgroup, cpuset: replace cpuset_post_attach_flush() with
 cgroup_subsys->post_attach callback

commit 5cf1cacb49aee39c3e02ae87068fc3c6430659b0 upstream.

Since e93ad19d0564 ("cpuset: make mm migration asynchronous"), cpuset
kicks off asynchronous NUMA node migration if necessary during task
migration and flushes it from cpuset_post_attach_flush() which is
called at the end of __cgroup_procs_write().  This is to avoid
performing migration with cgroup_threadgroup_rwsem write-locked which
can lead to deadlock through dependency on kworker creation.

memcg has a similar issue with charge moving, so let's convert it to
an official callback rather than the current one-off cpuset specific
function.  This patch adds cgroup_subsys->post_attach callback and
makes cpuset register cpuset_post_attach_flush() as its ->post_attach.

The conversion is mostly one-to-one except that the new callback is
called under cgroup_mutex.  This is to guarantee that no other
migration operations are started before ->post_attach callbacks are
finished.  cgroup_mutex is one of the outermost mutex in the system
and has never been and shouldn't be a problem.  We can add specialized
synchronization around __cgroup_procs_write() but I don't think
there's any noticeable benefit.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/cgroup.c | 7 +++++--
 kernel/cpuset.c | 4 ++--
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index dc94f8beb097..b0ea3aebc05a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2721,9 +2721,10 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
 				    size_t nbytes, loff_t off, bool threadgroup)
 {
 	struct task_struct *tsk;
+	struct cgroup_subsys *ss;
 	struct cgroup *cgrp;
 	pid_t pid;
-	int ret;
+	int ssid, ret;
 
 	if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
 		return -EINVAL;
@@ -2771,8 +2772,10 @@ out_unlock_rcu:
 	rcu_read_unlock();
 out_unlock_threadgroup:
 	percpu_up_write(&cgroup_threadgroup_rwsem);
+	for_each_subsys(ss, ssid)
+		if (ss->post_attach)
+			ss->post_attach();
 	cgroup_kn_unlock(of->kn);
-	cpuset_post_attach_flush();
 	return ret ?: nbytes;
 }
 
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 2ade632197d5..11eaf14b52c2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -57,7 +57,6 @@
 #include <asm/uaccess.h>
 #include <linux/atomic.h>
 #include <linux/mutex.h>
-#include <linux/workqueue.h>
 #include <linux/cgroup.h>
 #include <linux/wait.h>
 
@@ -1015,7 +1014,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 	}
 }
 
-void cpuset_post_attach_flush(void)
+static void cpuset_post_attach(void)
 {
 	flush_workqueue(cpuset_migrate_mm_wq);
 }
@@ -2083,6 +2082,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
 	.can_attach	= cpuset_can_attach,
 	.cancel_attach	= cpuset_cancel_attach,
 	.attach		= cpuset_attach,
+	.post_attach	= cpuset_post_attach,
 	.bind		= cpuset_bind,
 	.legacy_cftypes	= files,
 	.early_init	= 1,
-- 
cgit v1.2.3


From 3c6266d57c4c4fa02588070347acf21b610bbd96 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 21 Jan 2016 15:32:15 -0500
Subject: cgroup: make sure a parent css isn't freed before its children

commit 8bb5ef79bc0f4016ecf79e8dce6096a3c63603e4 upstream.

There are three subsystem callbacks in css shutdown path -
css_offline(), css_released() and css_free().  Except for
css_released(), cgroup core didn't guarantee the order of invocation.
css_offline() or css_free() could be called on a parent css before its
children.  This behavior is unexpected and led to bugs in cpu and
memory controller.

The previous patch updated ordering for css_offline() which fixes the
cpu controller issue.  While there currently isn't a known bug caused
by misordering of css_free() invocations, let's fix it too for
consistency.

css_free() ordering can be trivially fixed by moving putting of the
parent css below css_free() invocation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/cgroup.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b0ea3aebc05a..1c9d701f7a72 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4692,14 +4692,15 @@ static void css_free_work_fn(struct work_struct *work)
 
 	if (ss) {
 		/* css free path */
+		struct cgroup_subsys_state *parent = css->parent;
 		int id = css->id;
 
-		if (css->parent)
-			css_put(css->parent);
-
 		ss->css_free(css);
 		cgroup_idr_remove(&ss->css_idr, id);
 		cgroup_put(cgrp);
+
+		if (parent)
+			css_put(parent);
 	} else {
 		/* cgroup free path */
 		atomic_dec(&cgrp->root->nr_cgrps);
-- 
cgit v1.2.3


From 23a67ddd4636584816e2dc2c6393511d55944974 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 1 Feb 2016 15:11:28 +0100
Subject: locking/mcs: Fix mcs_spin_lock() ordering

commit 920c720aa5aa3900a7f1689228fdfc2580a91e7e upstream.

Similar to commit b4b29f94856a ("locking/osq: Fix ordering of node
initialisation in osq_lock") the use of xchg_acquire() is
fundamentally broken with MCS like constructs.

Furthermore, it turns out we rely on the global transitivity of this
operation because the unlock path observes the pointer with a
READ_ONCE(), not an smp_load_acquire().

This is non-critical because the MCS code isn't actually used and
mostly serves as documentation, a stepping stone to the more complex
things we've build on top of the idea.

Reported-by: Andrea Parri <parri.andrea@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Fixes: 3552a07a9c4a ("locking/mcs: Use acquire/release semantics")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/locking/mcs_spinlock.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 5b9102a47ea5..c835270f0c2f 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -67,7 +67,13 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
 	node->locked = 0;
 	node->next   = NULL;
 
-	prev = xchg_acquire(lock, node);
+	/*
+	 * We rely on the full barrier with global transitivity implied by the
+	 * below xchg() to order the initialization stores above against any
+	 * observation of @node. And to provide the ACQUIRE ordering associated
+	 * with a LOCK primitive.
+	 */
+	prev = xchg(lock, node);
 	if (likely(prev == NULL)) {
 		/*
 		 * Lock acquired, don't need to set node->locked to 1. Threads
-- 
cgit v1.2.3