From 41050a009640f9f330a5b916563ca7faf853a98c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 18 Dec 2014 12:31:27 -0800
Subject: rcu: Fix rcu_barrier() race that could result in too-short wait

The rcu_barrier() no-callbacks check for no-CBs CPUs has race conditions.
It checks a given CPU's lists of callbacks, and if all three no-CBs lists
are empty, ignores that CPU.  However, these three lists could potentially
be empty even when callbacks are present if the check executed just as
the callbacks were being moved from one list to another.  It turns out
that recent versions of rcutorture can spot this race.

This commit plugs this hole by consolidating the per-list counts of
no-CBs callbacks into a single count, which is incremented before
the corresponding callback is posted and after it is invoked.  Then
rcu_barrier() checks this single count to reliably determine whether
the corresponding CPU has no-CBs callbacks.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 45 ++++++++++++++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 17 deletions(-)

(limited to 'kernel/rcu/tree_plugin.h')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3ec85cb5d544..e5c43b7f63f2 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2056,9 +2056,26 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
 static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
 {
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+	unsigned long ret;
+#ifdef CONFIG_PROVE_RCU
 	struct rcu_head *rhp;
+#endif /* #ifdef CONFIG_PROVE_RCU */
+
+	/*
+	 * Check count of all no-CBs callbacks awaiting invocation.
+	 * There needs to be a barrier before this function is called,
+	 * but associated with a prior determination that no more
+	 * callbacks would be posted.  In the worst case, the first
+	 * barrier in _rcu_barrier() suffices (but the caller cannot
+	 * necessarily rely on this, not a substitute for the caller
+	 * getting the concurrency design right!).  There must also be
+	 * a barrier between the following load an posting of a callback
+	 * (if a callback is in fact needed).  This is associated with an
+	 * atomic_inc() in the caller.
+	 */
+	ret = atomic_long_read(&rdp->nocb_q_count);
 
-	/* No-CBs CPUs might have callbacks on any of three lists. */
+#ifdef CONFIG_PROVE_RCU
 	rhp = ACCESS_ONCE(rdp->nocb_head);
 	if (!rhp)
 		rhp = ACCESS_ONCE(rdp->nocb_gp_head);
@@ -2072,8 +2089,9 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
 		       cpu, rhp->func);
 		WARN_ON_ONCE(1);
 	}
+#endif /* #ifdef CONFIG_PROVE_RCU */
 
-	return !!rhp;
+	return !!ret;
 }
 
 /*
@@ -2095,9 +2113,10 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
 	struct task_struct *t;
 
 	/* Enqueue the callback on the nocb list and update counts. */
+	atomic_long_add(rhcount, &rdp->nocb_q_count);
+	/* rcu_barrier() relies on ->nocb_q_count add before xchg. */
 	old_rhpp = xchg(&rdp->nocb_tail, rhtp);
 	ACCESS_ONCE(*old_rhpp) = rhp;
-	atomic_long_add(rhcount, &rdp->nocb_q_count);
 	atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
 	smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
 
@@ -2288,9 +2307,6 @@ wait_again:
 		/* Move callbacks to wait-for-GP list, which is empty. */
 		ACCESS_ONCE(rdp->nocb_head) = NULL;
 		rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
-		rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0);
-		rdp->nocb_gp_count_lazy =
-			atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
 		gotcbs = true;
 	}
 
@@ -2338,9 +2354,6 @@ wait_again:
 		/* Append callbacks to follower's "done" list. */
 		tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
 		*tail = rdp->nocb_gp_head;
-		atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count);
-		atomic_long_add(rdp->nocb_gp_count_lazy,
-				&rdp->nocb_follower_count_lazy);
 		smp_mb__after_atomic(); /* Store *tail before wakeup. */
 		if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
 			/*
@@ -2415,13 +2428,11 @@ static int rcu_nocb_kthread(void *arg)
 		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
 		ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
 		tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
-		c = atomic_long_xchg(&rdp->nocb_follower_count, 0);
-		cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0);
-		rdp->nocb_p_count += c;
-		rdp->nocb_p_count_lazy += cl;
 
 		/* Each pass through the following loop invokes a callback. */
-		trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
+		trace_rcu_batch_start(rdp->rsp->name,
+				      atomic_long_read(&rdp->nocb_q_count_lazy),
+				      atomic_long_read(&rdp->nocb_q_count), -1);
 		c = cl = 0;
 		while (list) {
 			next = list->next;
@@ -2443,9 +2454,9 @@ static int rcu_nocb_kthread(void *arg)
 			list = next;
 		}
 		trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
-		ACCESS_ONCE(rdp->nocb_p_count) = rdp->nocb_p_count - c;
-		ACCESS_ONCE(rdp->nocb_p_count_lazy) =
-						rdp->nocb_p_count_lazy - cl;
+		smp_mb__before_atomic();  /* _add after CB invocation. */
+		atomic_long_add(-c, &rdp->nocb_q_count);
+		atomic_long_add(-cl, &rdp->nocb_q_count_lazy);
 		rdp->n_nocbs_invoked += c;
 	}
 	return 0;
-- 
cgit v1.2.3


From b08ea27d95bcaee6d9cf4edd64f373006661424a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 29 Oct 2014 15:39:39 -0700
Subject: rcu: Protect rcu_boost() lockless accesses with ACCESS_ONCE()

This commit prevents random compiler optimizations by applying
ACCESS_ONCE() to lockless accesses.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel/rcu/tree_plugin.h')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3ec85cb5d544..d59913ef8360 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1127,7 +1127,8 @@ static int rcu_boost(struct rcu_node *rnp)
 	struct task_struct *t;
 	struct list_head *tb;
 
-	if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
+	if (ACCESS_ONCE(rnp->exp_tasks) == NULL &&
+	    ACCESS_ONCE(rnp->boost_tasks) == NULL)
 		return 0;  /* Nothing left to boost. */
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
-- 
cgit v1.2.3


From 74e871ac6cb17f67cbefb569d98a8d05de666e07 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 30 Oct 2014 21:08:53 -0700
Subject: rcu: Rename "empty" to "empty_norm" in preparation for boost rework

This commit undertakes a simple variable renaming to make way for
some rework of RCU priority boosting.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/rcu/tree_plugin.h')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index d59913ef8360..3d22f0b2dea9 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -313,8 +313,8 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t,
  */
 void rcu_read_unlock_special(struct task_struct *t)
 {
-	int empty;
 	int empty_exp;
+	int empty_norm;
 	int empty_exp_now;
 	unsigned long flags;
 	struct list_head *np;
@@ -367,7 +367,7 @@ void rcu_read_unlock_special(struct task_struct *t)
 				break;
 			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 		}
-		empty = !rcu_preempt_blocked_readers_cgp(rnp);
+		empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
 		empty_exp = !rcu_preempted_readers_exp(rnp);
 		smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
 		np = rcu_next_node_entry(t, rnp);
@@ -393,7 +393,7 @@ void rcu_read_unlock_special(struct task_struct *t)
 		 * so we must take a snapshot of the expedited state.
 		 */
 		empty_exp_now = !rcu_preempted_readers_exp(rnp);
-		if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
+		if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
 			trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
 							 rnp->gpnum,
 							 0, rnp->qsmask,
-- 
cgit v1.2.3


From 8af3a5e78cfb63abe8813743946b7bd5a8a3134c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 31 Oct 2014 11:22:37 -0700
Subject: rcu: Abstract rcu_cleanup_dead_rnp() from rcu_cleanup_dead_cpu()

This commit abstracts rcu_cleanup_dead_rnp() from rcu_cleanup_dead_cpu()
in preparation for the rework of RCU priority boosting.  This new function
will be invoked from rcu_read_unlock_special() in the reworked scheme,
which is why rcu_cleanup_dead_rnp() assumes that the leaf rcu_node
structure's ->qsmaskinit field has already been updated.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'kernel/rcu/tree_plugin.h')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3d22f0b2dea9..d044b9cbbd97 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -306,6 +306,15 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t,
 	return np;
 }
 
+/*
+ * Return true if the specified rcu_node structure has tasks that were
+ * preempted within an RCU read-side critical section.
+ */
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
+{
+	return !list_empty(&rnp->blkd_tasks);
+}
+
 /*
  * Handle special cases during rcu_read_unlock(), such as needing to
  * notify RCU core processing or task having blocked during the RCU
@@ -967,6 +976,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 
+/*
+ * Because there is no preemptible RCU, there can be no readers blocked.
+ */
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
+{
+	return false;
+}
+
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 
 /*
-- 
cgit v1.2.3


From b6a932d1d9840727eee619d455bdeeedaa205be9 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 31 Oct 2014 12:05:04 -0700
Subject: rcu: Make rcu_read_unlock_special() propagate ->qsmaskinit bit
 clearing

This commit causes rcu_read_unlock_special() to propagate ->qsmaskinit
bit clearing up the rcu_node tree once a given rcu_node structure's
blkd_tasks list becomes empty.  This is the final commit in preparation
for the rework of RCU priority boosting:  It enables preempted tasks to
remain queued on their rcu_node structure even after all of that rcu_node
structure's CPUs have gone offline.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'kernel/rcu/tree_plugin.h')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index d044b9cbbd97..8a2b84157d34 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -322,9 +322,10 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
  */
 void rcu_read_unlock_special(struct task_struct *t)
 {
-	int empty_exp;
-	int empty_norm;
-	int empty_exp_now;
+	bool empty;
+	bool empty_exp;
+	bool empty_norm;
+	bool empty_exp_now;
 	unsigned long flags;
 	struct list_head *np;
 #ifdef CONFIG_RCU_BOOST
@@ -376,6 +377,7 @@ void rcu_read_unlock_special(struct task_struct *t)
 				break;
 			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 		}
+		empty = !rcu_preempt_has_tasks(rnp);
 		empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
 		empty_exp = !rcu_preempted_readers_exp(rnp);
 		smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
@@ -395,6 +397,14 @@ void rcu_read_unlock_special(struct task_struct *t)
 		drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
 #endif /* #ifdef CONFIG_RCU_BOOST */
 
+		/*
+		 * If this was the last task on the list, go see if we
+		 * need to propagate ->qsmaskinit bit clearing up the
+		 * rcu_node tree.
+		 */
+		if (!empty && !rcu_preempt_has_tasks(rnp))
+			rcu_cleanup_dead_rnp(rnp);
+
 		/*
 		 * If this was the last task on the current list, and if
 		 * we aren't waiting on any CPUs, report the quiescent state.
-- 
cgit v1.2.3


From d19fb8d1f3f66cc342d30aa48f090c70afb753ed Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 31 Oct 2014 12:56:16 -0700
Subject: rcu: Don't migrate blocked tasks even if all corresponding CPUs
 offline

When the last CPU associated with a given leaf rcu_node structure
goes offline, something must be done about the tasks queued on that
rcu_node structure.  Each of these tasks has been preempted on one of
the leaf rcu_node structure's CPUs while in an RCU read-side critical
section that it have not yet exited.  Handling these tasks is the job of
rcu_preempt_offline_tasks(), which migrates them from the leaf rcu_node
structure to the root rcu_node structure.

Unfortunately, this migration has to be done one task at a time because
each tasks allegiance must be shifted from the original leaf rcu_node to
the root, so that future attempts to deal with these tasks will acquire
the root rcu_node structure's ->lock rather than that of the leaf.
Worse yet, this migration must be done with interrupts disabled, which
is not so good for realtime response, especially given that there is
no bound on the number of tasks on a given rcu_node structure's list.
(OK, OK, there is a bound, it is just that it is unreasonably large,
especially on 64-bit systems.)  This was not considered a problem back
when rcu_preempt_offline_tasks() was first written because realtime
systems were assumed not to do CPU-hotplug operations while real-time
applications were running.  This assumption has proved of dubious validity
given that people are starting to run multiple realtime applications
on a single SMP system and that it is common practice to offline then
online a CPU before starting its real-time application in order to clear
extraneous processing off of that CPU.  So we now need CPU hotplug
operations to avoid undue latencies.

This commit therefore avoids migrating these tasks, instead letting
them be dequeued one by one from the original leaf rcu_node structure
by rcu_read_unlock_special().  This means that the clearing of bits
from the upper-level rcu_node structures must be deferred until the
last such task has been dequeued, because otherwise subsequent grace
periods won't wait on them.  This commit has the beneficial side effect
of simplifying the CPU-hotplug code for TREE_PREEMPT_RCU, especially in
CONFIG_RCU_BOOST builds.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 126 +----------------------------------------------
 1 file changed, 2 insertions(+), 124 deletions(-)

(limited to 'kernel/rcu/tree_plugin.h')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 8a2b84157d34..d594da48f4b4 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -103,6 +103,8 @@ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
 static struct rcu_state *rcu_state_p = &rcu_preempt_state;
 
 static int rcu_preempted_readers_exp(struct rcu_node *rnp);
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+			       bool wake);
 
 /*
  * Tell them what RCU they are running.
@@ -545,92 +547,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 
 #ifdef CONFIG_HOTPLUG_CPU
 
-/*
- * Handle tasklist migration for case in which all CPUs covered by the
- * specified rcu_node have gone offline.  Move them up to the root
- * rcu_node.  The reason for not just moving them to the immediate
- * parent is to remove the need for rcu_read_unlock_special() to
- * make more than two attempts to acquire the target rcu_node's lock.
- * Returns true if there were tasks blocking the current RCU grace
- * period.
- *
- * Returns 1 if there was previously a task blocking the current grace
- * period on the specified rcu_node structure.
- *
- * The caller must hold rnp->lock with irqs disabled.
- */
-static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
-				     struct rcu_node *rnp,
-				     struct rcu_data *rdp)
-{
-	struct list_head *lp;
-	struct list_head *lp_root;
-	int retval = 0;
-	struct rcu_node *rnp_root = rcu_get_root(rsp);
-	struct task_struct *t;
-
-	if (rnp == rnp_root) {
-		WARN_ONCE(1, "Last CPU thought to be offlined?");
-		return 0;  /* Shouldn't happen: at least one CPU online. */
-	}
-
-	/* If we are on an internal node, complain bitterly. */
-	WARN_ON_ONCE(rnp != rdp->mynode);
-
-	/*
-	 * Move tasks up to root rcu_node.  Don't try to get fancy for
-	 * this corner-case operation -- just put this node's tasks
-	 * at the head of the root node's list, and update the root node's
-	 * ->gp_tasks and ->exp_tasks pointers to those of this node's,
-	 * if non-NULL.  This might result in waiting for more tasks than
-	 * absolutely necessary, but this is a good performance/complexity
-	 * tradeoff.
-	 */
-	if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0)
-		retval |= RCU_OFL_TASKS_NORM_GP;
-	if (rcu_preempted_readers_exp(rnp))
-		retval |= RCU_OFL_TASKS_EXP_GP;
-	lp = &rnp->blkd_tasks;
-	lp_root = &rnp_root->blkd_tasks;
-	while (!list_empty(lp)) {
-		t = list_entry(lp->next, typeof(*t), rcu_node_entry);
-		raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
-		smp_mb__after_unlock_lock();
-		list_del(&t->rcu_node_entry);
-		t->rcu_blocked_node = rnp_root;
-		list_add(&t->rcu_node_entry, lp_root);
-		if (&t->rcu_node_entry == rnp->gp_tasks)
-			rnp_root->gp_tasks = rnp->gp_tasks;
-		if (&t->rcu_node_entry == rnp->exp_tasks)
-			rnp_root->exp_tasks = rnp->exp_tasks;
-#ifdef CONFIG_RCU_BOOST
-		if (&t->rcu_node_entry == rnp->boost_tasks)
-			rnp_root->boost_tasks = rnp->boost_tasks;
-#endif /* #ifdef CONFIG_RCU_BOOST */
-		raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
-	}
-
-	rnp->gp_tasks = NULL;
-	rnp->exp_tasks = NULL;
-#ifdef CONFIG_RCU_BOOST
-	rnp->boost_tasks = NULL;
-	/*
-	 * In case root is being boosted and leaf was not.  Make sure
-	 * that we boost the tasks blocking the current grace period
-	 * in this case.
-	 */
-	raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
-	smp_mb__after_unlock_lock();
-	if (rnp_root->boost_tasks != NULL &&
-	    rnp_root->boost_tasks != rnp_root->gp_tasks &&
-	    rnp_root->boost_tasks != rnp_root->exp_tasks)
-		rnp_root->boost_tasks = rnp_root->gp_tasks;
-	raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
-	return retval;
-}
-
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 
 /*
@@ -979,13 +895,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 
 #ifdef CONFIG_HOTPLUG_CPU
 
-/* Because preemptible RCU does not exist, no quieting of tasks. */
-static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
-	__releases(rnp->lock)
-{
-	raw_spin_unlock_irqrestore(&rnp->lock, flags);
-}
-
 /*
  * Because there is no preemptible RCU, there can be no readers blocked.
  */
@@ -1023,23 +932,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 	WARN_ON_ONCE(rnp->qsmask);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Because preemptible RCU does not exist, it never needs to migrate
- * tasks that were blocked within RCU read-side critical sections, and
- * such non-existent tasks cannot possibly have been blocking the current
- * grace period.
- */
-static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
-				     struct rcu_node *rnp,
-				     struct rcu_data *rdp)
-{
-	return 0;
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
 /*
  * Because preemptible RCU does not exist, it never has any callbacks
  * to check.
@@ -1058,20 +950,6 @@ void synchronize_rcu_expedited(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Because preemptible RCU does not exist, there is never any need to
- * report on tasks preempted in RCU read-side critical sections during
- * expedited RCU grace periods.
- */
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
-			       bool wake)
-{
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
 /*
  * Because preemptible RCU does not exist, rcu_barrier() is just
  * another name for rcu_barrier_sched().
-- 
cgit v1.2.3


From 96e92021d4fad8543d598b5e4926cb34fd40c4c4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 31 Oct 2014 14:09:23 -0700
Subject: rcu: Make use of rcu_preempt_has_tasks()

Given that there is now arcu_preempt_has_tasks() function that checks
to see if the ->blkd_tasks list is non-empty, this commit makes use of it.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/rcu/tree_plugin.h')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index d594da48f4b4..bcc6d9134d47 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -540,7 +540,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 {
 	WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
-	if (!list_empty(&rnp->blkd_tasks))
+	if (rcu_preempt_has_tasks(rnp))
 		rnp->gp_tasks = rnp->blkd_tasks.next;
 	WARN_ON_ONCE(rnp->qsmask);
 }
@@ -706,7 +706,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	smp_mb__after_unlock_lock();
-	if (list_empty(&rnp->blkd_tasks)) {
+	if (!rcu_preempt_has_tasks(rnp)) {
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	} else {
 		rnp->exp_tasks = rnp->blkd_tasks.next;
@@ -985,7 +985,7 @@ void exit_rcu(void)
 
 static void rcu_initiate_boost_trace(struct rcu_node *rnp)
 {
-	if (list_empty(&rnp->blkd_tasks))
+	if (!rcu_preempt_has_tasks(rnp))
 		rnp->n_balk_blkd_tasks++;
 	else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
 		rnp->n_balk_exp_gp_tasks++;
-- 
cgit v1.2.3


From 3e9f5c70d85060ff0db71826e2048daffec336e7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 3 Nov 2014 18:15:17 -0800
Subject: rcu: Don't spawn rcub kthreads on root rcu_node structure

Now that offlining CPUs no longer moves leaf rcu_node structures'
->blkd_tasks lists to the root, there is no way for the root rcu_node
structure's ->blkd_task list to be nonempty, unless the root node is also
the sole leaf node.  This commit therefore refrains from creating an rcub
kthread for the root rcu_node structure unless it is also the sole leaf.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel/rcu/tree_plugin.h')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index bcc6d9134d47..3e64bb197b1a 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1352,12 +1352,8 @@ static void __init rcu_spawn_boost_kthreads(void)
 	for_each_possible_cpu(cpu)
 		per_cpu(rcu_cpu_has_work, cpu) = 0;
 	BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
-	rnp = rcu_get_root(rcu_state_p);
-	(void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
-	if (NUM_RCU_NODES > 1) {
-		rcu_for_each_leaf_node(rcu_state_p, rnp)
-			(void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
-	}
+	rcu_for_each_leaf_node(rcu_state_p, rnp)
+		(void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
 }
 
 static void rcu_prepare_kthreads(int cpu)
-- 
cgit v1.2.3


From 5d0b024973027556b48a09bb36b55dc853ec7c6e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 10 Nov 2014 08:07:08 -0800
Subject: rcu: Don't bother affinitying rcub kthreads away from offline CPUs

When rcu_boost_kthread_setaffinity() sees that all CPUs for a given
rcu_node structure are now offline, it affinities the corresponding
RCU-boost ("rcub") kthread away from those CPUs.  This is pointless
because the kthread cannot run on those offline CPUs in any case.
This commit therefore removes this unneeded code.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel/rcu/tree_plugin.h')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3e64bb197b1a..1fac68220999 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1322,12 +1322,8 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 	for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
 		if ((mask & 0x1) && cpu != outgoingcpu)
 			cpumask_set_cpu(cpu, cm);
-	if (cpumask_weight(cm) == 0) {
+	if (cpumask_weight(cm) == 0)
 		cpumask_setall(cm);
-		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
-			cpumask_clear_cpu(cpu, cm);
-		WARN_ON_ONCE(cpumask_weight(cm) == 0);
-	}
 	set_cpus_allowed_ptr(t, cm);
 	free_cpumask_var(cm);
 }
-- 
cgit v1.2.3


From abaf3f9d275b8d856ae5e47531e40c0bfeac012b Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 18 Nov 2014 16:30:01 +0800
Subject: rcu: Revert "Allow post-unlock reference for rt_mutex" to avoid
 priority-inversion

The patch dfeb9765ce3c ("Allow post-unlock reference for rt_mutex")
ensured rcu-boost safe even the rt_mutex has post-unlock reference.

But rt_mutex allowing post-unlock reference is definitely a bug and it was
fixed by the commit 27e35715df54 ("rtmutex: Plug slow unlock race").
This fix made the previous patch (dfeb9765ce3c) useless.

And even worse, the priority-inversion introduced by the the previous
patch still exists.

rcu_read_unlock_special() {
	rt_mutex_unlock(&rnp->boost_mtx);
	/* Priority-Inversion:
	 * the current task had been deboosted and preempted as a low
	 * priority task immediately, it could wait long before reschedule in,
	 * and the rcu-booster also waits on this low priority task and sleeps.
	 * This priority-inversion makes rcu-booster can't work
	 * as expected.
	 */
	complete(&rnp->boost_completion);
}

Just revert the patch to avoid it.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'kernel/rcu/tree_plugin.h')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 1fac68220999..625e26040e6b 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -429,10 +429,8 @@ void rcu_read_unlock_special(struct task_struct *t)
 
 #ifdef CONFIG_RCU_BOOST
 		/* Unboost if we were boosted. */
-		if (drop_boost_mutex) {
+		if (drop_boost_mutex)
 			rt_mutex_unlock(&rnp->boost_mtx);
-			complete(&rnp->boost_completion);
-		}
 #endif /* #ifdef CONFIG_RCU_BOOST */
 
 		/*
@@ -1081,15 +1079,11 @@ static int rcu_boost(struct rcu_node *rnp)
 	 */
 	t = container_of(tb, struct task_struct, rcu_node_entry);
 	rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
-	init_completion(&rnp->boost_completion);
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	/* Lock only for side effect: boosts task t's priority. */
 	rt_mutex_lock(&rnp->boost_mtx);
 	rt_mutex_unlock(&rnp->boost_mtx);  /* Then keep lockdep happy. */
 
-	/* Wait for boostee to be done w/boost_mtx before reinitializing. */
-	wait_for_completion(&rnp->boost_completion);
-
 	return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
 	       ACCESS_ONCE(rnp->boost_tasks) != NULL;
 }
-- 
cgit v1.2.3


From fc908ed33e7c1428f799abb12399f906da03b397 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 8 Dec 2014 09:57:48 -0800
Subject: rcu: Make RCU_CPU_STALL_INFO include number of fqs attempts

One way that an RCU CPU stall warning can happen is if the grace-period
kthread is not allowed to execute.  One proxy for this kthread's
forward progress is the number of force-quiescent-state (fqs) scans.
This commit therefore adds the number of fqs scans to the RCU CPU stall
warning printouts when CONFIG_RCU_CPU_STALL_INFO=y.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel/rcu/tree_plugin.h')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3ec85cb5d544..769384d77437 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1898,11 +1898,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
 		ticks_value = rsp->gpnum - rdp->gpnum;
 	}
 	print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
-	pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
+	pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
 	       cpu, ticks_value, ticks_title,
 	       atomic_read(&rdtp->dynticks) & 0xfff,
 	       rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
 	       rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
+	       ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
 	       fast_no_hz);
 }
 
-- 
cgit v1.2.3


From e3663b1024d1f94688e5233440ad67a9bc10b94e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 8 Dec 2014 20:26:55 -0800
Subject: rcu: Handle gpnum/completed wrap while dyntick idle

Subtle race conditions can result if a CPU stays in dyntick-idle mode
long enough for the ->gpnum and ->completed fields to wrap.  For
example, consider the following sequence of events:

o	CPU 1 encounters a quiescent state while waiting for grace period
	5 to complete, but then enters dyntick-idle mode.

o	While CPU 1 is in dyntick-idle mode, the grace-period counters
	wrap around so that the grace period number is now 4.

o	Just as CPU 1 exits dyntick-idle mode, grace period 4 completes
	and grace period 5 begins.

o	The quiescent state that CPU 1 passed through during the old
	grace period 5 looks like it applies to the new grace period
	5.  Therefore, the new grace period 5 completes without CPU 1
	having passed through a quiescent state.

This could clearly be a fatal surprise to any long-running RCU read-side
critical section that happened to be running on CPU 1 at the time.  At one
time, this was not a problem, given that it takes significant time for
the grace-period counters to overflow even on 32-bit systems.  However,
with the advent of NO_HZ_FULL and SMP embedded systems, arbitrarily long
idle periods are now becoming quite feasible.  It is therefore time to
close this race.

This commit therefore avoids this race condition by having the
quiescent-state forcing code detect when a CPU is falling too far
behind, and setting a new rcu_data field ->gpwrap when this happens.
Whenever this new ->gpwrap field is set, the CPU's ->gpnum and ->completed
fields are known to be untrustworthy, and can be ignored, along with
any associated quiescent states.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel/rcu/tree_plugin.h')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 769384d77437..81ff8b9a5a39 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1605,7 +1605,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
 		 * completed since we last checked and there are
 		 * callbacks not yet ready to invoke.
 		 */
-		if (rdp->completed != rnp->completed &&
+		if ((rdp->completed != rnp->completed ||
+		     unlikely(ACCESS_ONCE(rdp->gpwrap))) &&
 		    rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
 			note_gp_changes(rsp, rdp);
 
-- 
cgit v1.2.3


From 9733e4f0a973a354034f5dd603b4142a3095c85f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 21 Nov 2014 12:49:13 -0800
Subject: rcu: Make _batches_completed() functions return unsigned long

Long ago, the various ->completed fields were of type long, but now are
unsigned long due to signed-integer-overflow concerns.  However, the
various _batches_completed() functions remained of type long, even though
their only purpose in life is to return the corresponding ->completed
field.  This patch cleans this up by changing these functions' return
types to unsigned long.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/rcu/tree_plugin.h')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3ec85cb5d544..f69300d4a51f 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -117,7 +117,7 @@ static void __init rcu_bootup_announce(void)
  * Return the number of RCU-preempt batches processed thus far
  * for debug and statistics.
  */
-static long rcu_batches_completed_preempt(void)
+static unsigned long rcu_batches_completed_preempt(void)
 {
 	return rcu_preempt_state.completed;
 }
@@ -126,7 +126,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
 /*
  * Return the number of RCU batches processed thus far for debug & stats.
  */
-long rcu_batches_completed(void)
+unsigned long rcu_batches_completed(void)
 {
 	return rcu_batches_completed_preempt();
 }
@@ -935,7 +935,7 @@ static void __init rcu_bootup_announce(void)
 /*
  * Return the number of RCU batches processed thus far for debug & stats.
  */
-long rcu_batches_completed(void)
+unsigned long rcu_batches_completed(void)
 {
 	return rcu_batches_completed_sched();
 }
-- 
cgit v1.2.3


From 917963d0b30f9c4153c372c165178501d97b6b55 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 21 Nov 2014 17:10:16 -0800
Subject: rcutorture: Check from beginning to end of grace period

Currently, rcutorture's Reader Batch checks measure from the end of
the previous grace period to the end of the current one.  This commit
tightens up these checks by measuring from the start and end of the same
grace period.  This involves adding rcu_batches_started() and friends
corresponding to the existing rcu_batches_completed() and friends.

We leave SRCU alone for the moment, as it does not yet have a way of
tracking both ends of its grace periods.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 28 ----------------------------
 1 file changed, 28 deletions(-)

(limited to 'kernel/rcu/tree_plugin.h')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index f69300d4a51f..07e61a04de1d 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -113,25 +113,6 @@ static void __init rcu_bootup_announce(void)
 	rcu_bootup_announce_oddness();
 }
 
-/*
- * Return the number of RCU-preempt batches processed thus far
- * for debug and statistics.
- */
-static unsigned long rcu_batches_completed_preempt(void)
-{
-	return rcu_preempt_state.completed;
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
-
-/*
- * Return the number of RCU batches processed thus far for debug & stats.
- */
-unsigned long rcu_batches_completed(void)
-{
-	return rcu_batches_completed_preempt();
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-
 /*
  * Record a preemptible-RCU quiescent state for the specified CPU.  Note
  * that this just means that the task currently running on the CPU is
@@ -932,15 +913,6 @@ static void __init rcu_bootup_announce(void)
 	rcu_bootup_announce_oddness();
 }
 
-/*
- * Return the number of RCU batches processed thus far for debug & stats.
- */
-unsigned long rcu_batches_completed(void)
-{
-	return rcu_batches_completed_sched();
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-
 /*
  * Because preemptible RCU does not exist, we never have to check for
  * CPUs being in quiescent states.
-- 
cgit v1.2.3


From a94844b22a2e2b9155bbc0878c507850477221c2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 12 Dec 2014 07:37:48 -0800
Subject: rcu: Optionally run grace-period kthreads at real-time priority

Recent testing has shown that under heavy load, running RCU's grace-period
kthreads at real-time priority can improve performance (according to 0day
test robot) and reduce the incidence of RCU CPU stall warnings.  However,
most systems do just fine with the default non-realtime priorities for
these kthreads, and it does not make sense to expose the entire user
base to any risk stemming from this change, given that this change is
of use only to a few users running extremely heavy workloads.

Therefore, this commit allows users to specify realtime priorities
for the grace-period kthreads, but leaves them running SCHED_OTHER
by default.  The realtime priority may be specified at build time
via the RCU_KTHREAD_PRIO Kconfig parameter, or at boot time via the
rcutree.kthread_prio parameter.  Either way, 0 says to continue the
default SCHED_OTHER behavior and values from 1-99 specify that priority
of SCHED_FIFO behavior.  Note that a value of 0 is not permitted when
the RCU_BOOST Kconfig parameter is specified.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel/rcu/tree_plugin.h')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 81ff8b9a5a39..dcb32638f88b 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -34,10 +34,6 @@
 
 #include "../locking/rtmutex_common.h"
 
-/* rcuc/rcub kthread realtime priority */
-static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
-module_param(kthread_prio, int, 0644);
-
 /*
  * Control variables for per-CPU and per-rcu_node kthreads.  These
  * handle all flavors of RCU.
-- 
cgit v1.2.3