From 41050a009640f9f330a5b916563ca7faf853a98c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 18 Dec 2014 12:31:27 -0800 Subject: rcu: Fix rcu_barrier() race that could result in too-short wait The rcu_barrier() no-callbacks check for no-CBs CPUs has race conditions. It checks a given CPU's lists of callbacks, and if all three no-CBs lists are empty, ignores that CPU. However, these three lists could potentially be empty even when callbacks are present if the check executed just as the callbacks were being moved from one list to another. It turns out that recent versions of rcutorture can spot this race. This commit plugs this hole by consolidating the per-list counts of no-CBs callbacks into a single count, which is incremented before the corresponding callback is posted and after it is invoked. Then rcu_barrier() checks this single count to reliably determine whether the corresponding CPU has no-CBs callbacks. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3ec85cb5d544..e5c43b7f63f2 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2056,9 +2056,26 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) { struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + unsigned long ret; +#ifdef CONFIG_PROVE_RCU struct rcu_head *rhp; +#endif /* #ifdef CONFIG_PROVE_RCU */ + + /* + * Check count of all no-CBs callbacks awaiting invocation. + * There needs to be a barrier before this function is called, + * but associated with a prior determination that no more + * callbacks would be posted. In the worst case, the first + * barrier in _rcu_barrier() suffices (but the caller cannot + * necessarily rely on this, not a substitute for the caller + * getting the concurrency design right!). There must also be + * a barrier between the following load an posting of a callback + * (if a callback is in fact needed). This is associated with an + * atomic_inc() in the caller. + */ + ret = atomic_long_read(&rdp->nocb_q_count); - /* No-CBs CPUs might have callbacks on any of three lists. */ +#ifdef CONFIG_PROVE_RCU rhp = ACCESS_ONCE(rdp->nocb_head); if (!rhp) rhp = ACCESS_ONCE(rdp->nocb_gp_head); @@ -2072,8 +2089,9 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) cpu, rhp->func); WARN_ON_ONCE(1); } +#endif /* #ifdef CONFIG_PROVE_RCU */ - return !!rhp; + return !!ret; } /* @@ -2095,9 +2113,10 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, struct task_struct *t; /* Enqueue the callback on the nocb list and update counts. */ + atomic_long_add(rhcount, &rdp->nocb_q_count); + /* rcu_barrier() relies on ->nocb_q_count add before xchg. */ old_rhpp = xchg(&rdp->nocb_tail, rhtp); ACCESS_ONCE(*old_rhpp) = rhp; - atomic_long_add(rhcount, &rdp->nocb_q_count); atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */ @@ -2288,9 +2307,6 @@ wait_again: /* Move callbacks to wait-for-GP list, which is empty. */ ACCESS_ONCE(rdp->nocb_head) = NULL; rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); - rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0); - rdp->nocb_gp_count_lazy = - atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); gotcbs = true; } @@ -2338,9 +2354,6 @@ wait_again: /* Append callbacks to follower's "done" list. */ tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail); *tail = rdp->nocb_gp_head; - atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count); - atomic_long_add(rdp->nocb_gp_count_lazy, - &rdp->nocb_follower_count_lazy); smp_mb__after_atomic(); /* Store *tail before wakeup. */ if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { /* @@ -2415,13 +2428,11 @@ static int rcu_nocb_kthread(void *arg) trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); ACCESS_ONCE(rdp->nocb_follower_head) = NULL; tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head); - c = atomic_long_xchg(&rdp->nocb_follower_count, 0); - cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0); - rdp->nocb_p_count += c; - rdp->nocb_p_count_lazy += cl; /* Each pass through the following loop invokes a callback. */ - trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); + trace_rcu_batch_start(rdp->rsp->name, + atomic_long_read(&rdp->nocb_q_count_lazy), + atomic_long_read(&rdp->nocb_q_count), -1); c = cl = 0; while (list) { next = list->next; @@ -2443,9 +2454,9 @@ static int rcu_nocb_kthread(void *arg) list = next; } trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); - ACCESS_ONCE(rdp->nocb_p_count) = rdp->nocb_p_count - c; - ACCESS_ONCE(rdp->nocb_p_count_lazy) = - rdp->nocb_p_count_lazy - cl; + smp_mb__before_atomic(); /* _add after CB invocation. */ + atomic_long_add(-c, &rdp->nocb_q_count); + atomic_long_add(-cl, &rdp->nocb_q_count_lazy); rdp->n_nocbs_invoked += c; } return 0; -- cgit v1.2.3 From b08ea27d95bcaee6d9cf4edd64f373006661424a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 29 Oct 2014 15:39:39 -0700 Subject: rcu: Protect rcu_boost() lockless accesses with ACCESS_ONCE() This commit prevents random compiler optimizations by applying ACCESS_ONCE() to lockless accesses. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3ec85cb5d544..d59913ef8360 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1127,7 +1127,8 @@ static int rcu_boost(struct rcu_node *rnp) struct task_struct *t; struct list_head *tb; - if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) + if (ACCESS_ONCE(rnp->exp_tasks) == NULL && + ACCESS_ONCE(rnp->boost_tasks) == NULL) return 0; /* Nothing left to boost. */ raw_spin_lock_irqsave(&rnp->lock, flags); -- cgit v1.2.3 From 74e871ac6cb17f67cbefb569d98a8d05de666e07 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 30 Oct 2014 21:08:53 -0700 Subject: rcu: Rename "empty" to "empty_norm" in preparation for boost rework This commit undertakes a simple variable renaming to make way for some rework of RCU priority boosting. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d59913ef8360..3d22f0b2dea9 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -313,8 +313,8 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t, */ void rcu_read_unlock_special(struct task_struct *t) { - int empty; int empty_exp; + int empty_norm; int empty_exp_now; unsigned long flags; struct list_head *np; @@ -367,7 +367,7 @@ void rcu_read_unlock_special(struct task_struct *t) break; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } - empty = !rcu_preempt_blocked_readers_cgp(rnp); + empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = !rcu_preempted_readers_exp(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ np = rcu_next_node_entry(t, rnp); @@ -393,7 +393,7 @@ void rcu_read_unlock_special(struct task_struct *t) * so we must take a snapshot of the expedited state. */ empty_exp_now = !rcu_preempted_readers_exp(rnp); - if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { + if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) { trace_rcu_quiescent_state_report(TPS("preempt_rcu"), rnp->gpnum, 0, rnp->qsmask, -- cgit v1.2.3 From 8af3a5e78cfb63abe8813743946b7bd5a8a3134c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Oct 2014 11:22:37 -0700 Subject: rcu: Abstract rcu_cleanup_dead_rnp() from rcu_cleanup_dead_cpu() This commit abstracts rcu_cleanup_dead_rnp() from rcu_cleanup_dead_cpu() in preparation for the rework of RCU priority boosting. This new function will be invoked from rcu_read_unlock_special() in the reworked scheme, which is why rcu_cleanup_dead_rnp() assumes that the leaf rcu_node structure's ->qsmaskinit field has already been updated. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3d22f0b2dea9..d044b9cbbd97 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -306,6 +306,15 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t, return np; } +/* + * Return true if the specified rcu_node structure has tasks that were + * preempted within an RCU read-side critical section. + */ +static bool rcu_preempt_has_tasks(struct rcu_node *rnp) +{ + return !list_empty(&rnp->blkd_tasks); +} + /* * Handle special cases during rcu_read_unlock(), such as needing to * notify RCU core processing or task having blocked during the RCU @@ -967,6 +976,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) raw_spin_unlock_irqrestore(&rnp->lock, flags); } +/* + * Because there is no preemptible RCU, there can be no readers blocked. + */ +static bool rcu_preempt_has_tasks(struct rcu_node *rnp) +{ + return false; +} + #endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* -- cgit v1.2.3 From b6a932d1d9840727eee619d455bdeeedaa205be9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Oct 2014 12:05:04 -0700 Subject: rcu: Make rcu_read_unlock_special() propagate ->qsmaskinit bit clearing This commit causes rcu_read_unlock_special() to propagate ->qsmaskinit bit clearing up the rcu_node tree once a given rcu_node structure's blkd_tasks list becomes empty. This is the final commit in preparation for the rework of RCU priority boosting: It enables preempted tasks to remain queued on their rcu_node structure even after all of that rcu_node structure's CPUs have gone offline. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d044b9cbbd97..8a2b84157d34 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -322,9 +322,10 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) */ void rcu_read_unlock_special(struct task_struct *t) { - int empty_exp; - int empty_norm; - int empty_exp_now; + bool empty; + bool empty_exp; + bool empty_norm; + bool empty_exp_now; unsigned long flags; struct list_head *np; #ifdef CONFIG_RCU_BOOST @@ -376,6 +377,7 @@ void rcu_read_unlock_special(struct task_struct *t) break; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } + empty = !rcu_preempt_has_tasks(rnp); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = !rcu_preempted_readers_exp(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ @@ -395,6 +397,14 @@ void rcu_read_unlock_special(struct task_struct *t) drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; #endif /* #ifdef CONFIG_RCU_BOOST */ + /* + * If this was the last task on the list, go see if we + * need to propagate ->qsmaskinit bit clearing up the + * rcu_node tree. + */ + if (!empty && !rcu_preempt_has_tasks(rnp)) + rcu_cleanup_dead_rnp(rnp); + /* * If this was the last task on the current list, and if * we aren't waiting on any CPUs, report the quiescent state. -- cgit v1.2.3 From d19fb8d1f3f66cc342d30aa48f090c70afb753ed Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Oct 2014 12:56:16 -0700 Subject: rcu: Don't migrate blocked tasks even if all corresponding CPUs offline When the last CPU associated with a given leaf rcu_node structure goes offline, something must be done about the tasks queued on that rcu_node structure. Each of these tasks has been preempted on one of the leaf rcu_node structure's CPUs while in an RCU read-side critical section that it have not yet exited. Handling these tasks is the job of rcu_preempt_offline_tasks(), which migrates them from the leaf rcu_node structure to the root rcu_node structure. Unfortunately, this migration has to be done one task at a time because each tasks allegiance must be shifted from the original leaf rcu_node to the root, so that future attempts to deal with these tasks will acquire the root rcu_node structure's ->lock rather than that of the leaf. Worse yet, this migration must be done with interrupts disabled, which is not so good for realtime response, especially given that there is no bound on the number of tasks on a given rcu_node structure's list. (OK, OK, there is a bound, it is just that it is unreasonably large, especially on 64-bit systems.) This was not considered a problem back when rcu_preempt_offline_tasks() was first written because realtime systems were assumed not to do CPU-hotplug operations while real-time applications were running. This assumption has proved of dubious validity given that people are starting to run multiple realtime applications on a single SMP system and that it is common practice to offline then online a CPU before starting its real-time application in order to clear extraneous processing off of that CPU. So we now need CPU hotplug operations to avoid undue latencies. This commit therefore avoids migrating these tasks, instead letting them be dequeued one by one from the original leaf rcu_node structure by rcu_read_unlock_special(). This means that the clearing of bits from the upper-level rcu_node structures must be deferred until the last such task has been dequeued, because otherwise subsequent grace periods won't wait on them. This commit has the beneficial side effect of simplifying the CPU-hotplug code for TREE_PREEMPT_RCU, especially in CONFIG_RCU_BOOST builds. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 126 +---------------------------------------------- 1 file changed, 2 insertions(+), 124 deletions(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 8a2b84157d34..d594da48f4b4 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -103,6 +103,8 @@ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); static struct rcu_state *rcu_state_p = &rcu_preempt_state; static int rcu_preempted_readers_exp(struct rcu_node *rnp); +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake); /* * Tell them what RCU they are running. @@ -545,92 +547,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) #ifdef CONFIG_HOTPLUG_CPU -/* - * Handle tasklist migration for case in which all CPUs covered by the - * specified rcu_node have gone offline. Move them up to the root - * rcu_node. The reason for not just moving them to the immediate - * parent is to remove the need for rcu_read_unlock_special() to - * make more than two attempts to acquire the target rcu_node's lock. - * Returns true if there were tasks blocking the current RCU grace - * period. - * - * Returns 1 if there was previously a task blocking the current grace - * period on the specified rcu_node structure. - * - * The caller must hold rnp->lock with irqs disabled. - */ -static int rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) -{ - struct list_head *lp; - struct list_head *lp_root; - int retval = 0; - struct rcu_node *rnp_root = rcu_get_root(rsp); - struct task_struct *t; - - if (rnp == rnp_root) { - WARN_ONCE(1, "Last CPU thought to be offlined?"); - return 0; /* Shouldn't happen: at least one CPU online. */ - } - - /* If we are on an internal node, complain bitterly. */ - WARN_ON_ONCE(rnp != rdp->mynode); - - /* - * Move tasks up to root rcu_node. Don't try to get fancy for - * this corner-case operation -- just put this node's tasks - * at the head of the root node's list, and update the root node's - * ->gp_tasks and ->exp_tasks pointers to those of this node's, - * if non-NULL. This might result in waiting for more tasks than - * absolutely necessary, but this is a good performance/complexity - * tradeoff. - */ - if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0) - retval |= RCU_OFL_TASKS_NORM_GP; - if (rcu_preempted_readers_exp(rnp)) - retval |= RCU_OFL_TASKS_EXP_GP; - lp = &rnp->blkd_tasks; - lp_root = &rnp_root->blkd_tasks; - while (!list_empty(lp)) { - t = list_entry(lp->next, typeof(*t), rcu_node_entry); - raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ - smp_mb__after_unlock_lock(); - list_del(&t->rcu_node_entry); - t->rcu_blocked_node = rnp_root; - list_add(&t->rcu_node_entry, lp_root); - if (&t->rcu_node_entry == rnp->gp_tasks) - rnp_root->gp_tasks = rnp->gp_tasks; - if (&t->rcu_node_entry == rnp->exp_tasks) - rnp_root->exp_tasks = rnp->exp_tasks; -#ifdef CONFIG_RCU_BOOST - if (&t->rcu_node_entry == rnp->boost_tasks) - rnp_root->boost_tasks = rnp->boost_tasks; -#endif /* #ifdef CONFIG_RCU_BOOST */ - raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ - } - - rnp->gp_tasks = NULL; - rnp->exp_tasks = NULL; -#ifdef CONFIG_RCU_BOOST - rnp->boost_tasks = NULL; - /* - * In case root is being boosted and leaf was not. Make sure - * that we boost the tasks blocking the current grace period - * in this case. - */ - raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ - smp_mb__after_unlock_lock(); - if (rnp_root->boost_tasks != NULL && - rnp_root->boost_tasks != rnp_root->gp_tasks && - rnp_root->boost_tasks != rnp_root->exp_tasks) - rnp_root->boost_tasks = rnp_root->gp_tasks; - raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ -#endif /* #ifdef CONFIG_RCU_BOOST */ - - return retval; -} - #endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* @@ -979,13 +895,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) #ifdef CONFIG_HOTPLUG_CPU -/* Because preemptible RCU does not exist, no quieting of tasks. */ -static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) - __releases(rnp->lock) -{ - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - /* * Because there is no preemptible RCU, there can be no readers blocked. */ @@ -1023,23 +932,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) WARN_ON_ONCE(rnp->qsmask); } -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Because preemptible RCU does not exist, it never needs to migrate - * tasks that were blocked within RCU read-side critical sections, and - * such non-existent tasks cannot possibly have been blocking the current - * grace period. - */ -static int rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) -{ - return 0; -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - /* * Because preemptible RCU does not exist, it never has any callbacks * to check. @@ -1058,20 +950,6 @@ void synchronize_rcu_expedited(void) } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Because preemptible RCU does not exist, there is never any need to - * report on tasks preempted in RCU read-side critical sections during - * expedited RCU grace periods. - */ -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake) -{ -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - /* * Because preemptible RCU does not exist, rcu_barrier() is just * another name for rcu_barrier_sched(). -- cgit v1.2.3 From 96e92021d4fad8543d598b5e4926cb34fd40c4c4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Oct 2014 14:09:23 -0700 Subject: rcu: Make use of rcu_preempt_has_tasks() Given that there is now arcu_preempt_has_tasks() function that checks to see if the ->blkd_tasks list is non-empty, this commit makes use of it. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d594da48f4b4..bcc6d9134d47 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -540,7 +540,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); - if (!list_empty(&rnp->blkd_tasks)) + if (rcu_preempt_has_tasks(rnp)) rnp->gp_tasks = rnp->blkd_tasks.next; WARN_ON_ONCE(rnp->qsmask); } @@ -706,7 +706,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - if (list_empty(&rnp->blkd_tasks)) { + if (!rcu_preempt_has_tasks(rnp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); } else { rnp->exp_tasks = rnp->blkd_tasks.next; @@ -985,7 +985,7 @@ void exit_rcu(void) static void rcu_initiate_boost_trace(struct rcu_node *rnp) { - if (list_empty(&rnp->blkd_tasks)) + if (!rcu_preempt_has_tasks(rnp)) rnp->n_balk_blkd_tasks++; else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) rnp->n_balk_exp_gp_tasks++; -- cgit v1.2.3 From 3e9f5c70d85060ff0db71826e2048daffec336e7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 3 Nov 2014 18:15:17 -0800 Subject: rcu: Don't spawn rcub kthreads on root rcu_node structure Now that offlining CPUs no longer moves leaf rcu_node structures' ->blkd_tasks lists to the root, there is no way for the root rcu_node structure's ->blkd_task list to be nonempty, unless the root node is also the sole leaf node. This commit therefore refrains from creating an rcub kthread for the root rcu_node structure unless it is also the sole leaf. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index bcc6d9134d47..3e64bb197b1a 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1352,12 +1352,8 @@ static void __init rcu_spawn_boost_kthreads(void) for_each_possible_cpu(cpu) per_cpu(rcu_cpu_has_work, cpu) = 0; BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); - rnp = rcu_get_root(rcu_state_p); - (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); - if (NUM_RCU_NODES > 1) { - rcu_for_each_leaf_node(rcu_state_p, rnp) - (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); - } + rcu_for_each_leaf_node(rcu_state_p, rnp) + (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); } static void rcu_prepare_kthreads(int cpu) -- cgit v1.2.3 From 5d0b024973027556b48a09bb36b55dc853ec7c6e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 10 Nov 2014 08:07:08 -0800 Subject: rcu: Don't bother affinitying rcub kthreads away from offline CPUs When rcu_boost_kthread_setaffinity() sees that all CPUs for a given rcu_node structure are now offline, it affinities the corresponding RCU-boost ("rcub") kthread away from those CPUs. This is pointless because the kthread cannot run on those offline CPUs in any case. This commit therefore removes this unneeded code. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3e64bb197b1a..1fac68220999 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1322,12 +1322,8 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) if ((mask & 0x1) && cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); - if (cpumask_weight(cm) == 0) { + if (cpumask_weight(cm) == 0) cpumask_setall(cm); - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) - cpumask_clear_cpu(cpu, cm); - WARN_ON_ONCE(cpumask_weight(cm) == 0); - } set_cpus_allowed_ptr(t, cm); free_cpumask_var(cm); } -- cgit v1.2.3 From abaf3f9d275b8d856ae5e47531e40c0bfeac012b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 18 Nov 2014 16:30:01 +0800 Subject: rcu: Revert "Allow post-unlock reference for rt_mutex" to avoid priority-inversion The patch dfeb9765ce3c ("Allow post-unlock reference for rt_mutex") ensured rcu-boost safe even the rt_mutex has post-unlock reference. But rt_mutex allowing post-unlock reference is definitely a bug and it was fixed by the commit 27e35715df54 ("rtmutex: Plug slow unlock race"). This fix made the previous patch (dfeb9765ce3c) useless. And even worse, the priority-inversion introduced by the the previous patch still exists. rcu_read_unlock_special() { rt_mutex_unlock(&rnp->boost_mtx); /* Priority-Inversion: * the current task had been deboosted and preempted as a low * priority task immediately, it could wait long before reschedule in, * and the rcu-booster also waits on this low priority task and sleeps. * This priority-inversion makes rcu-booster can't work * as expected. */ complete(&rnp->boost_completion); } Just revert the patch to avoid it. Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Peter Zijlstra Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1fac68220999..625e26040e6b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -429,10 +429,8 @@ void rcu_read_unlock_special(struct task_struct *t) #ifdef CONFIG_RCU_BOOST /* Unboost if we were boosted. */ - if (drop_boost_mutex) { + if (drop_boost_mutex) rt_mutex_unlock(&rnp->boost_mtx); - complete(&rnp->boost_completion); - } #endif /* #ifdef CONFIG_RCU_BOOST */ /* @@ -1081,15 +1079,11 @@ static int rcu_boost(struct rcu_node *rnp) */ t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); - init_completion(&rnp->boost_completion); raw_spin_unlock_irqrestore(&rnp->lock, flags); /* Lock only for side effect: boosts task t's priority. */ rt_mutex_lock(&rnp->boost_mtx); rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ - /* Wait for boostee to be done w/boost_mtx before reinitializing. */ - wait_for_completion(&rnp->boost_completion); - return ACCESS_ONCE(rnp->exp_tasks) != NULL || ACCESS_ONCE(rnp->boost_tasks) != NULL; } -- cgit v1.2.3 From fc908ed33e7c1428f799abb12399f906da03b397 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 8 Dec 2014 09:57:48 -0800 Subject: rcu: Make RCU_CPU_STALL_INFO include number of fqs attempts One way that an RCU CPU stall warning can happen is if the grace-period kthread is not allowed to execute. One proxy for this kthread's forward progress is the number of force-quiescent-state (fqs) scans. This commit therefore adds the number of fqs scans to the RCU CPU stall warning printouts when CONFIG_RCU_CPU_STALL_INFO=y. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3ec85cb5d544..769384d77437 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1898,11 +1898,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", + pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", cpu, ticks_value, ticks_title, atomic_read(&rdtp->dynticks) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), + ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart, fast_no_hz); } -- cgit v1.2.3 From e3663b1024d1f94688e5233440ad67a9bc10b94e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 8 Dec 2014 20:26:55 -0800 Subject: rcu: Handle gpnum/completed wrap while dyntick idle Subtle race conditions can result if a CPU stays in dyntick-idle mode long enough for the ->gpnum and ->completed fields to wrap. For example, consider the following sequence of events: o CPU 1 encounters a quiescent state while waiting for grace period 5 to complete, but then enters dyntick-idle mode. o While CPU 1 is in dyntick-idle mode, the grace-period counters wrap around so that the grace period number is now 4. o Just as CPU 1 exits dyntick-idle mode, grace period 4 completes and grace period 5 begins. o The quiescent state that CPU 1 passed through during the old grace period 5 looks like it applies to the new grace period 5. Therefore, the new grace period 5 completes without CPU 1 having passed through a quiescent state. This could clearly be a fatal surprise to any long-running RCU read-side critical section that happened to be running on CPU 1 at the time. At one time, this was not a problem, given that it takes significant time for the grace-period counters to overflow even on 32-bit systems. However, with the advent of NO_HZ_FULL and SMP embedded systems, arbitrarily long idle periods are now becoming quite feasible. It is therefore time to close this race. This commit therefore avoids this race condition by having the quiescent-state forcing code detect when a CPU is falling too far behind, and setting a new rcu_data field ->gpwrap when this happens. Whenever this new ->gpwrap field is set, the CPU's ->gpnum and ->completed fields are known to be untrustworthy, and can be ignored, along with any associated quiescent states. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 769384d77437..81ff8b9a5a39 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1605,7 +1605,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) * completed since we last checked and there are * callbacks not yet ready to invoke. */ - if (rdp->completed != rnp->completed && + if ((rdp->completed != rnp->completed || + unlikely(ACCESS_ONCE(rdp->gpwrap))) && rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) note_gp_changes(rsp, rdp); -- cgit v1.2.3 From 9733e4f0a973a354034f5dd603b4142a3095c85f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Nov 2014 12:49:13 -0800 Subject: rcu: Make _batches_completed() functions return unsigned long Long ago, the various ->completed fields were of type long, but now are unsigned long due to signed-integer-overflow concerns. However, the various _batches_completed() functions remained of type long, even though their only purpose in life is to return the corresponding ->completed field. This patch cleans this up by changing these functions' return types to unsigned long. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3ec85cb5d544..f69300d4a51f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -117,7 +117,7 @@ static void __init rcu_bootup_announce(void) * Return the number of RCU-preempt batches processed thus far * for debug and statistics. */ -static long rcu_batches_completed_preempt(void) +static unsigned long rcu_batches_completed_preempt(void) { return rcu_preempt_state.completed; } @@ -126,7 +126,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt); /* * Return the number of RCU batches processed thus far for debug & stats. */ -long rcu_batches_completed(void) +unsigned long rcu_batches_completed(void) { return rcu_batches_completed_preempt(); } @@ -935,7 +935,7 @@ static void __init rcu_bootup_announce(void) /* * Return the number of RCU batches processed thus far for debug & stats. */ -long rcu_batches_completed(void) +unsigned long rcu_batches_completed(void) { return rcu_batches_completed_sched(); } -- cgit v1.2.3 From 917963d0b30f9c4153c372c165178501d97b6b55 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Nov 2014 17:10:16 -0800 Subject: rcutorture: Check from beginning to end of grace period Currently, rcutorture's Reader Batch checks measure from the end of the previous grace period to the end of the current one. This commit tightens up these checks by measuring from the start and end of the same grace period. This involves adding rcu_batches_started() and friends corresponding to the existing rcu_batches_completed() and friends. We leave SRCU alone for the moment, as it does not yet have a way of tracking both ends of its grace periods. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 28 ---------------------------- 1 file changed, 28 deletions(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f69300d4a51f..07e61a04de1d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -113,25 +113,6 @@ static void __init rcu_bootup_announce(void) rcu_bootup_announce_oddness(); } -/* - * Return the number of RCU-preempt batches processed thus far - * for debug and statistics. - */ -static unsigned long rcu_batches_completed_preempt(void) -{ - return rcu_preempt_state.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt); - -/* - * Return the number of RCU batches processed thus far for debug & stats. - */ -unsigned long rcu_batches_completed(void) -{ - return rcu_batches_completed_preempt(); -} -EXPORT_SYMBOL_GPL(rcu_batches_completed); - /* * Record a preemptible-RCU quiescent state for the specified CPU. Note * that this just means that the task currently running on the CPU is @@ -932,15 +913,6 @@ static void __init rcu_bootup_announce(void) rcu_bootup_announce_oddness(); } -/* - * Return the number of RCU batches processed thus far for debug & stats. - */ -unsigned long rcu_batches_completed(void) -{ - return rcu_batches_completed_sched(); -} -EXPORT_SYMBOL_GPL(rcu_batches_completed); - /* * Because preemptible RCU does not exist, we never have to check for * CPUs being in quiescent states. -- cgit v1.2.3 From a94844b22a2e2b9155bbc0878c507850477221c2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 12 Dec 2014 07:37:48 -0800 Subject: rcu: Optionally run grace-period kthreads at real-time priority Recent testing has shown that under heavy load, running RCU's grace-period kthreads at real-time priority can improve performance (according to 0day test robot) and reduce the incidence of RCU CPU stall warnings. However, most systems do just fine with the default non-realtime priorities for these kthreads, and it does not make sense to expose the entire user base to any risk stemming from this change, given that this change is of use only to a few users running extremely heavy workloads. Therefore, this commit allows users to specify realtime priorities for the grace-period kthreads, but leaves them running SCHED_OTHER by default. The realtime priority may be specified at build time via the RCU_KTHREAD_PRIO Kconfig parameter, or at boot time via the rcutree.kthread_prio parameter. Either way, 0 says to continue the default SCHED_OTHER behavior and values from 1-99 specify that priority of SCHED_FIFO behavior. Note that a value of 0 is not permitted when the RCU_BOOST Kconfig parameter is specified. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 81ff8b9a5a39..dcb32638f88b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -34,10 +34,6 @@ #include "../locking/rtmutex_common.h" -/* rcuc/rcub kthread realtime priority */ -static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; -module_param(kthread_prio, int, 0644); - /* * Control variables for per-CPU and per-rcu_node kthreads. These * handle all flavors of RCU. -- cgit v1.2.3