diff options
Diffstat (limited to 'kernel/rcutree.c')
| -rw-r--r-- | kernel/rcutree.c | 507 | 
1 files changed, 371 insertions, 136 deletions
| diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 6c4a6722abfd..1050d6d3922c 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -50,6 +50,8 @@  #include <linux/wait.h>  #include <linux/kthread.h>  #include <linux/prefetch.h> +#include <linux/delay.h> +#include <linux/stop_machine.h>  #include "rcutree.h"  #include <trace/events/rcu.h> @@ -196,7 +198,7 @@ void rcu_note_context_switch(int cpu)  EXPORT_SYMBOL_GPL(rcu_note_context_switch);  DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { -	.dynticks_nesting = DYNTICK_TASK_NESTING, +	.dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,  	.dynticks = ATOMIC_INIT(1),  }; @@ -208,8 +210,11 @@ module_param(blimit, int, 0);  module_param(qhimark, int, 0);  module_param(qlowmark, int, 0); -int rcu_cpu_stall_suppress __read_mostly; +int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ +int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; +  module_param(rcu_cpu_stall_suppress, int, 0644); +module_param(rcu_cpu_stall_timeout, int, 0644);  static void force_quiescent_state(struct rcu_state *rsp, int relaxed);  static int rcu_pending(int cpu); @@ -301,8 +306,6 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)  	return &rsp->node[0];  } -#ifdef CONFIG_SMP -  /*   * If the specified CPU is offline, tell the caller that it is in   * a quiescent state.  Otherwise, whack it with a reschedule IPI. @@ -317,30 +320,21 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)  static int rcu_implicit_offline_qs(struct rcu_data *rdp)  {  	/* -	 * If the CPU is offline, it is in a quiescent state.  We can -	 * trust its state not to change because interrupts are disabled. +	 * If the CPU is offline for more than a jiffy, it is in a quiescent +	 * state.  We can trust its state not to change because interrupts +	 * are disabled.  The reason for the jiffy's worth of slack is to +	 * handle CPUs initializing on the way up and finding their way +	 * to the idle loop on the way down.  	 */ -	if (cpu_is_offline(rdp->cpu)) { +	if (cpu_is_offline(rdp->cpu) && +	    ULONG_CMP_LT(rdp->rsp->gp_start + 2, jiffies)) {  		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");  		rdp->offline_fqs++;  		return 1;  	} - -	/* -	 * The CPU is online, so send it a reschedule IPI.  This forces -	 * it through the scheduler, and (inefficiently) also handles cases -	 * where idle loops fail to inform RCU about the CPU being idle. -	 */ -	if (rdp->cpu != smp_processor_id()) -		smp_send_reschedule(rdp->cpu); -	else -		set_need_resched(); -	rdp->resched_ipi++;  	return 0;  } -#endif /* #ifdef CONFIG_SMP */ -  /*   * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle   * @@ -366,6 +360,17 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)  	atomic_inc(&rdtp->dynticks);  	smp_mb__after_atomic_inc();  /* Force ordering with next sojourn. */  	WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); + +	/* +	 * The idle task is not permitted to enter the idle loop while +	 * in an RCU read-side critical section. +	 */ +	rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), +			   "Illegal idle entry in RCU read-side critical section."); +	rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map), +			   "Illegal idle entry in RCU-bh read-side critical section."); +	rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map), +			   "Illegal idle entry in RCU-sched read-side critical section.");  }  /** @@ -389,10 +394,15 @@ void rcu_idle_enter(void)  	local_irq_save(flags);  	rdtp = &__get_cpu_var(rcu_dynticks);  	oldval = rdtp->dynticks_nesting; -	rdtp->dynticks_nesting = 0; +	WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); +	if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) +		rdtp->dynticks_nesting = 0; +	else +		rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;  	rcu_idle_enter_common(rdtp, oldval);  	local_irq_restore(flags);  } +EXPORT_SYMBOL_GPL(rcu_idle_enter);  /**   * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle @@ -462,7 +472,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)   * Exit idle mode, in other words, -enter- the mode in which RCU   * read-side critical sections can occur.   * - * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to + * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to   * allow for the possibility of usermode upcalls messing up our count   * of interrupt nesting level during the busy period that is just   * now starting. @@ -476,11 +486,15 @@ void rcu_idle_exit(void)  	local_irq_save(flags);  	rdtp = &__get_cpu_var(rcu_dynticks);  	oldval = rdtp->dynticks_nesting; -	WARN_ON_ONCE(oldval != 0); -	rdtp->dynticks_nesting = DYNTICK_TASK_NESTING; +	WARN_ON_ONCE(oldval < 0); +	if (oldval & DYNTICK_TASK_NEST_MASK) +		rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; +	else +		rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;  	rcu_idle_exit_common(rdtp, oldval);  	local_irq_restore(flags);  } +EXPORT_SYMBOL_GPL(rcu_idle_exit);  /**   * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle @@ -581,6 +595,49 @@ int rcu_is_cpu_idle(void)  }  EXPORT_SYMBOL(rcu_is_cpu_idle); +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Is the current CPU online?  Disable preemption to avoid false positives + * that could otherwise happen due to the current CPU number being sampled, + * this task being preempted, its old CPU being taken offline, resuming + * on some other CPU, then determining that its old CPU is now offline. + * It is OK to use RCU on an offline processor during initial boot, hence + * the check for rcu_scheduler_fully_active.  Note also that it is OK + * for a CPU coming online to use RCU for one jiffy prior to marking itself + * online in the cpu_online_mask.  Similarly, it is OK for a CPU going + * offline to continue to use RCU for one jiffy after marking itself + * offline in the cpu_online_mask.  This leniency is necessary given the + * non-atomic nature of the online and offline processing, for example, + * the fact that a CPU enters the scheduler after completing the CPU_DYING + * notifiers. + * + * This is also why RCU internally marks CPUs online during the + * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase. + * + * Disable checking if in an NMI handler because we cannot safely report + * errors from NMI handlers anyway. + */ +bool rcu_lockdep_current_cpu_online(void) +{ +	struct rcu_data *rdp; +	struct rcu_node *rnp; +	bool ret; + +	if (in_nmi()) +		return 1; +	preempt_disable(); +	rdp = &__get_cpu_var(rcu_sched_data); +	rnp = rdp->mynode; +	ret = (rdp->grpmask & rnp->qsmaskinit) || +	      !rcu_scheduler_fully_active; +	preempt_enable(); +	return ret; +} +EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ +  #endif /* #ifdef CONFIG_PROVE_RCU */  /** @@ -595,8 +652,6 @@ int rcu_is_cpu_rrupt_from_idle(void)  	return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;  } -#ifdef CONFIG_SMP -  /*   * Snapshot the specified CPU's dynticks counter so that we can later   * credit them with an implicit quiescent state.  Return 1 if this CPU @@ -640,12 +695,28 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)  	return rcu_implicit_offline_qs(rdp);  } -#endif /* #ifdef CONFIG_SMP */ +static int jiffies_till_stall_check(void) +{ +	int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout); + +	/* +	 * Limit check must be consistent with the Kconfig limits +	 * for CONFIG_RCU_CPU_STALL_TIMEOUT. +	 */ +	if (till_stall_check < 3) { +		ACCESS_ONCE(rcu_cpu_stall_timeout) = 3; +		till_stall_check = 3; +	} else if (till_stall_check > 300) { +		ACCESS_ONCE(rcu_cpu_stall_timeout) = 300; +		till_stall_check = 300; +	} +	return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; +}  static void record_gp_stall_check_time(struct rcu_state *rsp)  {  	rsp->gp_start = jiffies; -	rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; +	rsp->jiffies_stall = jiffies + jiffies_till_stall_check();  }  static void print_other_cpu_stall(struct rcu_state *rsp) @@ -664,13 +735,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)  		raw_spin_unlock_irqrestore(&rnp->lock, flags);  		return;  	} -	rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; - -	/* -	 * Now rat on any tasks that got kicked up to the root rcu_node -	 * due to CPU offlining. -	 */ -	ndetected = rcu_print_task_stall(rnp); +	rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3;  	raw_spin_unlock_irqrestore(&rnp->lock, flags);  	/* @@ -678,8 +743,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp)  	 * See Documentation/RCU/stallwarn.txt for info on how to debug  	 * RCU CPU stall warnings.  	 */ -	printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", +	printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:",  	       rsp->name); +	print_cpu_stall_info_begin();  	rcu_for_each_leaf_node(rsp, rnp) {  		raw_spin_lock_irqsave(&rnp->lock, flags);  		ndetected += rcu_print_task_stall(rnp); @@ -688,11 +754,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp)  			continue;  		for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)  			if (rnp->qsmask & (1UL << cpu)) { -				printk(" %d", rnp->grplo + cpu); +				print_cpu_stall_info(rsp, rnp->grplo + cpu);  				ndetected++;  			}  	} -	printk("} (detected by %d, t=%ld jiffies)\n", + +	/* +	 * Now rat on any tasks that got kicked up to the root rcu_node +	 * due to CPU offlining. +	 */ +	rnp = rcu_get_root(rsp); +	raw_spin_lock_irqsave(&rnp->lock, flags); +	ndetected = rcu_print_task_stall(rnp); +	raw_spin_unlock_irqrestore(&rnp->lock, flags); + +	print_cpu_stall_info_end(); +	printk(KERN_CONT "(detected by %d, t=%ld jiffies)\n",  	       smp_processor_id(), (long)(jiffies - rsp->gp_start));  	if (ndetected == 0)  		printk(KERN_ERR "INFO: Stall ended before state dump start\n"); @@ -716,15 +793,18 @@ static void print_cpu_stall(struct rcu_state *rsp)  	 * See Documentation/RCU/stallwarn.txt for info on how to debug  	 * RCU CPU stall warnings.  	 */ -	printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", -	       rsp->name, smp_processor_id(), jiffies - rsp->gp_start); +	printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name); +	print_cpu_stall_info_begin(); +	print_cpu_stall_info(rsp, smp_processor_id()); +	print_cpu_stall_info_end(); +	printk(KERN_CONT " (t=%lu jiffies)\n", jiffies - rsp->gp_start);  	if (!trigger_all_cpu_backtrace())  		dump_stack();  	raw_spin_lock_irqsave(&rnp->lock, flags);  	if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) -		rsp->jiffies_stall = -			jiffies + RCU_SECONDS_TILL_STALL_RECHECK; +		rsp->jiffies_stall = jiffies + +				     3 * jiffies_till_stall_check() + 3;  	raw_spin_unlock_irqrestore(&rnp->lock, flags);  	set_need_resched();  /* kick ourselves to get things going. */ @@ -807,6 +887,7 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct  			rdp->passed_quiesce = 0;  		} else  			rdp->qs_pending = 0; +		zero_cpu_stall_ticks(rdp);  	}  } @@ -943,6 +1024,10 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat   * in preparation for detecting the next grace period.  The caller must hold   * the root node's ->lock, which is released before return.  Hard irqs must   * be disabled. + * + * Note that it is legal for a dying CPU (which is marked as offline) to + * invoke this function.  This can happen when the dying CPU reports its + * quiescent state.   */  static void  rcu_start_gp(struct rcu_state *rsp, unsigned long flags) @@ -980,26 +1065,8 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)  	rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */  	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;  	record_gp_stall_check_time(rsp); - -	/* Special-case the common single-level case. */ -	if (NUM_RCU_NODES == 1) { -		rcu_preempt_check_blocked_tasks(rnp); -		rnp->qsmask = rnp->qsmaskinit; -		rnp->gpnum = rsp->gpnum; -		rnp->completed = rsp->completed; -		rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */ -		rcu_start_gp_per_cpu(rsp, rnp, rdp); -		rcu_preempt_boost_start_gp(rnp); -		trace_rcu_grace_period_init(rsp->name, rnp->gpnum, -					    rnp->level, rnp->grplo, -					    rnp->grphi, rnp->qsmask); -		raw_spin_unlock_irqrestore(&rnp->lock, flags); -		return; -	} -  	raw_spin_unlock(&rnp->lock);  /* leave irqs disabled. */ -  	/* Exclude any concurrent CPU-hotplug operations. */  	raw_spin_lock(&rsp->onofflock);  /* irqs already disabled. */ @@ -1245,53 +1312,115 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)  /*   * Move a dying CPU's RCU callbacks to online CPU's callback list. - * Synchronization is not required because this function executes - * in stop_machine() context. + * Also record a quiescent state for this CPU for the current grace period. + * Synchronization and interrupt disabling are not required because + * this function executes in stop_machine() context.  Therefore, cleanup + * operations that might block must be done later from the CPU_DEAD + * notifier. + * + * Note that the outgoing CPU's bit has already been cleared in the + * cpu_online_mask.  This allows us to randomly pick a callback + * destination from the bits set in that mask.   */ -static void rcu_send_cbs_to_online(struct rcu_state *rsp) +static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)  {  	int i; -	/* current DYING CPU is cleared in the cpu_online_mask */ +	unsigned long mask;  	int receive_cpu = cpumask_any(cpu_online_mask);  	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);  	struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); +	RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */ + +	/* First, adjust the counts. */ +	if (rdp->nxtlist != NULL) { +		receive_rdp->qlen_lazy += rdp->qlen_lazy; +		receive_rdp->qlen += rdp->qlen; +		rdp->qlen_lazy = 0; +		rdp->qlen = 0; +	} -	if (rdp->nxtlist == NULL) -		return;  /* irqs disabled, so comparison is stable. */ +	/* +	 * Next, move ready-to-invoke callbacks to be invoked on some +	 * other CPU.  These will not be required to pass through another +	 * grace period:  They are done, regardless of CPU. +	 */ +	if (rdp->nxtlist != NULL && +	    rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) { +		struct rcu_head *oldhead; +		struct rcu_head **oldtail; +		struct rcu_head **newtail; + +		oldhead = rdp->nxtlist; +		oldtail = receive_rdp->nxttail[RCU_DONE_TAIL]; +		rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; +		*rdp->nxttail[RCU_DONE_TAIL] = *oldtail; +		*receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead; +		newtail = rdp->nxttail[RCU_DONE_TAIL]; +		for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) { +			if (receive_rdp->nxttail[i] == oldtail) +				receive_rdp->nxttail[i] = newtail; +			if (rdp->nxttail[i] == newtail) +				rdp->nxttail[i] = &rdp->nxtlist; +		} +	} -	*receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; -	receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; -	receive_rdp->qlen += rdp->qlen; -	receive_rdp->n_cbs_adopted += rdp->qlen; -	rdp->n_cbs_orphaned += rdp->qlen; +	/* +	 * Finally, put the rest of the callbacks at the end of the list. +	 * The ones that made it partway through get to start over:  We +	 * cannot assume that grace periods are synchronized across CPUs. +	 * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but +	 * this does not seem compelling.  Not yet, anyway.) +	 */ +	if (rdp->nxtlist != NULL) { +		*receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; +		receive_rdp->nxttail[RCU_NEXT_TAIL] = +				rdp->nxttail[RCU_NEXT_TAIL]; +		receive_rdp->n_cbs_adopted += rdp->qlen; +		rdp->n_cbs_orphaned += rdp->qlen; + +		rdp->nxtlist = NULL; +		for (i = 0; i < RCU_NEXT_SIZE; i++) +			rdp->nxttail[i] = &rdp->nxtlist; +	} -	rdp->nxtlist = NULL; -	for (i = 0; i < RCU_NEXT_SIZE; i++) -		rdp->nxttail[i] = &rdp->nxtlist; -	rdp->qlen = 0; +	/* +	 * Record a quiescent state for the dying CPU.  This is safe +	 * only because we have already cleared out the callbacks. +	 * (Otherwise, the RCU core might try to schedule the invocation +	 * of callbacks on this now-offline CPU, which would be bad.) +	 */ +	mask = rdp->grpmask;	/* rnp->grplo is constant. */ +	trace_rcu_grace_period(rsp->name, +			       rnp->gpnum + 1 - !!(rnp->qsmask & mask), +			       "cpuofl"); +	rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum); +	/* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */  }  /* - * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy - * and move all callbacks from the outgoing CPU to the current one. + * The CPU has been completely removed, and some other CPU is reporting + * this fact from process context.  Do the remainder of the cleanup.   * There can only be one CPU hotplug operation at a time, so no other   * CPU can be attempting to update rcu_cpu_kthread_task.   */ -static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) +static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)  {  	unsigned long flags;  	unsigned long mask;  	int need_report = 0;  	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); -	struct rcu_node *rnp; +	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rnp. */ +	/* Adjust any no-longer-needed kthreads. */  	rcu_stop_cpu_kthread(cpu); +	rcu_node_kthread_setaffinity(rnp, -1); + +	/* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */  	/* Exclude any attempts to start a new grace period. */  	raw_spin_lock_irqsave(&rsp->onofflock, flags);  	/* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ -	rnp = rdp->mynode;	/* this is the outgoing CPU's rnp. */  	mask = rdp->grpmask;	/* rnp->grplo is constant. */  	do {  		raw_spin_lock(&rnp->lock);	/* irqs already disabled. */ @@ -1299,20 +1428,11 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)  		if (rnp->qsmaskinit != 0) {  			if (rnp != rdp->mynode)  				raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ -			else -				trace_rcu_grace_period(rsp->name, -						       rnp->gpnum + 1 - -						       !!(rnp->qsmask & mask), -						       "cpuofl");  			break;  		} -		if (rnp == rdp->mynode) { -			trace_rcu_grace_period(rsp->name, -					       rnp->gpnum + 1 - -					       !!(rnp->qsmask & mask), -					       "cpuofl"); +		if (rnp == rdp->mynode)  			need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); -		} else +		else  			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */  		mask = rnp->grpmask;  		rnp = rnp->parent; @@ -1332,29 +1452,15 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)  		raw_spin_unlock_irqrestore(&rnp->lock, flags);  	if (need_report & RCU_OFL_TASKS_EXP_GP)  		rcu_report_exp_rnp(rsp, rnp, true); -	rcu_node_kthread_setaffinity(rnp, -1); -} - -/* - * Remove the specified CPU from the RCU hierarchy and move any pending - * callbacks that it might have to the current CPU.  This code assumes - * that at least one CPU in the system will remain running at all times. - * Any attempt to offline -all- CPUs is likely to strand RCU callbacks. - */ -static void rcu_offline_cpu(int cpu) -{ -	__rcu_offline_cpu(cpu, &rcu_sched_state); -	__rcu_offline_cpu(cpu, &rcu_bh_state); -	rcu_preempt_offline_cpu(cpu);  }  #else /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_send_cbs_to_online(struct rcu_state *rsp) +static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)  {  } -static void rcu_offline_cpu(int cpu) +static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)  {  } @@ -1368,11 +1474,11 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)  {  	unsigned long flags;  	struct rcu_head *next, *list, **tail; -	int bl, count; +	int bl, count, count_lazy;  	/* If no callbacks are ready, just return.*/  	if (!cpu_has_callbacks_ready_to_invoke(rdp)) { -		trace_rcu_batch_start(rsp->name, 0, 0); +		trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);  		trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),  				    need_resched(), is_idle_task(current),  				    rcu_is_callbacks_kthread()); @@ -1384,8 +1490,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)  	 * races with call_rcu() from interrupt handlers.  	 */  	local_irq_save(flags); +	WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));  	bl = rdp->blimit; -	trace_rcu_batch_start(rsp->name, rdp->qlen, bl); +	trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl);  	list = rdp->nxtlist;  	rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];  	*rdp->nxttail[RCU_DONE_TAIL] = NULL; @@ -1396,12 +1503,13 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)  	local_irq_restore(flags);  	/* Invoke callbacks. */ -	count = 0; +	count = count_lazy = 0;  	while (list) {  		next = list->next;  		prefetch(next);  		debug_rcu_head_unqueue(list); -		__rcu_reclaim(rsp->name, list); +		if (__rcu_reclaim(rsp->name, list)) +			count_lazy++;  		list = next;  		/* Stop only if limit reached and CPU has something to do. */  		if (++count >= bl && @@ -1416,6 +1524,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)  			    rcu_is_callbacks_kthread());  	/* Update count, and requeue any remaining callbacks. */ +	rdp->qlen_lazy -= count_lazy;  	rdp->qlen -= count;  	rdp->n_cbs_invoked += count;  	if (list != NULL) { @@ -1458,6 +1567,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)  void rcu_check_callbacks(int cpu, int user)  {  	trace_rcu_utilization("Start scheduler-tick"); +	increment_cpu_stall_ticks();  	if (user || rcu_is_cpu_rrupt_from_idle()) {  		/* @@ -1492,8 +1602,6 @@ void rcu_check_callbacks(int cpu, int user)  	trace_rcu_utilization("End scheduler-tick");  } -#ifdef CONFIG_SMP -  /*   * Scan the leaf rcu_node structures, processing dyntick state for any that   * have not yet encountered a quiescent state, using the function specified. @@ -1616,15 +1724,6 @@ unlock_fqs_ret:  	trace_rcu_utilization("End fqs");  } -#else /* #ifdef CONFIG_SMP */ - -static void force_quiescent_state(struct rcu_state *rsp, int relaxed) -{ -	set_need_resched(); -} - -#endif /* #else #ifdef CONFIG_SMP */ -  /*   * This does the RCU core processing work for the specified rcu_state   * and rcu_data structures.  This may be called only from the CPU to @@ -1702,11 +1801,12 @@ static void invoke_rcu_core(void)  static void  __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), -	   struct rcu_state *rsp) +	   struct rcu_state *rsp, bool lazy)  {  	unsigned long flags;  	struct rcu_data *rdp; +	WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */  	debug_rcu_head_queue(head);  	head->func = func;  	head->next = NULL; @@ -1720,18 +1820,21 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),  	 * a quiescent state betweentimes.  	 */  	local_irq_save(flags); +	WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));  	rdp = this_cpu_ptr(rsp->rda);  	/* Add the callback to our list. */  	*rdp->nxttail[RCU_NEXT_TAIL] = head;  	rdp->nxttail[RCU_NEXT_TAIL] = &head->next;  	rdp->qlen++; +	if (lazy) +		rdp->qlen_lazy++;  	if (__is_kfree_rcu_offset((unsigned long)func))  		trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, -					 rdp->qlen); +					 rdp->qlen_lazy, rdp->qlen);  	else -		trace_rcu_callback(rsp->name, head, rdp->qlen); +		trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);  	/* If interrupts were disabled, don't dive into RCU core. */  	if (irqs_disabled_flags(flags)) { @@ -1778,16 +1881,16 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),   */  void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))  { -	__call_rcu(head, func, &rcu_sched_state); +	__call_rcu(head, func, &rcu_sched_state, 0);  }  EXPORT_SYMBOL_GPL(call_rcu_sched);  /* - * Queue an RCU for invocation after a quicker grace period. + * Queue an RCU callback for invocation after a quicker grace period.   */  void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))  { -	__call_rcu(head, func, &rcu_bh_state); +	__call_rcu(head, func, &rcu_bh_state, 0);  }  EXPORT_SYMBOL_GPL(call_rcu_bh); @@ -1816,6 +1919,10 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);   */  void synchronize_sched(void)  { +	rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && +			   !lock_is_held(&rcu_lock_map) && +			   !lock_is_held(&rcu_sched_lock_map), +			   "Illegal synchronize_sched() in RCU-sched read-side critical section");  	if (rcu_blocking_is_gp())  		return;  	wait_rcu_gp(call_rcu_sched); @@ -1833,12 +1940,137 @@ EXPORT_SYMBOL_GPL(synchronize_sched);   */  void synchronize_rcu_bh(void)  { +	rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && +			   !lock_is_held(&rcu_lock_map) && +			   !lock_is_held(&rcu_sched_lock_map), +			   "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");  	if (rcu_blocking_is_gp())  		return;  	wait_rcu_gp(call_rcu_bh);  }  EXPORT_SYMBOL_GPL(synchronize_rcu_bh); +static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); +static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); + +static int synchronize_sched_expedited_cpu_stop(void *data) +{ +	/* +	 * There must be a full memory barrier on each affected CPU +	 * between the time that try_stop_cpus() is called and the +	 * time that it returns. +	 * +	 * In the current initial implementation of cpu_stop, the +	 * above condition is already met when the control reaches +	 * this point and the following smp_mb() is not strictly +	 * necessary.  Do smp_mb() anyway for documentation and +	 * robustness against future implementation changes. +	 */ +	smp_mb(); /* See above comment block. */ +	return 0; +} + +/** + * synchronize_sched_expedited - Brute-force RCU-sched grace period + * + * Wait for an RCU-sched grace period to elapse, but use a "big hammer" + * approach to force the grace period to end quickly.  This consumes + * significant time on all CPUs and is unfriendly to real-time workloads, + * so is thus not recommended for any sort of common-case code.  In fact, + * if you are using synchronize_sched_expedited() in a loop, please + * restructure your code to batch your updates, and then use a single + * synchronize_sched() instead. + * + * Note that it is illegal to call this function while holding any lock + * that is acquired by a CPU-hotplug notifier.  And yes, it is also illegal + * to call this function from a CPU-hotplug notifier.  Failing to observe + * these restriction will result in deadlock. + * + * This implementation can be thought of as an application of ticket + * locking to RCU, with sync_sched_expedited_started and + * sync_sched_expedited_done taking on the roles of the halves + * of the ticket-lock word.  Each task atomically increments + * sync_sched_expedited_started upon entry, snapshotting the old value, + * then attempts to stop all the CPUs.  If this succeeds, then each + * CPU will have executed a context switch, resulting in an RCU-sched + * grace period.  We are then done, so we use atomic_cmpxchg() to + * update sync_sched_expedited_done to match our snapshot -- but + * only if someone else has not already advanced past our snapshot. + * + * On the other hand, if try_stop_cpus() fails, we check the value + * of sync_sched_expedited_done.  If it has advanced past our + * initial snapshot, then someone else must have forced a grace period + * some time after we took our snapshot.  In this case, our work is + * done for us, and we can simply return.  Otherwise, we try again, + * but keep our initial snapshot for purposes of checking for someone + * doing our work for us. + * + * If we fail too many times in a row, we fall back to synchronize_sched(). + */ +void synchronize_sched_expedited(void) +{ +	int firstsnap, s, snap, trycount = 0; + +	/* Note that atomic_inc_return() implies full memory barrier. */ +	firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); +	get_online_cpus(); +	WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); + +	/* +	 * Each pass through the following loop attempts to force a +	 * context switch on each CPU. +	 */ +	while (try_stop_cpus(cpu_online_mask, +			     synchronize_sched_expedited_cpu_stop, +			     NULL) == -EAGAIN) { +		put_online_cpus(); + +		/* No joy, try again later.  Or just synchronize_sched(). */ +		if (trycount++ < 10) +			udelay(trycount * num_online_cpus()); +		else { +			synchronize_sched(); +			return; +		} + +		/* Check to see if someone else did our work for us. */ +		s = atomic_read(&sync_sched_expedited_done); +		if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { +			smp_mb(); /* ensure test happens before caller kfree */ +			return; +		} + +		/* +		 * Refetching sync_sched_expedited_started allows later +		 * callers to piggyback on our grace period.  We subtract +		 * 1 to get the same token that the last incrementer got. +		 * We retry after they started, so our grace period works +		 * for them, and they started after our first try, so their +		 * grace period works for us. +		 */ +		get_online_cpus(); +		snap = atomic_read(&sync_sched_expedited_started); +		smp_mb(); /* ensure read is before try_stop_cpus(). */ +	} + +	/* +	 * Everyone up to our most recent fetch is covered by our grace +	 * period.  Update the counter, but only if our work is still +	 * relevant -- which it won't be if someone who started later +	 * than we did beat us to the punch. +	 */ +	do { +		s = atomic_read(&sync_sched_expedited_done); +		if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { +			smp_mb(); /* ensure test happens before caller kfree */ +			break; +		} +	} while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); + +	put_online_cpus(); +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); +  /*   * Check to see if there is any immediate RCU-related work to be done   * by the current CPU, for the specified type of RCU, returning 1 if so. @@ -1932,7 +2164,7 @@ static int rcu_cpu_has_callbacks(int cpu)  	/* RCU callbacks either ready or pending? */  	return per_cpu(rcu_sched_data, cpu).nxtlist ||  	       per_cpu(rcu_bh_data, cpu).nxtlist || -	       rcu_preempt_needs_cpu(cpu); +	       rcu_preempt_cpu_has_callbacks(cpu);  }  static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; @@ -2027,9 +2259,10 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)  	rdp->nxtlist = NULL;  	for (i = 0; i < RCU_NEXT_SIZE; i++)  		rdp->nxttail[i] = &rdp->nxtlist; +	rdp->qlen_lazy = 0;  	rdp->qlen = 0;  	rdp->dynticks = &per_cpu(rcu_dynticks, cpu); -	WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING); +	WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);  	WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);  	rdp->cpu = cpu;  	rdp->rsp = rsp; @@ -2057,7 +2290,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)  	rdp->qlen_last_fqs_check = 0;  	rdp->n_force_qs_snap = rsp->n_force_qs;  	rdp->blimit = blimit; -	rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING; +	rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;  	atomic_set(&rdp->dynticks->dynticks,  		   (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);  	rcu_prepare_for_idle_init(cpu); @@ -2139,16 +2372,18 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,  		 * touch any data without introducing corruption. We send the  		 * dying CPU's callbacks to an arbitrarily chosen online CPU.  		 */ -		rcu_send_cbs_to_online(&rcu_bh_state); -		rcu_send_cbs_to_online(&rcu_sched_state); -		rcu_preempt_send_cbs_to_online(); +		rcu_cleanup_dying_cpu(&rcu_bh_state); +		rcu_cleanup_dying_cpu(&rcu_sched_state); +		rcu_preempt_cleanup_dying_cpu();  		rcu_cleanup_after_idle(cpu);  		break;  	case CPU_DEAD:  	case CPU_DEAD_FROZEN:  	case CPU_UP_CANCELED:  	case CPU_UP_CANCELED_FROZEN: -		rcu_offline_cpu(cpu); +		rcu_cleanup_dead_cpu(cpu, &rcu_bh_state); +		rcu_cleanup_dead_cpu(cpu, &rcu_sched_state); +		rcu_preempt_cleanup_dead_cpu(cpu);  		break;  	default:  		break; | 
