From d1ec4c34c7a9f328e43ea87522119258194f28f8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 13 May 2015 10:41:58 -0700 Subject: rcu: Drop RCU_USER_QS in favor of NO_HZ_FULL The RCU_USER_QS Kconfig parameter is now just a synonym for NO_HZ_FULL, so this commit eliminates RCU_USER_QS, replacing all uses with NO_HZ_FULL. Reported-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Acked-by: Frederic Weisbecker --- kernel/rcu/tree.c | 8 ++++---- kernel/time/Kconfig | 2 -- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 65137bc28b2b..8b5dd8ba9495 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -701,7 +701,7 @@ void rcu_idle_enter(void) } EXPORT_SYMBOL_GPL(rcu_idle_enter); -#ifdef CONFIG_RCU_USER_QS +#ifdef CONFIG_NO_HZ_FULL /** * rcu_user_enter - inform RCU that we are resuming userspace. * @@ -714,7 +714,7 @@ void rcu_user_enter(void) { rcu_eqs_enter(1); } -#endif /* CONFIG_RCU_USER_QS */ +#endif /* CONFIG_NO_HZ_FULL */ /** * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle @@ -828,7 +828,7 @@ void rcu_idle_exit(void) } EXPORT_SYMBOL_GPL(rcu_idle_exit); -#ifdef CONFIG_RCU_USER_QS +#ifdef CONFIG_NO_HZ_FULL /** * rcu_user_exit - inform RCU that we are exiting userspace. * @@ -839,7 +839,7 @@ void rcu_user_exit(void) { rcu_eqs_exit(1); } -#endif /* CONFIG_RCU_USER_QS */ +#endif /* CONFIG_NO_HZ_FULL */ /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 579ce1b929af..4008d9f95dd7 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -92,12 +92,10 @@ config NO_HZ_FULL depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS # We need at least one periodic CPU for timekeeping depends on SMP - # RCU_USER_QS dependency depends on HAVE_CONTEXT_TRACKING # VIRT_CPU_ACCOUNTING_GEN dependency depends on HAVE_VIRT_CPU_ACCOUNTING_GEN select NO_HZ_COMMON - select RCU_USER_QS select RCU_NOCB_CPU select VIRT_CPU_ACCOUNTING_GEN select IRQ_WORK -- cgit v1.2.3 From 6dfec8d9493f48a42896386b41ec1a4644331b0b Mon Sep 17 00:00:00 2001 From: "bsegall@google.com" Date: Tue, 16 Jun 2015 12:18:21 -0700 Subject: sched/numa: Check sched_feat(NUMA) in migrate_improves_locality() migrate_improves_locality checked sched_feat(NUMA_FAVOUR_HIGHER) but not sched_feat(NUMA), so disabling just the NUMA feature would leave it working off of old data. Signed-off-by: Ben Segall Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/xm26si9rtqbm.fsf@sword-of-the-dawn.mtv.corp.google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d113c3ba8bc4..98b2b961df33 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5680,8 +5680,8 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) unsigned long src_faults, dst_faults; int src_nid, dst_nid; - if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || - !(env->sd->flags & SD_NUMA)) { + if (!sched_feat(NUMA) || !sched_feat(NUMA_FAVOUR_HIGHER) || + !p->numa_faults || !(env->sd->flags & SD_NUMA)) { return false; } -- cgit v1.2.3 From 2a1ed24ce94036d00a7c5d5e99a77a80f0aa556a Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Tue, 16 Jun 2015 17:25:59 +0530 Subject: sched/numa: Prefer NUMA hotness over cache hotness The current load balancer may not try to prevent a task from moving out of a preferred node to a less preferred node. The reason for this being: - Since sched features NUMA and NUMA_RESIST_LOWER are disabled by default, migrate_degrades_locality() always returns false. - Even if NUMA_RESIST_LOWER were to be enabled, if its cache hot, migrate_degrades_locality() never gets called. The above behaviour can mean that tasks can move out of their preferred node but they may be eventually be brought back to their preferred node by numa balancer (due to higher numa faults). To avoid the above, this commit merges migrate_degrades_locality() and migrate_improves_locality(). It also replaces 3 sched features NUMA, NUMA_FAVOUR_HIGHER and NUMA_RESIST_LOWER by a single sched feature NUMA. Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Mike Galbraith Link: http://lkml.kernel.org/r/1434455762-30857-2-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 89 ++++++++++++++----------------------------------- kernel/sched/features.h | 18 +++------- 2 files changed, 30 insertions(+), 77 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 98b2b961df33..43ee84f05d1e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5670,72 +5670,39 @@ static int task_hot(struct task_struct *p, struct lb_env *env) #ifdef CONFIG_NUMA_BALANCING /* - * Returns true if the destination node is the preferred node. - * Needs to match fbq_classify_rq(): if there is a runnable task - * that is not on its preferred node, we should identify it. + * Returns 1, if task migration degrades locality + * Returns 0, if task migration improves locality i.e migration preferred. + * Returns -1, if task migration is not affected by locality. */ -static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) +static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { struct numa_group *numa_group = rcu_dereference(p->numa_group); unsigned long src_faults, dst_faults; int src_nid, dst_nid; - if (!sched_feat(NUMA) || !sched_feat(NUMA_FAVOUR_HIGHER) || - !p->numa_faults || !(env->sd->flags & SD_NUMA)) { - return false; - } - - src_nid = cpu_to_node(env->src_cpu); - dst_nid = cpu_to_node(env->dst_cpu); - - if (src_nid == dst_nid) - return false; - - /* Encourage migration to the preferred node. */ - if (dst_nid == p->numa_preferred_nid) - return true; - - /* Migrating away from the preferred node is bad. */ - if (src_nid == p->numa_preferred_nid) - return false; - - if (numa_group) { - src_faults = group_faults(p, src_nid); - dst_faults = group_faults(p, dst_nid); - } else { - src_faults = task_faults(p, src_nid); - dst_faults = task_faults(p, dst_nid); - } - - return dst_faults > src_faults; -} - - -static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) -{ - struct numa_group *numa_group = rcu_dereference(p->numa_group); - unsigned long src_faults, dst_faults; - int src_nid, dst_nid; - - if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) - return false; - if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) - return false; + return -1; + + if (!sched_feat(NUMA)) + return -1; src_nid = cpu_to_node(env->src_cpu); dst_nid = cpu_to_node(env->dst_cpu); if (src_nid == dst_nid) - return false; + return -1; - /* Migrating away from the preferred node is bad. */ - if (src_nid == p->numa_preferred_nid) - return true; + /* Migrating away from the preferred node is always bad. */ + if (src_nid == p->numa_preferred_nid) { + if (env->src_rq->nr_running > env->src_rq->nr_preferred_running) + return 1; + else + return -1; + } /* Encourage migration to the preferred node. */ if (dst_nid == p->numa_preferred_nid) - return false; + return 0; if (numa_group) { src_faults = group_faults(p, src_nid); @@ -5749,16 +5716,10 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) } #else -static inline bool migrate_improves_locality(struct task_struct *p, +static inline int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { - return false; -} - -static inline bool migrate_degrades_locality(struct task_struct *p, - struct lb_env *env) -{ - return false; + return -1; } #endif @@ -5768,7 +5729,7 @@ static inline bool migrate_degrades_locality(struct task_struct *p, static int can_migrate_task(struct task_struct *p, struct lb_env *env) { - int tsk_cache_hot = 0; + int tsk_cache_hot; lockdep_assert_held(&env->src_rq->lock); @@ -5826,13 +5787,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) task is cache cold, or * 3) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, env); - if (!tsk_cache_hot) - tsk_cache_hot = migrate_degrades_locality(p, env); + tsk_cache_hot = migrate_degrades_locality(p, env); + if (tsk_cache_hot == -1) + tsk_cache_hot = task_hot(p, env); - if (migrate_improves_locality(p, env) || !tsk_cache_hot || + if (tsk_cache_hot <= 0 || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { - if (tsk_cache_hot) { + if (tsk_cache_hot == 1) { schedstat_inc(env->sd, lb_hot_gained[env->idle]); schedstat_inc(p, se.statistics.nr_forced_migrations); } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 91e33cd485f6..83a50e7ca533 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -79,20 +79,12 @@ SCHED_FEAT(LB_MIN, false) * numa_balancing= */ #ifdef CONFIG_NUMA_BALANCING -SCHED_FEAT(NUMA, false) /* - * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a - * higher number of hinting faults are recorded during active load - * balancing. + * NUMA will favor moving tasks towards nodes where a higher number of + * hinting faults are recorded during active load balancing. It will + * resist moving tasks towards nodes where a lower number of hinting + * faults have been recorded. */ -SCHED_FEAT(NUMA_FAVOUR_HIGHER, true) - -/* - * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a - * lower number of hinting faults have been recorded. As this has - * the potential to prevent a task ever migrating to a new node - * due to CPU overload it is disabled by default. - */ -SCHED_FEAT(NUMA_RESIST_LOWER, false) +SCHED_FEAT(NUMA, true) #endif -- cgit v1.2.3 From 44dcb04f0ea8eaac3b9c9d3172416efc5a950214 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Tue, 16 Jun 2015 17:26:00 +0530 Subject: sched/numa: Consider 'imbalance_pct' when comparing loads in numa_has_capacity() This is consistent with all other load balancing instances where we absorb unfairness upto env->imbalance_pct. Absorbing unfairness upto env->imbalance_pct allows to pull and retain task to their preferred nodes. Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1434455762-30857-3-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 43ee84f05d1e..a53a610095e6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1415,8 +1415,9 @@ static bool numa_has_capacity(struct task_numa_env *env) * --------------------- vs --------------------- * src->compute_capacity dst->compute_capacity */ - if (src->load * dst->compute_capacity > - dst->load * src->compute_capacity) + if (src->load * dst->compute_capacity * env->imbalance_pct > + + dst->load * src->compute_capacity * 100) return true; return false; -- cgit v1.2.3 From 8e2b0bf397279878babcb39b021edcafe7c945eb Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 2 Jul 2015 22:25:52 +0800 Subject: sched/fair: Clean up the __sched_period() code Since commit: 4bf0b77158 ("sched: remove do_div() from __sched_slice()") ... the logic of __sched_period() can be implemented as a single if-else without any local variables, so this patch cleans it up with an if-else statement, which expresses the function's logic straightforwardly. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435847152-29543-1-git-send-email-boqun.feng@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a53a610095e6..03ea05bd4c13 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -616,15 +616,10 @@ static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) */ static u64 __sched_period(unsigned long nr_running) { - u64 period = sysctl_sched_latency; - unsigned long nr_latency = sched_nr_latency; - - if (unlikely(nr_running > nr_latency)) { - period = sysctl_sched_min_granularity; - period *= nr_running; - } - - return period; + if (unlikely(nr_running > sched_nr_latency)) + return nr_running * sysctl_sched_min_granularity; + else + return sysctl_sched_latency; } /* -- cgit v1.2.3 From 399595f248cb25dccb6044b53c47c44c174dc23d Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 6 Jul 2015 21:51:02 +0900 Subject: sched/fair: Fix a comment reflecting function name change update_cfs_rq_load_contribution() was changed to __update_cfs_rq_tg_load_contrib() - sync up the commit in calc_tg_weight() too. Signed-off-by: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1436187062-19658-1-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 03ea05bd4c13..587a2f67ceb1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2349,7 +2349,7 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) /* * Use this CPU's actual weight instead of the last load_contribution * to gain a more accurate current total weight. See - * update_cfs_rq_load_contribution(). + * __update_cfs_rq_tg_load_contrib(). */ tg_weight = atomic_long_read(&tg->load_avg); tg_weight -= cfs_rq->tg_load_contrib; -- cgit v1.2.3 From e727c7d7a11e109849582e9165d54b254eb181d7 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2015 12:44:22 -0700 Subject: notifiers, RCU: Assert that RCU is watching in notify_die() Low-level arch entries often call notify_die(), and it's easy for arch code to fail to exit an RCU quiescent state first. Assert that we're not quiescent in notify_die(). Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Paul E. McKenney Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1f5fe6c23d5b432a23267102f2d72b787d80fdd8.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar --- kernel/notifier.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/notifier.c b/kernel/notifier.c index ae9fc7cc360e..980e4330fb59 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -544,6 +544,8 @@ int notrace notify_die(enum die_val val, const char *str, .signr = sig, }; + rcu_lockdep_assert(rcu_is_watching(), + "notify_die called but RCU thinks we're quiescent"); return atomic_notifier_call_chain(&die_chain, val, &args); } NOKPROBE_SYMBOL(notify_die); -- cgit v1.2.3 From b51bf95c583bba645974348666e9b5a14c7aa3ea Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Thu, 4 Jun 2015 12:13:25 +0800 Subject: genirq: Remove the parameter 'irq' of kstat_incr_irqs_this_cpu() The first parameter 'irq' is never used by kstat_incr_irqs_this_cpu(). Remove it. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Link: http://lkml.kernel.org/r/1433391238-19471-16-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 16 ++++++++-------- kernel/irq/handle.c | 2 +- kernel/irq/internals.h | 2 +- kernel/irq/irqdesc.c | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 27f4332c7f84..f3c3d55cd5a4 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -315,7 +315,7 @@ void handle_nested_irq(unsigned int irq) raw_spin_lock_irq(&desc->lock); desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(irq, desc); + kstat_incr_irqs_this_cpu(desc); action = desc->action; if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { @@ -391,7 +391,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(irq, desc); + kstat_incr_irqs_this_cpu(desc); if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { desc->istate |= IRQS_PENDING; @@ -443,7 +443,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(irq, desc); + kstat_incr_irqs_this_cpu(desc); /* * If its disabled or no action available @@ -515,7 +515,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) goto out; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(irq, desc); + kstat_incr_irqs_this_cpu(desc); /* * If its disabled or no action available @@ -583,7 +583,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) goto out_unlock; } - kstat_incr_irqs_this_cpu(irq, desc); + kstat_incr_irqs_this_cpu(desc); /* Start handling the irq */ desc->irq_data.chip->irq_ack(&desc->irq_data); @@ -646,7 +646,7 @@ void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc) goto out_eoi; } - kstat_incr_irqs_this_cpu(irq, desc); + kstat_incr_irqs_this_cpu(desc); do { if (unlikely(!desc->action)) @@ -675,7 +675,7 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); - kstat_incr_irqs_this_cpu(irq, desc); + kstat_incr_irqs_this_cpu(desc); if (chip->irq_ack) chip->irq_ack(&desc->irq_data); @@ -705,7 +705,7 @@ void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc) void *dev_id = raw_cpu_ptr(action->percpu_dev_id); irqreturn_t res; - kstat_incr_irqs_this_cpu(irq, desc); + kstat_incr_irqs_this_cpu(desc); if (chip->irq_ack) chip->irq_ack(&desc->irq_data); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 635480270858..4d37b96343e9 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -30,7 +30,7 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc) { print_irq_desc(irq, desc); - kstat_incr_irqs_this_cpu(irq, desc); + kstat_incr_irqs_this_cpu(desc); ack_bad_irq(irq); } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 4834ee828c41..3e03824cdd38 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -191,7 +191,7 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) return __irqd_to_state(d) & mask; } -static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc) +static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) { __this_cpu_inc(*desc->kstat_irqs); __this_cpu_inc(kstat.irqs_sum); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 4afc457613dd..0a2a4b697bcb 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -582,7 +582,7 @@ int irq_set_percpu_devid(unsigned int irq) void kstat_incr_irq_this_cpu(unsigned int irq) { - kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); + kstat_incr_irqs_this_cpu(irq_to_desc(irq)); } /** -- cgit v1.2.3 From 0798abeb7eec37dcc20f252c2195fc31c41561f9 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Thu, 4 Jun 2015 12:13:27 +0800 Subject: genirq: Remove the irq argument from check_irq_resend() It's only used in the software resend case and can be retrieved from irq_desc if necessary. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Link: http://lkml.kernel.org/r/1433391238-19471-18-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 2 +- kernel/irq/internals.h | 2 +- kernel/irq/manage.c | 2 +- kernel/irq/resend.c | 4 +++- 4 files changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f3c3d55cd5a4..0cfbd1506e35 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -187,7 +187,7 @@ int irq_startup(struct irq_desc *desc, bool resend) irq_enable(desc); } if (resend) - check_irq_resend(desc, desc->irq_data.irq); + check_irq_resend(desc); return ret; } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 3e03824cdd38..7054947e368e 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -90,7 +90,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *act irqreturn_t handle_irq_event(struct irq_desc *desc); /* Resending of interrupts :*/ -void check_irq_resend(struct irq_desc *desc, unsigned int irq); +void check_irq_resend(struct irq_desc *desc); bool irq_wait_for_poll(struct irq_desc *desc); void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f9744853b656..c2e835d19bca 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -516,7 +516,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq) /* Prevent probing on this irq: */ irq_settings_set_noprobe(desc); irq_enable(desc); - check_irq_resend(desc, irq); + check_irq_resend(desc); /* fall-through */ } default: diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 9065107f083e..32fc47c2c622 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -53,7 +53,7 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); * * Is called with interrupts disabled and desc->lock held. */ -void check_irq_resend(struct irq_desc *desc, unsigned int irq) +void check_irq_resend(struct irq_desc *desc) { /* * We do not resend level type interrupts. Level type @@ -74,6 +74,8 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) if (!desc->irq_data.chip->irq_retrigger || !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { #ifdef CONFIG_HARDIRQS_SW_RESEND + unsigned int irq = irq_desc_get_irq(desc); + /* * If the interrupt has a parent irq and runs * in the thread context of the parent irq, -- cgit v1.2.3 From a1ff541a40e90df05f586bf6b157083b351c4a0c Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 23 Jun 2015 19:47:29 +0200 Subject: genirq: Remove irq arg from __irq_set_trigger() It's only required for debug output and can be retrieved from the irq descriptor if necessary. [ tglx: Split out from combo patch ] Signed-off-by: Jiang Liu Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 2 +- kernel/irq/internals.h | 3 +-- kernel/irq/manage.c | 14 +++++++------- 3 files changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 0cfbd1506e35..310d65885440 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -63,7 +63,7 @@ int irq_set_irq_type(unsigned int irq, unsigned int type) return -EINVAL; type &= IRQ_TYPE_SENSE_MASK; - ret = __irq_set_trigger(desc, irq, type); + ret = __irq_set_trigger(desc, type); irq_put_desc_busunlock(desc, flags); return ret; } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 7054947e368e..429c5e34d619 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -59,8 +59,7 @@ enum { #include "debug.h" #include "settings.h" -extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, - unsigned long flags); +extern int __irq_set_trigger(struct irq_desc *desc, unsigned long flags); extern void __disable_irq(struct irq_desc *desc, unsigned int irq); extern void __enable_irq(struct irq_desc *desc, unsigned int irq); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index c2e835d19bca..0559d9c0f658 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -637,8 +637,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) return canrequest; } -int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, - unsigned long flags) +int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) { struct irq_chip *chip = desc->irq_data.chip; int ret, unmask = 0; @@ -648,7 +647,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, * IRQF_TRIGGER_* but the PIC does not support multiple * flow-types? */ - pr_debug("No set_type function for IRQ %d (%s)\n", irq, + pr_debug("No set_type function for IRQ %d (%s)\n", + irq_desc_get_irq(desc), chip ? (chip->name ? : "unknown") : "unknown"); return 0; } @@ -685,7 +685,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, break; default: pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n", - flags, irq, chip->irq_set_type); + flags, irq_desc_get_irq(desc), chip->irq_set_type); } if (unmask) unmask_irq(desc); @@ -1221,8 +1221,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) /* Setup the type (level, edge polarity) if configured: */ if (new->flags & IRQF_TRIGGER_MASK) { - ret = __irq_set_trigger(desc, irq, - new->flags & IRQF_TRIGGER_MASK); + ret = __irq_set_trigger(desc, + new->flags & IRQF_TRIGGER_MASK); if (ret) goto out_mask; @@ -1650,7 +1650,7 @@ void enable_percpu_irq(unsigned int irq, unsigned int type) if (type != IRQ_TYPE_NONE) { int ret; - ret = __irq_set_trigger(desc, irq, type); + ret = __irq_set_trigger(desc, type); if (ret) { WARN(1, "failed to set type for IRQ%d\n", irq); -- cgit v1.2.3 From 79ff1cda320b81dfe5feae0c5da52f029561ce93 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 23 Jun 2015 19:52:36 +0200 Subject: genirq: Remove irq argument from __enable/__disable_irq() Solely used for debug output. Can be retrieved from irq descriptor if necessary. [ tglx: Split out from combo patch ] Signed-off-by: Jiang Liu Signed-off-by: Thomas Gleixner --- kernel/irq/internals.h | 4 ++-- kernel/irq/manage.c | 13 +++++++------ kernel/irq/pm.c | 4 ++-- 3 files changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 429c5e34d619..c8dd8d723ee2 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -60,8 +60,8 @@ enum { #include "settings.h" extern int __irq_set_trigger(struct irq_desc *desc, unsigned long flags); -extern void __disable_irq(struct irq_desc *desc, unsigned int irq); -extern void __enable_irq(struct irq_desc *desc, unsigned int irq); +extern void __disable_irq(struct irq_desc *desc); +extern void __enable_irq(struct irq_desc *desc); extern int irq_startup(struct irq_desc *desc, bool resend); extern void irq_shutdown(struct irq_desc *desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0559d9c0f658..d526ac1eb0d1 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -423,7 +423,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) } #endif -void __disable_irq(struct irq_desc *desc, unsigned int irq) +void __disable_irq(struct irq_desc *desc) { if (!desc->depth++) irq_disable(desc); @@ -436,7 +436,7 @@ static int __disable_irq_nosync(unsigned int irq) if (!desc) return -EINVAL; - __disable_irq(desc, irq); + __disable_irq(desc); irq_put_desc_busunlock(desc, flags); return 0; } @@ -503,12 +503,13 @@ bool disable_hardirq(unsigned int irq) } EXPORT_SYMBOL_GPL(disable_hardirq); -void __enable_irq(struct irq_desc *desc, unsigned int irq) +void __enable_irq(struct irq_desc *desc) { switch (desc->depth) { case 0: err_out: - WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); + WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", + irq_desc_get_irq(desc)); break; case 1: { if (desc->istate & IRQS_SUSPENDED) @@ -546,7 +547,7 @@ void enable_irq(unsigned int irq) KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) goto out; - __enable_irq(desc, irq); + __enable_irq(desc); out: irq_put_desc_busunlock(desc, flags); } @@ -1280,7 +1281,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { desc->istate &= ~IRQS_SPURIOUS_DISABLED; - __enable_irq(desc, irq); + __enable_irq(desc); } raw_spin_unlock_irqrestore(&desc->lock, flags); diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index d22786a6dbde..0e1c617f8d5f 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -85,7 +85,7 @@ static bool suspend_device_irq(struct irq_desc *desc, int irq) } desc->istate |= IRQS_SUSPENDED; - __disable_irq(desc, irq); + __disable_irq(desc); /* * Hardware which has no wakeup source configuration facility @@ -150,7 +150,7 @@ static void resume_irq(struct irq_desc *desc, int irq) desc->depth++; resume: desc->istate &= ~IRQS_SUSPENDED; - __enable_irq(desc, irq); + __enable_irq(desc); } static void resume_irqs(bool want_early) -- cgit v1.2.3 From b80f5f3fc0dc5362eac19585c31a1cc414a6cf95 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 23 Jun 2015 19:58:45 +0200 Subject: genirq: Remove irq argument from suspend/resume_irq() Unused argument in both functions. [ tglx: Split out from combo patch ] Signed-off-by: Jiang Liu Signed-off-by: Thomas Gleixner --- kernel/irq/pm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 0e1c617f8d5f..21c62617a35a 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -68,7 +68,7 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) desc->cond_suspend_depth--; } -static bool suspend_device_irq(struct irq_desc *desc, int irq) +static bool suspend_device_irq(struct irq_desc *desc) { if (!desc->action || desc->no_suspend_depth) return false; @@ -126,7 +126,7 @@ void suspend_device_irqs(void) if (irq_settings_is_nested_thread(desc)) continue; raw_spin_lock_irqsave(&desc->lock, flags); - sync = suspend_device_irq(desc, irq); + sync = suspend_device_irq(desc); raw_spin_unlock_irqrestore(&desc->lock, flags); if (sync) @@ -135,7 +135,7 @@ void suspend_device_irqs(void) } EXPORT_SYMBOL_GPL(suspend_device_irqs); -static void resume_irq(struct irq_desc *desc, int irq) +static void resume_irq(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); @@ -169,7 +169,7 @@ static void resume_irqs(bool want_early) continue; raw_spin_lock_irqsave(&desc->lock, flags); - resume_irq(desc, irq); + resume_irq(desc); raw_spin_unlock_irqrestore(&desc->lock, flags); } } -- cgit v1.2.3 From 02d00eaa64bfd57fcbefe848e46e5ddca62ed5e6 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 23 Jun 2015 20:02:43 +0200 Subject: genirq: Remove irq argument from report_bad_irq() Not really a hotpath, so __report_bad_irq() can retrieve the irq number from the irq descriptor. [ tglx: Split out from combo patch ] Signed-off-by: Jiang Liu Signed-off-by: Thomas Gleixner --- kernel/irq/spurious.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index e2514b0e439e..c7699ec7cb9c 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -188,10 +188,9 @@ static inline int bad_action_ret(irqreturn_t action_ret) * (The other 100-of-100,000 interrupts may have been a correctly * functioning device sharing an IRQ with the failing one) */ -static void -__report_bad_irq(unsigned int irq, struct irq_desc *desc, - irqreturn_t action_ret) +static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) { + unsigned int irq = irq_desc_get_irq(desc); struct irqaction *action; unsigned long flags; @@ -224,14 +223,13 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, raw_spin_unlock_irqrestore(&desc->lock, flags); } -static void -report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) +static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) { static int count = 100; if (count > 0) { count--; - __report_bad_irq(irq, desc, action_ret); + __report_bad_irq(desc, action_ret); } } @@ -280,7 +278,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, return; if (bad_action_ret(action_ret)) { - report_bad_irq(irq, desc, action_ret); + report_bad_irq(desc, action_ret); return; } @@ -413,7 +411,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, /* * The interrupt is stuck */ - __report_bad_irq(irq, desc, action_ret); + __report_bad_irq(desc, action_ret); /* * Now kill the IRQ */ -- cgit v1.2.3 From c1e5bd8cc52ddc8c2998987c8806b999f09b064e Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 23 Jun 2015 20:07:35 +0200 Subject: genirq: Remove irq argument from try_one_irq() Unused argument. [ tglx: Split out from combo patch ] Signed-off-by: Jiang Liu Signed-off-by: Thomas Gleixner --- kernel/irq/spurious.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index c7699ec7cb9c..5378c529c1dc 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -60,7 +60,7 @@ bool irq_wait_for_poll(struct irq_desc *desc) /* * Recovery handler for misrouted interrupts. */ -static int try_one_irq(int irq, struct irq_desc *desc, bool force) +static int try_one_irq(struct irq_desc *desc, bool force) { irqreturn_t ret = IRQ_NONE; struct irqaction *action; @@ -133,7 +133,7 @@ static int misrouted_irq(int irq) if (i == irq) /* Already tried */ continue; - if (try_one_irq(i, desc, false)) + if (try_one_irq(desc, false)) ok = 1; } out: @@ -164,7 +164,7 @@ static void poll_spurious_irqs(unsigned long dummy) continue; local_irq_disable(); - try_one_irq(i, desc, true); + try_one_irq(desc, true); local_irq_enable(); } out: -- cgit v1.2.3 From 0dcdbc97557fd8c297c4e38e9f66e304a64bae9d Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Thu, 4 Jun 2015 12:13:28 +0800 Subject: genirq: Remove the irq argument from note_interrupt() Only required for the slow path. Retrieve it from irq descriptor if necessary. [ tglx: Split out from combo patch. Left [try_]misrouted_irq() untouched as there is no win in the slow path ] Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Jason Cooper Cc: Kevin Cernekee Cc: Arnd Bergmann Link: http://lkml.kernel.org/r/1433391238-19471-19-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 2 +- kernel/irq/handle.c | 2 +- kernel/irq/spurious.c | 6 ++++-- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 310d65885440..76f199dc6a5e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -328,7 +328,7 @@ void handle_nested_irq(unsigned int irq) action_ret = action->thread_fn(action->irq, action->dev_id); if (!noirqdebug) - note_interrupt(irq, desc, action_ret); + note_interrupt(desc, action_ret); raw_spin_lock_irq(&desc->lock); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 4d37b96343e9..b6eeea8a80c5 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -176,7 +176,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) add_interrupt_randomness(irq, flags); if (!noirqdebug) - note_interrupt(irq, desc, retval); + note_interrupt(desc, retval); return retval; } diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 5378c529c1dc..32144175458d 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -270,9 +270,10 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, #define SPURIOUS_DEFERRED 0x80000000 -void note_interrupt(unsigned int irq, struct irq_desc *desc, - irqreturn_t action_ret) +void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret) { + unsigned int irq; + if (desc->istate & IRQS_POLL_INPROGRESS || irq_settings_is_polled(desc)) return; @@ -396,6 +397,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, desc->last_unhandled = jiffies; } + irq = irq_desc_get_irq(desc); if (unlikely(try_misrouted_irq(irq, desc, action_ret))) { int ok = misrouted_irq(irq); if (action_ret == IRQ_NONE) -- cgit v1.2.3 From e019c249a60fc50319c5897d21d36207c257cc9e Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 23 Jun 2015 20:29:34 +0200 Subject: genirq: Provide and use __irq_can_set_affinity() Provide a irq_desc based variant of irq_can_set_affinity() to avoid a redundant lookup for the core code users. [ tglx: Split out from combo patch ] Signed-off-by: Jiang Liu Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d526ac1eb0d1..f5b774223778 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -115,6 +115,14 @@ EXPORT_SYMBOL(synchronize_irq); #ifdef CONFIG_SMP cpumask_var_t irq_default_affinity; +static int __irq_can_set_affinity(struct irq_desc *desc) +{ + if (!desc || !irqd_can_balance(&desc->irq_data) || + !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) + return 0; + return 1; +} + /** * irq_can_set_affinity - Check if the affinity of a given irq can be set * @irq: Interrupt to check @@ -122,13 +130,7 @@ cpumask_var_t irq_default_affinity; */ int irq_can_set_affinity(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); - - if (!desc || !irqd_can_balance(&desc->irq_data) || - !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) - return 0; - - return 1; + return __irq_can_set_affinity(irq_to_desc(irq)); } /** @@ -366,7 +368,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) int node = irq_desc_get_node(desc); /* Excludes PER_CPU and NO_BALANCE interrupts */ - if (!irq_can_set_affinity(irq)) + if (!__irq_can_set_affinity(desc)) return 0; /* -- cgit v1.2.3 From a8a98eac7b238beb49b479c164303651d5a37eb6 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Thu, 4 Jun 2015 12:13:30 +0800 Subject: genirq: Remove the irq argument from setup_affinity() Unused except for the alpha wrapper, which can retrieve if from the irq descriptor. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Link: http://lkml.kernel.org/r/1433391238-19471-21-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f5b774223778..886f11508c6d 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -361,8 +361,7 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); /* * Generic version of the affinity autoselector. */ -static int -setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) +static int setup_affinity(struct irq_desc *desc, struct cpumask *mask) { struct cpumask *set = irq_default_affinity; int node = irq_desc_get_node(desc); @@ -395,10 +394,10 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) return 0; } #else -static inline int -setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask) +/* Wrapper for ALPHA specific affinity selector magic */ +static inline int setup_affinity(struct irq_desc *d, struct cpumask *mask) { - return irq_select_affinity(irq); + return irq_select_affinity(irq_desc_get_irq(d)); } #endif @@ -412,14 +411,14 @@ int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask) int ret; raw_spin_lock_irqsave(&desc->lock, flags); - ret = setup_affinity(irq, desc, mask); + ret = setup_affinity(desc, mask); raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } #else static inline int -setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) +setup_affinity(struct irq_desc *desc, struct cpumask *mask) { return 0; } @@ -1256,7 +1255,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } /* Set default affinity mask once everything is setup */ - setup_affinity(irq, desc, mask); + setup_affinity(desc, mask); } else if (new->flags & IRQF_TRIGGER_MASK) { unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; -- cgit v1.2.3 From d5671f6bf2a672cfa72ef2cbac5cc53a4539690d Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 26 May 2015 17:48:34 +0200 Subject: rcu: Deinline rcu_read_lock_sched_held() if DEBUG_LOCK_ALLOC DEBUG_LOCK_ALLOC=y is not a production setting, but it is not very unusual either. Many developers routinely use kernels built with it enabled. Apart from being selected by hand, it is also auto-selected by PROVE_LOCKING "Lock debugging: prove locking correctness" and LOCK_STAT "Lock usage statistics" config options. LOCK STAT is necessary for "perf lock" to work. I wouldn't spend too much time optimizing it, but this particular function has a very large cost in code size: when it is deinlined, code size decreases by 830,000 bytes: text data bss dec hex filename 85674192 22294776 20627456 128596424 7aa39c8 vmlinux.before 84837612 22294424 20627456 127759492 79d7484 vmlinux (with this config: http://busybox.net/~vda/kernel_config) Signed-off-by: Denys Vlasenko CC: "Paul E. McKenney" CC: Josh Triplett CC: Mathieu Desnoyers CC: Lai Jiangshan CC: Tejun Heo CC: Oleg Nesterov CC: linux-kernel@vger.kernel.org Reviewed-by: Steven Rostedt Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index afaecb7a799a..fec5f48b8860 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -62,6 +62,55 @@ MODULE_ALIAS("rcupdate"); module_param(rcu_expedited, int, 0); +#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT) +/** + * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? + * + * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an + * RCU-sched read-side critical section. In absence of + * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side + * critical section unless it can prove otherwise. Note that disabling + * of preemption (including disabling irqs) counts as an RCU-sched + * read-side critical section. This is useful for debug checks in functions + * that required that they be called within an RCU-sched read-side + * critical section. + * + * Check debug_lockdep_rcu_enabled() to prevent false positives during boot + * and while lockdep is disabled. + * + * Note that if the CPU is in the idle loop from an RCU point of + * view (ie: that we are in the section between rcu_idle_enter() and + * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU + * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs + * that are in such a section, considering these as in extended quiescent + * state, so such a CPU is effectively never in an RCU read-side critical + * section regardless of what RCU primitives it invokes. This state of + * affairs is required --- we need to keep an RCU-free window in idle + * where the CPU may possibly enter into low power mode. This way we can + * notice an extended quiescent state to other CPUs that started a grace + * period. Otherwise we would delay any grace period as long as we run in + * the idle task. + * + * Similarly, we avoid claiming an SRCU read lock held if the current + * CPU is offline. + */ +int rcu_read_lock_sched_held(void) +{ + int lockdep_opinion = 0; + + if (!debug_lockdep_rcu_enabled()) + return 1; + if (!rcu_is_watching()) + return 0; + if (!rcu_lockdep_current_cpu_online()) + return 0; + if (debug_locks) + lockdep_opinion = lock_is_held(&rcu_sched_lock_map); + return lockdep_opinion || preempt_count() != 0 || irqs_disabled(); +} +EXPORT_SYMBOL(rcu_read_lock_sched_held); +#endif + #ifndef CONFIG_TINY_RCU static atomic_t rcu_expedited_nesting = -- cgit v1.2.3 From f765d1130700878c2275bc1ea09eed428f870a2a Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Wed, 27 May 2015 08:56:25 +0200 Subject: rcu: Change return type to bool Type-checking coccinelle spatches are being used to locate type mismatches between function signatures and return values in this case this produced: ./kernel/rcu/srcu.c:271 WARNING: return of wrong type int != unsigned long, srcu_readers_active() returns an int that is the sum of per_cpu unsigned long but the only user is cleanup_srcu_struct() which is using it as a boolean (condition) to see if there is any readers rather than actually using the approximate number of readers. The theoretically possible unsigned long overflow case does not need to be handled explicitly - if we had 4G++ readers then something else went wrong a long time ago. proposal: change the return type to boolean. The function name is left unchanged as it fits the naming expectation for a boolean. patch was compile tested for x86_64_defconfig (implies CONFIG_SRCU=y) patch is against 4.1-rc5 (localversion-next is -next-20150525) Signed-off-by: Nicholas Mc Guire Signed-off-by: Paul E. McKenney --- kernel/rcu/srcu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index fb33d35ee0b7..de35087c92a5 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -252,14 +252,15 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) } /** - * srcu_readers_active - returns approximate number of readers. + * srcu_readers_active - returns true if there are readers. and false + * otherwise * @sp: which srcu_struct to count active readers (holding srcu_read_lock). * * Note that this is not an atomic primitive, and can therefore suffer * severe errors when invoked on an active srcu_struct. That said, it * can be useful as an error check at cleanup time. */ -static int srcu_readers_active(struct srcu_struct *sp) +static bool srcu_readers_active(struct srcu_struct *sp) { int cpu; unsigned long sum = 0; -- cgit v1.2.3 From 319362c90f7af92d03714759bea5d2ba7c88e388 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 May 2015 14:16:52 -0700 Subject: rcu: Provide more diagnostics for stalled GP kthread Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 ++++++++-- kernel/rcu/tree.h | 6 +++++- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 65137bc28b2b..1c58cbd03922 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1178,9 +1178,11 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) j = jiffies; gpa = READ_ONCE(rsp->gp_activity); if (j - gpa > 2 * HZ) - pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x\n", + pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n", rsp->name, j - gpa, - rsp->gpnum, rsp->completed, rsp->gp_flags); + rsp->gpnum, rsp->completed, + rsp->gp_flags, rsp->gp_state, + rsp->gp_kthread ? rsp->gp_kthread->state : 0); } /* @@ -2041,6 +2043,7 @@ static int __noreturn rcu_gp_kthread(void *arg) wait_event_interruptible(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_INIT); + rsp->gp_state = RCU_GP_DONE_GPS; /* Locking provides needed memory barrier. */ if (rcu_gp_init(rsp)) break; @@ -2073,6 +2076,7 @@ static int __noreturn rcu_gp_kthread(void *arg) (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)), j); + rsp->gp_state = RCU_GP_DONE_FQS; /* Locking provides needed memory barriers. */ /* If grace period done, leave loop. */ if (!READ_ONCE(rnp->qsmask) && @@ -2110,7 +2114,9 @@ static int __noreturn rcu_gp_kthread(void *arg) } /* Handle grace-period end. */ + rsp->gp_state = RCU_GP_CLEANUP; rcu_gp_cleanup(rsp); + rsp->gp_state = RCU_GP_CLEANED; } } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 4adb7ca0bf47..f1f4784f9107 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -527,7 +527,11 @@ struct rcu_state { /* Values for rcu_state structure's gp_flags field. */ #define RCU_GP_WAIT_INIT 0 /* Initial state. */ #define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */ -#define RCU_GP_WAIT_FQS 2 /* Wait for force-quiescent-state time. */ +#define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */ +#define RCU_GP_WAIT_FQS 3 /* Wait for force-quiescent-state time. */ +#define RCU_GP_DONE_FQS 4 /* Wait done for force-quiescent-state time. */ +#define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */ +#define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */ extern struct list_head rcu_struct_flavors; -- cgit v1.2.3 From 75cf15a4c0dd57f5d230bd30c2d41bd8e06ae5a9 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 3 Jun 2015 08:18:23 +0200 Subject: rcu: Panic if RCU tree can not accommodate all CPUs Currently a condition when RCU tree is unable to accommodate the configured number of CPUs is not permitted and causes a fall back to compile-time values. However, the code has no means to exceed the RCU tree capacity neither at compile-time nor in run-time. Therefore, if the condition is met in run- time then it indicates a serios problem elsewhere and should be handled with a panic. Cc: "Paul E. McKenney" Cc: Steven Rostedt Signed-off-by: Alexander Gordeev Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1c58cbd03922..fe8d92987dfa 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4106,6 +4106,19 @@ static void __init rcu_init_geometry(void) pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n", rcu_fanout_leaf, nr_cpu_ids); + /* + * The boot-time rcu_fanout_leaf parameter is only permitted + * to increase the leaf-level fanout, not decrease it. Of course, + * the leaf-level fanout cannot exceed the number of bits in + * the rcu_node masks. Complain and fall back to the compile- + * time values if these limits are exceeded. + */ + if (rcu_fanout_leaf < RCU_FANOUT_LEAF || + rcu_fanout_leaf > sizeof(unsigned long) * 8) { + WARN_ON(1); + return; + } + /* * Compute number of nodes that can be handled an rcu_node tree * with the given number of levels. Setting rcu_capacity[0] makes @@ -4117,19 +4130,11 @@ static void __init rcu_init_geometry(void) rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT; /* - * The boot-time rcu_fanout_leaf parameter is only permitted - * to increase the leaf-level fanout, not decrease it. Of course, - * the leaf-level fanout cannot exceed the number of bits in - * the rcu_node masks. Finally, the tree must be able to accommodate - * the configured number of CPUs. Complain and fall back to the - * compile-time values if these limits are exceeded. + * The tree must be able to accommodate the configured number of CPUs. + * If this limit is exceeded than we have a serious problem elsewhere. */ - if (rcu_fanout_leaf < RCU_FANOUT_LEAF || - rcu_fanout_leaf > sizeof(unsigned long) * 8 || - n > rcu_capacity[MAX_RCU_LVLS]) { - WARN_ON(1); - return; - } + if (n > rcu_capacity[MAX_RCU_LVLS]) + panic("rcu_init_geometry: rcu_capacity[] is too small"); /* Calculate the number of rcu_nodes at each level of the tree. */ for (i = 1; i <= MAX_RCU_LVLS; i++) -- cgit v1.2.3 From 372b0ec24f6b516174934d68fd86d2056f1a5bba Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 3 Jun 2015 08:18:24 +0200 Subject: rcu: Remove superfluous local variable in rcu_init_geometry() Local variable 'n' mimics 'nr_cpu_ids' while the both are used within one function. There is no reason for 'n' to exist whatsoever. Cc: "Paul E. McKenney" Cc: Steven Rostedt Signed-off-by: Alexander Gordeev Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index fe8d92987dfa..ad49dbed44fb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4083,7 +4083,6 @@ static void __init rcu_init_geometry(void) ulong d; int i; int j; - int n = nr_cpu_ids; int rcu_capacity[MAX_RCU_LVLS + 1]; /* @@ -4133,15 +4132,16 @@ static void __init rcu_init_geometry(void) * The tree must be able to accommodate the configured number of CPUs. * If this limit is exceeded than we have a serious problem elsewhere. */ - if (n > rcu_capacity[MAX_RCU_LVLS]) + if (nr_cpu_ids > rcu_capacity[MAX_RCU_LVLS]) panic("rcu_init_geometry: rcu_capacity[] is too small"); /* Calculate the number of rcu_nodes at each level of the tree. */ for (i = 1; i <= MAX_RCU_LVLS; i++) - if (n <= rcu_capacity[i]) { - for (j = 0; j <= i; j++) - num_rcu_lvl[j] = - DIV_ROUND_UP(n, rcu_capacity[i - j]); + if (nr_cpu_ids <= rcu_capacity[i]) { + for (j = 0; j <= i; j++) { + int cap = rcu_capacity[i - j]; + num_rcu_lvl[j] = DIV_ROUND_UP(nr_cpu_ids, cap); + } rcu_num_lvls = i; for (j = i + 1; j <= MAX_RCU_LVLS; j++) num_rcu_lvl[j] = 0; @@ -4152,7 +4152,7 @@ static void __init rcu_init_geometry(void) rcu_num_nodes = 0; for (i = 0; i <= MAX_RCU_LVLS; i++) rcu_num_nodes += num_rcu_lvl[i]; - rcu_num_nodes -= n; + rcu_num_nodes -= nr_cpu_ids; } /* -- cgit v1.2.3 From 679f9858b1769d740d933f5f1ad9dbe3292f26d2 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 3 Jun 2015 08:18:25 +0200 Subject: rcu: Cleanup rcu_init_geometry() code and arithmetics This update simplifies rcu_init_geometry() code flow and makes calculation of the total number of rcu_node structures more easy to read. The update relies on the fact num_rcu_lvl[] is never accessed beyond rcu_num_lvls index by the rest of the code. Therefore, there is no need initialize the whole num_rcu_lvl[]. Cc: "Paul E. McKenney" Cc: Steven Rostedt Signed-off-by: Alexander Gordeev Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ad49dbed44fb..37ca8a867a1c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4082,7 +4082,6 @@ static void __init rcu_init_geometry(void) { ulong d; int i; - int j; int rcu_capacity[MAX_RCU_LVLS + 1]; /* @@ -4135,24 +4134,21 @@ static void __init rcu_init_geometry(void) if (nr_cpu_ids > rcu_capacity[MAX_RCU_LVLS]) panic("rcu_init_geometry: rcu_capacity[] is too small"); + /* Calculate the number of levels in the tree. */ + for (i = 1; nr_cpu_ids > rcu_capacity[i]; i++) { + } + rcu_num_lvls = i; + /* Calculate the number of rcu_nodes at each level of the tree. */ - for (i = 1; i <= MAX_RCU_LVLS; i++) - if (nr_cpu_ids <= rcu_capacity[i]) { - for (j = 0; j <= i; j++) { - int cap = rcu_capacity[i - j]; - num_rcu_lvl[j] = DIV_ROUND_UP(nr_cpu_ids, cap); - } - rcu_num_lvls = i; - for (j = i + 1; j <= MAX_RCU_LVLS; j++) - num_rcu_lvl[j] = 0; - break; - } + for (i = 0; i < rcu_num_lvls; i++) { + int cap = rcu_capacity[rcu_num_lvls - i]; + num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap); + } /* Calculate the total number of rcu_node structures. */ rcu_num_nodes = 0; - for (i = 0; i <= MAX_RCU_LVLS; i++) + for (i = 0; i < rcu_num_lvls; i++) rcu_num_nodes += num_rcu_lvl[i]; - rcu_num_nodes -= nr_cpu_ids; } /* -- cgit v1.2.3 From 9618138b09260e3df5af5a0d7bdc8fca010f6a3f Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 3 Jun 2015 08:18:26 +0200 Subject: rcu: Simplify rcu_init_geometry() capacity arithmetics Current code suggests that introducing the extra level to rcu_capacity[] array makes some of the arithmetic easier. Well, in fact it appears rather confusing and unnecessary. Cc: "Paul E. McKenney" Cc: Steven Rostedt Signed-off-by: Alexander Gordeev Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 37ca8a867a1c..2103beedb49f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4082,7 +4082,7 @@ static void __init rcu_init_geometry(void) { ulong d; int i; - int rcu_capacity[MAX_RCU_LVLS + 1]; + int rcu_capacity[MAX_RCU_LVLS]; /* * Initialize any unspecified boot parameters. @@ -4119,29 +4119,27 @@ static void __init rcu_init_geometry(void) /* * Compute number of nodes that can be handled an rcu_node tree - * with the given number of levels. Setting rcu_capacity[0] makes - * some of the arithmetic easier. + * with the given number of levels. */ - rcu_capacity[0] = 1; - rcu_capacity[1] = rcu_fanout_leaf; - for (i = 2; i <= MAX_RCU_LVLS; i++) + rcu_capacity[0] = rcu_fanout_leaf; + for (i = 1; i < MAX_RCU_LVLS; i++) rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT; /* * The tree must be able to accommodate the configured number of CPUs. * If this limit is exceeded than we have a serious problem elsewhere. */ - if (nr_cpu_ids > rcu_capacity[MAX_RCU_LVLS]) + if (nr_cpu_ids > rcu_capacity[MAX_RCU_LVLS - 1]) panic("rcu_init_geometry: rcu_capacity[] is too small"); /* Calculate the number of levels in the tree. */ - for (i = 1; nr_cpu_ids > rcu_capacity[i]; i++) { + for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) { } - rcu_num_lvls = i; + rcu_num_lvls = i + 1; /* Calculate the number of rcu_nodes at each level of the tree. */ for (i = 0; i < rcu_num_lvls; i++) { - int cap = rcu_capacity[rcu_num_lvls - i]; + int cap = rcu_capacity[(rcu_num_lvls - 1) - i]; num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap); } -- cgit v1.2.3 From a6d77081e266605c9f4d8c11e0ee00468b9dc614 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 3 Jun 2015 08:18:27 +0200 Subject: rcu: Limit rcu_state::levelcnt[] to RCU_NUM_LVLS items Variable rcu_num_lvls is limited by RCU_NUM_LVLS macro. In turn, rcu_state::levelcnt[] array is never accessed beyond rcu_num_lvls. Thus, rcu_state::levelcnt[] is safe to limit to RCU_NUM_LVLS items. Since rcu_num_lvls could be changed during boot (as result of rcutree.rcu_fanout_leaf kernel parameter update) one might assume a new value could overflow the value of RCU_NUM_LVLS. However, that is not the case, since leaf-level fanout is only permitted to increase, resulting in rcu_num_lvls possibly to decrease. Cc: "Paul E. McKenney" Cc: Steven Rostedt Signed-off-by: Alexander Gordeev Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f1f4784f9107..a6faae53ea8f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -443,7 +443,7 @@ do { \ struct rcu_state { struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ - u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ + u32 levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ u8 flavor_mask; /* bit in flavor mask. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ -- cgit v1.2.3 From 05b84aec465c34da242a224d2438d192ca0feec7 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 3 Jun 2015 08:18:28 +0200 Subject: rcu: Limit rcu_capacity[] size to RCU_NUM_LVLS items Number of items in rcu_capacity[] array is defined by macro MAX_RCU_LVLS. However, that array is never accessed beyond RCU_NUM_LVLS index. Therefore, we can limit the array to RCU_NUM_LVLS items and eliminate MAX_RCU_LVLS. As result, in most cases the memory is conserved. Cc: "Paul E. McKenney" Cc: Steven Rostedt Signed-off-by: Alexander Gordeev Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 12 ++++++------ kernel/rcu/tree.h | 2 -- 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2103beedb49f..2ec7b796f660 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3998,19 +3998,19 @@ static void __init rcu_init_one(struct rcu_state *rsp, "rcu_node_0", "rcu_node_1", "rcu_node_2", - "rcu_node_3" }; /* Match MAX_RCU_LVLS */ + "rcu_node_3" }; static const char * const fqs[] = { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", - "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ + "rcu_node_fqs_3" }; static u8 fl_mask = 0x1; int cpustride = 1; int i; int j; struct rcu_node *rnp; - BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ + BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ /* Silence gcc 4.8 false positive about array index out of range. */ if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS) @@ -4082,7 +4082,7 @@ static void __init rcu_init_geometry(void) { ulong d; int i; - int rcu_capacity[MAX_RCU_LVLS]; + int rcu_capacity[RCU_NUM_LVLS]; /* * Initialize any unspecified boot parameters. @@ -4122,14 +4122,14 @@ static void __init rcu_init_geometry(void) * with the given number of levels. */ rcu_capacity[0] = rcu_fanout_leaf; - for (i = 1; i < MAX_RCU_LVLS; i++) + for (i = 1; i < RCU_NUM_LVLS; i++) rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT; /* * The tree must be able to accommodate the configured number of CPUs. * If this limit is exceeded than we have a serious problem elsewhere. */ - if (nr_cpu_ids > rcu_capacity[MAX_RCU_LVLS - 1]) + if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) panic("rcu_init_geometry: rcu_capacity[] is too small"); /* Calculate the number of levels in the tree. */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index a6faae53ea8f..d625e9ff0faf 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -36,8 +36,6 @@ * Of course, your mileage may vary. */ -#define MAX_RCU_LVLS 4 - #ifdef CONFIG_RCU_FANOUT #define RCU_FANOUT CONFIG_RCU_FANOUT #else /* #ifdef CONFIG_RCU_FANOUT */ -- cgit v1.2.3 From 199977bff9efceec649d74510fa9754e107ce0c5 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 3 Jun 2015 08:18:29 +0200 Subject: rcu: Remove unnecessary fields from rcu_state structure Members rcu_state::levelcnt[] and rcu_state::levelspread[] are only used at init. There is no reason to keep them afterwards. Cc: "Paul E. McKenney" Cc: Steven Rostedt Signed-off-by: Alexander Gordeev Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 27 +++++++++++++++------------ kernel/rcu/tree.h | 2 -- 2 files changed, 15 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2ec7b796f660..7226e25ba97f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3967,22 +3967,22 @@ void rcu_scheduler_starting(void) * Compute the per-level fanout, either using the exact fanout specified * or balancing the tree, depending on the rcu_fanout_exact boot parameter. */ -static void __init rcu_init_levelspread(struct rcu_state *rsp) +static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt) { int i; if (rcu_fanout_exact) { - rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; + levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; for (i = rcu_num_lvls - 2; i >= 0; i--) - rsp->levelspread[i] = RCU_FANOUT; + levelspread[i] = RCU_FANOUT; } else { int ccur; int cprv; cprv = nr_cpu_ids; for (i = rcu_num_lvls - 1; i >= 0; i--) { - ccur = rsp->levelcnt[i]; - rsp->levelspread[i] = (cprv + ccur - 1) / ccur; + ccur = levelcnt[i]; + levelspread[i] = (cprv + ccur - 1) / ccur; cprv = ccur; } } @@ -4005,6 +4005,9 @@ static void __init rcu_init_one(struct rcu_state *rsp, "rcu_node_fqs_2", "rcu_node_fqs_3" }; static u8 fl_mask = 0x1; + + int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ + int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ int cpustride = 1; int i; int j; @@ -4019,19 +4022,19 @@ static void __init rcu_init_one(struct rcu_state *rsp, /* Initialize the level-tracking arrays. */ for (i = 0; i < rcu_num_lvls; i++) - rsp->levelcnt[i] = num_rcu_lvl[i]; + levelcnt[i] = num_rcu_lvl[i]; for (i = 1; i < rcu_num_lvls; i++) - rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; - rcu_init_levelspread(rsp); + rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1]; + rcu_init_levelspread(levelspread, levelcnt); rsp->flavor_mask = fl_mask; fl_mask <<= 1; /* Initialize the elements themselves, starting from the leaves. */ for (i = rcu_num_lvls - 1; i >= 0; i--) { - cpustride *= rsp->levelspread[i]; + cpustride *= levelspread[i]; rnp = rsp->level[i]; - for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { + for (j = 0; j < levelcnt[i]; j++, rnp++) { raw_spin_lock_init(&rnp->lock); lockdep_set_class_and_name(&rnp->lock, &rcu_node_class[i], buf[i]); @@ -4051,10 +4054,10 @@ static void __init rcu_init_one(struct rcu_state *rsp, rnp->grpmask = 0; rnp->parent = NULL; } else { - rnp->grpnum = j % rsp->levelspread[i - 1]; + rnp->grpnum = j % levelspread[i - 1]; rnp->grpmask = 1UL << rnp->grpnum; rnp->parent = rsp->level[i - 1] + - j / rsp->levelspread[i - 1]; + j / levelspread[i - 1]; } rnp->level = i; INIT_LIST_HEAD(&rnp->blkd_tasks); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index d625e9ff0faf..3413f3c5c8b2 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -441,8 +441,6 @@ do { \ struct rcu_state { struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ - u32 levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ - u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ u8 flavor_mask; /* bit in flavor mask. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ -- cgit v1.2.3 From cb007102398edd06ffc4488bf841c2e10f14d2e7 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 3 Jun 2015 08:18:30 +0200 Subject: rcu: Limit count of static data to the number of RCU levels Although a number of RCU levels may be less than the current maximum of four, some static data associated with each level are allocated for all four levels. As result, the extra data never get accessed and just wast memory. This update limits count of allocated items to the number of used RCU levels. Cc: "Paul E. McKenney" Cc: Steven Rostedt Signed-off-by: Alexander Gordeev Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 21 ++++----------------- kernel/rcu/tree.h | 12 ++++++++++++ 2 files changed, 16 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7226e25ba97f..e53bbc53bcd5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -124,13 +124,8 @@ module_param(rcu_fanout_exact, bool, 0444); static int rcu_fanout_leaf = RCU_FANOUT_LEAF; module_param(rcu_fanout_leaf, int, 0444); int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; -static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ - NUM_RCU_LVL_0, - NUM_RCU_LVL_1, - NUM_RCU_LVL_2, - NUM_RCU_LVL_3, - NUM_RCU_LVL_4, -}; +/* Number of rcu_nodes at specified level. */ +static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ /* @@ -3994,16 +3989,8 @@ static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt) static void __init rcu_init_one(struct rcu_state *rsp, struct rcu_data __percpu *rda) { - static const char * const buf[] = { - "rcu_node_0", - "rcu_node_1", - "rcu_node_2", - "rcu_node_3" }; - static const char * const fqs[] = { - "rcu_node_fqs_0", - "rcu_node_fqs_1", - "rcu_node_fqs_2", - "rcu_node_fqs_3" }; + static const char * const buf[] = RCU_NODE_NAME_INIT; + static const char * const fqs[] = RCU_FQS_NAME_INIT; static u8 fl_mask = 0x1; int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 3413f3c5c8b2..d44856b6170a 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -68,6 +68,9 @@ # define NUM_RCU_LVL_2 0 # define NUM_RCU_LVL_3 0 # define NUM_RCU_LVL_4 0 +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } +# define RCU_NODE_NAME_INIT { "rcu_node_0" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } #elif NR_CPUS <= RCU_FANOUT_2 # define RCU_NUM_LVLS 2 # define NUM_RCU_LVL_0 1 @@ -75,6 +78,9 @@ # define NUM_RCU_LVL_2 (NR_CPUS) # define NUM_RCU_LVL_3 0 # define NUM_RCU_LVL_4 0 +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } +# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } #elif NR_CPUS <= RCU_FANOUT_3 # define RCU_NUM_LVLS 3 # define NUM_RCU_LVL_0 1 @@ -82,6 +88,9 @@ # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) # define NUM_RCU_LVL_3 (NR_CPUS) # define NUM_RCU_LVL_4 0 +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } +# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } #elif NR_CPUS <= RCU_FANOUT_4 # define RCU_NUM_LVLS 4 # define NUM_RCU_LVL_0 1 @@ -89,6 +98,9 @@ # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) # define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) # define NUM_RCU_LVL_4 (NR_CPUS) +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } +# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } #else # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ -- cgit v1.2.3 From 426216970e0458c1f507860f4837cbde66a72263 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 3 Jun 2015 08:18:31 +0200 Subject: rcu: Simplify arithmetic to calculate number of RCU nodes This update makes arithmetic to calculate number of RCU nodes more straight and easy to read. Cc: "Paul E. McKenney" Cc: Steven Rostedt Signed-off-by: Alexander Gordeev Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 17 ++++------------- kernel/rcu/tree_plugin.h | 4 ++-- 2 files changed, 6 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index d44856b6170a..581f8d3c5b28 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -64,10 +64,7 @@ #if NR_CPUS <= RCU_FANOUT_1 # define RCU_NUM_LVLS 1 # define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 (NR_CPUS) -# define NUM_RCU_LVL_2 0 -# define NUM_RCU_LVL_3 0 -# define NUM_RCU_LVL_4 0 +# define NUM_RCU_NODES NUM_RCU_LVL_0 # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } # define RCU_NODE_NAME_INIT { "rcu_node_0" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } @@ -75,9 +72,7 @@ # define RCU_NUM_LVLS 2 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_LVL_2 (NR_CPUS) -# define NUM_RCU_LVL_3 0 -# define NUM_RCU_LVL_4 0 +# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1) # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } @@ -86,8 +81,7 @@ # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_LVL_3 (NR_CPUS) -# define NUM_RCU_LVL_4 0 +# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2) # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } @@ -97,7 +91,7 @@ # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) # define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_LVL_4 (NR_CPUS) +# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } @@ -105,9 +99,6 @@ # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ -#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) -#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) - extern int rcu_num_lvls; extern int rcu_num_nodes; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 013485fb2b06..5dac0a10a985 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -84,8 +84,8 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU torture testing starts during boot.\n"); if (IS_ENABLED(CONFIG_RCU_CPU_STALL_INFO)) pr_info("\tAdditional per-CPU info printed with stalls.\n"); - if (NUM_RCU_LVL_4 != 0) - pr_info("\tFour-level hierarchy is enabled.\n"); + if (RCU_NUM_LVLS >= 4) + pr_info("\tFour(or more)-level hierarchy is enabled.\n"); if (RCU_FANOUT_LEAF != 16) pr_info("\tBuild-time adjustment of leaf fanout to %d.\n", RCU_FANOUT_LEAF); -- cgit v1.2.3 From d9eba768839ac24e47606af36e50c14f10c2211c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 May 2015 15:35:43 -0700 Subject: rcutorture: Better bounds checking for n_barrier_cbs A negative value for rcutorture.n_barrier_cbs can pass a negative value to the memory allocator, so this commit instead causes rcu_barrier() testing to be disabled in this case. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 59e32684c23b..7e29a3266139 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1507,7 +1507,7 @@ static int rcu_torture_barrier_init(void) int i; int ret; - if (n_barrier_cbs == 0) + if (n_barrier_cbs <= 0) return 0; if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { pr_alert("%s" TORTURE_FLAG -- cgit v1.2.3 From 4444d852a99b8f0310f369da8473ec3639e380a7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 May 2015 15:42:40 -0700 Subject: rcutorture: Check nfakewriters parameter Currently, a negative value for rcutorture.nfakewriters= can cause rcutorture to pass a negative size to the memory allocator, which is not really a particularly good thing to do. This commit therefore adds bounds checking to this parameter, so that values that are less than or equal to zero disable fake writing. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7e29a3266139..2cbe569ac5dd 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1786,12 +1786,15 @@ rcu_torture_init(void) writer_task); if (firsterr) goto unwind; - fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), - GFP_KERNEL); - if (fakewriter_tasks == NULL) { - VERBOSE_TOROUT_ERRSTRING("out of memory"); - firsterr = -ENOMEM; - goto unwind; + if (nfakewriters > 0) { + fakewriter_tasks = kzalloc(nfakewriters * + sizeof(fakewriter_tasks[0]), + GFP_KERNEL); + if (fakewriter_tasks == NULL) { + VERBOSE_TOROUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } } for (i = 0; i < nfakewriters; i++) { firsterr = torture_create_kthread(rcu_torture_fakewriter, -- cgit v1.2.3 From e8e255f7191fb6491dd1d96cfbbe19981f6eb3dd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 May 2015 16:55:45 -0700 Subject: rcutorture: Bounds-check rcutorture.shuffle_interval Specifying a negative rcutorture.shuffle_interval value will cause a negative value to be used as a sleep time. This commit therefore refuses to start shuffling unless the rcutorture.shuffle_interval value is greater than zero. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 2cbe569ac5dd..1cead7806ca6 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1821,7 +1821,7 @@ rcu_torture_init(void) if (firsterr) goto unwind; } - if (test_no_idle_hz) { + if (test_no_idle_hz && shuffle_interval > 0) { firsterr = torture_shuffle_init(shuffle_interval * HZ); if (firsterr) goto unwind; -- cgit v1.2.3 From 3a0af333415830d2a0ca77de832336af5aadced4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 Jun 2015 18:11:31 -0700 Subject: rcutorture: Fix rcu_torture_cbflood() for callback-free RCU The rcu_torture_cbflood() function correctly checks for flavors of RCU that lack analogs to call_rcu() and rcu_barrier(), but in that case it fails to terminate correctly. In fact, it terminates so incorrectly that segfaults can result. This commit therefore causes rcu_torture_cbflood() to do the proper wait-for-stop procedure. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 1cead7806ca6..e0eda3c1b621 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -823,9 +823,7 @@ rcu_torture_cbflood(void *arg) } if (err) { VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM"); - while (!torture_must_stop()) - schedule_timeout_interruptible(HZ); - return 0; + goto wait_for_stop; } VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started"); do { @@ -844,6 +842,7 @@ rcu_torture_cbflood(void *arg) stutter_wait("rcu_torture_cbflood"); } while (!torture_must_stop()); vfree(rhp); +wait_for_stop: torture_kthread_stopping("rcu_torture_cbflood"); return 0; } -- cgit v1.2.3 From 5be5d1a11775fadc6104789fad72fae46dff348e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 30 Jun 2015 08:57:57 -0700 Subject: rcutorture: Add RCU-tasks qualifier to dereference Although RCU-tasks isn't really designed to support rcu_dereference() and list manipulation, that is how rcutorture tests it. Which means that lockdep-RCU complains about the rcu_dereference_check() invocations because RCU-tasks doesn't have read-side markers. This commit therefore creates a torturing_tasks() to silence the lockdep-RCU complaints from rcu_dereference_check() when RCU-tasks is being tortured. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index e0eda3c1b621..67b3f260720e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -684,10 +684,20 @@ static struct rcu_torture_ops tasks_ops = { #define RCUTORTURE_TASKS_OPS &tasks_ops, +static bool __maybe_unused torturing_tasks(void) +{ + return cur_ops == &tasks_ops; +} + #else /* #ifdef CONFIG_TASKS_RCU */ #define RCUTORTURE_TASKS_OPS +static bool torturing_tasks(void) +{ + return false; +} + #endif /* #else #ifdef CONFIG_TASKS_RCU */ /* @@ -1087,7 +1097,8 @@ static void rcu_torture_timer(unsigned long unused) p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || - srcu_read_lock_held(srcu_ctlp)); + srcu_read_lock_held(srcu_ctlp) || + torturing_tasks()); if (p == NULL) { /* Leave because rcu_torture_writer is not yet underway */ cur_ops->readunlock(idx); @@ -1161,7 +1172,8 @@ rcu_torture_reader(void *arg) p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || - srcu_read_lock_held(srcu_ctlp)); + srcu_read_lock_held(srcu_ctlp) || + torturing_tasks()); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ cur_ops->readunlock(idx); -- cgit v1.2.3 From 032dfc87225c96ec1771e5967436c4b23d1dc5d6 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 9 Jul 2015 15:34:23 +0200 Subject: rcu: Shut up bogus gcc array bounds warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Because gcc does not realize a loop would not be entered ever (i.e. in case of rcu_num_lvls == 1): for (i = 1; i < rcu_num_lvls; i++) rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1]; some compiler (pre- 5.x?) versions give a bogus warning: kernel/rcu/tree.c: In function ‘rcu_init_one.isra.55’: kernel/rcu/tree.c:4108:13: warning: array subscript is above array bounds [-Warray-bounds] rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; ^ Fix that warning by adding an extra item to rcu_state::level[] array. Once the bogus warning is fixed in gcc and kernel drops support of older versions, the dummy item may be removed from the array. Cc: "Paul E. McKenney" Suggested-by: "Paul E. McKenney" Signed-off-by: Alexander Gordeev Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 581f8d3c5b28..faee5242d6ff 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -443,7 +443,9 @@ do { \ */ struct rcu_state { struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ - struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ + struct rcu_node *level[RCU_NUM_LVLS + 1]; + /* Hierarchy levels (+1 to */ + /* shut bogus gcc warning) */ u8 flavor_mask; /* bit in flavor mask. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ -- cgit v1.2.3 From 13bd64947f53ba8d7199922be94b6626b8e222d7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 4 Jun 2015 10:06:01 -0700 Subject: rcu: Reset rcu_fanout_leaf if out of bounds Currently if the rcu_fanout_leaf boot parameter is out of bounds (that is, less than RCU_FANOUT_LEAF or greater than the number of bits in an unsigned long), a warning is issued and execution continues with the out-of-bounds value. This can result in all manner of failures, so this patch resets rcu_fanout_leaf to RCU_FANOUT_LEAF when an out-of-bounds condition is detected. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e53bbc53bcd5..a2147d7b51c0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4103,6 +4103,7 @@ static void __init rcu_init_geometry(void) */ if (rcu_fanout_leaf < RCU_FANOUT_LEAF || rcu_fanout_leaf > sizeof(unsigned long) * 8) { + rcu_fanout_leaf = RCU_FANOUT_LEAF; WARN_ON(1); return; } -- cgit v1.2.3 From 9b683874504a57cfa97558d403c75e286e20c9ce Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Jun 2015 14:50:22 -0700 Subject: rcu: Stop disabling CPU hotplug in synchronize_rcu_expedited() The fact that tasks could be migrated from leaf to root rcu_node structures meant that synchronize_rcu_expedited() had to disable CPU hotplug. However, tasks now stay put, so this commit removes the CPU-hotplug disabling from synchronize_rcu_expedited(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 25 ++----------------------- 1 file changed, 2 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5dac0a10a985..7234f03e0aa2 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -727,20 +727,6 @@ void synchronize_rcu_expedited(void) snap = READ_ONCE(sync_rcu_preempt_exp_count) + 1; smp_mb(); /* Above access cannot bleed into critical section. */ - /* - * Block CPU-hotplug operations. This means that any CPU-hotplug - * operation that finds an rcu_node structure with tasks in the - * process of being boosted will know that all tasks blocking - * this expedited grace period will already be in the process of - * being boosted. This simplifies the process of moving tasks - * from leaf to root rcu_node structures. - */ - if (!try_get_online_cpus()) { - /* CPU-hotplug operation in flight, fall back to normal GP. */ - wait_rcu_gp(call_rcu); - return; - } - /* * Acquire lock, falling back to synchronize_rcu() if too many * lock-acquisition failures. Of course, if someone does the @@ -748,22 +734,17 @@ void synchronize_rcu_expedited(void) */ while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { if (ULONG_CMP_LT(snap, - READ_ONCE(sync_rcu_preempt_exp_count))) { - put_online_cpus(); + READ_ONCE(sync_rcu_preempt_exp_count))) goto mb_ret; /* Others did our work for us. */ - } if (trycount++ < 10) { udelay(trycount * num_online_cpus()); } else { - put_online_cpus(); wait_rcu_gp(call_rcu); return; } } - if (ULONG_CMP_LT(snap, READ_ONCE(sync_rcu_preempt_exp_count))) { - put_online_cpus(); + if (ULONG_CMP_LT(snap, READ_ONCE(sync_rcu_preempt_exp_count))) goto unlock_mb_ret; /* Others did our work for us. */ - } /* force all RCU readers onto ->blkd_tasks lists. */ synchronize_sched_expedited(); @@ -779,8 +760,6 @@ void synchronize_rcu_expedited(void) rcu_for_each_leaf_node(rsp, rnp) sync_rcu_preempt_exp_init2(rsp, rnp); - put_online_cpus(); - /* Wait for snapshotted ->blkd_tasks lists to drain. */ rnp = rcu_get_root(rsp); wait_event(sync_rcu_preempt_exp_wq, -- cgit v1.2.3 From 75c27f119b6475d95374bdad872c6938b5c26196 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Jun 2015 15:22:43 -0700 Subject: rcu: Remove CONFIG_RCU_CPU_STALL_INFO The CONFIG_RCU_CPU_STALL_INFO has been default-y for a couple of releases with no complaints, so it is time to eliminate this Kconfig option entirely, so that the long-form RCU CPU stall warnings cannot be disabled. This commit does just that. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 4 ---- kernel/rcu/tree_plugin.h | 45 --------------------------------------------- 2 files changed, 49 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index faee5242d6ff..7c0b09d754a1 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -288,12 +288,10 @@ struct rcu_data { bool gpwrap; /* Possible gpnum/completed wrap. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ -#ifdef CONFIG_RCU_CPU_STALL_INFO unsigned long ticks_this_gp; /* The number of scheduling-clock */ /* ticks this CPU has handled */ /* during and after the last grace */ /* period it is aware of. */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ /* 2) batch handling */ /* @@ -388,9 +386,7 @@ struct rcu_data { #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ /* 8) RCU CPU stall data. */ -#ifdef CONFIG_RCU_CPU_STALL_INFO unsigned int softirq_snap; /* Snapshot of softirq activity. */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ int cpu; struct rcu_state *rsp; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7234f03e0aa2..ef41c1b04ba6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -82,8 +82,6 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU lockdep checking is enabled.\n"); if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE)) pr_info("\tRCU torture testing starts during boot.\n"); - if (IS_ENABLED(CONFIG_RCU_CPU_STALL_INFO)) - pr_info("\tAdditional per-CPU info printed with stalls.\n"); if (RCU_NUM_LVLS >= 4) pr_info("\tFour(or more)-level hierarchy is enabled.\n"); if (RCU_FANOUT_LEAF != 16) @@ -418,8 +416,6 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) rcu_print_detail_task_stall_rnp(rnp); } -#ifdef CONFIG_RCU_CPU_STALL_INFO - static void rcu_print_task_stall_begin(struct rcu_node *rnp) { pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", @@ -431,18 +427,6 @@ static void rcu_print_task_stall_end(void) pr_cont("\n"); } -#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ - -static void rcu_print_task_stall_begin(struct rcu_node *rnp) -{ -} - -static void rcu_print_task_stall_end(void) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ - /* * Scan the current list of tasks blocked within RCU read-side critical * sections, printing out the tid of each. @@ -1685,8 +1669,6 @@ early_initcall(rcu_register_oom_notifier); #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ -#ifdef CONFIG_RCU_CPU_STALL_INFO - #ifdef CONFIG_RCU_FAST_NO_HZ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) @@ -1775,33 +1757,6 @@ static void increment_cpu_stall_ticks(void) raw_cpu_inc(rsp->rda->ticks_this_gp); } -#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ - -static void print_cpu_stall_info_begin(void) -{ - pr_cont(" {"); -} - -static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) -{ - pr_cont(" %d", cpu); -} - -static void print_cpu_stall_info_end(void) -{ - pr_cont("} "); -} - -static void zero_cpu_stall_ticks(struct rcu_data *rdp) -{ -} - -static void increment_cpu_stall_ticks(void) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ - #ifdef CONFIG_RCU_NOCB_CPU /* -- cgit v1.2.3 From c190c3b16c0f56ff338df12df53c03859155951b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Jun 2015 19:03:45 -0700 Subject: rcu: Switch synchronize_sched_expedited() to stop_one_cpu() The synchronize_sched_expedited() currently invokes try_stop_cpus(), which schedules the stopper kthreads on each online non-idle CPU, and waits until all those kthreads are running before letting any of them stop. This is disastrous for real-time workloads, which get hit with a preemption that is as long as the longest scheduling latency on any CPU, including any non-realtime housekeeping CPUs. This commit therefore switches to using stop_one_cpu() on each CPU in turn. This avoids inflicting the worst-case scheduling latency on the worst-case CPU onto all other CPUs, and also simplifies the code a little bit. Follow-up commits will simplify the counter-snapshotting algorithm and convert a number of the counters that are now protected by the new ->expedited_mutex to non-atomic. Signed-off-by: Peter Zijlstra [ paulmck: Kept stop_one_cpu(), dropped disabling of "guardrails". ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 41 ++++++++++++++--------------------------- kernel/rcu/tree.h | 1 + 2 files changed, 15 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a2147d7b51c0..ae39a49daa58 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -103,6 +103,7 @@ struct rcu_state sname##_state = { \ .orphan_nxttail = &sname##_state.orphan_nxtlist, \ .orphan_donetail = &sname##_state.orphan_donelist, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ + .expedited_mutex = __MUTEX_INITIALIZER(sname##_state.expedited_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ } @@ -3305,8 +3306,6 @@ static int synchronize_sched_expedited_cpu_stop(void *data) */ void synchronize_sched_expedited(void) { - cpumask_var_t cm; - bool cma = false; int cpu; long firstsnap, s, snap; int trycount = 0; @@ -3342,28 +3341,11 @@ void synchronize_sched_expedited(void) } WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); - /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */ - cma = zalloc_cpumask_var(&cm, GFP_KERNEL); - if (cma) { - cpumask_copy(cm, cpu_online_mask); - cpumask_clear_cpu(raw_smp_processor_id(), cm); - for_each_cpu(cpu, cm) { - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1)) - cpumask_clear_cpu(cpu, cm); - } - if (cpumask_weight(cm) == 0) - goto all_cpus_idle; - } - /* * Each pass through the following loop attempts to force a * context switch on each CPU. */ - while (try_stop_cpus(cma ? cm : cpu_online_mask, - synchronize_sched_expedited_cpu_stop, - NULL) == -EAGAIN) { + while (!mutex_trylock(&rsp->expedited_mutex)) { put_online_cpus(); atomic_long_inc(&rsp->expedited_tryfail); @@ -3373,7 +3355,6 @@ void synchronize_sched_expedited(void) /* ensure test happens before caller kfree */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(&rsp->expedited_workdone1); - free_cpumask_var(cm); return; } @@ -3383,7 +3364,6 @@ void synchronize_sched_expedited(void) } else { wait_rcu_gp(call_rcu_sched); atomic_long_inc(&rsp->expedited_normal); - free_cpumask_var(cm); return; } @@ -3393,7 +3373,6 @@ void synchronize_sched_expedited(void) /* ensure test happens before caller kfree */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(&rsp->expedited_workdone2); - free_cpumask_var(cm); return; } @@ -3408,16 +3387,23 @@ void synchronize_sched_expedited(void) /* CPU hotplug operation in flight, use normal GP. */ wait_rcu_gp(call_rcu_sched); atomic_long_inc(&rsp->expedited_normal); - free_cpumask_var(cm); return; } snap = atomic_long_read(&rsp->expedited_start); smp_mb(); /* ensure read is before try_stop_cpus(). */ } - atomic_long_inc(&rsp->expedited_stoppedcpus); -all_cpus_idle: - free_cpumask_var(cm); + /* Stop each CPU that is online, non-idle, and not us. */ + for_each_online_cpu(cpu) { + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + /* Skip our CPU and any idle CPUs. */ + if (raw_smp_processor_id() == cpu || + !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) + continue; + stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL); + } + atomic_long_inc(&rsp->expedited_stoppedcpus); /* * Everyone up to our most recent fetch is covered by our grace @@ -3436,6 +3422,7 @@ all_cpus_idle: } } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s); atomic_long_inc(&rsp->expedited_done_exit); + mutex_unlock(&rsp->expedited_mutex); put_online_cpus(); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 7c0b09d754a1..7c25fe473ad9 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -480,6 +480,7 @@ struct rcu_state { /* _rcu_barrier(). */ /* End of fields guarded by barrier_mutex. */ + struct mutex expedited_mutex; /* Serializes expediting. */ atomic_long_t expedited_start; /* Starting ticket. */ atomic_long_t expedited_done; /* Done ticket. */ atomic_long_t expedited_wrap; /* # near-wrap incidents. */ -- cgit v1.2.3 From d6ada2cf2f81dab8a231d0ef8fb5dec4f5ac8379 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 24 Jun 2015 10:46:30 -0700 Subject: rcu: Rework synchronize_sched_expedited() counter handling Now that synchronize_sched_expedited() have a mutex, it can use simpler work-already-done detection scheme. This commit simplifies this scheme by using something similar to the sequence-locking counter scheme. A counter is incremented before and after each grace period, so that the counter is odd in the midst of the grace period and even otherwise. So if the counter has advanced to the second even number that is greater than or equal to the snapshot, the required grace period has already happened. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 98 +++++++++++++++---------------------------------- kernel/rcu/tree.h | 9 +---- kernel/rcu/tree_trace.c | 12 ++---- 3 files changed, 36 insertions(+), 83 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ae39a49daa58..3c182fdec805 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3283,56 +3283,24 @@ static int synchronize_sched_expedited_cpu_stop(void *data) * restructure your code to batch your updates, and then use a single * synchronize_sched() instead. * - * This implementation can be thought of as an application of ticket - * locking to RCU, with sync_sched_expedited_started and - * sync_sched_expedited_done taking on the roles of the halves - * of the ticket-lock word. Each task atomically increments - * sync_sched_expedited_started upon entry, snapshotting the old value, - * then attempts to stop all the CPUs. If this succeeds, then each - * CPU will have executed a context switch, resulting in an RCU-sched - * grace period. We are then done, so we use atomic_cmpxchg() to - * update sync_sched_expedited_done to match our snapshot -- but - * only if someone else has not already advanced past our snapshot. - * - * On the other hand, if try_stop_cpus() fails, we check the value - * of sync_sched_expedited_done. If it has advanced past our - * initial snapshot, then someone else must have forced a grace period - * some time after we took our snapshot. In this case, our work is - * done for us, and we can simply return. Otherwise, we try again, - * but keep our initial snapshot for purposes of checking for someone - * doing our work for us. - * - * If we fail too many times in a row, we fall back to synchronize_sched(). + * This implementation can be thought of as an application of sequence + * locking to expedited grace periods, but using the sequence counter to + * determine when someone else has already done the work instead of for + * retrying readers. We do a mutex_trylock() polling loop, but if we fail + * too many times in a row, we fall back to synchronize_sched(). */ void synchronize_sched_expedited(void) { int cpu; - long firstsnap, s, snap; + long s; int trycount = 0; struct rcu_state *rsp = &rcu_sched_state; - /* - * If we are in danger of counter wrap, just do synchronize_sched(). - * By allowing sync_sched_expedited_started to advance no more than - * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring - * that more than 3.5 billion CPUs would be required to force a - * counter wrap on a 32-bit system. Quite a few more CPUs would of - * course be required on a 64-bit system. - */ - if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start), - (ulong)atomic_long_read(&rsp->expedited_done) + - ULONG_MAX / 8)) { - wait_rcu_gp(call_rcu_sched); - atomic_long_inc(&rsp->expedited_wrap); - return; - } + /* Take a snapshot of the sequence number. */ + smp_mb(); /* Caller's modifications seen first by other CPUs. */ + s = (READ_ONCE(rsp->expedited_sequence) + 3) & ~0x1; + smp_mb(); /* Above access must not bleed into critical section. */ - /* - * Take a ticket. Note that atomic_inc_return() implies a - * full memory barrier. - */ - snap = atomic_long_inc_return(&rsp->expedited_start); - firstsnap = snap; if (!try_get_online_cpus()) { /* CPU hotplug operation in flight, fall back to normal GP. */ wait_rcu_gp(call_rcu_sched); @@ -3342,16 +3310,15 @@ void synchronize_sched_expedited(void) WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); /* - * Each pass through the following loop attempts to force a - * context switch on each CPU. + * Each pass through the following loop attempts to acquire + * ->expedited_mutex, checking for others doing our work each time. */ while (!mutex_trylock(&rsp->expedited_mutex)) { put_online_cpus(); atomic_long_inc(&rsp->expedited_tryfail); /* Check to see if someone else did our work for us. */ - s = atomic_long_read(&rsp->expedited_done); - if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { + if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) { /* ensure test happens before caller kfree */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(&rsp->expedited_workdone1); @@ -3368,8 +3335,7 @@ void synchronize_sched_expedited(void) } /* Recheck to see if someone else did our work for us. */ - s = atomic_long_read(&rsp->expedited_done); - if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { + if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) { /* ensure test happens before caller kfree */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(&rsp->expedited_workdone2); @@ -3389,10 +3355,20 @@ void synchronize_sched_expedited(void) atomic_long_inc(&rsp->expedited_normal); return; } - snap = atomic_long_read(&rsp->expedited_start); - smp_mb(); /* ensure read is before try_stop_cpus(). */ } + /* Recheck yet again to see if someone else did our work for us. */ + if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) { + rsp->expedited_workdone3++; + mutex_unlock(&rsp->expedited_mutex); + smp_mb(); /* ensure test happens before caller kfree */ + return; + } + + WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1); + smp_mb(); /* Ensure expedited GP seen after counter increment. */ + WARN_ON_ONCE(!(rsp->expedited_sequence & 0x1)); + /* Stop each CPU that is online, non-idle, and not us. */ for_each_online_cpu(cpu) { struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); @@ -3403,26 +3379,12 @@ void synchronize_sched_expedited(void) continue; stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL); } - atomic_long_inc(&rsp->expedited_stoppedcpus); - /* - * Everyone up to our most recent fetch is covered by our grace - * period. Update the counter, but only if our work is still - * relevant -- which it won't be if someone who started later - * than we did already did their update. - */ - do { - atomic_long_inc(&rsp->expedited_done_tries); - s = atomic_long_read(&rsp->expedited_done); - if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { - /* ensure test happens before caller kfree */ - smp_mb__before_atomic(); /* ^^^ */ - atomic_long_inc(&rsp->expedited_done_lost); - break; - } - } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s); - atomic_long_inc(&rsp->expedited_done_exit); + smp_mb(); /* Ensure expedited GP seen before counter increment. */ + WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1); + WARN_ON_ONCE(rsp->expedited_sequence & 0x1); mutex_unlock(&rsp->expedited_mutex); + smp_mb(); /* ensure subsequent action seen after grace period. */ put_online_cpus(); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 7c25fe473ad9..6a2b741436de 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -481,17 +481,12 @@ struct rcu_state { /* End of fields guarded by barrier_mutex. */ struct mutex expedited_mutex; /* Serializes expediting. */ - atomic_long_t expedited_start; /* Starting ticket. */ - atomic_long_t expedited_done; /* Done ticket. */ - atomic_long_t expedited_wrap; /* # near-wrap incidents. */ + unsigned long expedited_sequence; /* Take a ticket. */ atomic_long_t expedited_tryfail; /* # acquisition failures. */ atomic_long_t expedited_workdone1; /* # done by others #1. */ atomic_long_t expedited_workdone2; /* # done by others #2. */ + unsigned long expedited_workdone3; /* # done by others #3. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ - atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */ - atomic_long_t expedited_done_tries; /* # tries to update _done. */ - atomic_long_t expedited_done_lost; /* # times beaten to _done. */ - atomic_long_t expedited_done_exit; /* # times exited _done loop. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 3ea7ffc7d5c4..a1ab3a5f6290 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -185,18 +185,14 @@ static int show_rcuexp(struct seq_file *m, void *v) { struct rcu_state *rsp = (struct rcu_state *)m->private; - seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n", - atomic_long_read(&rsp->expedited_start), - atomic_long_read(&rsp->expedited_done), - atomic_long_read(&rsp->expedited_wrap), + seq_printf(m, "t=%lu tf=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu sc=%lu\n", + rsp->expedited_sequence, atomic_long_read(&rsp->expedited_tryfail), atomic_long_read(&rsp->expedited_workdone1), atomic_long_read(&rsp->expedited_workdone2), + rsp->expedited_workdone3, atomic_long_read(&rsp->expedited_normal), - atomic_long_read(&rsp->expedited_stoppedcpus), - atomic_long_read(&rsp->expedited_done_tries), - atomic_long_read(&rsp->expedited_done_lost), - atomic_long_read(&rsp->expedited_done_exit)); + rsp->expedited_sequence / 2); return 0; } -- cgit v1.2.3 From 385b73c06f6a733547d0a7714d0c4cb4c8788b88 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 24 Jun 2015 14:20:08 -0700 Subject: rcu: Get rid of synchronize_sched_expedited()'s polling loop This commit gets rid of synchronize_sched_expedited()'s mutex_trylock() polling loop in favor of a funnel-locking scheme based on the rcu_node tree. The work-done check is done at each level of the tree, allowing high-contention situations to be resolved quickly with reasonable levels of mutex contention. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 95 +++++++++++++++++++++---------------------------- kernel/rcu/tree.h | 8 +++-- kernel/rcu/tree_trace.c | 3 +- 3 files changed, 47 insertions(+), 59 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3c182fdec805..b310b40a49a2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -70,6 +70,7 @@ MODULE_ALIAS("rcutree"); static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; +static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; /* * In order to export the rcu_state name to the tracing tools, it @@ -103,7 +104,6 @@ struct rcu_state sname##_state = { \ .orphan_nxttail = &sname##_state.orphan_nxtlist, \ .orphan_donetail = &sname##_state.orphan_donelist, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ - .expedited_mutex = __MUTEX_INITIALIZER(sname##_state.expedited_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ } @@ -3272,6 +3272,22 @@ static int synchronize_sched_expedited_cpu_stop(void *data) return 0; } +/* Common code for synchronize_sched_expedited() work-done checking. */ +static bool sync_sched_exp_wd(struct rcu_state *rsp, struct rcu_node *rnp, + atomic_long_t *stat, unsigned long s) +{ + if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) { + if (rnp) + mutex_unlock(&rnp->exp_funnel_mutex); + /* Ensure test happens before caller kfree(). */ + smp_mb__before_atomic(); /* ^^^ */ + atomic_long_inc(stat); + put_online_cpus(); + return true; + } + return false; +} + /** * synchronize_sched_expedited - Brute-force RCU-sched grace period * @@ -3286,15 +3302,15 @@ static int synchronize_sched_expedited_cpu_stop(void *data) * This implementation can be thought of as an application of sequence * locking to expedited grace periods, but using the sequence counter to * determine when someone else has already done the work instead of for - * retrying readers. We do a mutex_trylock() polling loop, but if we fail - * too many times in a row, we fall back to synchronize_sched(). + * retrying readers. */ void synchronize_sched_expedited(void) { int cpu; long s; - int trycount = 0; struct rcu_state *rsp = &rcu_sched_state; + struct rcu_node *rnp0; + struct rcu_node *rnp1 = NULL; /* Take a snapshot of the sequence number. */ smp_mb(); /* Caller's modifications seen first by other CPUs. */ @@ -3310,60 +3326,25 @@ void synchronize_sched_expedited(void) WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); /* - * Each pass through the following loop attempts to acquire - * ->expedited_mutex, checking for others doing our work each time. + * Each pass through the following loop works its way + * up the rcu_node tree, returning if others have done the + * work or otherwise falls through holding the root rnp's + * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure + * can be inexact, as it is just promoting locality and is not + * strictly needed for correctness. */ - while (!mutex_trylock(&rsp->expedited_mutex)) { - put_online_cpus(); - atomic_long_inc(&rsp->expedited_tryfail); - - /* Check to see if someone else did our work for us. */ - if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) { - /* ensure test happens before caller kfree */ - smp_mb__before_atomic(); /* ^^^ */ - atomic_long_inc(&rsp->expedited_workdone1); - return; - } - - /* No joy, try again later. Or just synchronize_sched(). */ - if (trycount++ < 10) { - udelay(trycount * num_online_cpus()); - } else { - wait_rcu_gp(call_rcu_sched); - atomic_long_inc(&rsp->expedited_normal); + rnp0 = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; + for (; rnp0 != NULL; rnp0 = rnp0->parent) { + if (sync_sched_exp_wd(rsp, rnp1, &rsp->expedited_workdone1, s)) return; - } - - /* Recheck to see if someone else did our work for us. */ - if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) { - /* ensure test happens before caller kfree */ - smp_mb__before_atomic(); /* ^^^ */ - atomic_long_inc(&rsp->expedited_workdone2); - return; - } - - /* - * Refetching sync_sched_expedited_started allows later - * callers to piggyback on our grace period. We retry - * after they started, so our grace period works for them, - * and they started after our first try, so their grace - * period works for us. - */ - if (!try_get_online_cpus()) { - /* CPU hotplug operation in flight, use normal GP. */ - wait_rcu_gp(call_rcu_sched); - atomic_long_inc(&rsp->expedited_normal); - return; - } + mutex_lock(&rnp0->exp_funnel_mutex); + if (rnp1) + mutex_unlock(&rnp1->exp_funnel_mutex); + rnp1 = rnp0; } - - /* Recheck yet again to see if someone else did our work for us. */ - if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) { - rsp->expedited_workdone3++; - mutex_unlock(&rsp->expedited_mutex); - smp_mb(); /* ensure test happens before caller kfree */ + rnp0 = rnp1; /* rcu_get_root(rsp), AKA root rcu_node structure. */ + if (sync_sched_exp_wd(rsp, rnp0, &rsp->expedited_workdone2, s)) return; - } WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1); smp_mb(); /* Ensure expedited GP seen after counter increment. */ @@ -3383,7 +3364,7 @@ void synchronize_sched_expedited(void) smp_mb(); /* Ensure expedited GP seen before counter increment. */ WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1); WARN_ON_ONCE(rsp->expedited_sequence & 0x1); - mutex_unlock(&rsp->expedited_mutex); + mutex_unlock(&rnp0->exp_funnel_mutex); smp_mb(); /* ensure subsequent action seen after grace period. */ put_online_cpus(); @@ -3940,6 +3921,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, { static const char * const buf[] = RCU_NODE_NAME_INIT; static const char * const fqs[] = RCU_FQS_NAME_INIT; + static const char * const exp[] = RCU_EXP_NAME_INIT; static u8 fl_mask = 0x1; int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ @@ -3998,6 +3980,9 @@ static void __init rcu_init_one(struct rcu_state *rsp, rnp->level = i; INIT_LIST_HEAD(&rnp->blkd_tasks); rcu_init_one_nocb(rnp); + mutex_init(&rnp->exp_funnel_mutex); + lockdep_set_class_and_name(&rnp->exp_funnel_mutex, + &rcu_exp_class[i], exp[i]); } } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 6a2b741436de..2ef036b356f7 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -68,6 +68,7 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } # define RCU_NODE_NAME_INIT { "rcu_node_0" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } +# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" } #elif NR_CPUS <= RCU_FANOUT_2 # define RCU_NUM_LVLS 2 # define NUM_RCU_LVL_0 1 @@ -76,6 +77,7 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } +# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" } #elif NR_CPUS <= RCU_FANOUT_3 # define RCU_NUM_LVLS 3 # define NUM_RCU_LVL_0 1 @@ -85,6 +87,7 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } +# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" } #elif NR_CPUS <= RCU_FANOUT_4 # define RCU_NUM_LVLS 4 # define NUM_RCU_LVL_0 1 @@ -95,6 +98,7 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } +# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" } #else # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ @@ -237,6 +241,8 @@ struct rcu_node { int need_future_gp[2]; /* Counts of upcoming no-CB GP requests. */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; + + struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp; } ____cacheline_internodealigned_in_smp; /* @@ -480,12 +486,10 @@ struct rcu_state { /* _rcu_barrier(). */ /* End of fields guarded by barrier_mutex. */ - struct mutex expedited_mutex; /* Serializes expediting. */ unsigned long expedited_sequence; /* Take a ticket. */ atomic_long_t expedited_tryfail; /* # acquisition failures. */ atomic_long_t expedited_workdone1; /* # done by others #1. */ atomic_long_t expedited_workdone2; /* # done by others #2. */ - unsigned long expedited_workdone3; /* # done by others #3. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index a1ab3a5f6290..d2aab8dcd58e 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -185,12 +185,11 @@ static int show_rcuexp(struct seq_file *m, void *v) { struct rcu_state *rsp = (struct rcu_state *)m->private; - seq_printf(m, "t=%lu tf=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu sc=%lu\n", + seq_printf(m, "t=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu\n", rsp->expedited_sequence, atomic_long_read(&rsp->expedited_tryfail), atomic_long_read(&rsp->expedited_workdone1), atomic_long_read(&rsp->expedited_workdone2), - rsp->expedited_workdone3, atomic_long_read(&rsp->expedited_normal), rsp->expedited_sequence / 2); return 0; -- cgit v1.2.3 From 3a6d7c64d78a78d279851524d39999637a549363 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Jun 2015 11:27:10 -0700 Subject: rcu: Make expedited GP CPU stoppage asynchronous Sequentially stopping the CPUs slows down expedited grace periods by at least a factor of two, based on rcutorture's grace-period-per-second rate. This is a conservative measure because rcutorture uses unusually long RCU read-side critical sections and because rcutorture periodically quiesces the system in order to test RCU's ability to ramp down to and up from the idle state. This commit therefore replaces the stop_one_cpu() with stop_one_cpu_nowait(), using an atomic-counter scheme to determine when all CPUs have passed through the stopped state. Signed-off-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 31 +++++++++++++++++-------------- kernel/rcu/tree.h | 6 ++++++ kernel/rcu/tree_trace.c | 3 ++- 3 files changed, 25 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b310b40a49a2..c5c8509054ef 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3257,18 +3257,11 @@ EXPORT_SYMBOL_GPL(cond_synchronize_rcu); static int synchronize_sched_expedited_cpu_stop(void *data) { - /* - * There must be a full memory barrier on each affected CPU - * between the time that try_stop_cpus() is called and the - * time that it returns. - * - * In the current initial implementation of cpu_stop, the - * above condition is already met when the control reaches - * this point and the following smp_mb() is not strictly - * necessary. Do smp_mb() anyway for documentation and - * robustness against future implementation changes. - */ - smp_mb(); /* See above comment block. */ + struct rcu_state *rsp = data; + + /* We are here: If we are last, do the wakeup. */ + if (atomic_dec_and_test(&rsp->expedited_need_qs)) + wake_up(&rsp->expedited_wq); return 0; } @@ -3308,9 +3301,9 @@ void synchronize_sched_expedited(void) { int cpu; long s; - struct rcu_state *rsp = &rcu_sched_state; struct rcu_node *rnp0; struct rcu_node *rnp1 = NULL; + struct rcu_state *rsp = &rcu_sched_state; /* Take a snapshot of the sequence number. */ smp_mb(); /* Caller's modifications seen first by other CPUs. */ @@ -3351,16 +3344,26 @@ void synchronize_sched_expedited(void) WARN_ON_ONCE(!(rsp->expedited_sequence & 0x1)); /* Stop each CPU that is online, non-idle, and not us. */ + init_waitqueue_head(&rsp->expedited_wq); + atomic_set(&rsp->expedited_need_qs, 1); /* Extra count avoids race. */ for_each_online_cpu(cpu) { + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); /* Skip our CPU and any idle CPUs. */ if (raw_smp_processor_id() == cpu || !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) continue; - stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL); + atomic_inc(&rsp->expedited_need_qs); + stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop, + rsp, &rdp->exp_stop_work); } + /* Remove extra count and, if necessary, wait for CPUs to stop. */ + if (!atomic_dec_and_test(&rsp->expedited_need_qs)) + wait_event(rsp->expedited_wq, + !atomic_read(&rsp->expedited_need_qs)); + smp_mb(); /* Ensure expedited GP seen before counter increment. */ WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1); WARN_ON_ONCE(rsp->expedited_sequence & 0x1); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 2ef036b356f7..4edc277d08eb 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -27,6 +27,7 @@ #include #include #include +#include /* * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and @@ -298,6 +299,9 @@ struct rcu_data { /* ticks this CPU has handled */ /* during and after the last grace */ /* period it is aware of. */ + struct cpu_stop_work exp_stop_work; + /* Expedited grace-period control */ + /* for CPU stopping. */ /* 2) batch handling */ /* @@ -491,6 +495,8 @@ struct rcu_state { atomic_long_t expedited_workdone1; /* # done by others #1. */ atomic_long_t expedited_workdone2; /* # done by others #2. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ + atomic_t expedited_need_qs; /* # CPUs left to check in. */ + wait_queue_head_t expedited_wq; /* Wait for check-ins. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index d2aab8dcd58e..36c04b46d3b8 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -185,12 +185,13 @@ static int show_rcuexp(struct seq_file *m, void *v) { struct rcu_state *rsp = (struct rcu_state *)m->private; - seq_printf(m, "t=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu\n", + seq_printf(m, "t=%lu tf=%lu wd1=%lu wd2=%lu n=%lu enq=%d sc=%lu\n", rsp->expedited_sequence, atomic_long_read(&rsp->expedited_tryfail), atomic_long_read(&rsp->expedited_workdone1), atomic_long_read(&rsp->expedited_workdone2), atomic_long_read(&rsp->expedited_normal), + atomic_read(&rsp->expedited_need_qs), rsp->expedited_sequence / 2); return 0; } -- cgit v1.2.3 From 28f00767e3db933cacc3030f4d9736acd037be2c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 Jun 2015 15:00:58 -0700 Subject: rcu: Abstract sequence counting from synchronize_sched_expedited() This commit creates rcu_exp_gp_seq_start() and rcu_exp_gp_seq_end() to bracket an expedited grace period, rcu_exp_gp_seq_snap() to snapshot the sequence counter, and rcu_exp_gp_seq_done() to check to see if a full expedited grace period has elapsed since the snapshot. These will be applied to synchronize_rcu_expedited(). These are defined in terms of underlying rcu_seq_start(), rcu_seq_end(), rcu_seq_snap(), rcu_seq_done(), which will be applied to _rcu_barrier(). One reason that this commit doesn't use the seqcount primitives themselves is that the smp_wmb() in those primitive is insufficient due to the fact that expedited grace periods do reads as well as writes. In addition, the read-side seqcount primitives detect a potentially partial change, where the expedited primitives instead need a guaranteed full change. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c5c8509054ef..67fe75725486 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3255,6 +3255,60 @@ void cond_synchronize_rcu(unsigned long oldstate) } EXPORT_SYMBOL_GPL(cond_synchronize_rcu); +/* Adjust sequence number for start of update-side operation. */ +static void rcu_seq_start(unsigned long *sp) +{ + WRITE_ONCE(*sp, *sp + 1); + smp_mb(); /* Ensure update-side operation after counter increment. */ + WARN_ON_ONCE(!(*sp & 0x1)); +} + +/* Adjust sequence number for end of update-side operation. */ +static void rcu_seq_end(unsigned long *sp) +{ + smp_mb(); /* Ensure update-side operation before counter increment. */ + WRITE_ONCE(*sp, *sp + 1); + WARN_ON_ONCE(*sp & 0x1); +} + +/* Take a snapshot of the update side's sequence number. */ +static unsigned long rcu_seq_snap(unsigned long *sp) +{ + unsigned long s; + + smp_mb(); /* Caller's modifications seen first by other CPUs. */ + s = (READ_ONCE(*sp) + 3) & ~0x1; + smp_mb(); /* Above access must not bleed into critical section. */ + return s; +} + +/* + * Given a snapshot from rcu_seq_snap(), determine whether or not a + * full update-side operation has occurred. + */ +static bool rcu_seq_done(unsigned long *sp, unsigned long s) +{ + return ULONG_CMP_GE(READ_ONCE(*sp), s); +} + +/* Wrapper functions for expedited grace periods. */ +static void rcu_exp_gp_seq_start(struct rcu_state *rsp) +{ + rcu_seq_start(&rsp->expedited_sequence); +} +static void rcu_exp_gp_seq_end(struct rcu_state *rsp) +{ + rcu_seq_end(&rsp->expedited_sequence); +} +static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) +{ + return rcu_seq_snap(&rsp->expedited_sequence); +} +static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) +{ + return rcu_seq_done(&rsp->expedited_sequence, s); +} + static int synchronize_sched_expedited_cpu_stop(void *data) { struct rcu_state *rsp = data; @@ -3269,7 +3323,7 @@ static int synchronize_sched_expedited_cpu_stop(void *data) static bool sync_sched_exp_wd(struct rcu_state *rsp, struct rcu_node *rnp, atomic_long_t *stat, unsigned long s) { - if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) { + if (rcu_exp_gp_seq_done(rsp, s)) { if (rnp) mutex_unlock(&rnp->exp_funnel_mutex); /* Ensure test happens before caller kfree(). */ @@ -3306,9 +3360,7 @@ void synchronize_sched_expedited(void) struct rcu_state *rsp = &rcu_sched_state; /* Take a snapshot of the sequence number. */ - smp_mb(); /* Caller's modifications seen first by other CPUs. */ - s = (READ_ONCE(rsp->expedited_sequence) + 3) & ~0x1; - smp_mb(); /* Above access must not bleed into critical section. */ + s = rcu_exp_gp_seq_snap(rsp); if (!try_get_online_cpus()) { /* CPU hotplug operation in flight, fall back to normal GP. */ @@ -3339,9 +3391,7 @@ void synchronize_sched_expedited(void) if (sync_sched_exp_wd(rsp, rnp0, &rsp->expedited_workdone2, s)) return; - WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1); - smp_mb(); /* Ensure expedited GP seen after counter increment. */ - WARN_ON_ONCE(!(rsp->expedited_sequence & 0x1)); + rcu_exp_gp_seq_start(rsp); /* Stop each CPU that is online, non-idle, and not us. */ init_waitqueue_head(&rsp->expedited_wq); @@ -3364,9 +3414,7 @@ void synchronize_sched_expedited(void) wait_event(rsp->expedited_wq, !atomic_read(&rsp->expedited_need_qs)); - smp_mb(); /* Ensure expedited GP seen before counter increment. */ - WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1); - WARN_ON_ONCE(rsp->expedited_sequence & 0x1); + rcu_exp_gp_seq_end(rsp); mutex_unlock(&rnp0->exp_funnel_mutex); smp_mb(); /* ensure subsequent action seen after grace period. */ -- cgit v1.2.3 From 543c6158f6dff20a741dfa492771f18ceaa1a109 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 Jun 2015 15:52:50 -0700 Subject: rcu: Make synchronize_rcu_expedited() use sequence-counter scheme Although synchronize_rcu_expedited() uses a sequence-counter scheme, it is based on a single increment per grace period, which means that tasks piggybacking off of concurrent grace periods may be forced to wait longer than necessary. This commit therefore applies the new sequence-count functions developed for synchronize_sched_expedited() to speed things up a bit and to consolidate the sequence-counter implementation. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ef41c1b04ba6..759883f51de7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -536,7 +536,6 @@ void synchronize_rcu(void) EXPORT_SYMBOL_GPL(synchronize_rcu); static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); -static unsigned long sync_rcu_preempt_exp_count; static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); /* @@ -704,12 +703,10 @@ void synchronize_rcu_expedited(void) { struct rcu_node *rnp; struct rcu_state *rsp = rcu_state_p; - unsigned long snap; + unsigned long s; int trycount = 0; - smp_mb(); /* Caller's modifications seen first by other CPUs. */ - snap = READ_ONCE(sync_rcu_preempt_exp_count) + 1; - smp_mb(); /* Above access cannot bleed into critical section. */ + s = rcu_exp_gp_seq_snap(rsp); /* * Acquire lock, falling back to synchronize_rcu() if too many @@ -717,8 +714,7 @@ void synchronize_rcu_expedited(void) * expedited grace period for us, just leave. */ while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { - if (ULONG_CMP_LT(snap, - READ_ONCE(sync_rcu_preempt_exp_count))) + if (rcu_exp_gp_seq_done(rsp, s)) goto mb_ret; /* Others did our work for us. */ if (trycount++ < 10) { udelay(trycount * num_online_cpus()); @@ -727,8 +723,9 @@ void synchronize_rcu_expedited(void) return; } } - if (ULONG_CMP_LT(snap, READ_ONCE(sync_rcu_preempt_exp_count))) + if (rcu_exp_gp_seq_done(rsp, s)) goto unlock_mb_ret; /* Others did our work for us. */ + rcu_exp_gp_seq_start(rsp); /* force all RCU readers onto ->blkd_tasks lists. */ synchronize_sched_expedited(); @@ -750,8 +747,7 @@ void synchronize_rcu_expedited(void) sync_rcu_preempt_exp_done(rnp)); /* Clean up and exit. */ - smp_mb(); /* ensure expedited GP seen before counter increment. */ - WRITE_ONCE(sync_rcu_preempt_exp_count, sync_rcu_preempt_exp_count + 1); + rcu_exp_gp_seq_end(rsp); unlock_mb_ret: mutex_unlock(&sync_rcu_preempt_exp_mutex); mb_ret: -- cgit v1.2.3 From b09e5f8601d7e5b8d45348c9c09e1fb4109e8dc6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 Jun 2015 16:30:54 -0700 Subject: rcu: Abstract funnel locking from synchronize_sched_expedited() This commit abstracts funnel locking from synchronize_sched_expedited() so that it may be used by synchronize_rcu_expedited(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 80 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 67fe75725486..f79a1c646846 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3309,16 +3309,6 @@ static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) return rcu_seq_done(&rsp->expedited_sequence, s); } -static int synchronize_sched_expedited_cpu_stop(void *data) -{ - struct rcu_state *rsp = data; - - /* We are here: If we are last, do the wakeup. */ - if (atomic_dec_and_test(&rsp->expedited_need_qs)) - wake_up(&rsp->expedited_wq); - return 0; -} - /* Common code for synchronize_sched_expedited() work-done checking. */ static bool sync_sched_exp_wd(struct rcu_state *rsp, struct rcu_node *rnp, atomic_long_t *stat, unsigned long s) @@ -3335,6 +3325,48 @@ static bool sync_sched_exp_wd(struct rcu_state *rsp, struct rcu_node *rnp, return false; } +/* + * Funnel-lock acquisition for expedited grace periods. Returns a + * pointer to the root rcu_node structure, or NULL if some other + * task did the expedited grace period for us. + */ +static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) +{ + struct rcu_node *rnp0; + struct rcu_node *rnp1 = NULL; + + /* + * Each pass through the following loop works its way + * up the rcu_node tree, returning if others have done the + * work or otherwise falls through holding the root rnp's + * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure + * can be inexact, as it is just promoting locality and is not + * strictly needed for correctness. + */ + rnp0 = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; + for (; rnp0 != NULL; rnp0 = rnp0->parent) { + if (sync_sched_exp_wd(rsp, rnp1, &rsp->expedited_workdone1, s)) + return NULL; + mutex_lock(&rnp0->exp_funnel_mutex); + if (rnp1) + mutex_unlock(&rnp1->exp_funnel_mutex); + rnp1 = rnp0; + } + if (sync_sched_exp_wd(rsp, rnp1, &rsp->expedited_workdone2, s)) + return NULL; + return rnp1; +} + +static int synchronize_sched_expedited_cpu_stop(void *data) +{ + struct rcu_state *rsp = data; + + /* We are here: If we are last, do the wakeup. */ + if (atomic_dec_and_test(&rsp->expedited_need_qs)) + wake_up(&rsp->expedited_wq); + return 0; +} + /** * synchronize_sched_expedited - Brute-force RCU-sched grace period * @@ -3355,8 +3387,7 @@ void synchronize_sched_expedited(void) { int cpu; long s; - struct rcu_node *rnp0; - struct rcu_node *rnp1 = NULL; + struct rcu_node *rnp; struct rcu_state *rsp = &rcu_sched_state; /* Take a snapshot of the sequence number. */ @@ -3370,26 +3401,9 @@ void synchronize_sched_expedited(void) } WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); - /* - * Each pass through the following loop works its way - * up the rcu_node tree, returning if others have done the - * work or otherwise falls through holding the root rnp's - * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure - * can be inexact, as it is just promoting locality and is not - * strictly needed for correctness. - */ - rnp0 = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; - for (; rnp0 != NULL; rnp0 = rnp0->parent) { - if (sync_sched_exp_wd(rsp, rnp1, &rsp->expedited_workdone1, s)) - return; - mutex_lock(&rnp0->exp_funnel_mutex); - if (rnp1) - mutex_unlock(&rnp1->exp_funnel_mutex); - rnp1 = rnp0; - } - rnp0 = rnp1; /* rcu_get_root(rsp), AKA root rcu_node structure. */ - if (sync_sched_exp_wd(rsp, rnp0, &rsp->expedited_workdone2, s)) - return; + rnp = exp_funnel_lock(rsp, s); + if (rnp == NULL) + return; /* Someone else did our work for us. */ rcu_exp_gp_seq_start(rsp); @@ -3415,7 +3429,7 @@ void synchronize_sched_expedited(void) !atomic_read(&rsp->expedited_need_qs)); rcu_exp_gp_seq_end(rsp); - mutex_unlock(&rnp0->exp_funnel_mutex); + mutex_unlock(&rnp->exp_funnel_mutex); smp_mb(); /* ensure subsequent action seen after grace period. */ put_online_cpus(); -- cgit v1.2.3 From 7fd0ddc5bf1ab5259c80a53a01984e13befd658b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 Jun 2015 16:35:03 -0700 Subject: rcu: Fix synchronize_sched_expedited() type error for "s" The type of "s" has been "long" rather than the correct "unsigned long" for quite some time. This commit fixes this type error. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f79a1c646846..094ed8ff82b4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3386,7 +3386,7 @@ static int synchronize_sched_expedited_cpu_stop(void *data) void synchronize_sched_expedited(void) { int cpu; - long s; + unsigned long s; struct rcu_node *rnp; struct rcu_state *rsp = &rcu_sched_state; -- cgit v1.2.3 From 29fd930940193a9a035a75a3847457160d65559a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 Jun 2015 19:03:16 -0700 Subject: rcu: Use funnel locking for synchronize_rcu_expedited()'s polling loop This commit gets rid of synchronize_rcu_expedited()'s mutex_trylock() polling loop in favor of the funnel-locking scheme that was abstracted from synchronize_sched_expedited(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 15 ++++++++------- kernel/rcu/tree_plugin.h | 36 ++++++++++-------------------------- 2 files changed, 18 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 094ed8ff82b4..338ea61929bd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3309,9 +3309,9 @@ static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) return rcu_seq_done(&rsp->expedited_sequence, s); } -/* Common code for synchronize_sched_expedited() work-done checking. */ -static bool sync_sched_exp_wd(struct rcu_state *rsp, struct rcu_node *rnp, - atomic_long_t *stat, unsigned long s) +/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ +static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, + atomic_long_t *stat, unsigned long s) { if (rcu_exp_gp_seq_done(rsp, s)) { if (rnp) @@ -3319,7 +3319,6 @@ static bool sync_sched_exp_wd(struct rcu_state *rsp, struct rcu_node *rnp, /* Ensure test happens before caller kfree(). */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(stat); - put_online_cpus(); return true; } return false; @@ -3345,14 +3344,14 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) */ rnp0 = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; for (; rnp0 != NULL; rnp0 = rnp0->parent) { - if (sync_sched_exp_wd(rsp, rnp1, &rsp->expedited_workdone1, s)) + if (sync_exp_work_done(rsp, rnp1, &rsp->expedited_workdone1, s)) return NULL; mutex_lock(&rnp0->exp_funnel_mutex); if (rnp1) mutex_unlock(&rnp1->exp_funnel_mutex); rnp1 = rnp0; } - if (sync_sched_exp_wd(rsp, rnp1, &rsp->expedited_workdone2, s)) + if (sync_exp_work_done(rsp, rnp1, &rsp->expedited_workdone2, s)) return NULL; return rnp1; } @@ -3402,8 +3401,10 @@ void synchronize_sched_expedited(void) WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); rnp = exp_funnel_lock(rsp, s); - if (rnp == NULL) + if (rnp == NULL) { + put_online_cpus(); return; /* Someone else did our work for us. */ + } rcu_exp_gp_seq_start(rsp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 759883f51de7..f0d71449ec0c 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -536,7 +536,6 @@ void synchronize_rcu(void) EXPORT_SYMBOL_GPL(synchronize_rcu); static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); -static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); /* * Return non-zero if there are any tasks in RCU read-side critical @@ -556,7 +555,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp) * for the current expedited grace period. Works only for preemptible * RCU -- other RCU implementation use other means. * - * Caller must hold sync_rcu_preempt_exp_mutex. + * Caller must hold the root rcu_node's exp_funnel_mutex. */ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) { @@ -572,7 +571,7 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) * - * Caller must hold sync_rcu_preempt_exp_mutex. + * Caller must hold the root rcu_node's exp_funnel_mutex. */ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake) @@ -611,7 +610,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, * set the ->expmask bits on the leaf rcu_node structures to tell phase 2 * that work is needed here. * - * Caller must hold sync_rcu_preempt_exp_mutex. + * Caller must hold the root rcu_node's exp_funnel_mutex. */ static void sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp) @@ -654,7 +653,7 @@ sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp) * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits, * enabling rcu_read_unlock_special() to do the bit-clearing. * - * Caller must hold sync_rcu_preempt_exp_mutex. + * Caller must hold the root rcu_node's exp_funnel_mutex. */ static void sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp) @@ -702,29 +701,16 @@ sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp) void synchronize_rcu_expedited(void) { struct rcu_node *rnp; + struct rcu_node *rnp_unlock; struct rcu_state *rsp = rcu_state_p; unsigned long s; - int trycount = 0; s = rcu_exp_gp_seq_snap(rsp); - /* - * Acquire lock, falling back to synchronize_rcu() if too many - * lock-acquisition failures. Of course, if someone does the - * expedited grace period for us, just leave. - */ - while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { - if (rcu_exp_gp_seq_done(rsp, s)) - goto mb_ret; /* Others did our work for us. */ - if (trycount++ < 10) { - udelay(trycount * num_online_cpus()); - } else { - wait_rcu_gp(call_rcu); - return; - } - } - if (rcu_exp_gp_seq_done(rsp, s)) - goto unlock_mb_ret; /* Others did our work for us. */ + rnp_unlock = exp_funnel_lock(rsp, s); + if (rnp_unlock == NULL) + return; /* Someone else did our work for us. */ + rcu_exp_gp_seq_start(rsp); /* force all RCU readers onto ->blkd_tasks lists. */ @@ -748,9 +734,7 @@ void synchronize_rcu_expedited(void) /* Clean up and exit. */ rcu_exp_gp_seq_end(rsp); -unlock_mb_ret: - mutex_unlock(&sync_rcu_preempt_exp_mutex); -mb_ret: + mutex_unlock(&rnp_unlock->exp_funnel_mutex); smp_mb(); /* ensure subsequent action seen after grace period. */ } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); -- cgit v1.2.3 From 4f525a528b9e75571c6bedc6202beff1ced24c32 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 26 Jun 2015 11:20:00 -0700 Subject: rcu: Apply rcu_seq operations to _rcu_barrier() The rcu_seq operations were open-coded in _rcu_barrier(), so this commit replaces the open-coding with the shiny new rcu_seq operations. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 72 +++++++++++++------------------------------------ kernel/rcu/tree.h | 2 +- kernel/rcu/tree_trace.c | 4 +-- 3 files changed, 22 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 338ea61929bd..44245ae4c1c2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3568,10 +3568,10 @@ static void rcu_barrier_callback(struct rcu_head *rhp) struct rcu_state *rsp = rdp->rsp; if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { - _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done); + _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence); complete(&rsp->barrier_completion); } else { - _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done); + _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence); } } @@ -3583,7 +3583,7 @@ static void rcu_barrier_func(void *type) struct rcu_state *rsp = type; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); + _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence); atomic_inc(&rsp->barrier_cpu_count); rsp->call(&rdp->barrier_head, rcu_barrier_callback); } @@ -3596,55 +3596,24 @@ static void _rcu_barrier(struct rcu_state *rsp) { int cpu; struct rcu_data *rdp; - unsigned long snap = READ_ONCE(rsp->n_barrier_done); - unsigned long snap_done; + unsigned long s = rcu_seq_snap(&rsp->barrier_sequence); - _rcu_barrier_trace(rsp, "Begin", -1, snap); + _rcu_barrier_trace(rsp, "Begin", -1, s); /* Take mutex to serialize concurrent rcu_barrier() requests. */ mutex_lock(&rsp->barrier_mutex); - /* - * Ensure that all prior references, including to ->n_barrier_done, - * are ordered before the _rcu_barrier() machinery. - */ - smp_mb(); /* See above block comment. */ - - /* - * Recheck ->n_barrier_done to see if others did our work for us. - * This means checking ->n_barrier_done for an even-to-odd-to-even - * transition. The "if" expression below therefore rounds the old - * value up to the next even number and adds two before comparing. - */ - snap_done = rsp->n_barrier_done; - _rcu_barrier_trace(rsp, "Check", -1, snap_done); - - /* - * If the value in snap is odd, we needed to wait for the current - * rcu_barrier() to complete, then wait for the next one, in other - * words, we need the value of snap_done to be three larger than - * the value of snap. On the other hand, if the value in snap is - * even, we only had to wait for the next rcu_barrier() to complete, - * in other words, we need the value of snap_done to be only two - * greater than the value of snap. The "(snap + 3) & ~0x1" computes - * this for us (thank you, Linus!). - */ - if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) { - _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); + /* Did someone else do our work for us? */ + if (rcu_seq_done(&rsp->barrier_sequence, s)) { + _rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence); smp_mb(); /* caller's subsequent code after above check. */ mutex_unlock(&rsp->barrier_mutex); return; } - /* - * Increment ->n_barrier_done to avoid duplicate work. Use - * WRITE_ONCE() to prevent the compiler from speculating - * the increment to precede the early-exit check. - */ - WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1); - WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); - _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); - smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ + /* Mark the start of the barrier operation. */ + rcu_seq_start(&rsp->barrier_sequence); + _rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence); /* * Initialize the count to one rather than to zero in order to @@ -3668,10 +3637,10 @@ static void _rcu_barrier(struct rcu_state *rsp) if (rcu_is_nocb_cpu(cpu)) { if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { _rcu_barrier_trace(rsp, "OfflineNoCB", cpu, - rsp->n_barrier_done); + rsp->barrier_sequence); } else { _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, - rsp->n_barrier_done); + rsp->barrier_sequence); smp_mb__before_atomic(); atomic_inc(&rsp->barrier_cpu_count); __call_rcu(&rdp->barrier_head, @@ -3679,11 +3648,11 @@ static void _rcu_barrier(struct rcu_state *rsp) } } else if (READ_ONCE(rdp->qlen)) { _rcu_barrier_trace(rsp, "OnlineQ", cpu, - rsp->n_barrier_done); + rsp->barrier_sequence); smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); } else { _rcu_barrier_trace(rsp, "OnlineNQ", cpu, - rsp->n_barrier_done); + rsp->barrier_sequence); } } put_online_cpus(); @@ -3695,16 +3664,13 @@ static void _rcu_barrier(struct rcu_state *rsp) if (atomic_dec_and_test(&rsp->barrier_cpu_count)) complete(&rsp->barrier_completion); - /* Increment ->n_barrier_done to prevent duplicate work. */ - smp_mb(); /* Keep increment after above mechanism. */ - WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1); - WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); - _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); - smp_mb(); /* Keep increment before caller's subsequent code. */ - /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ wait_for_completion(&rsp->barrier_completion); + /* Mark the end of the barrier operation. */ + _rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence); + rcu_seq_end(&rsp->barrier_sequence); + /* Other rcu_barrier() invocations can now safely proceed. */ mutex_unlock(&rsp->barrier_mutex); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 4edc277d08eb..5c1042d9c310 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -486,7 +486,7 @@ struct rcu_state { struct mutex barrier_mutex; /* Guards barrier fields. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ struct completion barrier_completion; /* Wake at barrier end. */ - unsigned long n_barrier_done; /* ++ at start and end of */ + unsigned long barrier_sequence; /* ++ at start and end of */ /* _rcu_barrier(). */ /* End of fields guarded by barrier_mutex. */ diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 36c04b46d3b8..d9982a2ce305 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -81,9 +81,9 @@ static void r_stop(struct seq_file *m, void *v) static int show_rcubarrier(struct seq_file *m, void *v) { struct rcu_state *rsp = (struct rcu_state *)m->private; - seq_printf(m, "bcc: %d nbd: %lu\n", + seq_printf(m, "bcc: %d bseq: %lu\n", atomic_read(&rsp->barrier_cpu_count), - rsp->n_barrier_done); + rsp->barrier_sequence); return 0; } -- cgit v1.2.3 From 704dd435ac7eaefa89fcd82fd2876b8330e00ff3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 27 Jun 2015 09:36:29 -0700 Subject: rcu: Consolidate last open-coded expedited memory barrier One of the requirements on RCU grace periods is that if there is a causal chain of operations that starts after one grace period and ends before another grace period, then the two grace periods must be serialized. There has been (and might still be) code that relies on this, for example, certain types of reference-counting code that does a call_rcu() within an RCU callback function. This requirement is why there is an smp_mb() at the end of both synchronize_sched_expedited() and synchronize_rcu_expedited(). However, this is the only smp_mb() in these functions, so it would be nicer to consolidate it into rcu_exp_gp_seq_end(). This commit does just that. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- kernel/rcu/tree_plugin.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 44245ae4c1c2..a905d3ba8673 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3299,6 +3299,7 @@ static void rcu_exp_gp_seq_start(struct rcu_state *rsp) static void rcu_exp_gp_seq_end(struct rcu_state *rsp) { rcu_seq_end(&rsp->expedited_sequence); + smp_mb(); /* Ensure that consecutive grace periods serialize. */ } static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) { @@ -3431,7 +3432,6 @@ void synchronize_sched_expedited(void) rcu_exp_gp_seq_end(rsp); mutex_unlock(&rnp->exp_funnel_mutex); - smp_mb(); /* ensure subsequent action seen after grace period. */ put_online_cpus(); } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f0d71449ec0c..27b714601c6e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -735,7 +735,6 @@ void synchronize_rcu_expedited(void) /* Clean up and exit. */ rcu_exp_gp_seq_end(rsp); mutex_unlock(&rnp_unlock->exp_funnel_mutex); - smp_mb(); /* ensure subsequent action seen after grace period. */ } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); -- cgit v1.2.3 From 2cd6ffafec066118365f6d7eb7a42ea16c1f032c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Jun 2015 17:06:39 -0700 Subject: rcu: Extend expedited funnel locking to rcu_data structure The strictly rcu_node based funnel-locking scheme works well in many cases, but systems with CONFIG_RCU_FANOUT_LEAF=64 won't necessarily get all that much concurrency. This commit therefore extends the funnel locking into the per-CPU rcu_data structure, providing concurrency equal to the number of CPUs. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 19 ++++++++++++++++--- kernel/rcu/tree.h | 4 +++- kernel/rcu/tree_trace.c | 3 ++- 3 files changed, 21 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a905d3ba8673..e45097fc39fa 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3312,11 +3312,14 @@ static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp, atomic_long_t *stat, unsigned long s) { if (rcu_exp_gp_seq_done(rsp, s)) { if (rnp) mutex_unlock(&rnp->exp_funnel_mutex); + else if (rdp) + mutex_unlock(&rdp->exp_funnel_mutex); /* Ensure test happens before caller kfree(). */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(stat); @@ -3332,6 +3335,7 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, */ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) { + struct rcu_data *rdp; struct rcu_node *rnp0; struct rcu_node *rnp1 = NULL; @@ -3343,16 +3347,24 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) * can be inexact, as it is just promoting locality and is not * strictly needed for correctness. */ - rnp0 = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; + rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); + if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s)) + return NULL; + mutex_lock(&rdp->exp_funnel_mutex); + rnp0 = rdp->mynode; for (; rnp0 != NULL; rnp0 = rnp0->parent) { - if (sync_exp_work_done(rsp, rnp1, &rsp->expedited_workdone1, s)) + if (sync_exp_work_done(rsp, rnp1, rdp, + &rsp->expedited_workdone2, s)) return NULL; mutex_lock(&rnp0->exp_funnel_mutex); if (rnp1) mutex_unlock(&rnp1->exp_funnel_mutex); + else + mutex_unlock(&rdp->exp_funnel_mutex); rnp1 = rnp0; } - if (sync_exp_work_done(rsp, rnp1, &rsp->expedited_workdone2, s)) + if (sync_exp_work_done(rsp, rnp1, rdp, + &rsp->expedited_workdone3, s)) return NULL; return rnp1; } @@ -3733,6 +3745,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); rdp->cpu = cpu; rdp->rsp = rsp; + mutex_init(&rdp->exp_funnel_mutex); rcu_boot_init_nocb_percpu_data(rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 5c1042d9c310..efee84ce1e08 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -364,11 +364,12 @@ struct rcu_data { unsigned long n_rp_nocb_defer_wakeup; unsigned long n_rp_need_nothing; - /* 6) _rcu_barrier() and OOM callbacks. */ + /* 6) _rcu_barrier(), OOM callbacks, and expediting. */ struct rcu_head barrier_head; #ifdef CONFIG_RCU_FAST_NO_HZ struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ + struct mutex exp_funnel_mutex; /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU @@ -494,6 +495,7 @@ struct rcu_state { atomic_long_t expedited_tryfail; /* # acquisition failures. */ atomic_long_t expedited_workdone1; /* # done by others #1. */ atomic_long_t expedited_workdone2; /* # done by others #2. */ + atomic_long_t expedited_workdone3; /* # done by others #3. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */ wait_queue_head_t expedited_wq; /* Wait for check-ins. */ diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index d9982a2ce305..ec62369f1b02 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -185,11 +185,12 @@ static int show_rcuexp(struct seq_file *m, void *v) { struct rcu_state *rsp = (struct rcu_state *)m->private; - seq_printf(m, "t=%lu tf=%lu wd1=%lu wd2=%lu n=%lu enq=%d sc=%lu\n", + seq_printf(m, "t=%lu tf=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", rsp->expedited_sequence, atomic_long_read(&rsp->expedited_tryfail), atomic_long_read(&rsp->expedited_workdone1), atomic_long_read(&rsp->expedited_workdone2), + atomic_long_read(&rsp->expedited_workdone3), atomic_long_read(&rsp->expedited_normal), atomic_read(&rsp->expedited_need_qs), rsp->expedited_sequence / 2); -- cgit v1.2.3 From cf3620a6c7798be3395163d3bb863ab378a6aa80 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 30 Jun 2015 11:14:32 -0700 Subject: rcu: Add stall warnings to synchronize_sched_expedited() Although synchronize_sched_expedited() historically has no RCU CPU stall warnings, the availability of the rcupdate.rcu_expedited boot parameter invalidates the old assumption that synchronize_sched()'s stall warnings would suffice. This commit therefore adds RCU CPU stall warnings to synchronize_sched_expedited(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++---- kernel/rcu/tree.h | 1 + 2 files changed, 55 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e45097fc39fa..4b6594c7db58 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3369,16 +3369,65 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) return rnp1; } +/* Invoked on each online non-idle CPU for expedited quiescent state. */ static int synchronize_sched_expedited_cpu_stop(void *data) { - struct rcu_state *rsp = data; + struct rcu_data *rdp = data; + struct rcu_state *rsp = rdp->rsp; /* We are here: If we are last, do the wakeup. */ + rdp->exp_done = true; if (atomic_dec_and_test(&rsp->expedited_need_qs)) wake_up(&rsp->expedited_wq); return 0; } +static void synchronize_sched_expedited_wait(struct rcu_state *rsp) +{ + int cpu; + unsigned long jiffies_stall; + unsigned long jiffies_start; + struct rcu_data *rdp; + int ret; + + jiffies_stall = rcu_jiffies_till_stall_check(); + jiffies_start = jiffies; + + for (;;) { + ret = wait_event_interruptible_timeout( + rsp->expedited_wq, + !atomic_read(&rsp->expedited_need_qs), + jiffies_stall); + if (ret > 0) + return; + if (ret < 0) { + /* Hit a signal, disable CPU stall warnings. */ + wait_event(rsp->expedited_wq, + !atomic_read(&rsp->expedited_need_qs)); + return; + } + pr_err("INFO: %s detected expedited stalls on CPUs: {", + rsp->name); + for_each_online_cpu(cpu) { + rdp = per_cpu_ptr(rsp->rda, cpu); + + if (rdp->exp_done) + continue; + pr_cont(" %d", cpu); + } + pr_cont(" } %lu jiffies s: %lu\n", + jiffies - jiffies_start, rsp->expedited_sequence); + for_each_online_cpu(cpu) { + rdp = per_cpu_ptr(rsp->rda, cpu); + + if (rdp->exp_done) + continue; + dump_cpu_task(cpu); + } + jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; + } +} + /** * synchronize_sched_expedited - Brute-force RCU-sched grace period * @@ -3428,19 +3477,20 @@ void synchronize_sched_expedited(void) struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + rdp->exp_done = false; + /* Skip our CPU and any idle CPUs. */ if (raw_smp_processor_id() == cpu || !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) continue; atomic_inc(&rsp->expedited_need_qs); stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop, - rsp, &rdp->exp_stop_work); + rdp, &rdp->exp_stop_work); } /* Remove extra count and, if necessary, wait for CPUs to stop. */ if (!atomic_dec_and_test(&rsp->expedited_need_qs)) - wait_event(rsp->expedited_wq, - !atomic_read(&rsp->expedited_need_qs)); + synchronize_sched_expedited_wait(rsp); rcu_exp_gp_seq_end(rsp); mutex_unlock(&rnp->exp_funnel_mutex); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index efee84ce1e08..b3ae8d3cffbc 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -370,6 +370,7 @@ struct rcu_data { struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ struct mutex exp_funnel_mutex; + bool exp_done; /* Expedited QS for this CPU? */ /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU -- cgit v1.2.3 From b9a425cfcb3c473b4ca2f3dfaeaf13848f4a7976 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 1 Jul 2015 13:50:28 -0700 Subject: rcu: Pull out wait_event*() condition into helper function The condition for the wait_event_interruptible_timeout() that waits to do the next force-quiescent-state scan is a bit ornate: ((gf = READ_ONCE(rsp->gp_flags)) & RCU_GP_FLAG_FQS) || (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) This commit therefore pulls this condition out into a helper function and comments its component conditions. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4b6594c7db58..b2803730ac13 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1903,6 +1903,26 @@ static int rcu_gp_init(struct rcu_state *rsp) return 1; } +/* + * Helper function for wait_event_interruptible_timeout() wakeup + * at force-quiescent-state time. + */ +static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) +{ + struct rcu_node *rnp = rcu_get_root(rsp); + + /* Someone like call_rcu() requested a force-quiescent-state scan. */ + *gfp = READ_ONCE(rsp->gp_flags); + if (*gfp & RCU_GP_FLAG_FQS) + return true; + + /* The current grace period has completed. */ + if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) + return true; + + return false; +} + /* * Do one round of quiescent-state forcing. */ @@ -2067,11 +2087,7 @@ static int __noreturn rcu_gp_kthread(void *arg) TPS("fqswait")); rsp->gp_state = RCU_GP_WAIT_FQS; ret = wait_event_interruptible_timeout(rsp->gp_wq, - ((gf = READ_ONCE(rsp->gp_flags)) & - RCU_GP_FLAG_FQS) || - (!READ_ONCE(rnp->qsmask) && - !rcu_preempt_blocked_readers_cgp(rnp)), - j); + rcu_gp_fqs_check_wake(rsp, &gf), j); rsp->gp_state = RCU_GP_DONE_FQS; /* Locking provides needed memory barriers. */ /* If grace period done, leave loop. */ -- cgit v1.2.3 From 32bb1c79996069ef9e4e53b428050749f9841c3f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 Jul 2015 12:27:31 -0700 Subject: rcu: Rename RCU_GP_DONE_FQS to RCU_GP_DOING_FQS The grace-period kthread sleeps waiting to do a force-quiescent-state scan, and when awakened sets rsp->gp_state to RCU_GP_DONE_FQS. However, this is confusing because the kthread has not done the force-quiescent-state, but is instead just starting to do it. This commit therefore renames RCU_GP_DONE_FQS to RCU_GP_DOING_FQS in order to make things a bit easier on reviewers. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- kernel/rcu/tree.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b2803730ac13..f66f6e7730bc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2088,7 +2088,7 @@ static int __noreturn rcu_gp_kthread(void *arg) rsp->gp_state = RCU_GP_WAIT_FQS; ret = wait_event_interruptible_timeout(rsp->gp_wq, rcu_gp_fqs_check_wake(rsp, &gf), j); - rsp->gp_state = RCU_GP_DONE_FQS; + rsp->gp_state = RCU_GP_DOING_FQS; /* Locking provides needed memory barriers. */ /* If grace period done, leave loop. */ if (!READ_ONCE(rnp->qsmask) && diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index b3ae8d3cffbc..543ba726396c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -535,7 +535,7 @@ struct rcu_state { #define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */ #define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */ #define RCU_GP_WAIT_FQS 3 /* Wait for force-quiescent-state time. */ -#define RCU_GP_DONE_FQS 4 /* Wait done for force-quiescent-state time. */ +#define RCU_GP_DOING_FQS 4 /* Wait done for force-quiescent-state time. */ #define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */ #define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */ -- cgit v1.2.3 From cdacbe1f91264687af956e810278030f2ab5a3d0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 11 Jul 2015 16:24:45 -0700 Subject: rcu: Add fastpath bypassing funnel locking In the common case, there will be only one expedited grace period in the system at a given time, in which case it is not helpful to use funnel locking. This commit therefore adds a fastpath that bypasses funnel locking when the root ->exp_funnel_mutex is not held. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 16 ++++++++++++++++ kernel/rcu/tree.h | 2 +- kernel/rcu/tree_trace.c | 4 ++-- 3 files changed, 19 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f66f6e7730bc..3af0dee2d045 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3355,6 +3355,22 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) struct rcu_node *rnp0; struct rcu_node *rnp1 = NULL; + /* + * First try directly acquiring the root lock in order to reduce + * latency in the common case where expedited grace periods are + * rare. We check mutex_is_locked() to avoid pathological levels of + * memory contention on ->exp_funnel_mutex in the heavy-load case. + */ + rnp0 = rcu_get_root(rsp); + if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) { + if (mutex_trylock(&rnp0->exp_funnel_mutex)) { + if (sync_exp_work_done(rsp, rnp0, NULL, + &rsp->expedited_workdone0, s)) + return NULL; + return rnp0; + } + } + /* * Each pass through the following loop works its way * up the rcu_node tree, returning if others have done the diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 543ba726396c..80d974df0ea0 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -493,7 +493,7 @@ struct rcu_state { /* End of fields guarded by barrier_mutex. */ unsigned long expedited_sequence; /* Take a ticket. */ - atomic_long_t expedited_tryfail; /* # acquisition failures. */ + atomic_long_t expedited_workdone0; /* # done by others #0. */ atomic_long_t expedited_workdone1; /* # done by others #1. */ atomic_long_t expedited_workdone2; /* # done by others #2. */ atomic_long_t expedited_workdone3; /* # done by others #3. */ diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index ec62369f1b02..6fc4c5ff3bb5 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -185,9 +185,9 @@ static int show_rcuexp(struct seq_file *m, void *v) { struct rcu_state *rsp = (struct rcu_state *)m->private; - seq_printf(m, "t=%lu tf=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", + seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", rsp->expedited_sequence, - atomic_long_read(&rsp->expedited_tryfail), + atomic_long_read(&rsp->expedited_workdone0), atomic_long_read(&rsp->expedited_workdone1), atomic_long_read(&rsp->expedited_workdone2), atomic_long_read(&rsp->expedited_workdone3), -- cgit v1.2.3 From 63caae8480921773b46adec0b6ddac9a844a042f Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Mon, 20 Jul 2015 18:34:50 +0200 Subject: sched/idle: Move latency tracing stop/start calls deeper inside the idle loop Make sure to stop tracing only once we are past a point where all latency tracing events have been processed (irqs are not enabled again). This has the slight advantage of capturing more latency related events in the idle path, but most importantly it makes sure that latency tracing doesn't get re-enabled inadvertently when new events are coming in. This makes the irqsoff latency tracer useful again, as we stop capturing CPU sleep time as IRQ latency. Signed-off-by: Lucas Stach Cc: Daniel Lezcano Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Steven Rostedt Cc: Thomas Gleixner Cc: kernel@pengutronix.de Cc: patchwork-lst@pengutronix.de Link: http://lkml.kernel.org/r/1437410090-3747-1-git-send-email-l.stach@pengutronix.de Signed-off-by: Ingo Molnar --- kernel/sched/idle.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 594275ed2620..8f177c73ae19 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -83,10 +83,13 @@ void __weak arch_cpu_idle(void) */ void default_idle_call(void) { - if (current_clr_polling_and_test()) + if (current_clr_polling_and_test()) { local_irq_enable(); - else + } else { + stop_critical_timings(); arch_cpu_idle(); + start_critical_timings(); + } } static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, @@ -140,12 +143,6 @@ static void cpuidle_idle_call(void) return; } - /* - * During the idle period, stop measuring the disabled irqs - * critical sections latencies - */ - stop_critical_timings(); - /* * Tell the RCU framework we are entering an idle section, * so no more rcu read side critical sections and one more @@ -198,7 +195,6 @@ exit_idle: local_irq_enable(); rcu_idle_exit(); - start_critical_timings(); } DEFINE_PER_CPU(bool, cpu_dead_idle); -- cgit v1.2.3 From aa48b6f708868ab9c22ca737f27a0da832bf7f08 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Thu, 9 Jul 2015 16:00:47 +0800 Subject: genirq/MSI: Move alloc_msi_entry() from PCI into generic MSI code Move alloc_msi_entry() from PCI MSI code into generic MSI code, so it can be reused by other generic MSI drivers. Also introduce free_msi_entry() for completeness. Suggested-by: Stuart Yoder . Signed-off-by: Jiang Liu Reviewed-by: Marc Zyngier Reviewed-by: Yijing Wang Acked-by: Bjorn Helgaas Cc: Tony Luck Cc: linux-arm-kernel@lists.infradead.org Cc: Grant Likely Cc: Borislav Petkov Cc: Alexander Gordeev Link: http://lkml.kernel.org/r/1436428847-8886-13-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- kernel/irq/msi.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 7bf1f1bbb7fa..7e6512b9dc1f 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -18,6 +18,23 @@ /* Temparory solution for building, will be removed later */ #include +struct msi_desc *alloc_msi_entry(struct device *dev) +{ + struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL); + if (!desc) + return NULL; + + INIT_LIST_HEAD(&desc->list); + desc->dev = dev; + + return desc; +} + +void free_msi_entry(struct msi_desc *entry) +{ + kfree(entry); +} + void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg) { *msg = entry->msg; -- cgit v1.2.3 From 24560056de61d86153cecb84d04e4237437f5888 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 30 May 2015 10:11:24 -0700 Subject: rcu: Add RCU-sched flavors of get-state and cond-sync The get_state_synchronize_rcu() and cond_synchronize_rcu() functions allow polling for grace-period completion, with an actual wait for a grace period occurring only when cond_synchronize_rcu() is called too soon after the corresponding get_state_synchronize_rcu(). However, these functions work only for vanilla RCU. This commit adds the get_state_synchronize_sched() and cond_synchronize_sched(), which provide the same capability for RCU-sched. Reported-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 ++ kernel/rcu/tree.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 59e32684c23b..0f2cb55f0ab3 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -635,6 +635,8 @@ static struct rcu_torture_ops sched_ops = { .deferred_free = rcu_sched_torture_deferred_free, .sync = synchronize_sched, .exp_sync = synchronize_sched_expedited, + .get_state = get_state_synchronize_sched, + .cond_sync = cond_synchronize_sched, .call = call_rcu_sched, .cb_barrier = rcu_barrier_sched, .fqs = rcu_sched_force_quiescent_state, diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8b5dd8ba9495..9629298eea24 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3253,6 +3253,58 @@ void cond_synchronize_rcu(unsigned long oldstate) } EXPORT_SYMBOL_GPL(cond_synchronize_rcu); +/** + * get_state_synchronize_sched - Snapshot current RCU-sched state + * + * Returns a cookie that is used by a later call to cond_synchronize_sched() + * to determine whether or not a full grace period has elapsed in the + * meantime. + */ +unsigned long get_state_synchronize_sched(void) +{ + /* + * Any prior manipulation of RCU-protected data must happen + * before the load from ->gpnum. + */ + smp_mb(); /* ^^^ */ + + /* + * Make sure this load happens before the purportedly + * time-consuming work between get_state_synchronize_sched() + * and cond_synchronize_sched(). + */ + return smp_load_acquire(&rcu_sched_state.gpnum); +} +EXPORT_SYMBOL_GPL(get_state_synchronize_sched); + +/** + * cond_synchronize_sched - Conditionally wait for an RCU-sched grace period + * + * @oldstate: return value from earlier call to get_state_synchronize_sched() + * + * If a full RCU-sched grace period has elapsed since the earlier call to + * get_state_synchronize_sched(), just return. Otherwise, invoke + * synchronize_sched() to wait for a full grace period. + * + * Yes, this function does not take counter wrap into account. But + * counter wrap is harmless. If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!), + * so waiting for one additional grace period should be just fine. + */ +void cond_synchronize_sched(unsigned long oldstate) +{ + unsigned long newstate; + + /* + * Ensure that this load happens before any RCU-destructive + * actions the caller might carry out after we return. + */ + newstate = smp_load_acquire(&rcu_sched_state.completed); + if (ULONG_CMP_GE(oldstate, newstate)) + synchronize_sched(); +} +EXPORT_SYMBOL_GPL(cond_synchronize_sched); + static int synchronize_sched_expedited_cpu_stop(void *data) { /* -- cgit v1.2.3 From bc17ea1092c48227334a311a130c1a41966333fe Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 6 Jun 2015 08:11:43 -0700 Subject: rcu: Fix obsolete priority-boosting comment Tasks are no longer migrated to the root rcu_node, so there is no longer any need for a boost kthread for the root rcu_node, and there no longer is such a kthread. This commit therefore fixes the comment in rcu_boost_kthread()'s header to reflect this new reality. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 013485fb2b06..a983bc68a146 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1061,8 +1061,7 @@ static int rcu_boost(struct rcu_node *rnp) } /* - * Priority-boosting kthread. One per leaf rcu_node and one for the - * root rcu_node. + * Priority-boosting kthread, one per leaf rcu_node. */ static int rcu_boost_kthread(void *arg) { -- cgit v1.2.3 From ec90a194ae2cb8b8e9fe4f6f70dd3d4dc0269b4b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Jun 2015 12:53:06 -0700 Subject: rcu: Create a synchronize_rcu_mult() There have been several requests for a primitive that waits for grace periods for several RCU flavors concurrently, so this commit creates it. This is a variadic macro, and you pass in the call_rcu() functions of the flavors of RCU that you wish to wait for. Note that you cannot pass in call_srcu() for two reasons: (1) This would result in a type mismatch and (2) You need to specify which srcu_struct you want to use. Handle this by creating a wrapper function for your SRCU domain, for example: void call_srcu_mine(struct rcu_head *head, rcu_callback_t func) { call_srcu(&ss_mine, head, func); } You can then do something like this: synchronize_rcu_mult(call_srcu_mine, call_rcu, call_rcu_sched); Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index fec5f48b8860..a0a0dd03c73a 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -318,20 +318,37 @@ void wakeme_after_rcu(struct rcu_head *head) rcu = container_of(head, struct rcu_synchronize, head); complete(&rcu->completion); } +EXPORT_SYMBOL_GPL(wakeme_after_rcu); -void wait_rcu_gp(call_rcu_func_t crf) +void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, + struct rcu_synchronize *rs_array) { - struct rcu_synchronize rcu; + int i; - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - crf(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); + /* Initialize and register callbacks for each flavor specified. */ + for (i = 0; i < n; i++) { + if (checktiny && + (crcu_array[i] == call_rcu || + crcu_array[i] == call_rcu_bh)) { + might_sleep(); + continue; + } + init_rcu_head_on_stack(&rs_array[i].head); + init_completion(&rs_array[i].completion); + (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); + } + + /* Wait for all callbacks to be invoked. */ + for (i = 0; i < n; i++) { + if (checktiny && + (crcu_array[i] == call_rcu || + crcu_array[i] == call_rcu_bh)) + continue; + wait_for_completion(&rs_array[i].completion); + destroy_rcu_head_on_stack(&rs_array[i].head); + } } -EXPORT_SYMBOL_GPL(wait_rcu_gp); +EXPORT_SYMBOL_GPL(__wait_rcu_gp); #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD void init_rcu_head(struct rcu_head *head) -- cgit v1.2.3 From 779de6ce54f627f955d4a3d0c5b3dcfaab74fea8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Jun 2015 13:34:41 -0700 Subject: cpu: Wait for RCU grace periods concurrently In kernels built with CONFIG_PREEMPT, _cpu_down() waits for RCU and RCU-sched grace periods back-to-back, incurring quite a bit more latency than required. This commit therefore uses the new synchronize_rcu_mult() to allow waiting for both grace periods concurrently. Signed-off-by: Paul E. McKenney --- kernel/cpu.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 9c9c9fab16cc..d63b062b6267 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -380,14 +380,14 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) * will observe it. * * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might - * not imply sync_sched(), so explicitly call both. + * not imply sync_sched(), so wait for both. * * Do sync before park smpboot threads to take care the rcu boost case. */ -#ifdef CONFIG_PREEMPT - synchronize_sched(); -#endif - synchronize_rcu(); + if (IS_ENABLED(CONFIG_PREEMPT)) + synchronize_rcu_mult(call_rcu, call_rcu_sched); + else + synchronize_rcu(); smpboot_park_threads(cpu); -- cgit v1.2.3 From 46f00d18fca42cc954c2e9e99a48b6f3a7741ed7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 16 Jun 2015 10:35:18 -0700 Subject: rcu: Make rcu_is_watching() really notrace Although rcu_is_watching() is marked notrace, it invokes preempt_disable() and preempt_enable(), both of which can be traced. This defeats the purpose of the notrace on rcu_is_watching(), so this commit substitutes preempt_disable_notrace() and preempt_enable_notrace(). Signed-off-by: Alexei Starovoitov Signed-off-by: Paul E. McKenney Acked-by: Steven Rostedt --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9629298eea24..cb64d7e13d24 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -978,9 +978,9 @@ bool notrace rcu_is_watching(void) { bool ret; - preempt_disable(); + preempt_disable_notrace(); ret = __rcu_is_watching(); - preempt_enable(); + preempt_enable_notrace(); return ret; } EXPORT_SYMBOL_GPL(rcu_is_watching); -- cgit v1.2.3 From f78f5b90c4ffa559e400c3919a02236101f29f3f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 18 Jun 2015 15:50:02 -0700 Subject: rcu: Rename rcu_lockdep_assert() to RCU_LOCKDEP_WARN() This commit renames rcu_lockdep_assert() to RCU_LOCKDEP_WARN() for consistency with the WARN() series of macros. This also requires inverting the sense of the conditional, which this commit also does. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney Reviewed-by: Ingo Molnar --- kernel/cgroup.c | 4 ++-- kernel/pid.c | 5 ++--- kernel/rcu/srcu.c | 10 +++++----- kernel/rcu/tiny.c | 8 ++++---- kernel/rcu/tree.c | 28 ++++++++++++++-------------- kernel/rcu/tree_plugin.h | 8 ++++---- kernel/rcu/update.c | 4 ++-- kernel/sched/core.c | 8 ++++---- kernel/workqueue.c | 20 ++++++++++---------- 9 files changed, 47 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f89d9292eee6..b89f3168411b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -107,8 +107,8 @@ static DEFINE_SPINLOCK(release_agent_path_lock); struct percpu_rw_semaphore cgroup_threadgroup_rwsem; #define cgroup_assert_mutex_or_rcu_locked() \ - rcu_lockdep_assert(rcu_read_lock_held() || \ - lockdep_is_held(&cgroup_mutex), \ + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ + !lockdep_is_held(&cgroup_mutex), \ "cgroup_mutex or RCU read lock required"); /* diff --git a/kernel/pid.c b/kernel/pid.c index 4fd07d5b7baf..ca368793808e 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -451,9 +451,8 @@ EXPORT_SYMBOL(pid_task); */ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) { - rcu_lockdep_assert(rcu_read_lock_held(), - "find_task_by_pid_ns() needs rcu_read_lock()" - " protection"); + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), + "find_task_by_pid_ns() needs rcu_read_lock() protection"); return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); } diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index de35087c92a5..d3fcb2ec8536 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -415,11 +415,11 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) struct rcu_head *head = &rcu.head; bool done = false; - rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && - !lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); + RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) || + lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); might_sleep(); init_completion(&rcu.completion); diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index c291bd65d2cb..d0471056d0af 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -191,10 +191,10 @@ static void rcu_process_callbacks(struct softirq_action *unused) */ void synchronize_sched(void) { - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_sched() in RCU read-side critical section"); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_sched() in RCU read-side critical section"); cond_resched(); } EXPORT_SYMBOL_GPL(synchronize_sched); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cb64d7e13d24..0a73d26357a2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -649,12 +649,12 @@ static void rcu_eqs_enter_common(long long oldval, bool user) * It is illegal to enter an extended quiescent state while * in an RCU read-side critical section. */ - rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), - "Illegal idle entry in RCU read-side critical section."); - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map), - "Illegal idle entry in RCU-bh read-side critical section."); - rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map), - "Illegal idle entry in RCU-sched read-side critical section."); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map), + "Illegal idle entry in RCU read-side critical section."); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), + "Illegal idle entry in RCU-bh read-side critical section."); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), + "Illegal idle entry in RCU-sched read-side critical section."); } /* @@ -3161,10 +3161,10 @@ static inline int rcu_blocking_is_gp(void) */ void synchronize_sched(void) { - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_sched() in RCU-sched read-side critical section"); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_sched() in RCU-sched read-side critical section"); if (rcu_blocking_is_gp()) return; if (rcu_gp_is_expedited()) @@ -3188,10 +3188,10 @@ EXPORT_SYMBOL_GPL(synchronize_sched); */ void synchronize_rcu_bh(void) { - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); if (rcu_blocking_is_gp()) return; if (rcu_gp_is_expedited()) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index a983bc68a146..9e922f111d63 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -538,10 +538,10 @@ EXPORT_SYMBOL_GPL(call_rcu); */ void synchronize_rcu(void) { - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu() in RCU read-side critical section"); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu() in RCU read-side critical section"); if (!rcu_scheduler_active) return; if (rcu_gp_is_expedited()) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index a0a0dd03c73a..47268fb1d27b 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -589,8 +589,8 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks); void synchronize_rcu_tasks(void) { /* Complain if the scheduler has not started. */ - rcu_lockdep_assert(!rcu_scheduler_active, - "synchronize_rcu_tasks called too soon"); + RCU_LOCKDEP_WARN(rcu_scheduler_active, + "synchronize_rcu_tasks called too soon"); /* Wait for the grace period. */ wait_rcu_gp(call_rcu_tasks); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 78b4bad10081..5e73c79fadd0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2200,8 +2200,8 @@ unsigned long to_ratio(u64 period, u64 runtime) #ifdef CONFIG_SMP inline struct dl_bw *dl_bw_of(int i) { - rcu_lockdep_assert(rcu_read_lock_sched_held(), - "sched RCU must be held"); + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); return &cpu_rq(i)->rd->dl_bw; } @@ -2210,8 +2210,8 @@ static inline int dl_bw_cpus(int i) struct root_domain *rd = cpu_rq(i)->rd; int cpus = 0; - rcu_lockdep_assert(rcu_read_lock_sched_held(), - "sched RCU must be held"); + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); for_each_cpu_and(i, rd->span, cpu_active_mask) cpus++; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4c4f06176f74..cb91c63b4f4a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -338,20 +338,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); #include #define assert_rcu_or_pool_mutex() \ - rcu_lockdep_assert(rcu_read_lock_sched_held() || \ - lockdep_is_held(&wq_pool_mutex), \ - "sched RCU or wq_pool_mutex should be held") + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + !lockdep_is_held(&wq_pool_mutex), \ + "sched RCU or wq_pool_mutex should be held") #define assert_rcu_or_wq_mutex(wq) \ - rcu_lockdep_assert(rcu_read_lock_sched_held() || \ - lockdep_is_held(&wq->mutex), \ - "sched RCU or wq->mutex should be held") + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + !lockdep_is_held(&wq->mutex), \ + "sched RCU or wq->mutex should be held") #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ - rcu_lockdep_assert(rcu_read_lock_sched_held() || \ - lockdep_is_held(&wq->mutex) || \ - lockdep_is_held(&wq_pool_mutex), \ - "sched RCU, wq->mutex or wq_pool_mutex should be held") + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + !lockdep_is_held(&wq->mutex) && \ + !lockdep_is_held(&wq_pool_mutex), \ + "sched RCU, wq->mutex or wq_pool_mutex should be held") #define for_each_cpu_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ -- cgit v1.2.3 From a76a9a485d730024a7cbd76efcd9c6eb46003829 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 30 Jun 2015 08:17:40 -0700 Subject: rcu: Fix backwards RCU_LOCKDEP_WARN() in synchronize_rcu_tasks() The RCU_LOCKDEP_WARN() in synchronize_rcu_tasks() triggers if the scheduler is active, which is backwards. This commit therefore negates the test. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 47268fb1d27b..7a0b3bc7c5ed 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -589,7 +589,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks); void synchronize_rcu_tasks(void) { /* Complain if the scheduler has not started. */ - RCU_LOCKDEP_WARN(rcu_scheduler_active, + RCU_LOCKDEP_WARN(!rcu_scheduler_active, "synchronize_rcu_tasks called too soon"); /* Wait for the grace period. */ -- cgit v1.2.3 From 9a54f98e341d09793247a6e598012edefb5ae7cb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 14 Jul 2015 16:24:14 -0700 Subject: rcu: Don't disable CPU hotplug during OOM notifiers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RCU's rcu_oom_notify() disables CPU hotplug in order to stabilize the list of online CPUs, which it traverses. However, this is completely pointless because smp_call_function_single() will quietly fail if invoked on an offline CPU. Because the count of requests is incremented in the rcu_oom_notify_cpu() function that is remotely invoked, everything works nicely even in the face of concurrent CPU-hotplug operations. Furthermore, in recent kernels, invoking get_online_cpus() from an OOM notifier can result in deadlock. This commit therefore removes the call to get_online_cpus() and put_online_cpus() from rcu_oom_notify(). Reported-by: Marcin Ślusarz Reported-by: David Rientjes Signed-off-by: Paul E. McKenney Acked-by: David Rientjes Tested-by: Marcin Ślusarz --- kernel/rcu/tree_plugin.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 9e922f111d63..80a7c17907fe 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1679,12 +1679,10 @@ static int rcu_oom_notify(struct notifier_block *self, */ atomic_set(&oom_callback_count, 1); - get_online_cpus(); for_each_online_cpu(cpu) { smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); cond_resched_rcu_qs(); } - put_online_cpus(); /* Unconditionally decrement: no need to wake ourselves up. */ atomic_dec(&oom_callback_count); -- cgit v1.2.3 From 45ac1403f564f411c6a383a2448688ba8dd705a4 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 21 Jul 2015 12:44:02 +0300 Subject: perf: Add PERF_RECORD_SWITCH to indicate context switches There are already two events for context switches, namely the tracepoint sched:sched_switch and the software event context_switches. Unfortunately neither are suitable for use by non-privileged users for the purpose of synchronizing hardware trace data (e.g. Intel PT) to the context switch. Tracepoints are no good at all for non-privileged users because they need either CAP_SYS_ADMIN or /proc/sys/kernel/perf_event_paranoid <= -1. On the other hand, kernel software events need either CAP_SYS_ADMIN or /proc/sys/kernel/perf_event_paranoid <= 1. Now many distributions do default perf_event_paranoid to 1 making context_switches a contender, except it has another problem (which is also shared with sched:sched_switch) which is that it happens before perf schedules events out instead of after perf schedules events in. Whereas a privileged user can see all the events anyway, a non-privileged user only sees events for their own processes, in other words they see when their process was scheduled out not when it was scheduled in. That presents two problems to use the event: 1. the information comes too late, so tools have to look ahead in the event stream to find out what the current state is 2. if they are unlucky tracing might have stopped before the context-switches event is recorded. This new PERF_RECORD_SWITCH event does not have those problems and it also has a couple of other small advantages. It is easier to use because it is an auxiliary event (like mmap, comm and task events) which can be enabled by setting a single bit. It is smaller than sched:sched_switch and easier to parse. To make the event useful for privileged users also, if the context is cpu-wide then the event record will be PERF_RECORD_SWITCH_CPU_WIDE which is the same as PERF_RECORD_SWITCH except it also provides the next or previous pid/tid. Signed-off-by: Adrian Hunter Acked-by: Peter Zijlstra (Intel) Tested-by: Jiri Olsa Cc: Andi Kleen Cc: Mathieu Poirier Cc: Pawel Moll Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1437471846-26995-2-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d3dae3419b99..ce21143c0d9e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -163,6 +163,7 @@ static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; +static atomic_t nr_switch_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -2619,6 +2620,9 @@ static void perf_pmu_sched_task(struct task_struct *prev, local_irq_restore(flags); } +static void perf_event_switch(struct task_struct *task, + struct task_struct *next_prev, bool sched_in); + #define for_each_task_context_nr(ctxn) \ for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) @@ -2641,6 +2645,9 @@ void __perf_event_task_sched_out(struct task_struct *task, if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(task, next, false); + if (atomic_read(&nr_switch_events)) + perf_event_switch(task, next, false); + for_each_task_context_nr(ctxn) perf_event_context_sched_out(task, ctxn, next); @@ -2831,6 +2838,9 @@ void __perf_event_task_sched_in(struct task_struct *prev, if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) perf_cgroup_sched_in(prev, task); + if (atomic_read(&nr_switch_events)) + perf_event_switch(task, prev, true); + if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(prev, task, true); } @@ -3454,6 +3464,10 @@ static void unaccount_event(struct perf_event *event) atomic_dec(&nr_task_events); if (event->attr.freq) atomic_dec(&nr_freq_events); + if (event->attr.context_switch) { + static_key_slow_dec_deferred(&perf_sched_events); + atomic_dec(&nr_switch_events); + } if (is_cgroup_event(event)) static_key_slow_dec_deferred(&perf_sched_events); if (has_branch_stack(event)) @@ -5981,6 +5995,91 @@ void perf_log_lost_samples(struct perf_event *event, u64 lost) perf_output_end(&handle); } +/* + * context_switch tracking + */ + +struct perf_switch_event { + struct task_struct *task; + struct task_struct *next_prev; + + struct { + struct perf_event_header header; + u32 next_prev_pid; + u32 next_prev_tid; + } event_id; +}; + +static int perf_event_switch_match(struct perf_event *event) +{ + return event->attr.context_switch; +} + +static void perf_event_switch_output(struct perf_event *event, void *data) +{ + struct perf_switch_event *se = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + if (!perf_event_switch_match(event)) + return; + + /* Only CPU-wide events are allowed to see next/prev pid/tid */ + if (event->ctx->task) { + se->event_id.header.type = PERF_RECORD_SWITCH; + se->event_id.header.size = sizeof(se->event_id.header); + } else { + se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE; + se->event_id.header.size = sizeof(se->event_id); + se->event_id.next_prev_pid = + perf_event_pid(event, se->next_prev); + se->event_id.next_prev_tid = + perf_event_tid(event, se->next_prev); + } + + perf_event_header__init_id(&se->event_id.header, &sample, event); + + ret = perf_output_begin(&handle, event, se->event_id.header.size); + if (ret) + return; + + if (event->ctx->task) + perf_output_put(&handle, se->event_id.header); + else + perf_output_put(&handle, se->event_id); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +static void perf_event_switch(struct task_struct *task, + struct task_struct *next_prev, bool sched_in) +{ + struct perf_switch_event switch_event; + + /* N.B. caller checks nr_switch_events != 0 */ + + switch_event = (struct perf_switch_event){ + .task = task, + .next_prev = next_prev, + .event_id = { + .header = { + /* .type */ + .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT, + /* .size */ + }, + /* .next_prev_pid */ + /* .next_prev_tid */ + }, + }; + + perf_event_aux(perf_event_switch_output, + &switch_event, + NULL); +} + /* * IRQ throttle logging */ @@ -7479,6 +7578,10 @@ static void account_event(struct perf_event *event) if (atomic_inc_return(&nr_freq_events) == 1) tick_nohz_full_kick_all(); } + if (event->attr.context_switch) { + atomic_inc(&nr_switch_events); + static_key_slow_inc(&perf_sched_events.key); + } if (has_branch_stack(event)) static_key_slow_inc(&perf_sched_events.key); if (is_cgroup_event(event)) -- cgit v1.2.3 From 1ee4fb3ee1e47f2b3c294ded383a3cd9cc2decd4 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Wed, 22 Jul 2015 12:43:04 -0700 Subject: genirq: Export irq_[get|set]_irqchip_state() Export these functions to be able to build the Qualcomm family A PMIC gpio and mpp drivers as modules. [ tglx: Made them GPL exports ] Signed-off-by: Bjorn Andersson Reviewed-by: Mark Brown Cc: Marc Zyngier Cc: Cc: Cc: Srinivas Kandagatla Cc: Linus Walleij Link: http://lkml.kernel.org/r/1437594184-22966-1-git-send-email-bjorn.andersson@sonymobile.com Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 886f11508c6d..ad1b064f94fe 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1877,6 +1877,7 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, irq_put_desc_busunlock(desc, flags); return err; } +EXPORT_SYMBOL_GPL(irq_get_irqchip_state); /** * irq_set_irqchip_state - set the state of a forwarded interrupt. @@ -1922,3 +1923,4 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, irq_put_desc_busunlock(desc, flags); return err; } +EXPORT_SYMBOL_GPL(irq_set_irqchip_state); -- cgit v1.2.3 From be9b22b6a7e6725162c64155a08b71f0654b675c Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Wed, 22 Jul 2015 16:21:39 -0700 Subject: genirq: Add chip_[suspend|resume] PM support to irq_chip Some (admittedly odd) irqchips perform functions that are not directly related to any of their child IRQ lines, and therefore need to perform some tasks during suspend/resume regardless of whether there are any "installed" interrupts for the irqchip. However, the current generic-chip framework does not call the chip's irq_{suspend,resume} when there are no interrupts installed (this makes sense, because there are no irq_data objects for such a call to be made). More specifically, irq-bcm7120-l2 configures both a forwarding mask (which affects other top-level GIC IRQs) and a second-level interrupt mask (for managing its own child interrupts). The former must be saved/restored on suspend/resume, even when there's nothing to do for the latter. This patch adds a new set of suspend/resume hooks to irq_chip_generic, to help represent *chip* suspend/resume, rather than IRQ suspend/resume. These callbacks will always be called for an IRQ chip (regardless of the installed interrupts) and are based on the per-chip irq_chip_generic struct, rather than the per-IRQ irq_data struct. The original problem report is described in extra detail here: http://lkml.kernel.org/g/20150619224123.GL4917@ld-irv-0074 Signed-off-by: Brian Norris Tested-by: Florian Fainelli Cc: Gregory Fong Cc: bcm-kernel-feedback-list@broadcom.com Cc: linux-mips@linux-mips.org Cc: Kevin Cernekee Cc: Jason Cooper Link: http://lkml.kernel.org/r/1437607300-40858-1-git-send-email-computersforpeace@gmail.com Signed-off-by: Thomas Gleixner --- kernel/irq/generic-chip.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 15b370daf234..abd286afbd27 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -553,6 +553,9 @@ static int irq_gc_suspend(void) if (data) ct->chip.irq_suspend(data); } + + if (gc->suspend) + gc->suspend(gc); } return 0; } @@ -564,6 +567,9 @@ static void irq_gc_resume(void) list_for_each_entry(gc, &gc_list, list) { struct irq_chip_type *ct = gc->chip_types; + if (gc->resume) + gc->resume(gc); + if (ct->chip.irq_resume) { struct irq_data *data = irq_gc_get_irq_data(gc); -- cgit v1.2.3 From e075867681ca9b8c0b8823e24d0fb4ce3b4f2655 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 10 Oct 2014 02:44:01 +0200 Subject: jiffies: Remove HZ > USEC_PER_SEC special case HZ never goes much further 1000 and a bit. And if we ever reach one tick per microsecond, we might be having a problem. Lets stop maintaining this special case, just leave a paranoid check. Reviewed-by: Rik van Riel Cc: Christoph Lameter Cc: Ingo Molnar Cc; John Stultz Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- kernel/time/time.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index 85d5bb1d67eb..ad1bf23e6eb7 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -268,10 +268,14 @@ EXPORT_SYMBOL(jiffies_to_msecs); unsigned int jiffies_to_usecs(const unsigned long j) { -#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + /* + * Hz usually doesn't go much further MSEC_PER_SEC. + * jiffies_to_usecs() and usecs_to_jiffies() depend on that. + */ + BUILD_BUG_ON(HZ > USEC_PER_SEC); + +#if !(USEC_PER_SEC % HZ) return (USEC_PER_SEC / HZ) * j; -#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) - return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); #else # if BITS_PER_LONG == 32 return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; -- cgit v1.2.3 From 594493594373862ed2a7f91d88a5a2670742faa6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 27 May 2015 15:42:42 +0200 Subject: nohz: Remove idle task special case On nohz full early days, idle dynticks and full dynticks weren't well integrated and we couldn't risk full dynticks calls on idle without risking messing up tick idle statistics. This is why we prevented such thing to happen. Nowadays full dynticks and idle dynticks are better integrated and interact without known issue. So lets remove that. Reviewed-by: Rik van Riel Cc: Christoph Lameter Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- kernel/time/tick-sched.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index c792429e98c6..d6c8eff6e7b4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -208,10 +208,8 @@ void __tick_nohz_full_check(void) struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (tick_nohz_full_cpu(smp_processor_id())) { - if (ts->tick_stopped && !is_idle_task(current)) { - if (!can_stop_full_tick()) - tick_nohz_restart_sched_tick(ts, ktime_get()); - } + if (ts->tick_stopped && !can_stop_full_tick()) + tick_nohz_restart_sched_tick(ts, ktime_get()); } } @@ -710,7 +708,7 @@ static void tick_nohz_full_stop_tick(struct tick_sched *ts) #ifdef CONFIG_NO_HZ_FULL int cpu = smp_processor_id(); - if (!tick_nohz_full_cpu(cpu) || is_idle_task(current)) + if (!tick_nohz_full_cpu(cpu)) return; if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) -- cgit v1.2.3 From 73738a95d00467812664b7f86ba3052f5faf96d7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 27 May 2015 19:22:08 +0200 Subject: nohz: Restart nohz full tick from irq exit Restart the tick when necessary from the irq exit path. It makes nohz full more flexible, simplify the related IPIs and doesn't bring significant overhead on irq exit. In a longer term view, it will allow us to piggyback the nohz kick on the scheduler IPI in the future instead of sending a dedicated IPI that often doubles the scheduler IPI on task wakeup. This will require more changes though including careful review of resched_curr() callers to include nohz full needs. Reviewed-by: Rik van Riel Cc: Christoph Lameter Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- kernel/time/tick-sched.c | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index d6c8eff6e7b4..a06cd4af0ff1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -197,25 +197,9 @@ static bool can_stop_full_tick(void) return true; } -static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); - -/* - * Re-evaluate the need for the tick on the current CPU - * and restart it if necessary. - */ -void __tick_nohz_full_check(void) -{ - struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); - - if (tick_nohz_full_cpu(smp_processor_id())) { - if (ts->tick_stopped && !can_stop_full_tick()) - tick_nohz_restart_sched_tick(ts, ktime_get()); - } -} - static void nohz_full_kick_work_func(struct irq_work *work) { - __tick_nohz_full_check(); + /* Empty, the tick restart happens on tick_nohz_irq_exit() */ } static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { @@ -250,7 +234,7 @@ void tick_nohz_full_kick_cpu(int cpu) static void nohz_full_kick_ipi(void *info) { - __tick_nohz_full_check(); + /* Empty, the tick restart happens on tick_nohz_irq_exit() */ } /* @@ -703,7 +687,9 @@ out: return tick; } -static void tick_nohz_full_stop_tick(struct tick_sched *ts) +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); + +static void tick_nohz_full_update_tick(struct tick_sched *ts) { #ifdef CONFIG_NO_HZ_FULL int cpu = smp_processor_id(); @@ -714,10 +700,10 @@ static void tick_nohz_full_stop_tick(struct tick_sched *ts) if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) return; - if (!can_stop_full_tick()) - return; - - tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); + if (can_stop_full_tick()) + tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); + else if (ts->tick_stopped) + tick_nohz_restart_sched_tick(ts, ktime_get()); #endif } @@ -847,7 +833,7 @@ void tick_nohz_irq_exit(void) if (ts->inidle) __tick_nohz_idle_enter(ts); else - tick_nohz_full_stop_tick(ts); + tick_nohz_full_update_tick(ts); } /** -- cgit v1.2.3 From 59d2c7ca492d7a7093755e4108390e4dac8b6365 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 29 May 2015 14:42:15 +0200 Subject: nohz: Move tick_nohz_restart_sched_tick() above its users Fix the function declaration/definition dance. Reviewed-by: Rik van Riel Cc: Christoph Lameter Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- kernel/time/tick-sched.c | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a06cd4af0ff1..6b0d14d4c350 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -687,7 +687,22 @@ out: return tick; } -static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) +{ + /* Update jiffies first */ + tick_do_update_jiffies64(now); + update_cpu_load_nohz(); + + calc_load_exit_idle(); + touch_softlockup_watchdog(); + /* + * Cancel the scheduled timer and restore the tick + */ + ts->tick_stopped = 0; + ts->idle_exittime = now; + + tick_nohz_restart(ts, now); +} static void tick_nohz_full_update_tick(struct tick_sched *ts) { @@ -848,23 +863,6 @@ ktime_t tick_nohz_get_sleep_length(void) return ts->sleep_length; } -static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) -{ - /* Update jiffies first */ - tick_do_update_jiffies64(now); - update_cpu_load_nohz(); - - calc_load_exit_idle(); - touch_softlockup_watchdog(); - /* - * Cancel the scheduled timer and restore the tick - */ - ts->tick_stopped = 0; - ts->idle_exittime = now; - - tick_nohz_restart(ts, now); -} - static void tick_nohz_account_idle_ticks(struct tick_sched *ts) { #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE -- cgit v1.2.3 From de734f89b67c2df30e35a09e7e56a3659e5b6ac6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 11 Jun 2015 18:07:12 +0200 Subject: nohz: Remove useless argument on tick_nohz_task_switch() Leftover from early code. Cc: Christoph Lameter Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- kernel/sched/core.c | 2 +- kernel/time/tick-sched.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 78b4bad10081..4d34035bb3ee 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2489,7 +2489,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) put_task_struct(prev); } - tick_nohz_task_switch(current); + tick_nohz_task_switch(); return rq; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6b0d14d4c350..3319e16f31e5 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -258,7 +258,7 @@ void tick_nohz_full_kick_all(void) * It might need the tick due to per task/process properties: * perf events, posix cpu timers, ... */ -void __tick_nohz_task_switch(struct task_struct *tsk) +void __tick_nohz_task_switch(void) { unsigned long flags; -- cgit v1.2.3 From 8505a81bb036253213b109baf4178ea6861e2888 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Wed, 29 Jul 2015 19:09:36 +0900 Subject: genirq: Use the proper parameter name in kernel doc The following warning is emitted for make xmldocs: Warning(.//kernel/irq/chip.c:1009): No description found for parameter 'vcpu_info' Warning(.//kernel/irq/chip.c:1009): Excess function parameter 'dest' description in 'irq_chip_set_vcpu_affinity_parent' Signed-off-by: Masanari Iida Link: http://lkml.kernel.org/r/1438164576-5945-1-git-send-email-standby24x7@gmail.com Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 76f199dc6a5e..09304a653353 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1003,7 +1003,7 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data) /** * irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt * @data: Pointer to interrupt specific data - * @dest: The vcpu affinity information + * @vcpu_info: The vcpu affinity information */ int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info) { -- cgit v1.2.3 From ad3aedfbb04b3a2af54473cfe31f13953cfe9d84 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 28 Jul 2015 14:46:08 +0100 Subject: genirq/irqdomain: Allow irq domain aliasing It is not uncommon (at least with the ARM stuff) to have a piece of hardware that implements different flavours of "interrupts". A typical example of this is the GICv3 ITS, which implements standard PCI/MSI support, but also some form of "generic MSI". So far, the PCI/MSI domain is registered using the ITS device_node, so that irq_find_host can return it. On the contrary, the raw MSI domain is not registered with an device_node, making it impossible to be looked up by another subsystem (obviously, using the same device_node twice would only result in confusion, as it is not defined which one irq_find_host would return). A solution to this is to "type" domains that may be aliasing, and to be able to lookup an device_node that matches a given type. For this, we introduce irq_find_matching_host() as a superset of irq_find_host: struct irq_domain *irq_find_matching_host(struct device_node *node, enum irq_domain_bus_token bus_token); where bus_token is the "type" we want to match the domain against (so far, only DOMAIN_BUS_ANY is defined). This result in some moderately invasive changes on the PPC side (which is the only user of the .match method). This has otherwise no functionnal change. Reviewed-by: Hanjun Guo Signed-off-by: Marc Zyngier Cc: Cc: Yijing Wang Cc: Ma Jun Cc: Lorenzo Pieralisi Cc: Duc Dang Cc: Bjorn Helgaas Cc: Jiang Liu Cc: Jason Cooper Link: http://lkml.kernel.org/r/1438091186-10244-2-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8c3577fef78c..79baaf8a7813 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -187,10 +187,12 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, EXPORT_SYMBOL_GPL(irq_domain_add_legacy); /** - * irq_find_host() - Locates a domain for a given device node + * irq_find_matching_host() - Locates a domain for a given device node * @node: device-tree node of the interrupt controller + * @bus_token: domain-specific data */ -struct irq_domain *irq_find_host(struct device_node *node) +struct irq_domain *irq_find_matching_host(struct device_node *node, + enum irq_domain_bus_token bus_token) { struct irq_domain *h, *found = NULL; int rc; @@ -199,13 +201,19 @@ struct irq_domain *irq_find_host(struct device_node *node) * it might potentially be set to match all interrupts in * the absence of a device node. This isn't a problem so far * yet though... + * + * bus_token == DOMAIN_BUS_ANY matches any domain, any other + * values must generate an exact match for the domain to be + * selected. */ mutex_lock(&irq_domain_mutex); list_for_each_entry(h, &irq_domain_list, link) { if (h->ops->match) - rc = h->ops->match(h, node); + rc = h->ops->match(h, node, bus_token); else - rc = (h->of_node != NULL) && (h->of_node == node); + rc = ((h->of_node != NULL) && (h->of_node == node) && + ((bus_token == DOMAIN_BUS_ANY) || + (h->bus_token == bus_token))); if (rc) { found = h; @@ -215,7 +223,7 @@ struct irq_domain *irq_find_host(struct device_node *node) mutex_unlock(&irq_domain_mutex); return found; } -EXPORT_SYMBOL_GPL(irq_find_host); +EXPORT_SYMBOL_GPL(irq_find_matching_host); /** * irq_set_default_host() - Set a "default" irq domain -- cgit v1.2.3 From f231722a2b27ee99cbcd0c6bcf4c866612b78137 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:03 +0200 Subject: uprobes: Introduce get_uprobe() Cosmetic. Add the new trivial helper, get_uprobe(). It matches put_uprobe() we already have and we can simplify a couple of its users. Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134003.GA4736@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index cb346f26a22d..a9847b4ec1e7 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -366,6 +366,18 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn); } +static struct uprobe *get_uprobe(struct uprobe *uprobe) +{ + atomic_inc(&uprobe->ref); + return uprobe; +} + +static void put_uprobe(struct uprobe *uprobe) +{ + if (atomic_dec_and_test(&uprobe->ref)) + kfree(uprobe); +} + static int match_uprobe(struct uprobe *l, struct uprobe *r) { if (l->inode < r->inode) @@ -393,10 +405,8 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) while (n) { uprobe = rb_entry(n, struct uprobe, rb_node); match = match_uprobe(&u, uprobe); - if (!match) { - atomic_inc(&uprobe->ref); - return uprobe; - } + if (!match) + return get_uprobe(uprobe); if (match < 0) n = n->rb_left; @@ -432,10 +442,8 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe) parent = *p; u = rb_entry(parent, struct uprobe, rb_node); match = match_uprobe(uprobe, u); - if (!match) { - atomic_inc(&u->ref); - return u; - } + if (!match) + return get_uprobe(u); if (match < 0) p = &parent->rb_left; @@ -472,12 +480,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) return u; } -static void put_uprobe(struct uprobe *uprobe) -{ - if (atomic_dec_and_test(&uprobe->ref)) - kfree(uprobe); -} - static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) { struct uprobe *uprobe, *cur_uprobe; @@ -1039,14 +1041,14 @@ static void build_probe_list(struct inode *inode, if (u->inode != inode || u->offset < min) break; list_add(&u->pending_list, head); - atomic_inc(&u->ref); + get_uprobe(u); } for (t = n; (t = rb_next(t)); ) { u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset > max) break; list_add(&u->pending_list, head); - atomic_inc(&u->ref); + get_uprobe(u); } } spin_unlock(&uprobes_treelock); @@ -1437,7 +1439,7 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) return -ENOMEM; *n = *o; - atomic_inc(&n->uprobe->ref); + get_uprobe(n->uprobe); n->next = NULL; *p = n; @@ -1565,8 +1567,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; } - atomic_inc(&uprobe->ref); - ri->uprobe = uprobe; + ri->uprobe = get_uprobe(uprobe); ri->func = instruction_pointer(regs); ri->orig_ret_vaddr = orig_ret_vaddr; ri->chained = chained; -- cgit v1.2.3 From 2bb5e840e873f8778a41801141771f54f547fa65 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:06 +0200 Subject: uprobes: Introduce free_ret_instance() We can simplify uprobe_free_utask() and handle_uretprobe_chain() if we add a simple helper which does put_uprobe/kfree and returns the ->next return_instance. Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134006.GA4740@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a9847b4ec1e7..d8c702fc836f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1378,6 +1378,14 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs) return instruction_pointer(regs); } +static struct return_instance *free_ret_instance(struct return_instance *ri) +{ + struct return_instance *next = ri->next; + put_uprobe(ri->uprobe); + kfree(ri); + return next; +} + /* * Called with no locks held. * Called in context of a exiting or a exec-ing thread. @@ -1385,7 +1393,7 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs) void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; - struct return_instance *ri, *tmp; + struct return_instance *ri; if (!utask) return; @@ -1394,13 +1402,8 @@ void uprobe_free_utask(struct task_struct *t) put_uprobe(utask->active_uprobe); ri = utask->return_instances; - while (ri) { - tmp = ri; - ri = ri->next; - - put_uprobe(tmp->uprobe); - kfree(tmp); - } + while (ri) + ri = free_ret_instance(ri); xol_free_insn_slot(t); kfree(utask); @@ -1770,7 +1773,7 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) static bool handle_trampoline(struct pt_regs *regs) { struct uprobe_task *utask; - struct return_instance *ri, *tmp; + struct return_instance *ri; bool chained; utask = current->utask; @@ -1792,11 +1795,7 @@ static bool handle_trampoline(struct pt_regs *regs) handle_uretprobe_chain(ri, regs); chained = ri->chained; - put_uprobe(ri->uprobe); - - tmp = ri; - ri = ri->next; - kfree(tmp); + ri = free_ret_instance(ri); utask->depth--; if (!chained) -- cgit v1.2.3 From 0b5256c7f173258b19d98364adb57f707dda22f3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:08 +0200 Subject: uprobes: Send SIGILL if handle_trampoline() fails 1. It doesn't make sense to continue if handle_trampoline() fails, change handle_swbp() to always return after this call. 2. Turn pr_warn() into uprobe_warn(), and change handle_trampoline() to send SIGILL on failure. It is pointless to return to user mode with the corrupted instruction_pointer() which we can't restore. Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134008.GA4745@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d8c702fc836f..eabdc21366ee 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1770,7 +1770,7 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) up_read(&uprobe->register_rwsem); } -static bool handle_trampoline(struct pt_regs *regs) +static void handle_trampoline(struct pt_regs *regs) { struct uprobe_task *utask; struct return_instance *ri; @@ -1778,11 +1778,11 @@ static bool handle_trampoline(struct pt_regs *regs) utask = current->utask; if (!utask) - return false; + goto sigill; ri = utask->return_instances; if (!ri) - return false; + goto sigill; /* * TODO: we should throw out return_instance's invalidated by @@ -1804,8 +1804,12 @@ static bool handle_trampoline(struct pt_regs *regs) } utask->return_instances = ri; + return; + + sigill: + uprobe_warn(current, "handle uretprobe, sending SIGILL."); + force_sig_info(SIGILL, SEND_SIG_FORCED, current); - return true; } bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) @@ -1824,13 +1828,8 @@ static void handle_swbp(struct pt_regs *regs) int uninitialized_var(is_swbp); bp_vaddr = uprobe_get_swbp_addr(regs); - if (bp_vaddr == get_trampoline_vaddr()) { - if (handle_trampoline(regs)) - return; - - pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n", - current->pid, current->tgid); - } + if (bp_vaddr == get_trampoline_vaddr()) + return handle_trampoline(regs); uprobe = find_active_uprobe(bp_vaddr, &is_swbp); if (!uprobe) { -- cgit v1.2.3 From 6c58d0e4cc26ea8882928e64c0de9afed4fc37cb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:10 +0200 Subject: uprobes: Change prepare_uretprobe() to use uprobe_warn() Turn the last pr_warn() in uprobes.c into uprobe_warn(). While at it: - s/kzalloc/kmalloc, we initialize every member of 'ri' - remove the pointless comment above the obvious code Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134010.GA4752@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index eabdc21366ee..4c941feae3a2 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1541,9 +1541,9 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) return; } - ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL); + ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL); if (!ri) - goto fail; + return; trampoline_vaddr = get_trampoline_vaddr(); orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); @@ -1561,8 +1561,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) * This situation is not possible. Likely we have an * attack from user-space. */ - pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n", - current->pid, current->tgid); + uprobe_warn(current, "handle tail call"); goto fail; } @@ -1576,13 +1575,10 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) ri->chained = chained; utask->depth++; - - /* add instance to the stack */ ri->next = utask->return_instances; utask->return_instances = ri; return; - fail: kfree(ri); } -- cgit v1.2.3 From a83cfeb92132c279b20bbc8ed3cef833b0fe417e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:13 +0200 Subject: uprobes: Change handle_trampoline() to find the next chain beforehand No functional changes, preparation. Add the new helper, find_next_ret_chain(), which finds the first !chained entry and returns its ->next. Yes, it is suboptimal. We probably want to turn ->chained into ->start_of_this_chain pointer and avoid another loop. But this needs the boring changes in dup_utask(), so lets do this later. Change the main loop in handle_trampoline() to unwind the stack until ri is equal to the pointer returned by this new helper. Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134013.GA4755@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4c941feae3a2..98e4d97b8c31 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1766,11 +1766,22 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) up_read(&uprobe->register_rwsem); } +static struct return_instance *find_next_ret_chain(struct return_instance *ri) +{ + bool chained; + + do { + chained = ri->chained; + ri = ri->next; /* can't be NULL if chained */ + } while (chained); + + return ri; +} + static void handle_trampoline(struct pt_regs *regs) { struct uprobe_task *utask; - struct return_instance *ri; - bool chained; + struct return_instance *ri, *next; utask = current->utask; if (!utask) @@ -1780,24 +1791,18 @@ static void handle_trampoline(struct pt_regs *regs) if (!ri) goto sigill; + next = find_next_ret_chain(ri); /* * TODO: we should throw out return_instance's invalidated by * longjmp(), currently we assume that the probed function always * returns. */ instruction_pointer_set(regs, ri->orig_ret_vaddr); - - for (;;) { + do { handle_uretprobe_chain(ri, regs); - - chained = ri->chained; ri = free_ret_instance(ri); utask->depth--; - - if (!chained) - break; - BUG_ON(!ri); - } + } while (ri != next); utask->return_instances = ri; return; -- cgit v1.2.3 From 97da89767d398c1dfa1f34e5f312eb8ebb382f7f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:16 +0200 Subject: uprobes: Export 'struct return_instance', introduce arch_uretprobe_is_alive() Add the new "weak" helper, arch_uretprobe_is_alive(), used by the next patches. It should return true if this return_instance is still valid. The arch agnostic version just always returns true. The patch exports "struct return_instance" for the architectures which want to override this hook. We can also cleanup prepare_uretprobe() if we pass the new return_instance to arch_uretprobe_hijack_return_addr(). Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134016.GA4762@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 98e4d97b8c31..1c71b6242a7e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -86,15 +86,6 @@ struct uprobe { struct arch_uprobe arch; }; -struct return_instance { - struct uprobe *uprobe; - unsigned long func; - unsigned long orig_ret_vaddr; /* original return address */ - bool chained; /* true, if instance is nested */ - - struct return_instance *next; /* keep as stack */ -}; - /* * Execute out of line area: anonymous executable mapping installed * by the probed task to execute the copy of the original instruction @@ -1818,6 +1809,11 @@ bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) return false; } +bool __weak arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs) +{ + return true; +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. -- cgit v1.2.3 From 7b868e4802a86d867aad1be0471b5767d9c20e10 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:18 +0200 Subject: uprobes/x86: Reimplement arch_uretprobe_is_alive() Add the x86 specific version of arch_uretprobe_is_alive() helper. It returns true if the stack frame mangled by prepare_uretprobe() is still on stack. So if it returns false, we know that the probed function has already returned. We add the new return_instance->stack member and change the generic code to initialize it in prepare_uretprobe, but it should be equally useful for other architectures. TODO: this assumes that the probed application can't use multiple stacks (say sigaltstack). We will try to improve this logic later. Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134018.GA4766@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 1c71b6242a7e..c5f316e06dc0 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1562,6 +1562,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) ri->uprobe = get_uprobe(uprobe); ri->func = instruction_pointer(regs); + ri->stack = user_stack_pointer(regs); ri->orig_ret_vaddr = orig_ret_vaddr; ri->chained = chained; -- cgit v1.2.3 From 5eeb50de42fd3251845d03c556db012267c72b3f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:21 +0200 Subject: uprobes: Change handle_trampoline() to flush the frames invalidated by longjmp() Test-case: #include #include jmp_buf jmp; void func_2(void) { longjmp(jmp, 1); } void func_1(void) { if (setjmp(jmp)) return; func_2(); printf("ERR!! I am running on the caller's stack\n"); } int main(void) { func_1(); return 0; } fails if you probe func_1() and func_2() because handle_trampoline() assumes that the probed function should must return and hit the bp installed be prepare_uretprobe(). But in this case func_2() does not return, so when func_1() returns the kernel uses the no longer valid return_instance of func_2(). Change handle_trampoline() to unwind ->return_instances until we know that the next chain is alive or NULL, this ensures that the current chain is the last we need to report and free. Alternatively, every return_instance could use unique trampoline_vaddr, in this case we could use it as a key. And this could solve the problem with sigaltstack() automatically. But this approach needs more changes, and it puts the "hard" limit on MAX_URETPROBE_DEPTH. Plus it can not solve another problem partially fixed by the next patch. Note: this change has no effect on !x86, the arch-agnostic version of arch_uretprobe_is_alive() just returns "true". TODO: as documented by the previous change, arch_uretprobe_is_alive() can be fooled by sigaltstack/etc. Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134021.GA4773@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c5f316e06dc0..93d939c80cd9 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1774,6 +1774,7 @@ static void handle_trampoline(struct pt_regs *regs) { struct uprobe_task *utask; struct return_instance *ri, *next; + bool valid; utask = current->utask; if (!utask) @@ -1783,18 +1784,24 @@ static void handle_trampoline(struct pt_regs *regs) if (!ri) goto sigill; - next = find_next_ret_chain(ri); - /* - * TODO: we should throw out return_instance's invalidated by - * longjmp(), currently we assume that the probed function always - * returns. - */ - instruction_pointer_set(regs, ri->orig_ret_vaddr); do { - handle_uretprobe_chain(ri, regs); - ri = free_ret_instance(ri); - utask->depth--; - } while (ri != next); + /* + * We should throw out the frames invalidated by longjmp(). + * If this chain is valid, then the next one should be alive + * or NULL; the latter case means that nobody but ri->func + * could hit this trampoline on return. TODO: sigaltstack(). + */ + next = find_next_ret_chain(ri); + valid = !next || arch_uretprobe_is_alive(next, regs); + + instruction_pointer_set(regs, ri->orig_ret_vaddr); + do { + if (valid) + handle_uretprobe_chain(ri, regs); + ri = free_ret_instance(ri); + utask->depth--; + } while (ri != next); + } while (!valid); utask->return_instances = ri; return; -- cgit v1.2.3 From a5b7e1a89b820f2b9b23634ca4c59b555e8d9a0d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:23 +0200 Subject: uprobes: Change prepare_uretprobe() to (try to) flush the dead frames Change prepare_uretprobe() to flush the !arch_uretprobe_is_alive() return_instance's. This is not needed correctness-wise, but can help to avoid the failure caused by MAX_URETPROBE_DEPTH. Note: in this case arch_uretprobe_is_alive() can be false positive, the stack can grow after longjmp(). Unfortunately, the kernel can't 100% solve this problem, but see the next patch. Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134023.GA4776@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 93d939c80cd9..7e61c8ca27e0 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1511,6 +1511,16 @@ static unsigned long get_trampoline_vaddr(void) return trampoline_vaddr; } +static void cleanup_return_instances(struct uprobe_task *utask, struct pt_regs *regs) +{ + struct return_instance *ri = utask->return_instances; + while (ri && !arch_uretprobe_is_alive(ri, regs)) { + ri = free_ret_instance(ri); + utask->depth--; + } + utask->return_instances = ri; +} + static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) { struct return_instance *ri; @@ -1541,6 +1551,9 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) if (orig_ret_vaddr == -1) goto fail; + /* drop the entries invalidated by longjmp() */ + cleanup_return_instances(utask, regs); + /* * We don't want to keep trampoline address in stack, rather keep the * original return address of first caller thru all the consequent -- cgit v1.2.3 From 86dcb702e74b8ab7d3b2d36984ef00671cea73b9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:26 +0200 Subject: uprobes: Add the "enum rp_check ctx" arg to arch_uretprobe_is_alive() arch/x86 doesn't care (so far), but as Pratyush Anand pointed out other architectures might want why arch_uretprobe_is_alive() was called and use different checks depending on the context. Add the new argument to distinguish 2 callers. Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134026.GA4779@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7e61c8ca27e0..df5661a44e35 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1514,7 +1514,9 @@ static unsigned long get_trampoline_vaddr(void) static void cleanup_return_instances(struct uprobe_task *utask, struct pt_regs *regs) { struct return_instance *ri = utask->return_instances; - while (ri && !arch_uretprobe_is_alive(ri, regs)) { + enum rp_check ctx = RP_CHECK_CALL; + + while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { ri = free_ret_instance(ri); utask->depth--; } @@ -1805,7 +1807,7 @@ static void handle_trampoline(struct pt_regs *regs) * could hit this trampoline on return. TODO: sigaltstack(). */ next = find_next_ret_chain(ri); - valid = !next || arch_uretprobe_is_alive(next, regs); + valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs); instruction_pointer_set(regs, ri->orig_ret_vaddr); do { @@ -1830,7 +1832,8 @@ bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) return false; } -bool __weak arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs) +bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, + struct pt_regs *regs) { return true; } -- cgit v1.2.3 From db087ef69a2b155ae001665bf0b3806abde7ee34 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:28 +0200 Subject: uprobes/x86: Make arch_uretprobe_is_alive(RP_CHECK_CALL) more clever The previous change documents that cleanup_return_instances() can't always detect the dead frames, the stack can grow. But there is one special case which imho worth fixing: arch_uretprobe_is_alive() can return true when the stack didn't actually grow, but the next "call" insn uses the already invalidated frame. Test-case: #include #include jmp_buf jmp; int nr = 1024; void func_2(void) { if (--nr == 0) return; longjmp(jmp, 1); } void func_1(void) { setjmp(jmp); func_2(); } int main(void) { func_1(); return 0; } If you ret-probe func_1() and func_2() prepare_uretprobe() hits the MAX_URETPROBE_DEPTH limit and "return" from func_2() is not reported. When we know that the new call is not chained, we can do the more strict check. In this case "sp" points to the new ret-addr, so every frame which uses the same "sp" must be dead. The only complication is that arch_uretprobe_is_alive() needs to know was it chained or not, so we add the new RP_CHECK_CHAIN_CALL enum and change prepare_uretprobe() to pass RP_CHECK_CALL only if !chained. Note: arch_uretprobe_is_alive() could also re-read *sp and check if this word is still trampoline_vaddr. This could obviously improve the logic, but I would like to avoid another copy_from_user() especially in the case when we can't avoid the false "alive == T" positives. Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134028.GA4786@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index df5661a44e35..0f370ef57a02 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1511,10 +1511,11 @@ static unsigned long get_trampoline_vaddr(void) return trampoline_vaddr; } -static void cleanup_return_instances(struct uprobe_task *utask, struct pt_regs *regs) +static void cleanup_return_instances(struct uprobe_task *utask, bool chained, + struct pt_regs *regs) { struct return_instance *ri = utask->return_instances; - enum rp_check ctx = RP_CHECK_CALL; + enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL; while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { ri = free_ret_instance(ri); @@ -1528,7 +1529,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) struct return_instance *ri; struct uprobe_task *utask; unsigned long orig_ret_vaddr, trampoline_vaddr; - bool chained = false; + bool chained; if (!get_xol_area()) return; @@ -1554,14 +1555,15 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) goto fail; /* drop the entries invalidated by longjmp() */ - cleanup_return_instances(utask, regs); + chained = (orig_ret_vaddr == trampoline_vaddr); + cleanup_return_instances(utask, chained, regs); /* * We don't want to keep trampoline address in stack, rather keep the * original return address of first caller thru all the consequent * instances. This also makes breakpoint unwrapping easier. */ - if (orig_ret_vaddr == trampoline_vaddr) { + if (chained) { if (!utask->return_instances) { /* * This situation is not possible. Likely we have an @@ -1570,8 +1572,6 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) uprobe_warn(current, "handle tail call"); goto fail; } - - chained = true; orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; } -- cgit v1.2.3 From f58bea2fec63db72f8050ade709358257e9102ab Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:31 +0200 Subject: uprobes: Fix the usage of install_special_mapping() install_special_mapping(pages) expects that "pages" is the zero- terminated array while xol_add_vma() passes &area->page, this means that special_mapping_fault() can wrongly use the next member in xol_area (vaddr) as "struct page *". Fortunately, this area is not expandable so pgoff != 0 isn't possible (modulo bugs in special_mapping_vmops), but still this does not look good. Signed-off-by: Oleg Nesterov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Pratyush Anand Cc: Srikar Dronamraju Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134031.GA4789@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0f370ef57a02..4b8ac5f13320 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -99,7 +99,7 @@ struct xol_area { wait_queue_head_t wq; /* if all slots are busy */ atomic_t slot_count; /* number of in-use slots */ unsigned long *bitmap; /* 0 = free slot */ - struct page *page; + struct page *pages[2]; /* * We keep the vma's vm_start rather than a pointer to the vma @@ -1142,7 +1142,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) } ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, - VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page); + VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, area->pages); if (ret) goto fail; @@ -1168,21 +1168,22 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) if (!area->bitmap) goto free_area; - area->page = alloc_page(GFP_HIGHUSER); - if (!area->page) + area->pages[0] = alloc_page(GFP_HIGHUSER); + if (!area->pages[0]) goto free_bitmap; + area->pages[1] = NULL; area->vaddr = vaddr; init_waitqueue_head(&area->wq); /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); atomic_set(&area->slot_count, 1); - copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); + copy_to_page(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE); if (!xol_add_vma(mm, area)) return area; - __free_page(area->page); + __free_page(area->pages[0]); free_bitmap: kfree(area->bitmap); free_area: @@ -1220,7 +1221,7 @@ void uprobe_clear_state(struct mm_struct *mm) if (!area) return; - put_page(area->page); + put_page(area->pages[0]); kfree(area->bitmap); kfree(area); } @@ -1289,7 +1290,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) if (unlikely(!xol_vaddr)) return 0; - arch_uprobe_copy_ixol(area->page, xol_vaddr, + arch_uprobe_copy_ixol(area->pages[0], xol_vaddr, &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); return xol_vaddr; -- cgit v1.2.3 From 704bde3cc26a4cb34386c164107b59e09745a022 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:33 +0200 Subject: uprobes: Use vm_special_mapping to name the XOL vma Change xol_add_vma() to use _install_special_mapping(), this way we can name the vma installed by uprobes. Currently it looks like private anonymous mapping, this is confusing and complicates the debugging. With this change /proc/$pid/maps reports "[uprobes]". As a side effect this will cause core dumps to include the XOL vma and I think this is good; this can help to debug the problem if the app crashed because it was probed. Signed-off-by: Oleg Nesterov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Pratyush Anand Cc: Srikar Dronamraju Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134033.GA4796@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4b8ac5f13320..2d5b7bd337a7 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -96,17 +96,18 @@ struct uprobe { * allocated. */ struct xol_area { - wait_queue_head_t wq; /* if all slots are busy */ - atomic_t slot_count; /* number of in-use slots */ - unsigned long *bitmap; /* 0 = free slot */ - struct page *pages[2]; + wait_queue_head_t wq; /* if all slots are busy */ + atomic_t slot_count; /* number of in-use slots */ + unsigned long *bitmap; /* 0 = free slot */ + struct vm_special_mapping xol_mapping; + struct page *pages[2]; /* * We keep the vma's vm_start rather than a pointer to the vma * itself. The probed process or a naughty kernel module could make * the vma go away, and we must handle that reasonably gracefully. */ - unsigned long vaddr; /* Page(s) of instruction slots */ + unsigned long vaddr; /* Page(s) of instruction slots */ }; /* @@ -1125,11 +1126,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon /* Slot allocation for XOL */ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) { - int ret = -EALREADY; + struct vm_area_struct *vma; + int ret; down_write(&mm->mmap_sem); - if (mm->uprobes_state.xol_area) + if (mm->uprobes_state.xol_area) { + ret = -EALREADY; goto fail; + } if (!area->vaddr) { /* Try to map as high as possible, this is only a hint. */ @@ -1141,11 +1145,15 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) } } - ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, - VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, area->pages); - if (ret) + vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE, + VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, + &area->xol_mapping); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); goto fail; + } + ret = 0; smp_wmb(); /* pairs with get_xol_area() */ mm->uprobes_state.xol_area = area; fail: @@ -1168,6 +1176,8 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) if (!area->bitmap) goto free_area; + area->xol_mapping.name = "[uprobes]"; + area->xol_mapping.pages = area->pages; area->pages[0] = alloc_page(GFP_HIGHUSER); if (!area->pages[0]) goto free_bitmap; -- cgit v1.2.3 From 2a742cedcf13572999436676cbe36c3a9b733b0f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:36 +0200 Subject: uprobes: Fix the waitqueue_active() check in xol_free_insn_slot() The xol_free_insn_slot()->waitqueue_active() check is buggy. We need mb() after we set the conditon for wait_event(), or xol_take_insn_slot() can miss the wakeup. Signed-off-by: Oleg Nesterov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Pratyush Anand Cc: Srikar Dronamraju Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134036.GA4799@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2d5b7bd337a7..4e5e9798aa0c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1337,6 +1337,7 @@ static void xol_free_insn_slot(struct task_struct *tsk) clear_bit(slot_nr, area->bitmap); atomic_dec(&area->slot_count); + smp_mb__after_atomic(); /* pairs with prepare_to_wait() */ if (waitqueue_active(&area->wq)) wake_up(&area->wq); -- cgit v1.2.3 From a5b9e5a2f14f25a8dae987494d50ad3aac7366b6 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 30 Jul 2015 14:31:34 -0700 Subject: x86/ldt: Make modify_ldt() optional The modify_ldt syscall exposes a large attack surface and is unnecessary for modern userspace. Make it optional. Signed-off-by: Andy Lutomirski Reviewed-by: Kees Cook Cc: Andrew Cooper Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jan Beulich Cc: Konrad Rzeszutek Wilk Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt Cc: Thomas Gleixner Cc: security@kernel.org Cc: xen-devel Link: http://lkml.kernel.org/r/a605166a771c343fd64802dece77a903507333bd.1438291540.git.luto@kernel.org [ Made MATH_EMULATION dependent on MODIFY_LDT_SYSCALL. ] Signed-off-by: Ingo Molnar --- kernel/sys_ni.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 7995ef5868d8..ca7d84f438f1 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -140,6 +140,7 @@ cond_syscall(sys_sgetmask); cond_syscall(sys_ssetmask); cond_syscall(sys_vm86old); cond_syscall(sys_vm86); +cond_syscall(sys_modify_ldt); cond_syscall(sys_ipc); cond_syscall(compat_sys_ipc); cond_syscall(compat_sys_sysctl); -- cgit v1.2.3 From d74892c5b291c0010295d26d6b1e11cd70451722 Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Wed, 29 Jul 2015 15:14:17 -0400 Subject: clockevents: Drop redundant cpumask check in tick_check_new_device() The same check is performed by tick_check_percpu(). Signed-off-by: Luiz Capitulino Link: http://lkml.kernel.org/r/20150729151417.069d1bb0@redhat.com Signed-off-by: Thomas Gleixner --- kernel/time/tick-common.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index f8bf47571dda..d11c55b6ab7d 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -304,9 +304,6 @@ void tick_check_new_device(struct clock_event_device *newdev) int cpu; cpu = smp_processor_id(); - if (!cpumask_test_cpu(cpu, newdev->cpumask)) - goto out_bc; - td = &per_cpu(tick_cpu_device, cpu); curdev = td->evtdev; -- cgit v1.2.3 From 985d3a4c11cd28251bcc7925aa2d7a9038910384 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 Jul 2015 06:11:51 +0800 Subject: sched/fair: Avoid pulling all tasks in idle balancing In idle balancing where a CPU going idle pulls tasks from another CPU, a livelock may happen if the CPU pulls all tasks from another, makes it idle, and this iterates. So just avoid this. Reported-by: Rabin Vincent Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Ben Segall Cc: Linus Torvalds Cc: Mike Galbraith Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150705221151.GF5197@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 587a2f67ceb1..8b384b8d2f1d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5863,6 +5863,13 @@ static int detach_tasks(struct lb_env *env) return 0; while (!list_empty(tasks)) { + /* + * We don't want to steal all, otherwise we may be treated likewise, + * which could at worst lead to a livelock crash. + */ + if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) + break; + p = list_first_entry(tasks, struct task_struct, se.group_node); env->loop++; -- cgit v1.2.3 From 8fd373548eed815858d7b5534971bdc64b4d8d2c Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Sat, 4 Jul 2015 15:39:22 +0800 Subject: sched/rt: Remove a redundant condition from task_woken_rt() 'p' has been already queued at this point, so "!task_running(rq, p)" and "p->nr_cpus_allowed > 1" imply that "has_pushable_tasks(rq)" is true, so it can be removed. Signed-off-by: Xunlei Pang Signed-off-by: Peter Zijlstra (Intel) Acked-by: Steven Rostedt Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435995563-3723-1-git-send-email-xlpang@126.com Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0d193a243e96..00816eeaa308 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2069,7 +2069,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && - has_pushable_tasks(rq) && p->nr_cpus_allowed > 1 && (dl_task(rq->curr) || rt_task(rq->curr)) && (rq->curr->nr_cpus_allowed < 2 || -- cgit v1.2.3 From 3fe33bcdd358dd8c641cf4d92c9d2d9972ca94dd Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Sat, 4 Jul 2015 15:39:23 +0800 Subject: sched/deadline: Remove a redundant condition from task_woken_dl() 'p' has been already queued at this point, so "!task_running(rq, p)" and "p->nr_cpus_allowed > 1" imply that "has_pushable_dl_tasks(rq)" is true, so it can be removed. Signed-off-by: Xunlei Pang Signed-off-by: Peter Zijlstra (Intel) Acked-by: Steven Rostedt Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435995563-3723-2-git-send-email-xlpang@126.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0a17af35670a..20772eea67f2 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1657,7 +1657,6 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && - has_pushable_dl_tasks(rq) && p->nr_cpus_allowed > 1 && dl_task(rq->curr) && (rq->curr->nr_cpus_allowed < 2 || -- cgit v1.2.3 From 781b0203423c228b100aaaf169c77b2b556f8a49 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sat, 4 Jul 2015 09:06:32 +0200 Subject: sched, sysctl: Delete an unnecessary check before unregister_sysctl_table() The unregister_sysctl_table() function tests whether its argument is NULL and then returns immediately. Thus the test around the call is not needed. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/5597877E.3060503@users.sourceforge.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 78b4bad10081..48be7dc3d497 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5311,8 +5311,7 @@ static void register_sched_domain_sysctl(void) /* may be called multiple times per register */ static void unregister_sched_domain_sysctl(void) { - if (sd_sysctl_header) - unregister_sysctl_table(sd_sysctl_header); + unregister_sysctl_table(sd_sysctl_header); sd_sysctl_header = NULL; if (sd_ctl_dir[0].child) sd_free_ctl_entry(&sd_ctl_dir[0].child); -- cgit v1.2.3 From 9d7fb04276481c59610983362d8e023d262b58ca Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 30 Jun 2015 11:30:54 +0200 Subject: sched/cputime: Guarantee stime + utime == rtime While the current code guarantees monotonicity for stime and utime independently of one another, it does not guarantee that the sum of both is equal to the total time we started out with. This confuses things (and peoples) who look at this sum, like top, and will report >100% usage followed by a matching period of 0%. Rework the code to provide both individual monotonicity and a coherent sum. Suggested-by: Fredrik Markstrom Reported-by: Fredrik Markstrom Tested-by: Fredrik Markstrom Signed-off-by: Peter Zijlstra (Intel) Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: jason.low2@hp.com Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/fork.c | 7 ++-- kernel/sched/cputime.c | 101 +++++++++++++++++++++++++++++-------------------- 2 files changed, 64 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 1bfefc6f96a4..6e8f807c5716 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1067,6 +1067,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) rcu_assign_pointer(tsk->sighand, sig); if (!sig) return -ENOMEM; + atomic_set(&sig->count, 1); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); return 0; @@ -1128,6 +1129,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); seqlock_init(&sig->stats_lock); + prev_cputime_init(&sig->prev_cputime); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->real_timer.function = it_real_fn; @@ -1335,9 +1337,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->utime = p->stime = p->gtime = 0; p->utimescaled = p->stimescaled = 0; -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - p->prev_cputime.utime = p->prev_cputime.stime = 0; -#endif + prev_cputime_init(&p->prev_cputime); + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN seqlock_init(&p->vtime_seqlock); p->vtime_snap = 0; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index f5a64ffad176..8cbc3db671df 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -555,48 +555,43 @@ drop_precision: } /* - * Atomically advance counter to the new value. Interrupts, vcpu - * scheduling, and scaling inaccuracies can cause cputime_advance - * to be occasionally called with a new value smaller than counter. - * Let's enforce atomicity. + * Adjust tick based cputime random precision against scheduler runtime + * accounting. * - * Normally a caller will only go through this loop once, or not - * at all in case a previous caller updated counter the same jiffy. - */ -static void cputime_advance(cputime_t *counter, cputime_t new) -{ - cputime_t old; - - while (new > (old = READ_ONCE(*counter))) - cmpxchg_cputime(counter, old, new); -} - -/* - * Adjust tick based cputime random precision against scheduler - * runtime accounting. + * Tick based cputime accounting depend on random scheduling timeslices of a + * task to be interrupted or not by the timer. Depending on these + * circumstances, the number of these interrupts may be over or + * under-optimistic, matching the real user and system cputime with a variable + * precision. + * + * Fix this by scaling these tick based values against the total runtime + * accounted by the CFS scheduler. + * + * This code provides the following guarantees: + * + * stime + utime == rtime + * stime_i+1 >= stime_i, utime_i+1 >= utime_i + * + * Assuming that rtime_i+1 >= rtime_i. */ static void cputime_adjust(struct task_cputime *curr, - struct cputime *prev, + struct prev_cputime *prev, cputime_t *ut, cputime_t *st) { cputime_t rtime, stime, utime; + unsigned long flags; - /* - * Tick based cputime accounting depend on random scheduling - * timeslices of a task to be interrupted or not by the timer. - * Depending on these circumstances, the number of these interrupts - * may be over or under-optimistic, matching the real user and system - * cputime with a variable precision. - * - * Fix this by scaling these tick based values against the total - * runtime accounted by the CFS scheduler. - */ + /* Serialize concurrent callers such that we can honour our guarantees */ + raw_spin_lock_irqsave(&prev->lock, flags); rtime = nsecs_to_cputime(curr->sum_exec_runtime); /* - * Update userspace visible utime/stime values only if actual execution - * time is bigger than already exported. Note that can happen, that we - * provided bigger values due to scaling inaccuracy on big numbers. + * This is possible under two circumstances: + * - rtime isn't monotonic after all (a bug); + * - we got reordered by the lock. + * + * In both cases this acts as a filter such that the rest of the code + * can assume it is monotonic regardless of anything else. */ if (prev->stime + prev->utime >= rtime) goto out; @@ -606,22 +601,46 @@ static void cputime_adjust(struct task_cputime *curr, if (utime == 0) { stime = rtime; - } else if (stime == 0) { - utime = rtime; - } else { - cputime_t total = stime + utime; + goto update; + } - stime = scale_stime((__force u64)stime, - (__force u64)rtime, (__force u64)total); - utime = rtime - stime; + if (stime == 0) { + utime = rtime; + goto update; } - cputime_advance(&prev->stime, stime); - cputime_advance(&prev->utime, utime); + stime = scale_stime((__force u64)stime, (__force u64)rtime, + (__force u64)(stime + utime)); + + /* + * Make sure stime doesn't go backwards; this preserves monotonicity + * for utime because rtime is monotonic. + * + * utime_i+1 = rtime_i+1 - stime_i + * = rtime_i+1 - (rtime_i - utime_i) + * = (rtime_i+1 - rtime_i) + utime_i + * >= utime_i + */ + if (stime < prev->stime) + stime = prev->stime; + utime = rtime - stime; + + /* + * Make sure utime doesn't go backwards; this still preserves + * monotonicity for stime, analogous argument to above. + */ + if (utime < prev->utime) { + utime = prev->utime; + stime = rtime - utime; + } +update: + prev->stime = stime; + prev->utime = utime; out: *ut = prev->utime; *st = prev->stime; + raw_spin_unlock_irqrestore(&prev->lock, flags); } void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) -- cgit v1.2.3 From fbd705a0c6184580d0e2fbcbd47a37b6e5822511 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 9 Jun 2015 11:13:36 +0200 Subject: sched: Introduce the 'trace_sched_waking' tracepoint Mathieu reported that since 317f394160e9 ("sched: Move the second half of ttwu() to the remote cpu") trace_sched_wakeup() can happen out of context of the waker. This is a problem when you want to analyse wakeup paths because it is now very hard to correlate the wakeup event to whoever issued the wakeup. OTOH trace_sched_wakeup() is issued at the point where we set p->state = TASK_RUNNING, which is right were we hand the task off to the scheduler, so this is an important point when looking at scheduling behaviour, up to here its been the wakeup path everything hereafter is due to scheduler policy. To bridge this gap, introduce a second tracepoint: trace_sched_waking. It is guaranteed to be called in the waker context. Reported-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Cc: Francis Giraldeau Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150609091336.GQ3644@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 10 +++++++--- kernel/trace/trace_sched_switch.c | 2 +- kernel/trace/trace_sched_wakeup.c | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 48be7dc3d497..fa5826cc612f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1654,9 +1654,9 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { check_preempt_curr(rq, p, wake_flags); - trace_sched_wakeup(p, true); - p->state = TASK_RUNNING; + trace_sched_wakeup(p); + #ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* @@ -1874,6 +1874,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (!(p->state & state)) goto out; + trace_sched_waking(p); + success = 1; /* we're going to change ->state */ cpu = task_cpu(p); @@ -1949,6 +1951,8 @@ static void try_to_wake_up_local(struct task_struct *p) if (!(p->state & TASK_NORMAL)) goto out; + trace_sched_waking(p); + if (!task_on_rq_queued(p)) ttwu_activate(rq, p, ENQUEUE_WAKEUP); @@ -2307,7 +2311,7 @@ void wake_up_new_task(struct task_struct *p) rq = __task_rq_lock(p); activate_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; - trace_sched_wakeup_new(p, true); + trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP if (p->sched_class->task_woken) diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 419ca37e72c9..f270088e9929 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -26,7 +26,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n } static void -probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) +probe_sched_wakeup(void *ignore, struct task_struct *wakee) { if (unlikely(!sched_ref)) return; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 9b33dd117f3f..12cbe77b4136 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -514,7 +514,7 @@ static void wakeup_reset(struct trace_array *tr) } static void -probe_wakeup(void *ignore, struct task_struct *p, int success) +probe_wakeup(void *ignore, struct task_struct *p) { struct trace_array_cpu *data; int cpu = smp_processor_id(); -- cgit v1.2.3 From 63b0e9edceec10fa41ec33393a1515a5ff444277 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 14 Jul 2015 17:39:50 +0200 Subject: sched/fair: Beef up wake_wide() Josef Bacik reported that Facebook sees better performance with their 1:N load (1 dispatch/node, N workers/node) when carrying an old patch to try very hard to wake to an idle CPU. While looking at wake_wide(), I noticed that it doesn't pay attention to the wakeup of a many partner waker, returning 1 only when waking one of its many partners. Correct that, letting explicit domain flags override the heuristic. While at it, adjust task_struct bits, we don't need a 64-bit counter. Tested-by: Josef Bacik Signed-off-by: Mike Galbraith [ Tidy things up. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-team Cc: morten.rasmussen@arm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1436888390.7983.49.camel@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 67 ++++++++++++++++++++++++++--------------------------- 1 file changed, 33 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8b384b8d2f1d..ea23f9f1b51b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4726,26 +4726,29 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif +/* + * Detect M:N waker/wakee relationships via a switching-frequency heuristic. + * A waker of many should wake a different task than the one last awakened + * at a frequency roughly N times higher than one of its wakees. In order + * to determine whether we should let the load spread vs consolodating to + * shared cache, we look for a minimum 'flip' frequency of llc_size in one + * partner, and a factor of lls_size higher frequency in the other. With + * both conditions met, we can be relatively sure that the relationship is + * non-monogamous, with partner count exceeding socket size. Waker/wakee + * being client/server, worker/dispatcher, interrupt source or whatever is + * irrelevant, spread criteria is apparent partner count exceeds socket size. + */ static int wake_wide(struct task_struct *p) { + unsigned int master = current->wakee_flips; + unsigned int slave = p->wakee_flips; int factor = this_cpu_read(sd_llc_size); - /* - * Yeah, it's the switching-frequency, could means many wakee or - * rapidly switch, use factor here will just help to automatically - * adjust the loose-degree, so bigger node will lead to more pull. - */ - if (p->wakee_flips > factor) { - /* - * wakee is somewhat hot, it needs certain amount of cpu - * resource, so if waker is far more hot, prefer to leave - * it alone. - */ - if (current->wakee_flips > (factor * p->wakee_flips)) - return 1; - } - - return 0; + if (master < slave) + swap(master, slave); + if (slave < factor || master < slave * factor) + return 0; + return 1; } static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) @@ -4757,13 +4760,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) unsigned long weight; int balanced; - /* - * If we wake multiple tasks be careful to not bounce - * ourselves around too much. - */ - if (wake_wide(p)) - return 0; - idx = sd->wake_idx; this_cpu = smp_processor_id(); prev_cpu = task_cpu(p); @@ -5017,17 +5013,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); - int new_cpu = cpu; + int new_cpu = prev_cpu; int want_affine = 0; int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) - want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); rcu_read_lock(); for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) - continue; + break; /* * If both cpu and prev_cpu are part of this domain, @@ -5041,17 +5037,21 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (tmp->flags & sd_flag) sd = tmp; + else if (!want_affine) + break; } - if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) - prev_cpu = cpu; - - if (sd_flag & SD_BALANCE_WAKE) { - new_cpu = select_idle_sibling(p, prev_cpu); - goto unlock; + if (affine_sd) { + sd = NULL; /* Prefer wake_affine over balance flags */ + if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + new_cpu = cpu; } - while (sd) { + if (!sd) { + if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ + new_cpu = select_idle_sibling(p, new_cpu); + + } else while (sd) { struct sched_group *group; int weight; @@ -5085,7 +5085,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } /* while loop will break here if sd == NULL */ } -unlock: rcu_read_unlock(); return new_cpu; -- cgit v1.2.3 From fe32d3cd5e8eb0f82e459763374aa80797023403 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 15 Jul 2015 12:52:04 +0300 Subject: sched/preempt: Fix cond_resched_lock() and cond_resched_softirq() These functions check should_resched() before unlocking spinlock/bh-enable: preempt_count always non-zero => should_resched() always returns false. cond_resched_lock() worked iff spin_needbreak is set. This patch adds argument "preempt_offset" to should_resched(). preempt_count offset constants for that: PREEMPT_DISABLE_OFFSET - offset after preempt_disable() PREEMPT_LOCK_OFFSET - offset after spin_lock() SOFTIRQ_DISABLE_OFFSET - offset after local_bh_distable() SOFTIRQ_LOCK_OFFSET - offset after spin_lock_bh() Signed-off-by: Konstantin Khlebnikov Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Graf Cc: Boris Ostrovsky Cc: David Vrabel Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: bdb438065890 ("sched: Extract the basic add/sub preempt_count modifiers") Link: http://lkml.kernel.org/r/20150715095204.12246.98268.stgit@buzz Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fa5826cc612f..f5fad2b12baf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4496,7 +4496,7 @@ SYSCALL_DEFINE0(sched_yield) int __sched _cond_resched(void) { - if (should_resched()) { + if (should_resched(0)) { preempt_schedule_common(); return 1; } @@ -4514,7 +4514,7 @@ EXPORT_SYMBOL(_cond_resched); */ int __cond_resched_lock(spinlock_t *lock) { - int resched = should_resched(); + int resched = should_resched(PREEMPT_LOCK_OFFSET); int ret = 0; lockdep_assert_held(lock); @@ -4536,7 +4536,7 @@ int __sched __cond_resched_softirq(void) { BUG_ON(!in_softirq()); - if (should_resched()) { + if (should_resched(SOFTIRQ_DISABLE_OFFSET)) { local_bh_enable(); preempt_schedule_common(); local_bh_disable(); -- cgit v1.2.3 From 02cb7aa923ec553e6454ec766ded27b472326ebe Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 30 Jun 2015 03:29:44 +0200 Subject: stop_machine: Move 'cpu_stopper_task' and 'stop_cpus_work' into 'struct cpu_stopper' Multpiple DEFINE_PER_CPU's do not make sense, move all the per-cpu variables into 'struct cpu_stopper'. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Tejun Heo Cc: Thomas Gleixner Cc: dave@stgolabs.net Cc: der.herr@hofr.at Cc: paulmck@linux.vnet.ibm.com Cc: riel@redhat.com Cc: viro@ZenIV.linux.org.uk Link: http://lkml.kernel.org/r/20150630012944.GA23924@redhat.com Signed-off-by: Ingo Molnar --- kernel/stop_machine.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index fd643d8c4b42..6e677b003164 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -35,13 +35,16 @@ struct cpu_stop_done { /* the actual stopper, one per every possible cpu, enabled on online cpus */ struct cpu_stopper { + struct task_struct *thread; + spinlock_t lock; bool enabled; /* is this stopper enabled? */ struct list_head works; /* list of pending works */ + + struct cpu_stop_work stop_work; /* for stop_cpus */ }; static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); -static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task); static bool stop_machine_initialized = false; /* @@ -74,7 +77,6 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed) static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - struct task_struct *p = per_cpu(cpu_stopper_task, cpu); unsigned long flags; @@ -82,7 +84,7 @@ static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) if (stopper->enabled) { list_add_tail(&work->list, &stopper->works); - wake_up_process(p); + wake_up_process(stopper->thread); } else cpu_stop_signal_done(work->done, false); @@ -293,7 +295,6 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, /* static data for stop_cpus */ static DEFINE_MUTEX(stop_cpus_mutex); -static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work); static void queue_stop_cpus_work(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg, @@ -304,7 +305,7 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask, /* initialize works and done */ for_each_cpu(cpu, cpumask) { - work = &per_cpu(stop_cpus_work, cpu); + work = &per_cpu(cpu_stopper.stop_work, cpu); work->fn = fn; work->arg = arg; work->done = done; @@ -317,7 +318,7 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask, */ lg_global_lock(&stop_cpus_lock); for_each_cpu(cpu, cpumask) - cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu)); + cpu_stop_queue_work(cpu, &per_cpu(cpu_stopper.stop_work, cpu)); lg_global_unlock(&stop_cpus_lock); } @@ -458,7 +459,7 @@ extern void sched_set_stop_task(int cpu, struct task_struct *stop); static void cpu_stop_create(unsigned int cpu) { - sched_set_stop_task(cpu, per_cpu(cpu_stopper_task, cpu)); + sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu)); } static void cpu_stop_park(unsigned int cpu) @@ -485,7 +486,7 @@ static void cpu_stop_unpark(unsigned int cpu) } static struct smp_hotplug_thread cpu_stop_threads = { - .store = &cpu_stopper_task, + .store = &cpu_stopper.thread, .thread_should_run = cpu_stop_should_run, .thread_fn = cpu_stopper_thread, .thread_comm = "migration/%u", -- cgit v1.2.3 From b377c2a089d4538e6e62e51fa595c896c314d83d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 30 Jun 2015 03:29:48 +0200 Subject: stop_machine: Don't do for_each_cpu() twice in queue_stop_cpus_work() queue_stop_cpus_work() can do everything in one for_each_cpu() loop. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Tejun Heo Cc: Thomas Gleixner Cc: dave@stgolabs.net Cc: der.herr@hofr.at Cc: paulmck@linux.vnet.ibm.com Cc: riel@redhat.com Cc: viro@ZenIV.linux.org.uk Link: http://lkml.kernel.org/r/20150630012948.GA23927@redhat.com Signed-off-by: Ingo Molnar --- kernel/stop_machine.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 6e677b003164..621220852df0 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -303,22 +303,19 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask, struct cpu_stop_work *work; unsigned int cpu; - /* initialize works and done */ - for_each_cpu(cpu, cpumask) { - work = &per_cpu(cpu_stopper.stop_work, cpu); - work->fn = fn; - work->arg = arg; - work->done = done; - } - /* * Disable preemption while queueing to avoid getting * preempted by a stopper which might wait for other stoppers * to enter @fn which can lead to deadlock. */ lg_global_lock(&stop_cpus_lock); - for_each_cpu(cpu, cpumask) - cpu_stop_queue_work(cpu, &per_cpu(cpu_stopper.stop_work, cpu)); + for_each_cpu(cpu, cpumask) { + work = &per_cpu(cpu_stopper.stop_work, cpu); + work->fn = fn; + work->arg = arg; + work->done = done; + cpu_stop_queue_work(cpu, work); + } lg_global_unlock(&stop_cpus_lock); } -- cgit v1.2.3 From 7eeb088e72048bf4660f64fc3824c8066cf17591 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 30 Jun 2015 03:29:51 +0200 Subject: stop_machine: Unexport __stop_machine() The only caller outside of stop_machine.c is _cpu_down(), it can use stop_machine(). get_online_cpus() is fine under cpu_hotplug_begin(). Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Tejun Heo Cc: Thomas Gleixner Cc: dave@stgolabs.net Cc: der.herr@hofr.at Cc: paulmck@linux.vnet.ibm.com Cc: riel@redhat.com Cc: viro@ZenIV.linux.org.uk Link: http://lkml.kernel.org/r/20150630012951.GA23934@redhat.com Signed-off-by: Ingo Molnar --- kernel/cpu.c | 2 +- kernel/stop_machine.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 9c9c9fab16cc..664ce5299334 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -395,7 +395,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) * So now all preempt/rcu users must observe !cpu_active(). */ - err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); + err = stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 621220852df0..b50910dbf030 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -513,7 +513,7 @@ early_initcall(cpu_stop_init); #ifdef CONFIG_STOP_MACHINE -int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) +static int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) { struct multi_stop_data msdata = { .fn = fn, -- cgit v1.2.3 From 9a301f22faac7fc2207ee49c1855a6b4ba9c5a52 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 30 Jun 2015 03:29:55 +0200 Subject: stop_machine: Use 'cpu_stop_fn_t' where possible Cosmetic, but 'cpu_stop_fn_t' actually makes the code more readable and it doesn't break cscope. And most of the declarations already use it. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Tejun Heo Cc: Thomas Gleixner Cc: dave@stgolabs.net Cc: der.herr@hofr.at Cc: paulmck@linux.vnet.ibm.com Cc: riel@redhat.com Cc: viro@ZenIV.linux.org.uk Link: http://lkml.kernel.org/r/20150630012955.GA23937@redhat.com Signed-off-by: Ingo Molnar --- kernel/stop_machine.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index b50910dbf030..9a70defe9f1f 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -141,7 +141,7 @@ enum multi_stop_state { }; struct multi_stop_data { - int (*fn)(void *); + cpu_stop_fn_t fn; void *data; /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ unsigned int num_threads; @@ -513,7 +513,7 @@ early_initcall(cpu_stop_init); #ifdef CONFIG_STOP_MACHINE -static int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) +static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { struct multi_stop_data msdata = { .fn = fn, @@ -546,7 +546,7 @@ static int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *c return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata); } -int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) +int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { int ret; @@ -580,7 +580,7 @@ EXPORT_SYMBOL_GPL(stop_machine); * 0 if all executions of @fn returned 0, any non zero return value if any * returned non zero. */ -int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, +int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { struct multi_stop_data msdata = { .fn = fn, .data = data, -- cgit v1.2.3 From d308b9f1e4412bcf583c82c4ca15ef97cb8b0e6f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 30 Jun 2015 03:29:58 +0200 Subject: stop_machine: Remove cpu_stop_work's from list in cpu_stop_park() cpu_stop_park() does cpu_stop_signal_done() but leaves the work on stopper->works. The owner of this work can free/reuse this memory right after that and corrupt the list, so if this CPU becomes online again cpu_stopper_thread() will crash. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Tejun Heo Cc: Thomas Gleixner Cc: dave@stgolabs.net Cc: der.herr@hofr.at Cc: paulmck@linux.vnet.ibm.com Cc: riel@redhat.com Cc: viro@ZenIV.linux.org.uk Link: http://lkml.kernel.org/r/20150630012958.GA23944@redhat.com Signed-off-by: Ingo Molnar --- kernel/stop_machine.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 9a70defe9f1f..12484e5d5c88 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -462,13 +462,15 @@ static void cpu_stop_create(unsigned int cpu) static void cpu_stop_park(unsigned int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - struct cpu_stop_work *work; + struct cpu_stop_work *work, *tmp; unsigned long flags; /* drain remaining works */ spin_lock_irqsave(&stopper->lock, flags); - list_for_each_entry(work, &stopper->works, list) + list_for_each_entry_safe(work, tmp, &stopper->works, list) { + list_del_init(&work->list); cpu_stop_signal_done(work->done, false); + } stopper->enabled = false; spin_unlock_irqrestore(&stopper->lock, flags); } -- cgit v1.2.3 From cd126afe838d7ea9b971cdea087fd498a7293c7f Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Wed, 15 Jul 2015 08:04:36 +0800 Subject: sched/fair: Remove rq's runnable avg The current rq->avg is not used at all since its merge into the kernel, and the code is in the scheduler's hot path, so remove it. Tested-by: Dietmar Eggemann Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arjan@linux.intel.com Cc: bsegall@google.com Cc: fengguang.wu@intel.com Cc: len.brown@intel.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: rafael.j.wysocki@intel.com Cc: umgwanakikbuti@gmail.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1436918682-4971-2-git-send-email-yuyang.du@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 7 +------ kernel/sched/fair.c | 25 ++++--------------------- kernel/sched/sched.h | 2 -- 3 files changed, 5 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4222ec50ab88..363b7e82554b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -68,13 +68,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group #define PN(F) \ SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) - if (!se) { - struct sched_avg *avg = &cpu_rq(cpu)->avg; - P(avg->runnable_avg_sum); - P(avg->avg_period); + if (!se) return; - } - PN(se->exec_start); PN(se->vruntime); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ea23f9f1b51b..90292c672a3b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2724,19 +2724,12 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) } } -static inline void update_rq_runnable_avg(struct rq *rq, int runnable) -{ - __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg, - runnable, runnable); - __update_tg_runnable_avg(&rq->avg, &rq->cfs); -} #else /* CONFIG_FAIR_GROUP_SCHED */ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, int force_update) {} static inline void __update_tg_runnable_avg(struct sched_avg *sa, struct cfs_rq *cfs_rq) {} static inline void __update_group_entity_contrib(struct sched_entity *se) {} -static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ static inline void __update_task_entity_contrib(struct sched_entity *se) @@ -2940,7 +2933,6 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, */ void idle_enter_fair(struct rq *this_rq) { - update_rq_runnable_avg(this_rq, 1); } /* @@ -2950,7 +2942,6 @@ void idle_enter_fair(struct rq *this_rq) */ void idle_exit_fair(struct rq *this_rq) { - update_rq_runnable_avg(this_rq, 0); } static int idle_balance(struct rq *this_rq); @@ -2959,7 +2950,6 @@ static int idle_balance(struct rq *this_rq); static inline void update_entity_load_avg(struct sched_entity *se, int update_cfs_rq) {} -static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) {} @@ -4258,10 +4248,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_entity_load_avg(se, 1); } - if (!se) { - update_rq_runnable_avg(rq, rq->nr_running); + if (!se) add_nr_running(rq, 1); - } + hrtick_update(rq); } @@ -4319,10 +4308,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_entity_load_avg(se, 1); } - if (!se) { + if (!se) sub_nr_running(rq, 1); - update_rq_runnable_avg(rq, 1); - } + hrtick_update(rq); } @@ -6005,9 +5993,6 @@ static void __update_blocked_averages_cpu(struct task_group *tg, int cpu) */ if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running) list_del_leaf_cfs_rq(cfs_rq); - } else { - struct rq *rq = rq_of(cfs_rq); - update_rq_runnable_avg(rq, rq->nr_running); } } @@ -7988,8 +7973,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (numabalancing_enabled) task_tick_numa(rq, curr); - - update_rq_runnable_avg(rq, 1); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 84d48790bb6d..e13210cce7e8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -595,8 +595,6 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; - - struct sched_avg avg; #endif /* CONFIG_FAIR_GROUP_SCHED */ /* -- cgit v1.2.3 From 9d89c257dfb9c51a532d69397f6eed75e5168c35 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Wed, 15 Jul 2015 08:04:37 +0800 Subject: sched/fair: Rewrite runnable load and utilization average tracking The idea of runnable load average (let runnable time contribute to weight) was proposed by Paul Turner and Ben Segall, and it is still followed by this rewrite. This rewrite aims to solve the following issues: 1. cfs_rq's load average (namely runnable_load_avg and blocked_load_avg) is updated at the granularity of an entity at a time, which results in the cfs_rq's load average is stale or partially updated: at any time, only one entity is up to date, all other entities are effectively lagging behind. This is undesirable. To illustrate, if we have n runnable entities in the cfs_rq, as time elapses, they certainly become outdated: t0: cfs_rq { e1_old, e2_old, ..., en_old } and when we update: t1: update e1, then we have cfs_rq { e1_new, e2_old, ..., en_old } t2: update e2, then we have cfs_rq { e1_old, e2_new, ..., en_old } ... We solve this by combining all runnable entities' load averages together in cfs_rq's avg, and update the cfs_rq's avg as a whole. This is based on the fact that if we regard the update as a function, then: w * update(e) = update(w * e) and update(e1) + update(e2) = update(e1 + e2), then w1 * update(e1) + w2 * update(e2) = update(w1 * e1 + w2 * e2) therefore, by this rewrite, we have an entirely updated cfs_rq at the time we update it: t1: update cfs_rq { e1_new, e2_new, ..., en_new } t2: update cfs_rq { e1_new, e2_new, ..., en_new } ... 2. cfs_rq's load average is different between top rq->cfs_rq and other task_group's per CPU cfs_rqs in whether or not blocked_load_average contributes to the load. The basic idea behind runnable load average (the same for utilization) is that the blocked state is taken into account as opposed to only accounting for the currently runnable state. Therefore, the average should include both the runnable/running and blocked load averages. This rewrite does that. In addition, we also combine runnable/running and blocked averages of all entities into the cfs_rq's average, and update it together at once. This is based on the fact that: update(runnable) + update(blocked) = update(runnable + blocked) This significantly reduces the code as we don't need to separately maintain/update runnable/running load and blocked load. 3. How task_group entities' share is calculated is complex and imprecise. We reduce the complexity in this rewrite to allow a very simple rule: the task_group's load_avg is aggregated from its per CPU cfs_rqs's load_avgs. Then group entity's weight is simply proportional to its own cfs_rq's load_avg / task_group's load_avg. To illustrate, if a task_group has { cfs_rq1, cfs_rq2, ..., cfs_rqn }, then, task_group_avg = cfs_rq1_avg + cfs_rq2_avg + ... + cfs_rqn_avg, then cfs_rqx's entity's share = cfs_rqx_avg / task_group_avg * task_group's share To sum up, this rewrite in principle is equivalent to the current one, but fixes the issues described above. Turns out, it significantly reduces the code complexity and hence increases clarity and efficiency. In addition, the new averages are more smooth/continuous (no spurious spikes and valleys) and updated more consistently and quickly to reflect the load dynamics. As a result, we have less load tracking overhead, better performance, and especially better power efficiency due to more balanced load. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arjan@linux.intel.com Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: fengguang.wu@intel.com Cc: len.brown@intel.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: rafael.j.wysocki@intel.com Cc: umgwanakikbuti@gmail.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1436918682-4971-3-git-send-email-yuyang.du@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 - kernel/sched/debug.c | 41 ++-- kernel/sched/fair.c | 630 +++++++++++++++++---------------------------------- kernel/sched/sched.h | 28 +-- 4 files changed, 231 insertions(+), 471 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f5fad2b12baf..3981526539c5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2020,9 +2020,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; -#ifdef CONFIG_SMP - p->se.avg.decay_count = 0; -#endif INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_SCHEDSTATS diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 363b7e82554b..74f276f5568c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -88,12 +88,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group #endif P(se->load.weight); #ifdef CONFIG_SMP - P(se->avg.runnable_avg_sum); - P(se->avg.running_avg_sum); - P(se->avg.avg_period); - P(se->avg.load_avg_contrib); - P(se->avg.utilization_avg_contrib); - P(se->avg.decay_count); + P(se->avg.load_avg); + P(se->avg.util_avg); #endif #undef PN #undef P @@ -209,21 +205,19 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg", - cfs_rq->runnable_load_avg); - SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", - cfs_rq->blocked_load_avg); - SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg", - cfs_rq->utilization_load_avg); + SEQ_printf(m, " .%-30s: %lu\n", "load_avg", + cfs_rq->avg.load_avg); + SEQ_printf(m, " .%-30s: %lu\n", "util_avg", + cfs_rq->avg.util_avg); + SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg", + atomic_long_read(&cfs_rq->removed_load_avg)); + SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg", + atomic_long_read(&cfs_rq->removed_util_avg)); #ifdef CONFIG_FAIR_GROUP_SCHED - SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", - cfs_rq->tg_load_contrib); - SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", - cfs_rq->tg_runnable_contrib); + SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib", + cfs_rq->tg_load_avg_contrib); SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", atomic_long_read(&cfs_rq->tg->load_avg)); - SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", - atomic_read(&cfs_rq->tg->runnable_avg)); #endif #endif #ifdef CONFIG_CFS_BANDWIDTH @@ -631,12 +625,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.load.weight); #ifdef CONFIG_SMP - P(se.avg.runnable_avg_sum); - P(se.avg.running_avg_sum); - P(se.avg.avg_period); - P(se.avg.load_avg_contrib); - P(se.avg.utilization_avg_contrib); - P(se.avg.decay_count); + P(se.avg.load_sum); + P(se.avg.util_sum); + P(se.avg.load_avg); + P(se.avg.util_avg); + P(se.avg.last_update_time); #endif P(policy); P(prio); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 90292c672a3b..01ffa9509c23 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -283,9 +283,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } -static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, - int force_update); - static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (!cfs_rq->on_list) { @@ -305,8 +302,6 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) } cfs_rq->on_list = 1; - /* We should have no load, but we need to update last_decay. */ - update_cfs_rq_blocked_load(cfs_rq, 0); } } @@ -664,19 +659,31 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) static int select_idle_sibling(struct task_struct *p, int cpu); static unsigned long task_h_load(struct task_struct *p); -static inline void __update_task_entity_contrib(struct sched_entity *se); -static inline void __update_task_entity_utilization(struct sched_entity *se); +/* + * We choose a half-life close to 1 scheduling period. + * Note: The tables below are dependent on this value. + */ +#define LOAD_AVG_PERIOD 32 +#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ +#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ /* Give new task start runnable values to heavy its load in infant time */ void init_task_runnable_average(struct task_struct *p) { - u32 slice; + struct sched_avg *sa = &p->se.avg; - slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; - p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice; - p->se.avg.avg_period = slice; - __update_task_entity_contrib(&p->se); - __update_task_entity_utilization(&p->se); + sa->last_update_time = 0; + /* + * sched_avg's period_contrib should be strictly less then 1024, so + * we give it 1023 to make sure it is almost a period (1024us), and + * will definitely be update (after enqueue). + */ + sa->period_contrib = 1023; + sa->load_avg = scale_load_down(p->se.load.weight); + sa->load_sum = sa->load_avg * LOAD_AVG_MAX; + sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); + sa->util_sum = LOAD_AVG_MAX; + /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } #else void init_task_runnable_average(struct task_struct *p) @@ -1698,8 +1705,8 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) delta = runtime - p->last_sum_exec_runtime; *period = now - p->last_task_numa_placement; } else { - delta = p->se.avg.runnable_avg_sum; - *period = p->se.avg.avg_period; + delta = p->se.avg.load_sum / p->se.load.weight; + *period = LOAD_AVG_MAX; } p->last_sum_exec_runtime = runtime; @@ -2347,13 +2354,13 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) long tg_weight; /* - * Use this CPU's actual weight instead of the last load_contribution - * to gain a more accurate current total weight. See - * __update_cfs_rq_tg_load_contrib(). + * Use this CPU's real-time load instead of the last load contribution + * as the updating of the contribution is delayed, and we will use the + * the real-time load to calc the share. See update_tg_load_avg(). */ tg_weight = atomic_long_read(&tg->load_avg); - tg_weight -= cfs_rq->tg_load_contrib; - tg_weight += cfs_rq->load.weight; + tg_weight -= cfs_rq->tg_load_avg_contrib; + tg_weight += cfs_rq->avg.load_avg; return tg_weight; } @@ -2363,7 +2370,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) long tg_weight, load, shares; tg_weight = calc_tg_weight(tg, cfs_rq); - load = cfs_rq->load.weight; + load = cfs_rq->avg.load_avg; shares = (tg->shares * load); if (tg_weight) @@ -2425,14 +2432,6 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq) #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_SMP -/* - * We choose a half-life close to 1 scheduling period. - * Note: The tables below are dependent on this value. - */ -#define LOAD_AVG_PERIOD 32 -#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ -#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ - /* Precomputed fixed inverse multiplies for multiplication by y^n */ static const u32 runnable_avg_yN_inv[] = { 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, @@ -2481,9 +2480,8 @@ static __always_inline u64 decay_load(u64 val, u64 n) local_n %= LOAD_AVG_PERIOD; } - val *= runnable_avg_yN_inv[local_n]; - /* We don't use SRR here since we always want to round down. */ - return val >> 32; + val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32); + return val; } /* @@ -2542,23 +2540,22 @@ static u32 __compute_runnable_contrib(u64 n) * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ -static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, - struct sched_avg *sa, - int runnable, - int running) +static __always_inline int +__update_load_avg(u64 now, int cpu, struct sched_avg *sa, + unsigned long weight, int running) { u64 delta, periods; - u32 runnable_contrib; + u32 contrib; int delta_w, decayed = 0; unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); - delta = now - sa->last_runnable_update; + delta = now - sa->last_update_time; /* * This should only happen when time goes backwards, which it * unfortunately does during sched clock init when we swap over to TSC. */ if ((s64)delta < 0) { - sa->last_runnable_update = now; + sa->last_update_time = now; return 0; } @@ -2569,26 +2566,26 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, delta >>= 10; if (!delta) return 0; - sa->last_runnable_update = now; + sa->last_update_time = now; /* delta_w is the amount already accumulated against our next period */ - delta_w = sa->avg_period % 1024; + delta_w = sa->period_contrib; if (delta + delta_w >= 1024) { - /* period roll-over */ decayed = 1; + /* how much left for next period will start over, we don't know yet */ + sa->period_contrib = 0; + /* * Now that we know we're crossing a period boundary, figure * out how much from delta we need to complete the current * period and accrue it. */ delta_w = 1024 - delta_w; - if (runnable) - sa->runnable_avg_sum += delta_w; + if (weight) + sa->load_sum += weight * delta_w; if (running) - sa->running_avg_sum += delta_w * scale_freq - >> SCHED_CAPACITY_SHIFT; - sa->avg_period += delta_w; + sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT; delta -= delta_w; @@ -2596,334 +2593,156 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, periods = delta / 1024; delta %= 1024; - sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, - periods + 1); - sa->running_avg_sum = decay_load(sa->running_avg_sum, - periods + 1); - sa->avg_period = decay_load(sa->avg_period, - periods + 1); + sa->load_sum = decay_load(sa->load_sum, periods + 1); + sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1); /* Efficiently calculate \sum (1..n_period) 1024*y^i */ - runnable_contrib = __compute_runnable_contrib(periods); - if (runnable) - sa->runnable_avg_sum += runnable_contrib; + contrib = __compute_runnable_contrib(periods); + if (weight) + sa->load_sum += weight * contrib; if (running) - sa->running_avg_sum += runnable_contrib * scale_freq - >> SCHED_CAPACITY_SHIFT; - sa->avg_period += runnable_contrib; + sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT; } /* Remainder of delta accrued against u_0` */ - if (runnable) - sa->runnable_avg_sum += delta; + if (weight) + sa->load_sum += weight * delta; if (running) - sa->running_avg_sum += delta * scale_freq - >> SCHED_CAPACITY_SHIFT; - sa->avg_period += delta; - - return decayed; -} - -/* Synchronize an entity's decay with its parenting cfs_rq.*/ -static inline u64 __synchronize_entity_decay(struct sched_entity *se) -{ - struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 decays = atomic64_read(&cfs_rq->decay_counter); + sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT; - decays -= se->avg.decay_count; - se->avg.decay_count = 0; - if (!decays) - return 0; + sa->period_contrib += delta; - se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); - se->avg.utilization_avg_contrib = - decay_load(se->avg.utilization_avg_contrib, decays); + if (decayed) { + sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); + sa->util_avg = (sa->util_sum << SCHED_LOAD_SHIFT) / LOAD_AVG_MAX; + } - return decays; + return decayed; } #ifdef CONFIG_FAIR_GROUP_SCHED -static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, - int force_update) -{ - struct task_group *tg = cfs_rq->tg; - long tg_contrib; - - tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; - tg_contrib -= cfs_rq->tg_load_contrib; - - if (!tg_contrib) - return; - - if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { - atomic_long_add(tg_contrib, &tg->load_avg); - cfs_rq->tg_load_contrib += tg_contrib; - } -} - /* - * Aggregate cfs_rq runnable averages into an equivalent task_group - * representation for computing load contributions. + * Updating tg's load_avg is necessary before update_cfs_share (which is done) + * and effective_load (which is not done because it is too costly). */ -static inline void __update_tg_runnable_avg(struct sched_avg *sa, - struct cfs_rq *cfs_rq) +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) { - struct task_group *tg = cfs_rq->tg; - long contrib; - - /* The fraction of a cpu used by this cfs_rq */ - contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, - sa->avg_period + 1); - contrib -= cfs_rq->tg_runnable_contrib; + long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; - if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { - atomic_add(contrib, &tg->runnable_avg); - cfs_rq->tg_runnable_contrib += contrib; - } -} - -static inline void __update_group_entity_contrib(struct sched_entity *se) -{ - struct cfs_rq *cfs_rq = group_cfs_rq(se); - struct task_group *tg = cfs_rq->tg; - int runnable_avg; - - u64 contrib; - - contrib = cfs_rq->tg_load_contrib * tg->shares; - se->avg.load_avg_contrib = div_u64(contrib, - atomic_long_read(&tg->load_avg) + 1); - - /* - * For group entities we need to compute a correction term in the case - * that they are consuming <1 cpu so that we would contribute the same - * load as a task of equal weight. - * - * Explicitly co-ordinating this measurement would be expensive, but - * fortunately the sum of each cpus contribution forms a usable - * lower-bound on the true value. - * - * Consider the aggregate of 2 contributions. Either they are disjoint - * (and the sum represents true value) or they are disjoint and we are - * understating by the aggregate of their overlap. - * - * Extending this to N cpus, for a given overlap, the maximum amount we - * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of - * cpus that overlap for this interval and w_i is the interval width. - * - * On a small machine; the first term is well-bounded which bounds the - * total error since w_i is a subset of the period. Whereas on a - * larger machine, while this first term can be larger, if w_i is the - * of consequential size guaranteed to see n_i*w_i quickly converge to - * our upper bound of 1-cpu. - */ - runnable_avg = atomic_read(&tg->runnable_avg); - if (runnable_avg < NICE_0_LOAD) { - se->avg.load_avg_contrib *= runnable_avg; - se->avg.load_avg_contrib >>= NICE_0_SHIFT; + if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { + atomic_long_add(delta, &cfs_rq->tg->load_avg); + cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; } } #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, - int force_update) {} -static inline void __update_tg_runnable_avg(struct sched_avg *sa, - struct cfs_rq *cfs_rq) {} -static inline void __update_group_entity_contrib(struct sched_entity *se) {} +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline void __update_task_entity_contrib(struct sched_entity *se) -{ - u32 contrib; - - /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ - contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); - contrib /= (se->avg.avg_period + 1); - se->avg.load_avg_contrib = scale_load(contrib); -} +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); -/* Compute the current contribution to load_avg by se, return any delta */ -static long __update_entity_load_avg_contrib(struct sched_entity *se) +/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ +static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { - long old_contrib = se->avg.load_avg_contrib; + int decayed; + struct sched_avg *sa = &cfs_rq->avg; - if (entity_is_task(se)) { - __update_task_entity_contrib(se); - } else { - __update_tg_runnable_avg(&se->avg, group_cfs_rq(se)); - __update_group_entity_contrib(se); + if (atomic_long_read(&cfs_rq->removed_load_avg)) { + long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); + sa->load_avg = max_t(long, sa->load_avg - r, 0); + sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); } - return se->avg.load_avg_contrib - old_contrib; -} - - -static inline void __update_task_entity_utilization(struct sched_entity *se) -{ - u32 contrib; - - /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ - contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE); - contrib /= (se->avg.avg_period + 1); - se->avg.utilization_avg_contrib = scale_load(contrib); -} + if (atomic_long_read(&cfs_rq->removed_util_avg)) { + long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); + sa->util_avg = max_t(long, sa->util_avg - r, 0); + sa->util_sum = max_t(s32, sa->util_sum - + ((r * LOAD_AVG_MAX) >> SCHED_LOAD_SHIFT), 0); + } -static long __update_entity_utilization_avg_contrib(struct sched_entity *se) -{ - long old_contrib = se->avg.utilization_avg_contrib; + decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, + scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL); - if (entity_is_task(se)) - __update_task_entity_utilization(se); - else - se->avg.utilization_avg_contrib = - group_cfs_rq(se)->utilization_load_avg; - - return se->avg.utilization_avg_contrib - old_contrib; -} +#ifndef CONFIG_64BIT + smp_wmb(); + cfs_rq->load_last_update_time_copy = sa->last_update_time; +#endif -static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, - long load_contrib) -{ - if (likely(load_contrib < cfs_rq->blocked_load_avg)) - cfs_rq->blocked_load_avg -= load_contrib; - else - cfs_rq->blocked_load_avg = 0; + return decayed; } -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); - -/* Update a sched_entity's runnable average */ -static inline void update_entity_load_avg(struct sched_entity *se, - int update_cfs_rq) +/* Update task and its cfs_rq load average */ +static inline void update_load_avg(struct sched_entity *se, int update_tg) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - long contrib_delta, utilization_delta; int cpu = cpu_of(rq_of(cfs_rq)); - u64 now; + u64 now = cfs_rq_clock_task(cfs_rq); /* - * For a group entity we need to use their owned cfs_rq_clock_task() in - * case they are the parent of a throttled hierarchy. + * Track task load average for carrying it to new CPU after migrated, and + * track group sched_entity load average for task_h_load calc in migration */ - if (entity_is_task(se)) - now = cfs_rq_clock_task(cfs_rq); - else - now = cfs_rq_clock_task(group_cfs_rq(se)); + __update_load_avg(now, cpu, &se->avg, + se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se); - if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq, - cfs_rq->curr == se)) - return; - - contrib_delta = __update_entity_load_avg_contrib(se); - utilization_delta = __update_entity_utilization_avg_contrib(se); - - if (!update_cfs_rq) - return; - - if (se->on_rq) { - cfs_rq->runnable_load_avg += contrib_delta; - cfs_rq->utilization_load_avg += utilization_delta; - } else { - subtract_blocked_load_contrib(cfs_rq, -contrib_delta); - } + if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) + update_tg_load_avg(cfs_rq, 0); } -/* - * Decay the load contributed by all blocked children and account this so that - * their contribution may appropriately discounted when they wake up. - */ -static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) +/* Add the load generated by se into cfs_rq's load average */ +static inline void +enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 now = cfs_rq_clock_task(cfs_rq) >> 20; - u64 decays; - - decays = now - cfs_rq->last_decay; - if (!decays && !force_update) - return; + struct sched_avg *sa = &se->avg; + u64 now = cfs_rq_clock_task(cfs_rq); + int migrated = 0, decayed; - if (atomic_long_read(&cfs_rq->removed_load)) { - unsigned long removed_load; - removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0); - subtract_blocked_load_contrib(cfs_rq, removed_load); + if (sa->last_update_time == 0) { + sa->last_update_time = now; + migrated = 1; } - - if (decays) { - cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, - decays); - atomic64_add(decays, &cfs_rq->decay_counter); - cfs_rq->last_decay = now; + else { + __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, + se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se); } - __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); -} + decayed = update_cfs_rq_load_avg(now, cfs_rq); -/* Add the load generated by se into cfs_rq's child load-average */ -static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se, - int wakeup) -{ - /* - * We track migrations using entity decay_count <= 0, on a wake-up - * migration we use a negative decay count to track the remote decays - * accumulated while sleeping. - * - * Newly forked tasks are enqueued with se->avg.decay_count == 0, they - * are seen by enqueue_entity_load_avg() as a migration with an already - * constructed load_avg_contrib. - */ - if (unlikely(se->avg.decay_count <= 0)) { - se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq)); - if (se->avg.decay_count) { - /* - * In a wake-up migration we have to approximate the - * time sleeping. This is because we can't synchronize - * clock_task between the two cpus, and it is not - * guaranteed to be read-safe. Instead, we can - * approximate this using our carried decays, which are - * explicitly atomically readable. - */ - se->avg.last_runnable_update -= (-se->avg.decay_count) - << 20; - update_entity_load_avg(se, 0); - /* Indicate that we're now synchronized and on-rq */ - se->avg.decay_count = 0; - } - wakeup = 0; - } else { - __synchronize_entity_decay(se); + if (migrated) { + cfs_rq->avg.load_avg += sa->load_avg; + cfs_rq->avg.load_sum += sa->load_sum; + cfs_rq->avg.util_avg += sa->util_avg; + cfs_rq->avg.util_sum += sa->util_sum; } - /* migrated tasks did not contribute to our blocked load */ - if (wakeup) { - subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); - update_entity_load_avg(se, 0); - } - - cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; - cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib; - /* we force update consideration on load-balancer moves */ - update_cfs_rq_blocked_load(cfs_rq, !wakeup); + if (decayed || migrated) + update_tg_load_avg(cfs_rq, 0); } /* - * Remove se's load from this cfs_rq child load-average, if the entity is - * transitioning to a blocked state we track its projected decay using - * blocked_load_avg. + * Task first catches up with cfs_rq, and then subtract + * itself from the cfs_rq (task must be off the queue now). */ -static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se, - int sleep) +void remove_entity_load_avg(struct sched_entity *se) { - update_entity_load_avg(se, 1); - /* we force update consideration on load-balancer moves */ - update_cfs_rq_blocked_load(cfs_rq, !sleep); + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 last_update_time; + +#ifndef CONFIG_64BIT + u64 last_update_time_copy; - cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; - cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib; - if (sleep) { - cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; - se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); - } /* migrations, e.g. sleep=0 leave decay_count == 0 */ + do { + last_update_time_copy = cfs_rq->load_last_update_time_copy; + smp_rmb(); + last_update_time = cfs_rq->avg.last_update_time; + } while (last_update_time != last_update_time_copy); +#else + last_update_time = cfs_rq->avg.last_update_time; +#endif + + __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0); + atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); + atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } /* @@ -2948,16 +2767,10 @@ static int idle_balance(struct rq *this_rq); #else /* CONFIG_SMP */ -static inline void update_entity_load_avg(struct sched_entity *se, - int update_cfs_rq) {} -static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se, - int wakeup) {} -static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se, - int sleep) {} -static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, - int force_update) {} +static inline void update_load_avg(struct sched_entity *se, int update_tg) {} +static inline void +enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +static inline void remove_entity_load_avg(struct sched_entity *se) {} static inline int idle_balance(struct rq *rq) { @@ -3089,7 +2902,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); + enqueue_entity_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -3164,7 +2977,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); + update_load_avg(se, 1); update_stats_dequeue(cfs_rq, se); if (flags & DEQUEUE_SLEEP) { @@ -3254,7 +3067,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); - update_entity_load_avg(se, 1); + update_load_avg(se, 1); } update_stats_curr_start(cfs_rq, se); @@ -3354,7 +3167,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ - update_entity_load_avg(prev, 1); + update_load_avg(prev, 0); } cfs_rq->curr = NULL; } @@ -3370,8 +3183,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) /* * Ensure that runnable average is periodically updated. */ - update_entity_load_avg(curr, 1); - update_cfs_rq_blocked_load(cfs_rq, 1); + update_load_avg(curr, 1); update_cfs_shares(cfs_rq); #ifdef CONFIG_SCHED_HRTICK @@ -4244,8 +4056,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; + update_load_avg(se, 1); update_cfs_shares(cfs_rq); - update_entity_load_avg(se, 1); } if (!se) @@ -4304,8 +4116,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; + update_load_avg(se, 1); update_cfs_shares(cfs_rq); - update_entity_load_avg(se, 1); } if (!se) @@ -4444,7 +4256,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, static void update_idle_cpu_load(struct rq *this_rq) { unsigned long curr_jiffies = READ_ONCE(jiffies); - unsigned long load = this_rq->cfs.runnable_load_avg; + unsigned long load = this_rq->cfs.avg.load_avg; unsigned long pending_updates; /* @@ -4490,7 +4302,7 @@ void update_cpu_load_nohz(void) */ void update_cpu_load_active(struct rq *this_rq) { - unsigned long load = this_rq->cfs.runnable_load_avg; + unsigned long load = this_rq->cfs.avg.load_avg; /* * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). */ @@ -4501,7 +4313,7 @@ void update_cpu_load_active(struct rq *this_rq) /* Used instead of source_load when we know the type == 0 */ static unsigned long weighted_cpuload(const int cpu) { - return cpu_rq(cpu)->cfs.runnable_load_avg; + return cpu_rq(cpu)->cfs.avg.load_avg; } /* @@ -4551,7 +4363,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); - unsigned long load_avg = rq->cfs.runnable_load_avg; + unsigned long load_avg = rq->cfs.avg.load_avg; if (nr_running) return load_avg / nr_running; @@ -4670,7 +4482,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) /* * w = rw_i + @wl */ - w = se->my_q->load.weight + wl; + w = se->my_q->avg.load_avg + wl; /* * wl = S * s'_i; see (2) @@ -4691,7 +4503,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) /* * wl = dw_i = S * (s'_i - s_i); see (3) */ - wl -= se->load.weight; + wl -= se->avg.load_avg; /* * Recursively apply this logic to all parent groups to compute @@ -4761,14 +4573,14 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) */ if (sync) { tg = task_group(current); - weight = current->se.load.weight; + weight = current->se.avg.load_avg; this_load += effective_load(tg, this_cpu, -weight, -weight); load += effective_load(tg, prev_cpu, 0, -weight); } tg = task_group(p); - weight = p->se.load.weight; + weight = p->se.avg.load_avg; /* * In low-load situations, where prev_cpu is idle and this_cpu is idle @@ -4961,12 +4773,12 @@ done: * tasks. The unit of the return value must be the one of capacity so we can * compare the usage with the capacity of the CPU that is available for CFS * task (ie cpu_capacity). - * cfs.utilization_load_avg is the sum of running time of runnable tasks on a + * cfs.avg.util_avg is the sum of running time of runnable tasks on a * CPU. It represents the amount of utilization of a CPU in the range * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full * capacity of the CPU because it's about the running time on this CPU. - * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE - * because of unfortunate rounding in avg_period and running_load_avg or just + * Nevertheless, cfs.avg.util_avg can be higher than SCHED_LOAD_SCALE + * because of unfortunate rounding in util_avg or just * after migrating tasks until the average stabilizes with the new running * time. So we need to check that the usage stays into the range * [0..cpu_capacity_orig] and cap if necessary. @@ -4975,7 +4787,7 @@ done: */ static int get_cpu_usage(int cpu) { - unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; + unsigned long usage = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long capacity = capacity_orig_of(cpu); if (usage >= SCHED_LOAD_SCALE) @@ -5084,26 +4896,22 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f * previous cpu. However, the caller only guarantees p->pi_lock is held; no * other assumptions, including the state of rq->lock, should be made. */ -static void -migrate_task_rq_fair(struct task_struct *p, int next_cpu) +static void migrate_task_rq_fair(struct task_struct *p, int next_cpu) { - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - /* - * Load tracking: accumulate removed load so that it can be processed - * when we next update owning cfs_rq under rq->lock. Tasks contribute - * to blocked load iff they have a positive decay-count. It can never - * be negative here since on-rq tasks have decay-count == 0. + * We are supposed to update the task to "current" time, then its up to date + * and ready to go to new CPU/cfs_rq. But we have difficulty in getting + * what current time is, so simply throw away the out-of-date time. This + * will result in the wakee task is less decayed, but giving the wakee more + * load sounds not bad. */ - if (se->avg.decay_count) { - se->avg.decay_count = -__synchronize_entity_decay(se); - atomic_long_add(se->avg.load_avg_contrib, - &cfs_rq->removed_load); - } + remove_entity_load_avg(&p->se); + + /* Tell new CPU we are migrated */ + p->se.avg.last_update_time = 0; /* We have migrated, no longer consider this task hot */ - se->exec_start = 0; + p->se.exec_start = 0; } #endif /* CONFIG_SMP */ @@ -5966,36 +5774,6 @@ static void attach_tasks(struct lb_env *env) } #ifdef CONFIG_FAIR_GROUP_SCHED -/* - * update tg->load_weight by folding this cpu's load_avg - */ -static void __update_blocked_averages_cpu(struct task_group *tg, int cpu) -{ - struct sched_entity *se = tg->se[cpu]; - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; - - /* throttled entities do not contribute to load */ - if (throttled_hierarchy(cfs_rq)) - return; - - update_cfs_rq_blocked_load(cfs_rq, 1); - - if (se) { - update_entity_load_avg(se, 1); - /* - * We pivot on our runnable average having decayed to zero for - * list removal. This generally implies that all our children - * have also been removed (modulo rounding error or bandwidth - * control); however, such cases are rare and we can fix these - * at enqueue. - * - * TODO: fix up out-of-order children on enqueue. - */ - if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running) - list_del_leaf_cfs_rq(cfs_rq); - } -} - static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -6004,19 +5782,19 @@ static void update_blocked_averages(int cpu) raw_spin_lock_irqsave(&rq->lock, flags); update_rq_clock(rq); + /* * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ for_each_leaf_cfs_rq(rq, cfs_rq) { - /* - * Note: We may want to consider periodically releasing - * rq->lock about these updates so that creating many task - * groups does not result in continually extending hold time. - */ - __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu); - } + /* throttled entities do not contribute to load */ + if (throttled_hierarchy(cfs_rq)) + continue; + if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) + update_tg_load_avg(cfs_rq, 0); + } raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -6044,14 +5822,13 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) } if (!se) { - cfs_rq->h_load = cfs_rq->runnable_load_avg; + cfs_rq->h_load = cfs_rq->avg.load_avg; cfs_rq->last_h_load_update = now; } while ((se = cfs_rq->h_load_next) != NULL) { load = cfs_rq->h_load; - load = div64_ul(load * se->avg.load_avg_contrib, - cfs_rq->runnable_load_avg + 1); + load = div64_ul(load * se->avg.load_avg, cfs_rq->avg.load_avg + 1); cfs_rq = group_cfs_rq(se); cfs_rq->h_load = load; cfs_rq->last_h_load_update = now; @@ -6063,8 +5840,8 @@ static unsigned long task_h_load(struct task_struct *p) struct cfs_rq *cfs_rq = task_cfs_rq(p); update_cfs_rq_h_load(cfs_rq); - return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, - cfs_rq->runnable_load_avg + 1); + return div64_ul(p->se.avg.load_avg * cfs_rq->h_load, + cfs_rq->avg.load_avg + 1); } #else static inline void update_blocked_averages(int cpu) @@ -6073,7 +5850,7 @@ static inline void update_blocked_averages(int cpu) static unsigned long task_h_load(struct task_struct *p) { - return p->se.avg.load_avg_contrib; + return p->se.avg.load_avg; } #endif @@ -8071,15 +7848,18 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) } #ifdef CONFIG_SMP - /* - * Remove our load from contribution when we leave sched_fair - * and ensure we don't carry in an old decay_count if we - * switch back. - */ - if (se->avg.decay_count) { - __synchronize_entity_decay(se); - subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); - } + /* Catch up with the cfs_rq and remove our load when we leave */ + __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq), &se->avg, + se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se); + + cfs_rq->avg.load_avg = + max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); + cfs_rq->avg.load_sum = + max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); + cfs_rq->avg.util_avg = + max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); + cfs_rq->avg.util_sum = + max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); #endif } @@ -8136,8 +7916,8 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif #ifdef CONFIG_SMP - atomic64_set(&cfs_rq->decay_counter, 1); - atomic_long_set(&cfs_rq->removed_load, 0); + atomic_long_set(&cfs_rq->removed_load_avg, 0); + atomic_long_set(&cfs_rq->removed_util_avg, 0); #endif } @@ -8182,14 +7962,14 @@ static void task_move_group_fair(struct task_struct *p, int queued) if (!queued) { cfs_rq = cfs_rq_of(se); se->vruntime += cfs_rq->min_vruntime; + #ifdef CONFIG_SMP - /* - * migrate_task_rq_fair() will have removed our previous - * contribution, but we must synchronize for ongoing future - * decay. - */ - se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); - cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; + /* Virtually synchronize task with its new cfs_rq */ + p->se.avg.last_update_time = cfs_rq->avg.last_update_time; + cfs_rq->avg.load_avg += p->se.avg.load_avg; + cfs_rq->avg.load_sum += p->se.avg.load_sum; + cfs_rq->avg.util_avg += p->se.avg.util_avg; + cfs_rq->avg.util_sum += p->se.avg.util_sum; #endif } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e13210cce7e8..dcde941a585b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -245,7 +245,6 @@ struct task_group { #ifdef CONFIG_SMP atomic_long_t load_avg; - atomic_t runnable_avg; #endif #endif @@ -366,27 +365,18 @@ struct cfs_rq { #ifdef CONFIG_SMP /* - * CFS Load tracking - * Under CFS, load is tracked on a per-entity basis and aggregated up. - * This allows for the description of both thread and group usage (in - * the FAIR_GROUP_SCHED case). - * runnable_load_avg is the sum of the load_avg_contrib of the - * sched_entities on the rq. - * blocked_load_avg is similar to runnable_load_avg except that its - * the blocked sched_entities on the rq. - * utilization_load_avg is the sum of the average running time of the - * sched_entities on the rq. + * CFS load tracking */ - unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg; - atomic64_t decay_counter; - u64 last_decay; - atomic_long_t removed_load; - + struct sched_avg avg; #ifdef CONFIG_FAIR_GROUP_SCHED - /* Required to track per-cpu representation of a task_group */ - u32 tg_runnable_contrib; - unsigned long tg_load_contrib; + unsigned long tg_load_avg_contrib; +#endif + atomic_long_t removed_load_avg, removed_util_avg; +#ifndef CONFIG_64BIT + u64 load_last_update_time_copy; +#endif +#ifdef CONFIG_FAIR_GROUP_SCHED /* * h_load = weight * f(tg) * -- cgit v1.2.3 From 6c1d47c0827304949e0eb9479f4d587f226fac8b Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 15 Jul 2015 08:04:38 +0800 Subject: sched/fair: Implement update_blocked_averages() for CONFIG_FAIR_GROUP_SCHED=n The load and the utilization of idle CPUs must be updated periodically in order to decay the blocked part. If CONFIG_FAIR_GROUP_SCHED is not set, the load and util of idle cpus are not decayed and stay at the values set before becoming idle. Signed-off-by: Vincent Guittot Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arjan@linux.intel.com Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: fengguang.wu@intel.com Cc: len.brown@intel.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: rafael.j.wysocki@intel.com Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/1436918682-4971-4-git-send-email-yuyang.du@intel.com [ Fixed up the SOB chain. ] Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 01ffa9509c23..e4b80c63633a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5846,6 +5846,14 @@ static unsigned long task_h_load(struct task_struct *p) #else static inline void update_blocked_averages(int cpu) { + struct rq *rq = cpu_rq(cpu); + struct cfs_rq *cfs_rq = &rq->cfs; + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); + update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); + raw_spin_unlock_irqrestore(&rq->lock, flags); } static unsigned long task_h_load(struct task_struct *p) -- cgit v1.2.3 From 540247fb5ddf6d2364f90387fa1f8f428d15e683 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Wed, 15 Jul 2015 08:04:39 +0800 Subject: sched/fair: Init cfs_rq's sched_entity load average The runnable load and utilization averages of cfs_rq's sched_entity were not initiated. Like done to a task, give new cfs_rq' sched_entity start values to heavy its load in infant time. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arjan@linux.intel.com Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: fengguang.wu@intel.com Cc: len.brown@intel.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: rafael.j.wysocki@intel.com Cc: umgwanakikbuti@gmail.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1436918682-4971-5-git-send-email-yuyang.du@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 11 ++++++----- kernel/sched/sched.h | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3981526539c5..5ca9ae0b7e31 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2304,7 +2304,7 @@ void wake_up_new_task(struct task_struct *p) #endif /* Initialize new task's runnable average */ - init_task_runnable_average(p); + init_entity_runnable_average(&p->se); rq = __task_rq_lock(p); activate_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e4b80c63633a..f636db0e086c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -667,10 +667,10 @@ static unsigned long task_h_load(struct task_struct *p); #define LOAD_AVG_MAX 47742 /* maximum possible load avg */ #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ -/* Give new task start runnable values to heavy its load in infant time */ -void init_task_runnable_average(struct task_struct *p) +/* Give new sched_entity start runnable values to heavy its load in infant time */ +void init_entity_runnable_average(struct sched_entity *se) { - struct sched_avg *sa = &p->se.avg; + struct sched_avg *sa = &se->avg; sa->last_update_time = 0; /* @@ -679,14 +679,14 @@ void init_task_runnable_average(struct task_struct *p) * will definitely be update (after enqueue). */ sa->period_contrib = 1023; - sa->load_avg = scale_load_down(p->se.load.weight); + sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); sa->util_sum = LOAD_AVG_MAX; /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } #else -void init_task_runnable_average(struct task_struct *p) +void init_entity_runnable_average(struct sched_entity *se) { } #endif @@ -8029,6 +8029,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); + init_entity_runnable_average(se); } return 1; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index dcde941a585b..4d139e0bc206 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1307,7 +1307,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); unsigned long to_ratio(u64 period, u64 runtime); -extern void init_task_runnable_average(struct task_struct *p); +extern void init_entity_runnable_average(struct sched_entity *se); static inline void add_nr_running(struct rq *rq, unsigned count) { -- cgit v1.2.3 From 1269557889b477e3e43ab99a21035ddf8f7cea4d Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Wed, 15 Jul 2015 08:04:40 +0800 Subject: sched/fair: Remove task and group entity load when they are dead When task exits or group is destroyed, the entity's load should be removed from its parent cfs_rq's load. Otherwise, it will take time for the parent cfs_rq to decay the dead entity's load to 0, which is not desired. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arjan@linux.intel.com Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: fengguang.wu@intel.com Cc: len.brown@intel.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: rafael.j.wysocki@intel.com Cc: umgwanakikbuti@gmail.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1436918682-4971-6-git-send-email-yuyang.du@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f636db0e086c..5532bf38e844 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4913,6 +4913,11 @@ static void migrate_task_rq_fair(struct task_struct *p, int next_cpu) /* We have migrated, no longer consider this task hot */ p->se.exec_start = 0; } + +static void task_dead_fair(struct task_struct *p) +{ + remove_entity_load_avg(&p->se); +} #endif /* CONFIG_SMP */ static unsigned long @@ -7991,8 +7996,11 @@ void free_fair_sched_group(struct task_group *tg) for_each_possible_cpu(i) { if (tg->cfs_rq) kfree(tg->cfs_rq[i]); - if (tg->se) + if (tg->se) { + if (tg->se[i]) + remove_entity_load_avg(tg->se[i]); kfree(tg->se[i]); + } } kfree(tg->cfs_rq); @@ -8179,6 +8187,7 @@ const struct sched_class fair_sched_class = { .rq_offline = rq_offline_fair, .task_waking = task_waking_fair, + .task_dead = task_dead_fair, #endif .set_curr_task = set_curr_task_fair, -- cgit v1.2.3 From 139622343ef31941effc6de6a5a9320371a00e62 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Wed, 15 Jul 2015 08:04:41 +0800 Subject: sched/fair: Provide runnable_load_avg back to cfs_rq The cfs_rq's load_avg is composed of runnable_load_avg and blocked_load_avg. Before this series, sometimes the runnable_load_avg is used, and sometimes the load_avg is used. Completely replacing all uses of runnable_load_avg with load_avg may be too big a leap, i.e., the blocked_load_avg is concerned to result in overrated load. Therefore, we get runnable_load_avg back. The new cfs_rq's runnable_load_avg is improved to be updated with all of the runnable sched_eneities at the same time, so the one sched_entity updated and the others stale problem is solved. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arjan@linux.intel.com Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: fengguang.wu@intel.com Cc: len.brown@intel.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: rafael.j.wysocki@intel.com Cc: umgwanakikbuti@gmail.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1436918682-4971-7-git-send-email-yuyang.du@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 2 ++ kernel/sched/fair.c | 55 ++++++++++++++++++++++++++++++++++++++++++---------- kernel/sched/sched.h | 2 ++ 3 files changed, 49 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 74f276f5568c..641511771ae6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -207,6 +207,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %lu\n", "load_avg", cfs_rq->avg.load_avg); + SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg", + cfs_rq->runnable_load_avg); SEQ_printf(m, " .%-30s: %lu\n", "util_avg", cfs_rq->avg.util_avg); SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5532bf38e844..1a878d59f0f3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2542,7 +2542,7 @@ static u32 __compute_runnable_contrib(u64 n) */ static __always_inline int __update_load_avg(u64 now, int cpu, struct sched_avg *sa, - unsigned long weight, int running) + unsigned long weight, int running, struct cfs_rq *cfs_rq) { u64 delta, periods; u32 contrib; @@ -2582,8 +2582,11 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, * period and accrue it. */ delta_w = 1024 - delta_w; - if (weight) + if (weight) { sa->load_sum += weight * delta_w; + if (cfs_rq) + cfs_rq->runnable_load_sum += weight * delta_w; + } if (running) sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT; @@ -2594,19 +2597,29 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, delta %= 1024; sa->load_sum = decay_load(sa->load_sum, periods + 1); + if (cfs_rq) { + cfs_rq->runnable_load_sum = + decay_load(cfs_rq->runnable_load_sum, periods + 1); + } sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1); /* Efficiently calculate \sum (1..n_period) 1024*y^i */ contrib = __compute_runnable_contrib(periods); - if (weight) + if (weight) { sa->load_sum += weight * contrib; + if (cfs_rq) + cfs_rq->runnable_load_sum += weight * contrib; + } if (running) sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT; } /* Remainder of delta accrued against u_0` */ - if (weight) + if (weight) { sa->load_sum += weight * delta; + if (cfs_rq) + cfs_rq->runnable_load_sum += weight * delta; + } if (running) sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT; @@ -2614,6 +2627,10 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, if (decayed) { sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); + if (cfs_rq) { + cfs_rq->runnable_load_avg = + div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); + } sa->util_avg = (sa->util_sum << SCHED_LOAD_SHIFT) / LOAD_AVG_MAX; } @@ -2661,7 +2678,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) } decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, - scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL); + scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq); #ifndef CONFIG_64BIT smp_wmb(); @@ -2683,7 +2700,7 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) * track group sched_entity load average for task_h_load calc in migration */ __update_load_avg(now, cpu, &se->avg, - se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se); + se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) update_tg_load_avg(cfs_rq, 0); @@ -2703,11 +2720,15 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) } else { __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, - se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se); + se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); } decayed = update_cfs_rq_load_avg(now, cfs_rq); + cfs_rq->runnable_load_avg += sa->load_avg; + cfs_rq->runnable_load_sum += sa->load_sum; + if (migrated) { cfs_rq->avg.load_avg += sa->load_avg; cfs_rq->avg.load_sum += sa->load_sum; @@ -2719,6 +2740,18 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) update_tg_load_avg(cfs_rq, 0); } +/* Remove the runnable load generated by se from cfs_rq's runnable load average */ +static inline void +dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + update_load_avg(se, 1); + + cfs_rq->runnable_load_avg = + max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); + cfs_rq->runnable_load_sum = + max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); +} + /* * Task first catches up with cfs_rq, and then subtract * itself from the cfs_rq (task must be off the queue now). @@ -2740,7 +2773,7 @@ void remove_entity_load_avg(struct sched_entity *se) last_update_time = cfs_rq->avg.last_update_time; #endif - __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0); + __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } @@ -2770,6 +2803,8 @@ static int idle_balance(struct rq *this_rq); static inline void update_load_avg(struct sched_entity *se, int update_tg) {} static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +static inline void +dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void remove_entity_load_avg(struct sched_entity *se) {} static inline int idle_balance(struct rq *rq) @@ -2977,7 +3012,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - update_load_avg(se, 1); + dequeue_entity_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se); if (flags & DEQUEUE_SLEEP) { @@ -7863,7 +7898,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) #ifdef CONFIG_SMP /* Catch up with the cfs_rq and remove our load when we leave */ __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq), &se->avg, - se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se); + se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4d139e0bc206..ab0b05cc3f37 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -368,6 +368,8 @@ struct cfs_rq { * CFS load tracking */ struct sched_avg avg; + u64 runnable_load_sum; + unsigned long runnable_load_avg; #ifdef CONFIG_FAIR_GROUP_SCHED unsigned long tg_load_avg_contrib; #endif -- cgit v1.2.3 From 7ea241afbf4924c58d41078599f7a32ba49fb985 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Wed, 15 Jul 2015 08:04:42 +0800 Subject: sched/fair: Clean up load average references For cfs_rq, we have load.weight, runnable_load_avg, and load_avg. Clean up how they are used: - First, as group sched_entity already largely uses load_avg, we now expand to use load_avg in all cases. - Second, for CPU-wide load balancing, we choose to use runnable_load_avg in all cases, which is the same as before this series. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arjan@linux.intel.com Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: fengguang.wu@intel.com Cc: len.brown@intel.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: rafael.j.wysocki@intel.com Cc: umgwanakikbuti@gmail.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1436918682-4971-8-git-send-email-yuyang.du@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 44 +++++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1a878d59f0f3..858b94ab1bd2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -685,6 +685,9 @@ void init_entity_runnable_average(struct sched_entity *se) sa->util_sum = LOAD_AVG_MAX; /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } + +static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); +static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); #else void init_entity_runnable_average(struct sched_entity *se) { @@ -2360,7 +2363,7 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) */ tg_weight = atomic_long_read(&tg->load_avg); tg_weight -= cfs_rq->tg_load_avg_contrib; - tg_weight += cfs_rq->avg.load_avg; + tg_weight += cfs_rq_load_avg(cfs_rq); return tg_weight; } @@ -2370,7 +2373,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) long tg_weight, load, shares; tg_weight = calc_tg_weight(tg, cfs_rq); - load = cfs_rq->avg.load_avg; + load = cfs_rq_load_avg(cfs_rq); shares = (tg->shares * load); if (tg_weight) @@ -2796,6 +2799,16 @@ void idle_exit_fair(struct rq *this_rq) { } +static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) +{ + return cfs_rq->runnable_load_avg; +} + +static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) +{ + return cfs_rq->avg.load_avg; +} + static int idle_balance(struct rq *this_rq); #else /* CONFIG_SMP */ @@ -4270,6 +4283,12 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, sched_avg_update(this_rq); } +/* Used instead of source_load when we know the type == 0 */ +static unsigned long weighted_cpuload(const int cpu) +{ + return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs); +} + #ifdef CONFIG_NO_HZ_COMMON /* * There is no sane way to deal with nohz on smp when using jiffies because the @@ -4291,7 +4310,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, static void update_idle_cpu_load(struct rq *this_rq) { unsigned long curr_jiffies = READ_ONCE(jiffies); - unsigned long load = this_rq->cfs.avg.load_avg; + unsigned long load = weighted_cpuload(cpu_of(this_rq)); unsigned long pending_updates; /* @@ -4337,7 +4356,7 @@ void update_cpu_load_nohz(void) */ void update_cpu_load_active(struct rq *this_rq) { - unsigned long load = this_rq->cfs.avg.load_avg; + unsigned long load = weighted_cpuload(cpu_of(this_rq)); /* * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). */ @@ -4345,12 +4364,6 @@ void update_cpu_load_active(struct rq *this_rq) __update_cpu_load(this_rq, load, 1); } -/* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) -{ - return cpu_rq(cpu)->cfs.avg.load_avg; -} - /* * Return a low guess at the load of a migration-source cpu weighted * according to the scheduling class and "nice" value. @@ -4398,7 +4411,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); - unsigned long load_avg = rq->cfs.avg.load_avg; + unsigned long load_avg = weighted_cpuload(cpu); if (nr_running) return load_avg / nr_running; @@ -4517,7 +4530,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) /* * w = rw_i + @wl */ - w = se->my_q->avg.load_avg + wl; + w = cfs_rq_load_avg(se->my_q) + wl; /* * wl = S * s'_i; see (2) @@ -5862,13 +5875,14 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) } if (!se) { - cfs_rq->h_load = cfs_rq->avg.load_avg; + cfs_rq->h_load = cfs_rq_load_avg(cfs_rq); cfs_rq->last_h_load_update = now; } while ((se = cfs_rq->h_load_next) != NULL) { load = cfs_rq->h_load; - load = div64_ul(load * se->avg.load_avg, cfs_rq->avg.load_avg + 1); + load = div64_ul(load * se->avg.load_avg, + cfs_rq_load_avg(cfs_rq) + 1); cfs_rq = group_cfs_rq(se); cfs_rq->h_load = load; cfs_rq->last_h_load_update = now; @@ -5881,7 +5895,7 @@ static unsigned long task_h_load(struct task_struct *p) update_cfs_rq_h_load(cfs_rq); return div64_ul(p->se.avg.load_avg * cfs_rq->h_load, - cfs_rq->avg.load_avg + 1); + cfs_rq_load_avg(cfs_rq) + 1); } #else static inline void update_blocked_averages(int cpu) -- cgit v1.2.3 From 3c8e4793556981a7f532599959aa3303968056f0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 Jul 2015 17:31:47 +0200 Subject: sched: Remove finish_arch_switch() One less arch hook.. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 - kernel/sched/sched.h | 3 --- 2 files changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5ca9ae0b7e31..b11f6240709b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2470,7 +2470,6 @@ static struct rq *finish_task_switch(struct task_struct *prev) */ prev_state = prev->state; vtime_task_switch(prev); - finish_arch_switch(prev); perf_event_task_sched_in(prev, current); finish_lock_switch(rq, prev); finish_arch_post_lock_switch(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ab0b05cc3f37..22ccc5556c42 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1055,9 +1055,6 @@ static inline int task_on_rq_migrating(struct task_struct *p) #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif -#ifndef finish_arch_switch -# define finish_arch_switch(prev) do { } while (0) -#endif #ifndef finish_arch_post_lock_switch # define finish_arch_post_lock_switch() do { } while (0) #endif -- cgit v1.2.3 From e5779e8e12299f77c2421a707855d8d124171d85 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 30 Jul 2015 20:32:40 -0700 Subject: perf/x86/hw_breakpoints: Disallow kernel breakpoints unless kprobe-safe Code on the kprobe blacklist doesn't want unexpected int3 exceptions. It probably doesn't want unexpected debug exceptions either. Be safe: disallow breakpoints in nokprobes code. On non-CONFIG_KPROBES kernels, there is no kprobe blacklist. In that case, disallow kernel breakpoints entirely. It will be particularly important to keep hw breakpoints out of the entry and NMI code once we move debug exceptions off the IST stack. Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Brian Gerst Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e14b152af99640448d895e3c2a8c2d5ee19a1325.1438312874.git.luto@kernel.org Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index c90e417bb963..d10ab6b9b5e0 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1332,7 +1332,7 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr) addr < (unsigned long)__kprobes_text_end; } -static bool within_kprobe_blacklist(unsigned long addr) +bool within_kprobe_blacklist(unsigned long addr) { struct kprobe_blacklist_entry *ent; -- cgit v1.2.3 From 9a6694cfa2390181dec936a17c0d9d21ef7b08d9 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Thu, 30 Jul 2015 16:48:24 +0300 Subject: perf/x86/intel/pt: Do not force sync packets on every schedule-in Currently, the PT driver zeroes out the status register every time before starting the event. However, all the writable bits are already taken care of in pt_handle_status() function, except the new PacketByteCnt field, which in new versions of PT contains the number of packet bytes written since the last sync (PSB) packet. Zeroing it out before enabling PT forces a sync packet to be written. This means that, with the existing code, a sync packet (PSB and PSBEND, 18 bytes in total) will be generated every time a PT event is scheduled in. To avoid these unnecessary syncs and save a WRMSR in the fast path, this patch changes the default behavior to not clear PacketByteCnt field, so that the sync packets will be generated with the period specified as "psb_period" attribute config field. This has little impact on the trace data as the other packets that are normally sent within PSB+ (between PSB and PSBEND) have their own generation scenarios which do not depend on the sync packets. One exception where we do need to force PSB like this when tracing starts, so that the decoder has a clear sync point in the trace. For this purpose we aready have hw::itrace_started flag, which we are currently using to output PERF_RECORD_ITRACE_START. This patch moves setting itrace_started from perf core to the pmu::start, where it should still be 0 on the very first run. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: hpa@zytor.com Link: http://lkml.kernel.org/r/1438264104-16189-1-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index a9796c8ff7e0..bdea12924b11 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6139,8 +6139,6 @@ static void perf_log_itrace_start(struct perf_event *event) event->hw.itrace_started) return; - event->hw.itrace_started = 1; - rec.header.type = PERF_RECORD_ITRACE_START; rec.header.misc = 0; rec.header.size = sizeof(rec); -- cgit v1.2.3 From af859beaaba4d57883b08f4acbcb3974bc1f975e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 19 Jul 2015 15:13:40 -0700 Subject: rcu: Silence lockdep false positive for expedited grace periods In a CONFIG_PREEMPT=y kernel, synchronize_rcu_expedited() acquires the ->exp_funnel_mutex in rcu_preempt_state, then invokes synchronize_sched_expedited, which acquires the ->exp_funnel_mutex in rcu_sched_state. There can be no deadlock because rcu_preempt_state ->exp_funnel_mutex acquisition always precedes that of rcu_sched_state. But lockdep does not know that, so it gives false-positive splats. This commit therefore associates a separate lock_class_key structure with the rcu_sched_state structure's ->exp_funnel_mutex, allowing lockdep to see the lock ordering, avoiding the false positives. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 12 ++++++++++-- kernel/rcu/tree.h | 8 ++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3af0dee2d045..439112e9d1b3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -71,6 +71,7 @@ MODULE_ALIAS("rcutree"); static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; +static struct lock_class_key rcu_exp_sched_class[RCU_NUM_LVLS]; /* * In order to export the rcu_state name to the tracing tools, it @@ -4049,6 +4050,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, static const char * const buf[] = RCU_NODE_NAME_INIT; static const char * const fqs[] = RCU_FQS_NAME_INIT; static const char * const exp[] = RCU_EXP_NAME_INIT; + static const char * const exp_sched[] = RCU_EXP_SCHED_NAME_INIT; static u8 fl_mask = 0x1; int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ @@ -4108,8 +4110,14 @@ static void __init rcu_init_one(struct rcu_state *rsp, INIT_LIST_HEAD(&rnp->blkd_tasks); rcu_init_one_nocb(rnp); mutex_init(&rnp->exp_funnel_mutex); - lockdep_set_class_and_name(&rnp->exp_funnel_mutex, - &rcu_exp_class[i], exp[i]); + if (rsp == &rcu_sched_state) + lockdep_set_class_and_name( + &rnp->exp_funnel_mutex, + &rcu_exp_sched_class[i], exp_sched[i]); + else + lockdep_set_class_and_name( + &rnp->exp_funnel_mutex, + &rcu_exp_class[i], exp[i]); } } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 80d974df0ea0..0412030ca882 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -70,6 +70,8 @@ # define RCU_NODE_NAME_INIT { "rcu_node_0" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } # define RCU_EXP_NAME_INIT { "rcu_node_exp_0" } +# define RCU_EXP_SCHED_NAME_INIT \ + { "rcu_node_exp_sched_0" } #elif NR_CPUS <= RCU_FANOUT_2 # define RCU_NUM_LVLS 2 # define NUM_RCU_LVL_0 1 @@ -79,6 +81,8 @@ # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" } +# define RCU_EXP_SCHED_NAME_INIT \ + { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1" } #elif NR_CPUS <= RCU_FANOUT_3 # define RCU_NUM_LVLS 3 # define NUM_RCU_LVL_0 1 @@ -89,6 +93,8 @@ # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" } +# define RCU_EXP_SCHED_NAME_INIT \ + { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2" } #elif NR_CPUS <= RCU_FANOUT_4 # define RCU_NUM_LVLS 4 # define NUM_RCU_LVL_0 1 @@ -100,6 +106,8 @@ # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" } +# define RCU_EXP_SCHED_NAME_INIT \ + { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2", "rcu_node_exp_sched_3" } #else # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ -- cgit v1.2.3 From 12d560f4ea87030667438a169912380be00cea4b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 14 Jul 2015 18:35:23 -0700 Subject: rcu,locking: Privatize smp_mb__after_unlock_lock() RCU is the only thing that uses smp_mb__after_unlock_lock(), and is likely the only thing that ever will use it, so this commit makes this macro private to RCU. Signed-off-by: Paul E. McKenney Cc: Will Deacon Cc: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: "linux-arch@vger.kernel.org" --- kernel/rcu/tree.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 0412030ca882..2e991f8361e4 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -653,3 +653,15 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ } #endif /* #ifdef CONFIG_RCU_TRACE */ + +/* + * Place this after a lock-acquisition primitive to guarantee that + * an UNLOCK+LOCK pair act as a full barrier. This guarantee applies + * if the UNLOCK and LOCK are executed by the same CPU or if the + * UNLOCK and LOCK operate on the same lock variable. + */ +#ifdef CONFIG_PPC +#define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ +#else /* #ifdef CONFIG_PPC */ +#define smp_mb__after_unlock_lock() do { } while (0) +#endif /* #else #ifdef CONFIG_PPC */ -- cgit v1.2.3 From 89af7ba5740b5937d10c8de93e79e71e5a933041 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 5 Aug 2015 00:52:46 -0700 Subject: cpu-hotplug: convert cpu_hotplug_disabled to a counter As a prerequisite to exporting cpu_hotplug_enable/cpu_hotplug_disable functions to modules we need to convert cpu_hotplug_disabled to a counter to properly support disable -> disable -> enable call sequences. E.g. after Hyper-V vmbus module (which is supposed to be the first user of exported cpu_hotplug_enable/cpu_hotplug_disable) did cpu_hotplug_disable() hibernate path calls disable_nonboot_cpus() and if we hit an error in _cpu_down() enable_nonboot_cpus() will be called on the failure path (thus making cpu_hotplug_disabled = 0 and leaving cpu hotplug in 'enabled' state). Same problem is possible if more than 1 module use cpu_hotplug_disable/cpu_hotplug_enable on their load/unload paths. When one of these modules is been unloaded it is logical to leave cpu hotplug in 'disabled' state. To support the change we need to increse cpu_hotplug_disabled counter in disable_nonboot_cpus() unconditionally as all users of disable_nonboot_cpus() are supposed to do enable_nonboot_cpus() in case an error was returned. Signed-off-by: Vitaly Kuznetsov Reviewed-by: Thomas Gleixner Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman --- kernel/cpu.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 5644ec5582b9..0fca2ba96138 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -191,14 +191,14 @@ void cpu_hotplug_done(void) void cpu_hotplug_disable(void) { cpu_maps_update_begin(); - cpu_hotplug_disabled = 1; + cpu_hotplug_disabled++; cpu_maps_update_done(); } void cpu_hotplug_enable(void) { cpu_maps_update_begin(); - cpu_hotplug_disabled = 0; + WARN_ON(--cpu_hotplug_disabled < 0); cpu_maps_update_done(); } @@ -608,13 +608,18 @@ int disable_nonboot_cpus(void) } } - if (!error) { + if (!error) BUG_ON(num_online_cpus() > 1); - /* Make sure the CPUs won't be enabled by someone else */ - cpu_hotplug_disabled = 1; - } else { + else pr_err("Non-boot CPUs are not disabled\n"); - } + + /* + * Make sure the CPUs won't be enabled by someone else. We need to do + * this even in case of failure as all disable_nonboot_cpus() users are + * supposed to do enable_nonboot_cpus() on the failure path. + */ + cpu_hotplug_disabled++; + cpu_maps_update_done(); return error; } @@ -633,7 +638,7 @@ void __ref enable_nonboot_cpus(void) /* Allow everyone to use the CPU hotplug again */ cpu_maps_update_begin(); - cpu_hotplug_disabled = 0; + WARN_ON(--cpu_hotplug_disabled < 0); if (cpumask_empty(frozen_cpus)) goto out; -- cgit v1.2.3 From 32145c4677d2c46b9d877a33ae82c6fcacd002f9 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 5 Aug 2015 00:52:47 -0700 Subject: cpu-hotplug: export cpu_hotplug_enable/cpu_hotplug_disable Hyper-V module needs to disable cpu hotplug (offlining) as there is no support from hypervisor side to reassign already opened event channels to a different CPU. Currently it is been done by altering smp_ops.cpu_disable but it is hackish. Signed-off-by: Vitaly Kuznetsov Reviewed-by: Thomas Gleixner Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman --- kernel/cpu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 0fca2ba96138..718ea76b78ba 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -194,6 +194,7 @@ void cpu_hotplug_disable(void) cpu_hotplug_disabled++; cpu_maps_update_done(); } +EXPORT_SYMBOL_GPL(cpu_hotplug_disable); void cpu_hotplug_enable(void) { @@ -201,7 +202,7 @@ void cpu_hotplug_enable(void) WARN_ON(--cpu_hotplug_disabled < 0); cpu_maps_update_done(); } - +EXPORT_SYMBOL_GPL(cpu_hotplug_enable); #endif /* CONFIG_HOTPLUG_CPU */ /* Need to know about CPUs going up/down? */ -- cgit v1.2.3 From 71cf5aeeb8e2154efda5f40be50c925f15057755 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sun, 19 Jul 2015 20:06:22 +0200 Subject: kernel, cpu: Remove bogus __ref annotations cpu_chain lost its __cpuinitdata annotation long ago in commit 5c113fbeed7a ("fix cpu_chain section mismatch..."). This and the global __cpuinit annotation drop in v3.11 vanished the need to mark all users, including transitive ones, with the __ref annotation. Just get rid of it to not wrongly hide section mismatches. Signed-off-by: Mathias Krause Cc: Paul Gortmaker Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/cpu.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 5644ec5582b9..a5f2165b2dc4 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -205,7 +205,7 @@ void cpu_hotplug_enable(void) #endif /* CONFIG_HOTPLUG_CPU */ /* Need to know about CPUs going up/down? */ -int __ref register_cpu_notifier(struct notifier_block *nb) +int register_cpu_notifier(struct notifier_block *nb) { int ret; cpu_maps_update_begin(); @@ -214,7 +214,7 @@ int __ref register_cpu_notifier(struct notifier_block *nb) return ret; } -int __ref __register_cpu_notifier(struct notifier_block *nb) +int __register_cpu_notifier(struct notifier_block *nb) { return raw_notifier_chain_register(&cpu_chain, nb); } @@ -244,7 +244,7 @@ static void cpu_notify_nofail(unsigned long val, void *v) EXPORT_SYMBOL(register_cpu_notifier); EXPORT_SYMBOL(__register_cpu_notifier); -void __ref unregister_cpu_notifier(struct notifier_block *nb) +void unregister_cpu_notifier(struct notifier_block *nb) { cpu_maps_update_begin(); raw_notifier_chain_unregister(&cpu_chain, nb); @@ -252,7 +252,7 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_cpu_notifier); -void __ref __unregister_cpu_notifier(struct notifier_block *nb) +void __unregister_cpu_notifier(struct notifier_block *nb) { raw_notifier_chain_unregister(&cpu_chain, nb); } @@ -329,7 +329,7 @@ struct take_cpu_down_param { }; /* Take this CPU down. */ -static int __ref take_cpu_down(void *_param) +static int take_cpu_down(void *_param) { struct take_cpu_down_param *param = _param; int err; @@ -348,7 +348,7 @@ static int __ref take_cpu_down(void *_param) } /* Requires cpu_add_remove_lock to be held */ -static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) +static int _cpu_down(unsigned int cpu, int tasks_frozen) { int err, nr_calls = 0; void *hcpu = (void *)(long)cpu; @@ -442,7 +442,7 @@ out_release: return err; } -int __ref cpu_down(unsigned int cpu) +int cpu_down(unsigned int cpu) { int err; @@ -627,7 +627,7 @@ void __weak arch_enable_nonboot_cpus_end(void) { } -void __ref enable_nonboot_cpus(void) +void enable_nonboot_cpus(void) { int cpu, error; -- cgit v1.2.3 From 04a22fae4cbc1f7d3f7471e9b36359f98bd3f043 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 1 Jul 2015 02:13:50 +0000 Subject: tracing, perf: Implement BPF programs attached to uprobes By copying BPF related operation to uprobe processing path, this patch allow users attach BPF programs to uprobes like what they are already doing on kprobes. After this patch, users are allowed to use PERF_EVENT_IOC_SET_BPF on a uprobe perf event. Which make it possible to profile user space programs and kernel events together using BPF. Because of this patch, CONFIG_BPF_EVENTS should be selected by CONFIG_UPROBE_EVENT to ensure trace_call_bpf() is compiled even if KPROBE_EVENT is not set. Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1435716878-189507-3-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 4 ++-- kernel/trace/Kconfig | 2 +- kernel/trace/trace_uprobe.c | 5 +++++ 3 files changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index bdea12924b11..77f9e5d0e2d1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6846,8 +6846,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) if (event->tp_event->prog) return -EEXIST; - if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) - /* bpf programs can only be attached to kprobes */ + if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE)) + /* bpf programs can only be attached to u/kprobes */ return -EINVAL; prog = bpf_prog_get(prog_fd); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3b9a48ae153a..1153c43428f3 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -434,7 +434,7 @@ config UPROBE_EVENT config BPF_EVENTS depends on BPF_SYSCALL - depends on KPROBE_EVENT + depends on KPROBE_EVENT || UPROBE_EVENT bool default y help diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index aa1ea7b36fa8..f97479f1ce35 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1095,11 +1095,15 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, { struct trace_event_call *call = &tu->tp.call; struct uprobe_trace_entry_head *entry; + struct bpf_prog *prog = call->prog; struct hlist_head *head; void *data; int size, esize; int rctx; + if (prog && !trace_call_bpf(prog, regs)) + return; + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); size = esize + tu->tp.size + dsize; @@ -1289,6 +1293,7 @@ static int register_uprobe_event(struct trace_uprobe *tu) return -ENODEV; } + call->flags = TRACE_EVENT_FL_UPROBE; call->class->reg = trace_uprobe_register; call->data = tu; ret = trace_add_event_call(call); -- cgit v1.2.3 From ecbebcb868bed598354d3b6d94cfeb10e440e4ca Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 16 Jul 2015 16:56:35 +0530 Subject: kernel: broadcast-hrtimer: Migrate to new 'set-state' interface Migrate broadcast-hrtimer driver to the new 'set-state' interface provided by clockevents core, the earlier 'set-mode' interface is marked obsolete now. Cc: Thomas Gleixner Signed-off-by: Viresh Kumar Signed-off-by: Daniel Lezcano --- kernel/time/tick-broadcast-hrtimer.c | 49 +++++++++++++++--------------------- 1 file changed, 20 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index 3e7db49a2381..53d7184da0be 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -18,30 +18,23 @@ static struct hrtimer bctimer; -static void bc_set_mode(enum clock_event_mode mode, - struct clock_event_device *bc) +static int bc_shutdown(struct clock_event_device *evt) { - switch (mode) { - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - /* - * Note, we cannot cancel the timer here as we might - * run into the following live lock scenario: - * - * cpu 0 cpu1 - * lock(broadcast_lock); - * hrtimer_interrupt() - * bc_handler() - * tick_handle_oneshot_broadcast(); - * lock(broadcast_lock); - * hrtimer_cancel() - * wait_for_callback() - */ - hrtimer_try_to_cancel(&bctimer); - break; - default: - break; - } + /* + * Note, we cannot cancel the timer here as we might + * run into the following live lock scenario: + * + * cpu 0 cpu1 + * lock(broadcast_lock); + * hrtimer_interrupt() + * bc_handler() + * tick_handle_oneshot_broadcast(); + * lock(broadcast_lock); + * hrtimer_cancel() + * wait_for_callback() + */ + hrtimer_try_to_cancel(&bctimer); + return 0; } /* @@ -82,7 +75,7 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) } static struct clock_event_device ce_broadcast_hrtimer = { - .set_mode = bc_set_mode, + .set_state_shutdown = bc_shutdown, .set_next_ktime = bc_set_next, .features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_KTIME | @@ -102,13 +95,11 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t) { ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer); - switch (ce_broadcast_hrtimer.mode) { - case CLOCK_EVT_MODE_ONESHOT: + if (clockevent_state_oneshot(&ce_broadcast_hrtimer)) if (ce_broadcast_hrtimer.next_event.tv64 != KTIME_MAX) return HRTIMER_RESTART; - default: - return HRTIMER_NORESTART; - } + + return HRTIMER_NORESTART; } void tick_setup_hrtimer_broadcast(void) -- cgit v1.2.3 From c2ad6b51efc5f27d70ce952decd2a15679b83600 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 28 Jul 2015 09:00:04 +0300 Subject: perf/ring-buffer: Clarify the use of page::private for high-order AUX allocations A question [1] was raised about the use of page::private in AUX buffer allocations, so let's add a clarification about its intended use. The private field and flag are used by perf's rb_alloc_aux() path to tell the pmu driver the size of each high-order allocation, so that the driver can program those appropriately into its hardware. This only matters for PMUs that don't support hardware scatter tables. Otherwise, every page in the buffer is just a page. This patch adds a comment about the private field to the AUX buffer allocation path. [1] http://marc.info/?l=linux-kernel&m=143803696607968 Reported-by: Mathieu Poirier Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1438063204-665-1-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index c8aa3f75bc4d..182bc30899d5 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -437,7 +437,10 @@ static struct page *rb_alloc_aux_page(int node, int order) if (page && order) { /* - * Communicate the allocation size to the driver + * Communicate the allocation size to the driver: + * if we managed to secure a high-order allocation, + * set its first page's private to this order; + * !PagePrivate(page) means it's just a normal page. */ split_page(page, order); SetPagePrivate(page); -- cgit v1.2.3 From e237882b8f83dd1a0eece1608bcb689d4f4b221b Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Mon, 10 Aug 2015 20:20:48 -0500 Subject: sched/numa: Fix NUMA_DIRECT topology identification Systems which have all nodes at a distance of at most 1 hop should be identified as 'NUMA_DIRECT'. However, the scheduler incorrectly identifies it as 'NUMA_BACKPLANE'. This is because 'n' is assigned to sched_max_numa_distance but the code (mis)interprets it to mean 'number of hops'. Rik had actually used sched_domains_numa_levels for detecting a 'NUMA_DIRECT' topology: http://marc.info/?l=linux-kernel&m=141279712429834&w=2 But that was changed when he removed the hops table in the subsequent version: http://marc.info/?l=linux-kernel&m=141353106106771&w=2 Fixing the issue here. Signed-off-by: Aravind Gopalakrishnan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Rik van Riel Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1439256048-3748-1-git-send-email-Aravind.Gopalakrishnan@amd.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b11f6240709b..ea6d74345e60 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6444,8 +6444,10 @@ static void init_numa_topology_type(void) n = sched_max_numa_distance; - if (n <= 1) + if (sched_domains_numa_levels <= 1) { sched_numa_topology_type = NUMA_DIRECT; + return; + } for_each_online_node(a) { for_each_online_node(b) { -- cgit v1.2.3 From 7855a35ac07a350e2cd26f09568a6d8e372be358 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 10 Aug 2015 18:02:55 +0900 Subject: sched: Ensure a task has a non-normalized vruntime when returning back to CFS Current code ensures that a task has a normalized vruntime when switching away from the fair class, but it does not ensure the task has a non-normalized vruntime when switching back to the fair class. This is an example breaking this consistency: 1. a task is in fair class and !queued 2. changes its class to RT class (still !queued) 3. changes its class to fair class again (still !queued) Signed-off-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1439197375-27927-1-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 858b94ab1bd2..f0950fde1f5b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7930,16 +7930,31 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) */ static void switched_to_fair(struct rq *rq, struct task_struct *p) { -#ifdef CONFIG_FAIR_GROUP_SCHED struct sched_entity *se = &p->se; + +#ifdef CONFIG_FAIR_GROUP_SCHED /* * Since the real-depth could have been changed (only FAIR * class maintain depth value), reset depth properly. */ se->depth = se->parent ? se->parent->depth + 1 : 0; #endif - if (!task_on_rq_queued(p)) + + if (!task_on_rq_queued(p)) { + + /* + * Ensure the task has a non-normalized vruntime when it is switched + * back to the fair class with !queued, so that enqueue_entity() at + * wake-up time will do the right thing. + * + * If it's queued, then the enqueue_entity(.flags=0) makes the task + * has non-normalized vruntime, if it's !queued, then it still has + * normalized vruntime. + */ + if (p->state != TASK_RUNNING) + se->vruntime += cfs_rq_of(se)->min_vruntime; return; + } /* * We were most likely switched from sched_rt, so -- cgit v1.2.3 From 25834c73f93af7f0712c98ca4593691592e6b360 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2015 17:43:34 +0200 Subject: sched: Fix a race between __kthread_bind() and sched_setaffinity() Because sched_setscheduler() checks p->flags & PF_NO_SETAFFINITY without locks, a caller might observe an old value and race with the set_cpus_allowed_ptr() call from __kthread_bind() and effectively undo it: __kthread_bind() do_set_cpus_allowed() sched_setaffinity() if (p->flags & PF_NO_SETAFFINITIY) set_cpus_allowed_ptr() p->flags |= PF_NO_SETAFFINITY Fix the bug by putting everything under the regular scheduler locks. This also closes a hole in the serialization of task_struct::{nr_,}cpus_allowed. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dedekind1@gmail.com Cc: juri.lelli@arm.com Cc: mgorman@suse.de Cc: riel@redhat.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/20150515154833.545640346@infradead.org Signed-off-by: Ingo Molnar --- kernel/kthread.c | 20 +++++++++++++++++--- kernel/sched/core.c | 36 ++++++++++++++++++++++++++++++++---- kernel/workqueue.c | 6 ++---- 3 files changed, 51 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 10e489c448fe..7c40a189becc 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -325,16 +325,30 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), } EXPORT_SYMBOL(kthread_create_on_node); -static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state) +static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state) { - /* Must have done schedule() in kthread() before we set_task_cpu */ + unsigned long flags; + if (!wait_task_inactive(p, state)) { WARN_ON(1); return; } + /* It's safe because the task is inactive. */ - do_set_cpus_allowed(p, cpumask_of(cpu)); + raw_spin_lock_irqsave(&p->pi_lock, flags); + do_set_cpus_allowed(p, mask); p->flags |= PF_NO_SETAFFINITY; + raw_spin_unlock_irqrestore(&p->pi_lock, flags); +} + +static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state) +{ + __kthread_bind_mask(p, cpumask_of(cpu), state); +} + +void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) +{ + __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE); } /** diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ea6d74345e60..2e3b983da836 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1153,6 +1153,8 @@ static int migration_cpu_stop(void *data) void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { + lockdep_assert_held(&p->pi_lock); + if (p->sched_class->set_cpus_allowed) p->sched_class->set_cpus_allowed(p, new_mask); @@ -1169,7 +1171,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ -int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) { unsigned long flags; struct rq *rq; @@ -1178,6 +1181,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) rq = task_rq_lock(p, &flags); + /* + * Must re-check here, to close a race against __kthread_bind(), + * sched_setaffinity() is not guaranteed to observe the flag. + */ + if (check && (p->flags & PF_NO_SETAFFINITY)) { + ret = -EINVAL; + goto out; + } + if (cpumask_equal(&p->cpus_allowed, new_mask)) goto out; @@ -1214,6 +1226,11 @@ out: return ret; } + +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ + return __set_cpus_allowed_ptr(p, new_mask, false); +} EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); void set_task_cpu(struct task_struct *p, unsigned int new_cpu) @@ -1595,6 +1612,15 @@ static void update_avg(u64 *avg, u64 sample) s64 diff = sample - *avg; *avg += diff >> 3; } + +#else + +static inline int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) +{ + return set_cpus_allowed_ptr(p, new_mask); +} + #endif /* CONFIG_SMP */ static void @@ -4340,7 +4366,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) } #endif again: - retval = set_cpus_allowed_ptr(p, new_mask); + retval = __set_cpus_allowed_ptr(p, new_mask, true); if (!retval) { cpuset_cpus_allowed(p, cpus_allowed); @@ -4865,7 +4891,8 @@ void init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&idle->pi_lock, flags); + raw_spin_lock(&rq->lock); __sched_fork(0, idle); idle->state = TASK_RUNNING; @@ -4891,7 +4918,8 @@ void init_idle(struct task_struct *idle, int cpu) #if defined(CONFIG_SMP) idle->on_cpu = 1; #endif - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&idle->pi_lock, flags); /* Set the preempt count _outside_ the spinlocks! */ init_idle_preempt_count(idle, cpu); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4c4f06176f74..f5782d5fd196 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1714,9 +1714,7 @@ static struct worker *create_worker(struct worker_pool *pool) goto fail; set_user_nice(worker->task, pool->attrs->nice); - - /* prevent userland from meddling with cpumask of workqueue workers */ - worker->task->flags |= PF_NO_SETAFFINITY; + kthread_bind_mask(worker->task, pool->attrs->cpumask); /* successful, attach the worker to the pool */ worker_attach_to_pool(worker, pool); @@ -3856,7 +3854,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, } wq->rescuer = rescuer; - rescuer->task->flags |= PF_NO_SETAFFINITY; + kthread_bind_mask(rescuer->task, cpu_possible_mask); wake_up_process(rescuer->task); } -- cgit v1.2.3 From c5b2803840817115e9b568d5054e5007ae36176b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2015 17:43:35 +0200 Subject: sched: Make sched_class::set_cpus_allowed() unconditional Give every class a set_cpus_allowed() method, this enables some small optimization in the RT,DL implementation by avoiding a double cpumask_weight() call. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dedekind1@gmail.com Cc: juri.lelli@arm.com Cc: mgorman@suse.de Cc: riel@redhat.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/20150515154833.614517487@infradead.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 17 +++++++++++------ kernel/sched/deadline.c | 20 ++++++++++++-------- kernel/sched/fair.c | 1 + kernel/sched/idle_task.c | 1 + kernel/sched/rt.c | 12 ++++++++---- kernel/sched/sched.h | 2 ++ kernel/sched/stop_task.c | 1 + 7 files changed, 36 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2e3b983da836..740f90bdc67b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1151,17 +1151,22 @@ static int migration_cpu_stop(void *data) return 0; } -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +/* + * sched_class::set_cpus_allowed must do the below, but is not required to + * actually call this function. + */ +void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) { - lockdep_assert_held(&p->pi_lock); - - if (p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, new_mask); - cpumask_copy(&p->cpus_allowed, new_mask); p->nr_cpus_allowed = cpumask_weight(new_mask); } +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ + lockdep_assert_held(&p->pi_lock); + p->sched_class->set_cpus_allowed(p, new_mask); +} + /* * Change a given task's CPU affinity. Migrate the thread to a * proper CPU and schedule it away if the CPU it's executing on diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 20772eea67f2..dc357fa572b0 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1696,13 +1696,6 @@ static void set_cpus_allowed_dl(struct task_struct *p, raw_spin_unlock(&src_dl_b->lock); } - /* - * Update only if the task is actually running (i.e., - * it is on the rq AND it is not throttled). - */ - if (!on_dl_rq(&p->dl)) - return; - weight = cpumask_weight(new_mask); /* @@ -1710,7 +1703,14 @@ static void set_cpus_allowed_dl(struct task_struct *p, * can migrate or not. */ if ((p->nr_cpus_allowed > 1) == (weight > 1)) - return; + goto done; + + /* + * Update only if the task is actually running (i.e., + * it is on the rq AND it is not throttled). + */ + if (!on_dl_rq(&p->dl)) + goto done; /* * The process used to be able to migrate OR it can now migrate @@ -1727,6 +1727,10 @@ static void set_cpus_allowed_dl(struct task_struct *p, } update_dl_migration(&rq->dl); + +done: + cpumask_copy(&p->cpus_allowed, new_mask); + p->nr_cpus_allowed = weight; } /* Assumes rq->lock is held */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f0950fde1f5b..6e2e3483b1ec 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8252,6 +8252,7 @@ const struct sched_class fair_sched_class = { .task_waking = task_waking_fair, .task_dead = task_dead_fair, + .set_cpus_allowed = set_cpus_allowed_common, #endif .set_curr_task = set_curr_task_fair, diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index c65dac8c97cd..c4ae0f1fdf9b 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -96,6 +96,7 @@ const struct sched_class idle_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_idle, + .set_cpus_allowed = set_cpus_allowed_common, #endif .set_curr_task = set_curr_task_idle, diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 00816eeaa308..63692efeca82 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2084,9 +2084,6 @@ static void set_cpus_allowed_rt(struct task_struct *p, BUG_ON(!rt_task(p)); - if (!task_on_rq_queued(p)) - return; - weight = cpumask_weight(new_mask); /* @@ -2094,7 +2091,10 @@ static void set_cpus_allowed_rt(struct task_struct *p, * can migrate or not. */ if ((p->nr_cpus_allowed > 1) == (weight > 1)) - return; + goto done; + + if (!task_on_rq_queued(p)) + goto done; rq = task_rq(p); @@ -2113,6 +2113,10 @@ static void set_cpus_allowed_rt(struct task_struct *p, } update_rt_migration(&rq->rt); + +done: + cpumask_copy(&p->cpus_allowed, new_mask); + p->nr_cpus_allowed = weight; } /* Assumes rq->lock is held */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 22ccc5556c42..68cda117574c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1255,6 +1255,8 @@ extern void trigger_load_balance(struct rq *rq); extern void idle_enter_fair(struct rq *this_rq); extern void idle_exit_fair(struct rq *this_rq); +extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); + #else static inline void idle_enter_fair(struct rq *rq) { } diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 79ffec45a6ac..cbc67da10954 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -123,6 +123,7 @@ const struct sched_class stop_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_stop, + .set_cpus_allowed = set_cpus_allowed_common, #endif .set_curr_task = set_curr_task_stop, -- cgit v1.2.3 From 6c37067e27867db172b988cc11b9ff921175dee5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2015 17:43:36 +0200 Subject: sched: Change the sched_class::set_cpus_allowed() calling context Change the calling context of sched_class::set_cpus_allowed() such that we can assume the task is inactive. This allows us to easily make changes that affect accounting done by enqueue/dequeue. This does in fact completely remove set_cpus_allowed_rt() and greatly reduces set_cpus_allowed_dl(). Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dedekind1@gmail.com Cc: juri.lelli@arm.com Cc: mgorman@suse.de Cc: riel@redhat.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/20150515154833.667516139@infradead.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 23 +++++++++++++++++++++++ kernel/sched/deadline.c | 39 ++------------------------------------- kernel/sched/rt.c | 45 +-------------------------------------------- 3 files changed, 26 insertions(+), 81 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 740f90bdc67b..9917c962be99 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1163,8 +1163,31 @@ void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_ma void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { + struct rq *rq = task_rq(p); + bool queued, running; + lockdep_assert_held(&p->pi_lock); + + queued = task_on_rq_queued(p); + running = task_current(rq, p); + + if (queued) { + /* + * Because __kthread_bind() calls this on blocked tasks without + * holding rq->lock. + */ + lockdep_assert_held(&rq->lock); + dequeue_task(rq, p, 0); + } + if (running) + put_prev_task(rq, p); + p->sched_class->set_cpus_allowed(p, new_mask); + + if (running) + p->sched_class->set_curr_task(rq); + if (queued) + enqueue_task(rq, p, 0); } /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index dc357fa572b0..b4730565a45d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1668,9 +1668,8 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) static void set_cpus_allowed_dl(struct task_struct *p, const struct cpumask *new_mask) { - struct rq *rq; struct root_domain *src_rd; - int weight; + struct rq *rq; BUG_ON(!dl_task(p)); @@ -1696,41 +1695,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, raw_spin_unlock(&src_dl_b->lock); } - weight = cpumask_weight(new_mask); - - /* - * Only update if the process changes its state from whether it - * can migrate or not. - */ - if ((p->nr_cpus_allowed > 1) == (weight > 1)) - goto done; - - /* - * Update only if the task is actually running (i.e., - * it is on the rq AND it is not throttled). - */ - if (!on_dl_rq(&p->dl)) - goto done; - - /* - * The process used to be able to migrate OR it can now migrate - */ - if (weight <= 1) { - if (!task_current(rq, p)) - dequeue_pushable_dl_task(rq, p); - BUG_ON(!rq->dl.dl_nr_migratory); - rq->dl.dl_nr_migratory--; - } else { - if (!task_current(rq, p)) - enqueue_pushable_dl_task(rq, p); - rq->dl.dl_nr_migratory++; - } - - update_dl_migration(&rq->dl); - -done: - cpumask_copy(&p->cpus_allowed, new_mask); - p->nr_cpus_allowed = weight; + set_cpus_allowed_common(p, new_mask); } /* Assumes rq->lock is held */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 63692efeca82..d2ea59364a1c 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2076,49 +2076,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) push_rt_tasks(rq); } -static void set_cpus_allowed_rt(struct task_struct *p, - const struct cpumask *new_mask) -{ - struct rq *rq; - int weight; - - BUG_ON(!rt_task(p)); - - weight = cpumask_weight(new_mask); - - /* - * Only update if the process changes its state from whether it - * can migrate or not. - */ - if ((p->nr_cpus_allowed > 1) == (weight > 1)) - goto done; - - if (!task_on_rq_queued(p)) - goto done; - - rq = task_rq(p); - - /* - * The process used to be able to migrate OR it can now migrate - */ - if (weight <= 1) { - if (!task_current(rq, p)) - dequeue_pushable_task(rq, p); - BUG_ON(!rq->rt.rt_nr_migratory); - rq->rt.rt_nr_migratory--; - } else { - if (!task_current(rq, p)) - enqueue_pushable_task(rq, p); - rq->rt.rt_nr_migratory++; - } - - update_rt_migration(&rq->rt); - -done: - cpumask_copy(&p->cpus_allowed, new_mask); - p->nr_cpus_allowed = weight; -} - /* Assumes rq->lock is held */ static void rq_online_rt(struct rq *rq) { @@ -2327,7 +2284,7 @@ const struct sched_class rt_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_rt, - .set_cpus_allowed = set_cpus_allowed_rt, + .set_cpus_allowed = set_cpus_allowed_common, .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, .task_woken = task_woken_rt, -- cgit v1.2.3 From 4ffa08ed4cc4c5d47d197d749aae6f79af91eb73 Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Wed, 5 Aug 2015 15:56:18 +0200 Subject: sched/deadline: Fix comment in push_dl_tasks() The comment is "misleading"; fix it by adapting a comment from push_rt_tasks(). Signed-off-by: Andrea Parri Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1438782979-9057-1-git-send-email-parri.andrea@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index b4730565a45d..82c0dd05d5d8 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1563,7 +1563,7 @@ out: static void push_dl_tasks(struct rq *rq) { - /* Terminates as it moves a -deadline task */ + /* push_dl_task() will return true if it moved a -deadline task */ while (push_dl_task(rq)) ; } -- cgit v1.2.3 From ff277d4250fe715b6666219b1a3423b863418794 Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Wed, 5 Aug 2015 15:56:19 +0200 Subject: sched/deadline: Fix comment in enqueue_task_dl() The "dl_boosted" flag is set by comparing *absolute* deadlines (c.f., rt_mutex_setprio()). Signed-off-by: Andrea Parri Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1438782979-9057-2-git-send-email-parri.andrea@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 82c0dd05d5d8..fc8f01083527 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -953,7 +953,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) /* * Use the scheduling parameters of the top pi-waiter - * task if we have one and its (relative) deadline is + * task if we have one and its (absolute) deadline is * smaller than our one... OTW we keep our runtime and * deadline. */ -- cgit v1.2.3 From 38bf985b05625df3fbbc1dbf543bdd2da447c2af Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 27 May 2015 16:44:48 -0700 Subject: timer_list: Add the base offset so remaining nsecs are accurate for non monotonic timers I noticed for non-monotonic timers in timer_list, some of the output looked a little confusing. For example: #1: <0000000000000000>, posix_timer_fn, S:01, hrtimer_start_range_ns, leap-a-day/2360 # expires at 1434412800000000000-1434412800000000000 nsecs [in 1434410725062375469 to 1434410725062375469 nsecs] You'll note the relative time till the expiration "[in xxx to yyy nsecs]" is incorrect. This is because its printing the delta between CLOCK_MONOTONIC time to the CLOCK_REALTIME expiration. This patch fixes this issue by adding the clock offset to the "now" time which we use to calculate the delta. Cc: Prarit Bhargava Cc: Daniel Bristot de Oliveira Cc: Richard Cochran Cc: Jan Kara Cc: Jiri Bohac Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Shuah Khan Signed-off-by: John Stultz --- kernel/time/timer_list.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index a4536e1e3e2a..129c96033e46 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -137,7 +137,7 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) (unsigned long long) ktime_to_ns(base->offset)); #endif SEQ_printf(m, "active timers:\n"); - print_active_timers(m, base, now); + print_active_timers(m, base, now + ktime_to_ns(base->offset)); } static void print_cpu(struct seq_file *m, int cpu, u64 now) -- cgit v1.2.3 From de4a95faf173be1e798c37ca486dfcb234a0941b Mon Sep 17 00:00:00 2001 From: Karsten Blees Date: Thu, 25 Jun 2015 14:13:55 +0200 Subject: time: Fix nanosecond file time rounding in timespec_trunc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit timespec_trunc() avoids rounding if granularity <= nanoseconds-per-jiffie (or TICK_NSEC). This optimization assumes that: 1. current_kernel_time().tv_nsec is already rounded to TICK_NSEC (i.e. with HZ=1000 you'd get 1000000, 2000000, 3000000... but never 1000001). This is no longer true (probably since hrtimers introduced in 2.6.16). 2. TICK_NSEC is evenly divisible by all possible granularities. This may be true for HZ=100, 250, 1000, but obviously not for HZ=300 / TICK_NSEC=3333333 (introduced in 2.6.20). Thus, sub-second portions of in-core file times are not rounded to on-disk granularity. I.e. file times may change when the inode is re-read from disk or when the file system is remounted. This affects all file systems with file time granularities > 1 ns and < 1s, e.g. CEPH (1000 ns), UDF (1000 ns), CIFS (100 ns), NTFS (100 ns) and FUSE (configurable from user mode via struct fuse_init_out.time_gran). Steps to reproduce with e.g. UDF: $ dd if=/dev/zero of=udfdisk count=10000 && mkudffs udfdisk $ mkdir udf && mount udfdisk udf $ touch udf/test && stat -c %y udf/test 2015-06-09 10:22:56.130006767 +0200 $ umount udf && mount udfdisk udf $ stat -c %y udf/test 2015-06-09 10:22:56.130006000 +0200 Remounting truncates the mtime to 1 µs. Fix the rounding in timespec_trunc() and update the documentation. timespec_trunc() is exclusively used to calculate inode's [acm]time (mostly via current_fs_time()), and always with super_block.s_time_gran as second argument. So this can safely be changed without side effects. Note: This does _not_ fix the issue for FAT's 2 second mtime resolution, as super_block.s_time_gran isn't prepared to handle different ctime / mtime / atime resolutions nor resolutions > 1 second. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Karsten Blees Signed-off-by: John Stultz --- kernel/time/time.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index 85d5bb1d67eb..34dbd4209e4a 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -287,26 +287,20 @@ EXPORT_SYMBOL(jiffies_to_usecs); * @t: Timespec * @gran: Granularity in ns. * - * Truncate a timespec to a granularity. gran must be smaller than a second. - * Always rounds down. - * - * This function should be only used for timestamps returned by - * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because - * it doesn't handle the better resolution of the latter. + * Truncate a timespec to a granularity. Always rounds down. gran must + * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns). */ struct timespec timespec_trunc(struct timespec t, unsigned gran) { - /* - * Division is pretty slow so avoid it for common cases. - * Currently current_kernel_time() never returns better than - * jiffies resolution. Exploit that. - */ - if (gran <= jiffies_to_usecs(1) * 1000) { + /* Avoid division in the common cases 1 ns and 1 s. */ + if (gran == 1) { /* nothing */ - } else if (gran == 1000000000) { + } else if (gran == NSEC_PER_SEC) { t.tv_nsec = 0; - } else { + } else if (gran > 1 && gran < NSEC_PER_SEC) { t.tv_nsec -= t.tv_nsec % gran; + } else { + WARN(1, "illegal file time granularity: %u", gran); } return t; } -- cgit v1.2.3 From e1d7ba8735551ed79c7a0463a042353574b96da3 Mon Sep 17 00:00:00 2001 From: Wang YanQing Date: Tue, 23 Jun 2015 18:38:54 +0800 Subject: time: Always make sure wall_to_monotonic isn't positive Two issues were found on an IMX6 development board without an enabled RTC device(resulting in the boot time and monotonic time being initialized to 0). Issue 1:exportfs -a generate: "exportfs: /opt/nfs/arm does not support NFS export" Issue 2:cat /proc/stat: "btime 4294967236" The same issues can be reproduced on x86 after running the following code: int main(void) { struct timeval val; int ret; val.tv_sec = 0; val.tv_usec = 0; ret = settimeofday(&val, NULL); return 0; } Two issues are different symptoms of same problem: The reason is a positive wall_to_monotonic pushes boot time back to the time before Epoch, and getboottime will return negative value. In symptom 1: negative boot time cause get_expiry() to overflow time_t when input expire time is 2147483647, then cache_flush() always clears entries just added in ip_map_parse. In symptom 2: show_stat() uses "unsigned long" to print negative btime value returned by getboottime. This patch fix the problem by prohibiting time from being set to a value which would cause a negative boot time. As a result one can't set the CLOCK_REALTIME time prior to (1970 + system uptime). Cc: Prarit Bhargava Cc: Richard Cochran Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Wang YanQing [jstultz: reworded commit message] Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index bca3667a2de1..4cdb771913c8 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -911,6 +911,7 @@ int do_settimeofday64(const struct timespec64 *ts) struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 ts_delta, xt; unsigned long flags; + int ret = 0; if (!timespec64_valid_strict(ts)) return -EINVAL; @@ -924,10 +925,15 @@ int do_settimeofday64(const struct timespec64 *ts) ts_delta.tv_sec = ts->tv_sec - xt.tv_sec; ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec; + if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) { + ret = -EINVAL; + goto out; + } + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); tk_set_xtime(tk, ts); - +out: timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); @@ -936,7 +942,7 @@ int do_settimeofday64(const struct timespec64 *ts) /* signal hrtimers about time change */ clock_was_set(); - return 0; + return ret; } EXPORT_SYMBOL(do_settimeofday64); @@ -965,7 +971,8 @@ int timekeeping_inject_offset(struct timespec *ts) /* Make sure the proposed value is valid */ tmp = timespec64_add(tk_xtime(tk), ts64); - if (!timespec64_valid_strict(&tmp)) { + if (timespec64_compare(&tk->wall_to_monotonic, &ts64) > 0 || + !timespec64_valid_strict(&tmp)) { ret = -EINVAL; goto error; } -- cgit v1.2.3 From 7494e9eedee2121305a48af4fbbcedb69a2c2b93 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Sun, 26 Jul 2015 18:45:39 +0800 Subject: time: Add the common weak version of update_persistent_clock() The weak update_persistent_clock64() calls update_persistent_clock(), if the architecture defines an update_persistent_clock64() to replace and remove its update_persistent_clock() version, when building the kernel the linker will throw an undefined symbol error, that is, any arch that switches to update_persistent_clock64() will have this issue. To solve the issue, we add the common weak update_persistent_clock(). Cc: Prarit Bhargava Cc: Richard Cochran Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Arnd Bergmann Signed-off-by: Xunlei Pang Signed-off-by: John Stultz --- kernel/time/ntp.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index fb4d98c7fd43..df68cb875248 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -487,6 +487,11 @@ out: } #ifdef CONFIG_GENERIC_CMOS_UPDATE +int __weak update_persistent_clock(struct timespec now) +{ + return -ENODEV; +} + int __weak update_persistent_clock64(struct timespec64 now64) { struct timespec now; -- cgit v1.2.3 From 8758a240e2d74c5932ab51a73377e6507b7fd441 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 29 Jul 2015 20:09:43 +0800 Subject: time: Introduce current_kernel_time64() The current_kernel_time() is not year 2038 safe on 32bit systems since it returns a timespec value. Introduce current_kernel_time64() which returns a timespec64 value. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Baolin Wang Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4cdb771913c8..f6ee2e6b6f5d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1881,7 +1881,7 @@ struct timespec __current_kernel_time(void) return timespec64_to_timespec(tk_xtime(tk)); } -struct timespec current_kernel_time(void) +struct timespec64 current_kernel_time64(void) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 now; @@ -1893,9 +1893,9 @@ struct timespec current_kernel_time(void) now = tk_xtime(tk); } while (read_seqcount_retry(&tk_core.seq, seq)); - return timespec64_to_timespec(now); + return now; } -EXPORT_SYMBOL(current_kernel_time); +EXPORT_SYMBOL(current_kernel_time64); struct timespec64 get_monotonic_coarse64(void) { -- cgit v1.2.3 From 9ca308506062fc4a4ee8ca7ad2f71033c831c2fb Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 29 Jul 2015 20:18:31 +0800 Subject: time: Introduce timespec64_to_jiffies()/jiffies_to_timespec64() The conversion between struct timespec and jiffies is not year 2038 safe on 32bit systems. Introduce timespec64_to_jiffies() and jiffies_to_timespec64() functions which use struct timespec64 to make it ready for 2038 issue. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Baolin Wang Signed-off-by: John Stultz --- kernel/time/time.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index 34dbd4209e4a..f18ab105ed87 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -540,7 +540,7 @@ EXPORT_SYMBOL(__usecs_to_jiffies); * value to a scaled second value. */ static unsigned long -__timespec_to_jiffies(unsigned long sec, long nsec) +__timespec64_to_jiffies(u64 sec, long nsec) { nsec = nsec + TICK_NSEC - 1; @@ -548,22 +548,27 @@ __timespec_to_jiffies(unsigned long sec, long nsec) sec = MAX_SEC_IN_JIFFIES; nsec = 0; } - return (((u64)sec * SEC_CONVERSION) + + return ((sec * SEC_CONVERSION) + (((u64)nsec * NSEC_CONVERSION) >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; } -unsigned long -timespec_to_jiffies(const struct timespec *value) +static unsigned long +__timespec_to_jiffies(unsigned long sec, long nsec) { - return __timespec_to_jiffies(value->tv_sec, value->tv_nsec); + return __timespec64_to_jiffies((u64)sec, nsec); } -EXPORT_SYMBOL(timespec_to_jiffies); +unsigned long +timespec64_to_jiffies(const struct timespec64 *value) +{ + return __timespec64_to_jiffies(value->tv_sec, value->tv_nsec); +} +EXPORT_SYMBOL(timespec64_to_jiffies); void -jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) +jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value) { /* * Convert jiffies to nanoseconds and separate with @@ -574,7 +579,7 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) NSEC_PER_SEC, &rem); value->tv_nsec = rem; } -EXPORT_SYMBOL(jiffies_to_timespec); +EXPORT_SYMBOL(jiffies_to_timespec64); /* * We could use a similar algorithm to timespec_to_jiffies (with a -- cgit v1.2.3 From 75e3b37d059856a972a5bf2bdfeac0f0f2db9ea3 Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Tue, 11 Aug 2015 16:40:43 -0400 Subject: hrtimer: Drop return code of hrtimer_switch_to_hres() It's not checked by the caller. Signed-off-by: Luiz Capitulino Link: http://lkml.kernel.org/r/20150811164043.538241ef@redhat.com Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5c7ae4b641c4..55575d4f253c 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -679,14 +679,13 @@ static void retrigger_next_event(void *arg) /* * Switch to high resolution mode */ -static int hrtimer_switch_to_hres(void) +static void hrtimer_switch_to_hres(void) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); if (tick_init_highres()) { printk(KERN_WARNING "Could not switch to high resolution " "mode on CPU %d\n", base->cpu); - return 0; } base->hres_active = 1; hrtimer_resolution = HIGH_RES_NSEC; @@ -694,7 +693,6 @@ static int hrtimer_switch_to_hres(void) tick_setup_sched_timer(); /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); - return 1; } static void clock_was_set_work(struct work_struct *work) @@ -718,7 +716,7 @@ void clock_was_set_delayed(void) static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; } static inline int hrtimer_hres_active(void) { return 0; } static inline int hrtimer_is_hres_enabled(void) { return 0; } -static inline int hrtimer_switch_to_hres(void) { return 0; } +static inline void hrtimer_switch_to_hres(void) { } static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } static inline int hrtimer_reprogram(struct hrtimer *timer, -- cgit v1.2.3 From d0023a1448abdcc892b8bca631e74bb1888efd02 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 17 Aug 2015 10:18:48 -0700 Subject: timer: Write timer->flags atomically lock_timer_base() cannot prevent the following : CPU1 ( in __mod_timer() timer->flags |= TIMER_MIGRATING; spin_unlock(&base->lock); base = new_base; spin_lock(&base->lock); // The next line clears TIMER_MIGRATING timer->flags &= ~TIMER_BASEMASK; CPU2 (in lock_timer_base()) see timer base is cpu0 base spin_lock_irqsave(&base->lock, *flags); if (timer->flags == tf) return base; // oops, wrong base timer->flags |= base->cpu // too late We must write timer->flags in one go, otherwise we can fool other cpus. Fixes: bc7a34b8b9eb ("timer: Reduce timer migration overhead if disabled") Signed-off-by: Eric Dumazet Cc: Jon Christopherson Cc: David Miller Cc: xen-devel@lists.xen.org Cc: david.vrabel@citrix.com Cc: Sander Eikelenboom Link: http://lkml.kernel.org/r/1439831928.32680.11.camel@edumazet-glaptop2.roam.corp.google.com Signed-off-by: Thomas Gleixner Cc: Thomas Gleixner --- kernel/time/timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 5e097fa9faf7..84190f02b521 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -807,8 +807,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, spin_unlock(&base->lock); base = new_base; spin_lock(&base->lock); - timer->flags &= ~TIMER_BASEMASK; - timer->flags |= base->cpu; + WRITE_ONCE(timer->flags, + (timer->flags & ~TIMER_BASEMASK) | base->cpu); } } -- cgit v1.2.3 From 662b3e194656cc713d51d52780fb71f499c46619 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 18 Aug 2015 16:18:28 +0200 Subject: hrtimer: Simplify get_target_base() by returning current base Instead of fetching again the current cpu base, just take it from the parameter. Signed-off-by: Frederic Weisbecker Link: http://lkml.kernel.org/r/1439907509-9553-2-git-send-email-fweisbec@gmail.com Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 55575d4f253c..f9eb21ba3af6 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -183,7 +183,7 @@ struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) { if (pinned || !base->migration_enabled) - return this_cpu_ptr(&hrtimer_bases); + return base; return &per_cpu(hrtimer_bases, get_nohz_timer_target()); } #else @@ -191,7 +191,7 @@ static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) { - return this_cpu_ptr(&hrtimer_bases); + return base; } #endif -- cgit v1.2.3 From b48362d8aaf32aeb4a75f5c556c652ffeeb1be5d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 18 Aug 2015 16:18:29 +0200 Subject: hrtimer: Unconfuse switch_hrtimer_base() a bit The variable called "this_base" is confusing because its name suggests it's of "struct hrtimer_clock_base" type, along with "base" and "new_base" which doesn't help understanding this complicated function. Make its name clearer and fix the misleading comment while at it. [ tglx: Fixed the comment for real ] Signed-off-by: Frederic Weisbecker Link: http://lkml.kernel.org/r/1439907509-9553-3-git-send-email-fweisbec@gmail.com Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f9eb21ba3af6..5c4fe50e47d3 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -196,18 +196,27 @@ struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, #endif /* - * Switch the timer base to the current CPU when possible. + * We switch the timer base to a power-optimized selected CPU target, + * if: + * - NO_HZ_COMMON is enabled + * - timer migration is enabled + * - the timer callback is not running + * - the timer is not the first expiring timer on the new target + * + * If one of the above requirements is not fulfilled we move the timer + * to the current CPU or leave it on the previously assigned CPU if + * the timer callback is currently running. */ static inline struct hrtimer_clock_base * switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, int pinned) { - struct hrtimer_cpu_base *new_cpu_base, *this_base; + struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base; struct hrtimer_clock_base *new_base; int basenum = base->index; - this_base = this_cpu_ptr(&hrtimer_bases); - new_cpu_base = get_target_base(this_base, pinned); + this_cpu_base = this_cpu_ptr(&hrtimer_bases); + new_cpu_base = get_target_base(this_cpu_base, pinned); again: new_base = &new_cpu_base->clock_base[basenum]; @@ -229,19 +238,19 @@ again: raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); - if (new_cpu_base != this_base && + if (new_cpu_base != this_cpu_base && hrtimer_check_target(timer, new_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); - new_cpu_base = this_base; + new_cpu_base = this_cpu_base; timer->base = base; goto again; } timer->base = new_base; } else { - if (new_cpu_base != this_base && + if (new_cpu_base != this_cpu_base && hrtimer_check_target(timer, new_base)) { - new_cpu_base = this_base; + new_cpu_base = this_cpu_base; goto again; } } -- cgit v1.2.3 From 6d4affea7d5aa5ca5ff4c3e5fbf3ee16801cc527 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Fri, 14 Aug 2015 15:20:25 +0300 Subject: genirq: Don't return ENOSYS in irq_chip_retrigger_hierarchy irq_chip_retrigger_hierarchy() returns -ENOSYS if it was not able to find at least one .irq_retrigger() callback implemented in the IRQ domain hierarchy. That's wrong, because check_irq_resend() expects a 0 return value from the callback in case that the hardware assisted resend was not possible. If the return value is non zero the core code assumes hardware resend success and the software resend is not invoked. This results in lost interrupts on platforms where none of the parent irq chips in the hierarchy implements the retrigger callback. This is observable on TI OMAP, where the hierarchy is: ARM GIC <- OMAP wakeupgen <- TI Crossbar Return 0 instead so the software resend mechanism gets invoked. [ tglx: Massaged changelog ] Fixes: 85f08c17de26 ('genirq: Introduce helper functions...') Signed-off-by: Grygorii Strashko Reviewed-by: Marc Zyngier Reviewed-by: Jiang Liu Cc: Sudeep Holla Cc: Cc: Cc: Cc: Cc: Cc: Cc: stable@vger.kernel.org # 4.1 Link: http://lkml.kernel.org/r/1439554830-19502-2-git-send-email-grygorii.strashko@ti.com Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 27f4332c7f84..6de638bccba7 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -997,7 +997,7 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data) if (data->chip && data->chip->irq_retrigger) return data->chip->irq_retrigger(data); - return -ENOSYS; + return 0; } /** -- cgit v1.2.3 From b7560de198222994374c1340a389f12d5efb244a Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Fri, 14 Aug 2015 15:20:26 +0300 Subject: genirq: Introduce irq_chip_set_type_parent() helper This helper is required for irq chips which do not implement a irq_set_type callback and need to call down the irq domain hierarchy for the actual trigger type change. This helper is required to fix further wreckage caused by the conversion of TI OMAP to hierarchical irq domains and therefor tagged for stable. [ tglx: Massaged changelog ] Signed-off-by: Grygorii Strashko Cc: Sudeep Holla Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: stable@vger.kernel.org # 4.1 Link: http://lkml.kernel.org/r/1439554830-19502-3-git-send-email-grygorii.strashko@ti.com Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6de638bccba7..ae216824e8ca 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -984,6 +984,23 @@ int irq_chip_set_affinity_parent(struct irq_data *data, return -ENOSYS; } +/** + * irq_chip_set_type_parent - Set IRQ type on the parent interrupt + * @data: Pointer to interrupt specific data + * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h + * + * Conditional, as the underlying parent chip might not implement it. + */ +int irq_chip_set_type_parent(struct irq_data *data, unsigned int type) +{ + data = data->parent_data; + + if (data->chip->irq_set_type) + return data->chip->irq_set_type(data, type); + + return -ENOSYS; +} + /** * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware * @data: Pointer to interrupt specific data -- cgit v1.2.3 From 85e1cd6e769dfc84995270d0a4838021fcb8602d Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sat, 22 Aug 2015 01:10:47 -0700 Subject: hrtimer: Handle failure of tick_init_highres() gracefully Commit 75e3b37d0598 ("hrtimer: Drop return code of hrtimer_switch_to_hres()") drops the return code of hrtimer_switch_to_hres(). While doing so, it also drops the return statement itself on failure. This may cause a system hang. Seen when running arm:multi_v7_defconfig in qemu with devicetree file vexpress-v2p-ca9. Fixes: 75e3b37d0598 ("hrtimer: Drop return code of hrtimer_switch_to_hres()") Cc: Luiz Capitulino Signed-off-by: Guenter Roeck Link: http://lkml.kernel.org/r/1440231047-16256-1-git-send-email-linux@roeck-us.net Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5c4fe50e47d3..457a373e2181 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -695,6 +695,7 @@ static void hrtimer_switch_to_hres(void) if (tick_init_highres()) { printk(KERN_WARNING "Could not switch to high resolution " "mode on CPU %d\n", base->cpu); + return; } base->hres_active = 1; hrtimer_resolution = HIGH_RES_NSEC; -- cgit v1.2.3 From dd9d3843755da95f63dd3a376f62b3e45c011210 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= Date: Wed, 12 Aug 2015 21:35:56 +0200 Subject: sched: Fix cpu_active_mask/cpu_online_mask race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a race condition in SMP bootup code, which may result in WARNING: CPU: 0 PID: 1 at kernel/workqueue.c:4418 workqueue_cpu_up_callback() or kernel BUG at kernel/smpboot.c:135! It can be triggered with a bit of luck in Linux guests running on busy hosts. CPU0 CPUn ==== ==== _cpu_up() __cpu_up() start_secondary() set_cpu_online() cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits)); cpu_notify(CPU_ONLINE) cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits)); During the various CPU_ONLINE callbacks CPUn is online but not active. Several things can go wrong at that point, depending on the scheduling of tasks on CPU0. Variant 1: cpu_notify(CPU_ONLINE) workqueue_cpu_up_callback() rebind_workers() set_cpus_allowed_ptr() This call fails because it requires an active CPU; rebind_workers() ends with a warning: WARNING: CPU: 0 PID: 1 at kernel/workqueue.c:4418 workqueue_cpu_up_callback() Variant 2: cpu_notify(CPU_ONLINE) smpboot_thread_call() smpboot_unpark_threads() .. __kthread_unpark() __kthread_bind() wake_up_state() .. select_task_rq() select_fallback_rq() The ->wake_cpu of the unparked thread is not allowed, making a call to select_fallback_rq() necessary. Then, select_fallback_rq() cannot find an allowed, active CPU and promptly resets the allowed CPUs, so that the task in question ends up on CPU0. When those unparked tasks are eventually executed, they run immediately into a BUG: kernel BUG at kernel/smpboot.c:135! Just changing the order in which the online/active bits are set (and adding some memory barriers), would solve the two issues above. However, it would change the order of operations back to the one before commit 6acbfb96976f ("sched: Fix hotplug vs. set_cpus_allowed_ptr()"), thus, reintroducing that particular problem. Going further back into history, we have at least the following commits touching this topic: - commit 2baab4e90495 ("sched: Fix select_fallback_rq() vs cpu_active/cpu_online") - commit 5fbd036b552f ("sched: Cleanup cpu_active madness") Together, these give us the following non-working solutions: - secondary CPU sets active before online, because active is assumed to be a subset of online; - secondary CPU sets online before active, because the primary CPU assumes that an online CPU is also active; - secondary CPU sets online and waits for primary CPU to set active, because it might deadlock. Commit 875ebe940d77 ("powerpc/smp: Wait until secondaries are active & online") introduces an arch-specific solution to this arch-independent problem. Now, go for a more general solution without explicit waiting and simply set active twice: once on the secondary CPU after online was set and once on the primary CPU after online was seen. set_cpus_allowed_ptr()") Signed-off-by: Jan H. Schönherr Acked-by: Peter Zijlstra Cc: Cc: Anton Blanchard Cc: Borislav Petkov Cc: Joerg Roedel Cc: Linus Torvalds Cc: Matt Wilson Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 6acbfb96976f ("sched: Fix hotplug vs. set_cpus_allowed_ptr()") Link: http://lkml.kernel.org/r/1439408156-18840-1-git-send-email-jschoenh@amazon.de Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 78b4bad10081..e9673433cc01 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5433,6 +5433,14 @@ static int sched_cpu_active(struct notifier_block *nfb, case CPU_STARTING: set_cpu_rq_start_time(); return NOTIFY_OK; + case CPU_ONLINE: + /* + * At this point a starting CPU has marked itself as online via + * set_cpu_online(). But it might not yet have marked itself + * as active, which is essential from here on. + * + * Thus, fall-through and help the starting CPU along. + */ case CPU_DOWN_FAILED: set_cpu_active((long)hcpu, true); return NOTIFY_OK; -- cgit v1.2.3 From a2fb3382edbea83c6f2bf6ac15e3673b2e254aad Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 26 Aug 2015 10:57:46 +0000 Subject: tracing/uprobes: Do not print '0x (null)' when offset is 0 When manually added uprobe point with zero address, 'uprobe_events' output '(null)' instead of 0x00000000: # echo p:probe_libc/abs_0 /path/to/lib.bin:0x0 arg1=%ax > \ /sys/kernel/debug/tracing/uprobe_events # cat /sys/kernel/debug/tracing/uprobe_events p:probe_libc/abs_0 /path/to/lib.bin:0x (null) arg1=%ax This patch fixes this behavior: # cat /sys/kernel/debug/tracing/uprobe_events p:probe_libc/abs_0 /path/to/lib.bin:0x0000000000000000 Signed-off-by: Wang Nan Acked-by: Masami Hiramatsu Cc: Namhyung Kim Cc: Steven Rostedt Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1440586666-235233-8-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/trace/trace_uprobe.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index f97479f1ce35..d2f6d0be3503 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -601,7 +601,22 @@ static int probes_seq_show(struct seq_file *m, void *v) seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, trace_event_name(&tu->tp.call)); - seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); + seq_printf(m, " %s:", tu->filename); + + /* Don't print "0x (null)" when offset is 0 */ + if (tu->offset) { + seq_printf(m, "0x%p", (void *)tu->offset); + } else { + switch (sizeof(void *)) { + case 4: + seq_printf(m, "0x00000000"); + break; + case 8: + default: + seq_printf(m, "0x0000000000000000"); + break; + } + } for (i = 0; i < tu->tp.nr_args; i++) seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); -- cgit v1.2.3