From 6cc98f03adbf517986a90c43af0cbc9a732b8435 Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Fri, 8 Jul 2016 10:59:08 -0700 Subject: timer: Ensure timers are not running before migrating This is needed to support migration of timers during cpu isolation. A timer might be running on the CPU that we want to isolate so we are unable to migrate the timers at this point. We are adding a spin-loop to wait for the timer to finish before migrating the timers. Change-Id: I24d6e91b6dff468c640c2fe3a37a7f31b6f0c79a Signed-off-by: Olav Haugan --- kernel/time/timer.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 51896272fcde..be750f6b2a68 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1635,7 +1635,7 @@ static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *he } } -static void migrate_timers(int cpu) +static void migrate_timers(int cpu, bool wait) { struct tvec_base *old_base; struct tvec_base *new_base; @@ -1651,7 +1651,18 @@ static void migrate_timers(int cpu) spin_lock_irq(&new_base->lock); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); - BUG_ON(old_base->running_timer); + if (wait) { + /* Ensure timers are done running before continuing */ + while (old_base->running_timer) { + spin_unlock(&old_base->lock); + spin_unlock_irq(&new_base->lock); + cpu_relax(); + spin_lock_irq(&new_base->lock); + spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + } + } else { + BUG_ON(old_base->running_timer); + } for (i = 0; i < TVR_SIZE; i++) migrate_timer_list(new_base, old_base->tv1.vec + i); @@ -1676,7 +1687,7 @@ static int timer_cpu_notify(struct notifier_block *self, switch (action) { case CPU_DEAD: case CPU_DEAD_FROZEN: - migrate_timers((long)hcpu); + migrate_timers((long)hcpu, false); break; default: break; -- cgit v1.2.3 From a7dffd7ffbe6aafe8da10f82a922c00e1c65acdc Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 25 Mar 2015 11:47:53 +0530 Subject: timer: create timer_quiesce_cpu() to isolate CPU from timers To isolate CPUs (isolate from timers) from sysfs using cpusets, we need some support from the timer core. i.e. A routine timer_quiesce_cpu() which would migrates away all the unpinned timers, but shouldn't touch the pinned ones. This patch creates this routine. Change-Id: I8624e0659b86b7b8fa425a3fafdb0784fe005124 Signed-off-by: Viresh Kumar [forward port to 3.18] Signed-off-by: Santosh Shukla [ohaugan@codeaurora.org: Port to 4.4. Fixes for compilation error] Git-commit: 313910b70ea0c73f8789d9189c11e1f339080646 Git-repo: git://git.linaro.org/people/mike.holmes/santosh.shukla/lng-isol.git Signed-off-by: Olav Haugan --- kernel/time/timer.c | 56 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index be750f6b2a68..067174a4dde3 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1620,44 +1620,49 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) } EXPORT_SYMBOL(schedule_timeout_uninterruptible); -#ifdef CONFIG_HOTPLUG_CPU -static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) +#if defined(CONFIG_HOTPLUG_CPU) +static void migrate_timer_list(struct tvec_base *new_base, + struct hlist_head *head, bool remove_pinned) { struct timer_list *timer; int cpu = new_base->cpu; + struct hlist_node *n; + int is_pinned; - while (!hlist_empty(head)) { - timer = hlist_entry(head->first, struct timer_list, entry); - /* We ignore the accounting on the dying cpu */ - detach_timer(timer, false); + hlist_for_each_entry_safe(timer, n, head, entry) { + is_pinned = timer->flags & TIMER_PINNED_ON_CPU; + if (!remove_pinned && is_pinned) + continue; + + detach_if_pending(timer, get_timer_base(timer->flags), false); timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; internal_add_timer(new_base, timer); } } -static void migrate_timers(int cpu, bool wait) +static void __migrate_timers(int cpu, bool wait, bool remove_pinned) { struct tvec_base *old_base; struct tvec_base *new_base; + unsigned long flags; int i; - BUG_ON(cpu_online(cpu)); old_base = per_cpu_ptr(&tvec_bases, cpu); new_base = get_cpu_ptr(&tvec_bases); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - spin_lock_irq(&new_base->lock); + spin_lock_irqsave(&new_base->lock, flags); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); if (wait) { /* Ensure timers are done running before continuing */ while (old_base->running_timer) { spin_unlock(&old_base->lock); - spin_unlock_irq(&new_base->lock); + spin_unlock_irqrestore(&new_base->lock, flags); cpu_relax(); - spin_lock_irq(&new_base->lock); + spin_lock_irqsave(&new_base->lock, flags); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); } } else { @@ -1665,29 +1670,38 @@ static void migrate_timers(int cpu, bool wait) } for (i = 0; i < TVR_SIZE; i++) - migrate_timer_list(new_base, old_base->tv1.vec + i); + migrate_timer_list(new_base, old_base->tv1.vec + i, + remove_pinned); for (i = 0; i < TVN_SIZE; i++) { - migrate_timer_list(new_base, old_base->tv2.vec + i); - migrate_timer_list(new_base, old_base->tv3.vec + i); - migrate_timer_list(new_base, old_base->tv4.vec + i); - migrate_timer_list(new_base, old_base->tv5.vec + i); + migrate_timer_list(new_base, old_base->tv2.vec + i, + remove_pinned); + migrate_timer_list(new_base, old_base->tv3.vec + i, + remove_pinned); + migrate_timer_list(new_base, old_base->tv4.vec + i, + remove_pinned); + migrate_timer_list(new_base, old_base->tv5.vec + i, + remove_pinned); } - old_base->active_timers = 0; - old_base->all_timers = 0; - spin_unlock(&old_base->lock); - spin_unlock_irq(&new_base->lock); + spin_unlock_irqrestore(&new_base->lock, flags); put_cpu_ptr(&tvec_bases); } +/* Migrate timers from 'cpu' to this_cpu */ +static void migrate_timers(int cpu) +{ + BUG_ON(cpu_online(cpu)); + __migrate_timers(cpu, false, true); +} + static int timer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { switch (action) { case CPU_DEAD: case CPU_DEAD_FROZEN: - migrate_timers((long)hcpu, false); + migrate_timers((long)hcpu); break; default: break; -- cgit v1.2.3 From d0cea65e27b7541234d61b70017ec6cc3cfe3eec Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 25 Mar 2015 11:57:45 +0530 Subject: hrtimer: update timer->state with 'pinned' information 'Pinned' information would be required in migrate_hrtimers() now, as we can migrate non-pinned timers away without a hotplug (i.e. with cpuset.quiesce). And so we may need to identify pinned timers now, as we can't migrate them. This patch reuses the timer->state variable for setting this flag as there were enough number of free bits available in this variable. And there is no point increasing size of this struct by adding another field. Change-Id: If3b3770e547971809e789ea7c8033c48ec2aa92d Signed-off-by: Viresh Kumar [forward port to 3.18] Signed-off-by: Santosh Shukla [ohaugan@codeaurora.org: Port to 4.4] Git-commit: 62feaf1ed0b64c04868d143d8bdb92d60dc3189b Git-repo: git://git.linaro.org/people/mike.holmes/santosh.shukla/lng-isol.git Signed-off-by: Olav Haugan --- kernel/time/hrtimer.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index fa909f9fd559..83c298cc0533 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -880,7 +880,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, base->cpu_base->active_bases |= 1 << base->index; - timer->state = HRTIMER_STATE_ENQUEUED; + timer->state |= HRTIMER_STATE_ENQUEUED; return timerqueue_add(&base->active, &timer->node); } @@ -900,11 +900,9 @@ static void __remove_hrtimer(struct hrtimer *timer, u8 newstate, int reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; - u8 state = timer->state; - timer->state = newstate; - if (!(state & HRTIMER_STATE_ENQUEUED)) - return; + if (!(timer->state & HRTIMER_STATE_ENQUEUED)) + goto out; if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); @@ -921,6 +919,13 @@ static void __remove_hrtimer(struct hrtimer *timer, if (reprogram && timer == cpu_base->next_timer) hrtimer_force_reprogram(cpu_base, 1); #endif + +out: + /* + * We need to preserve PINNED state here, otherwise we may end up + * migrating pinned hrtimers as well. + */ + timer->state = newstate | (timer->state & HRTIMER_STATE_PINNED); } /* @@ -1002,6 +1007,10 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, timer_stats_hrtimer_set_start_info(timer); + /* Update pinned state */ + timer->state &= ~HRTIMER_STATE_PINNED; + timer->state |= !!(mode & HRTIMER_MODE_PINNED) << HRTIMER_PINNED_SHIFT; + leftmost = enqueue_hrtimer(timer, new_base); if (!leftmost) goto unlock; -- cgit v1.2.3 From f461d408acb522ec6db8ae34a49c2dbbd6092ab1 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 25 Mar 2015 12:07:31 +0530 Subject: hrtimer: create hrtimer_quiesce_cpu() to isolate CPU from hrtimers To isolate CPUs (isolate from hrtimers) from sysfs using cpusets, we need some support from the hrtimer core. i.e. A routine hrtimer_quiesce_cpu() which would migrate away all the unpinned hrtimers, but shouldn't touch the pinned ones. This patch creates this routine. Change-Id: I51259ea41e3bd5cdba50b718201a6840174a7224 Signed-off-by: Viresh Kumar [forward port to 3.18] Signed-off-by: Santosh Shukla [ohaugan@codeaurora.org: Port to 4.4] Git-commit: d4d50a0ddc35e58ee95137ba4d14e74fea8b682f Git-repo: git://git.linaro.org/people/mike.holmes/santosh.shukla/lng-isol.git Signed-off-by: Olav Haugan --- kernel/time/hrtimer.c | 54 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 44 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 83c298cc0533..44ddf403fb01 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1623,13 +1623,17 @@ static void init_hrtimers_cpu(int cpu) hrtimer_init_hres(cpu_base); } -#ifdef CONFIG_HOTPLUG_CPU - +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_CPUSETS) static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, - struct hrtimer_clock_base *new_base) + struct hrtimer_clock_base *new_base, + bool remove_pinned) { struct hrtimer *timer; struct timerqueue_node *node; + struct timerqueue_head pinned; + int is_pinned; + + timerqueue_init_head(&pinned); while ((node = timerqueue_getnext(&old_base->active))) { timer = container_of(node, struct hrtimer, node); @@ -1642,6 +1646,13 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * under us on another CPU */ __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); + + is_pinned = timer->state & HRTIMER_STATE_PINNED; + if (!remove_pinned && is_pinned) { + timerqueue_add(&pinned, &timer->node); + continue; + } + timer->base = new_base; /* * Enqueue the timers on the new cpu. This does not @@ -1653,17 +1664,23 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, */ enqueue_hrtimer(timer, new_base); } + + /* Re-queue pinned timers for non-hotplug usecase */ + while ((node = timerqueue_getnext(&pinned))) { + timer = container_of(node, struct hrtimer, node); + + timerqueue_del(&pinned, &timer->node); + enqueue_hrtimer(timer, old_base); + } } -static void migrate_hrtimers(int scpu) +static void __migrate_hrtimers(int scpu, bool remove_pinned) { struct hrtimer_cpu_base *old_base, *new_base; + unsigned long flags; int i; - BUG_ON(cpu_online(scpu)); - tick_cancel_sched_timer(scpu); - - local_irq_disable(); + local_irq_save(flags); old_base = &per_cpu(hrtimer_bases, scpu); new_base = this_cpu_ptr(&hrtimer_bases); /* @@ -1675,7 +1692,7 @@ static void migrate_hrtimers(int scpu) for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], - &new_base->clock_base[i]); + &new_base->clock_base[i], remove_pinned); } raw_spin_unlock(&old_base->lock); @@ -1683,11 +1700,28 @@ static void migrate_hrtimers(int scpu) /* Check, if we got expired work to do */ __hrtimer_peek_ahead_timers(); - local_irq_enable(); + local_irq_restore(flags); +} +#endif /* CONFIG_HOTPLUG_CPU || CONFIG_CPUSETS */ + +#ifdef CONFIG_HOTPLUG_CPU +static void migrate_hrtimers(int scpu) +{ + BUG_ON(cpu_online(scpu)); + tick_cancel_sched_timer(scpu); + + __migrate_hrtimers(scpu, true); } #endif /* CONFIG_HOTPLUG_CPU */ +#ifdef CONFIG_CPUSETS +void hrtimer_quiesce_cpu(void *cpup) +{ + __migrate_hrtimers(*(int *)cpup, false); +} +#endif /* CONFIG_CPUSETS */ + static int hrtimer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { -- cgit v1.2.3 From a66156c83c6de8dd2e614d6ad37afe7673ca1473 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 25 Mar 2015 12:57:46 +0530 Subject: hrtimer: make sure PINNED flag is cleared after removing hrtimer Change-Id: Icc4d1c183e993b4b3c9b96ec9779c234e73ecab7 Signed-off-by: Viresh Kumar [forward port to 3.18] Signed-off-by: Santosh Shukla Git-commit: d6c894e515b4cd93c3a08e7c60cce0aa5118c656 Git-repo: git://git.linaro.org/people/mike.holmes/santosh.shukla/lng-isol.git Signed-off-by: Olav Haugan --- kernel/time/hrtimer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 44ddf403fb01..ab304f854743 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -954,6 +954,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest state = HRTIMER_STATE_INACTIVE; __remove_hrtimer(timer, base, state, reprogram); + timer->state &= ~HRTIMER_STATE_PINNED; return 1; } return 0; @@ -1009,7 +1010,7 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, /* Update pinned state */ timer->state &= ~HRTIMER_STATE_PINNED; - timer->state |= !!(mode & HRTIMER_MODE_PINNED) << HRTIMER_PINNED_SHIFT; + timer->state |= (!!(mode & HRTIMER_MODE_PINNED)) << HRTIMER_PINNED_SHIFT; leftmost = enqueue_hrtimer(timer, new_base); if (!leftmost) -- cgit v1.2.3 From 84e39dcb3bf4b0ea5ed9b1bcd1b1fb43d2a4bc63 Mon Sep 17 00:00:00 2001 From: "Gary S. Robertson" Date: Wed, 10 Sep 2014 14:57:16 -0500 Subject: hrtimer.h: prevent pinned timer state from breaking inactive test An hrtimer may be pinned to a CPU but inactive, so it is no longer valid to test the hrtimer.state struct member as having no bits set when inactive. Changed the test function to mask out the HRTIMER_STATE_PINNED bit when checking for inactive state. Change-Id: I632f37874ef79887ee1202a028ef734f392d6ed0 Signed-off-by: Gary S. Robertson [ohaugan@codeaurora.org: Port to 4.4] Git-commit: 902e4d4eb0d2158d2792166221a72a829caecf07 Git-repo: git://git.linaro.org/people/mike.holmes/santosh.shukla/lng-isol.git Signed-off-by: Olav Haugan --- kernel/time/hrtimer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index ab304f854743..f3b89de9ca2a 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1186,8 +1186,8 @@ bool hrtimer_active(const struct hrtimer *timer) cpu_base = READ_ONCE(timer->base->cpu_base); seq = raw_read_seqcount_begin(&cpu_base->seq); - if (timer->state != HRTIMER_STATE_INACTIVE || - cpu_base->running == timer) + if (((timer->state & ~HRTIMER_STATE_PINNED) != + HRTIMER_STATE_INACTIVE) || cpu_base->running == timer) return true; } while (read_seqcount_retry(&cpu_base->seq, seq) || -- cgit v1.2.3 From bba552f4fc0046325f29156720a04623a71821d5 Mon Sep 17 00:00:00 2001 From: Santosh Shukla Date: Wed, 25 Mar 2015 16:09:32 +0530 Subject: timer: Add function to migrate timers Add function to migrate timer that will be used by later patch set. Change-Id: I370e404001344e635a663822b07557abbe0f6f52 Signed-off-by: Santosh Shukla [ohaugan@codeaurora.org: Updated commit text and fixed trivial merge conflict] Git-commit: 3633b88d8fcb4273807574c27c328b6908a741e5 Git-repo: git://git.linaro.org/people/mike.holmes/santosh.shukla/lng-isol.git Signed-off-by: Olav Haugan --- kernel/time/timer.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 067174a4dde3..d38a67a49550 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1695,6 +1695,13 @@ static void migrate_timers(int cpu) __migrate_timers(cpu, false, true); } +#ifdef CONFIG_CPUSETS +void timer_quiesce_cpu(void *cpup) +{ + __migrate_timers(*(int *)cpup, true, false); +} +#endif /* CONFIG_CPUSETS */ + static int timer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { -- cgit v1.2.3 From 922fed628c625b28a50f66267e4b9f99088e2aa4 Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Sun, 3 Jul 2016 15:02:08 -0700 Subject: timer: Do not require CPUSETS to be enabled for migration Do not require CPUSETS to be enabled to allow migration of timers and hrtimers. Change-Id: Ib911a0d34c250c4df020bdb265b92d2b8df8db93 Signed-off-by: Olav Haugan --- kernel/time/hrtimer.c | 10 +++------- kernel/time/timer.c | 2 -- 2 files changed, 3 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f3b89de9ca2a..1b0117198a08 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1624,7 +1624,7 @@ static void init_hrtimers_cpu(int cpu) hrtimer_init_hres(cpu_base); } -#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_CPUSETS) +#if defined(CONFIG_HOTPLUG_CPU) static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base, bool remove_pinned) @@ -1703,9 +1703,7 @@ static void __migrate_hrtimers(int scpu, bool remove_pinned) __hrtimer_peek_ahead_timers(); local_irq_restore(flags); } -#endif /* CONFIG_HOTPLUG_CPU || CONFIG_CPUSETS */ -#ifdef CONFIG_HOTPLUG_CPU static void migrate_hrtimers(int scpu) { BUG_ON(cpu_online(scpu)); @@ -1714,14 +1712,12 @@ static void migrate_hrtimers(int scpu) __migrate_hrtimers(scpu, true); } -#endif /* CONFIG_HOTPLUG_CPU */ - -#ifdef CONFIG_CPUSETS void hrtimer_quiesce_cpu(void *cpup) { __migrate_hrtimers(*(int *)cpup, false); } -#endif /* CONFIG_CPUSETS */ + +#endif /* CONFIG_HOTPLUG_CPU */ static int hrtimer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index d38a67a49550..0efb3916f5a4 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1695,12 +1695,10 @@ static void migrate_timers(int cpu) __migrate_timers(cpu, false, true); } -#ifdef CONFIG_CPUSETS void timer_quiesce_cpu(void *cpup) { __migrate_timers(*(int *)cpup, true, false); } -#endif /* CONFIG_CPUSETS */ static int timer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) -- cgit v1.2.3 From fc70615291343d031b148bb2676963419a7b672e Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Sun, 29 May 2016 19:56:37 -0700 Subject: cpumask: Add cpu isolation support Add bitmask and corresponding supporting functions for cpu isolation. Change-Id: Ice1a9503666a2b720bdb324289ca55ceb33097cd Signed-off-by: Olav Haugan --- kernel/cpu.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 1cfd381642da..3c97f5b88a07 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -768,6 +768,10 @@ static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly; const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits); EXPORT_SYMBOL(cpu_active_mask); +static DECLARE_BITMAP(cpu_isolated_bits, CONFIG_NR_CPUS) __read_mostly; +const struct cpumask *const cpu_isolated_mask = to_cpumask(cpu_isolated_bits); +EXPORT_SYMBOL(cpu_isolated_mask); + void set_cpu_possible(unsigned int cpu, bool possible) { if (possible) @@ -802,6 +806,14 @@ void set_cpu_active(unsigned int cpu, bool active) cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits)); } +void set_cpu_isolated(unsigned int cpu, bool isolated) +{ + if (isolated) + cpumask_set_cpu(cpu, to_cpumask(cpu_isolated_bits)); + else + cpumask_clear_cpu(cpu, to_cpumask(cpu_isolated_bits)); +} + void init_cpu_present(const struct cpumask *src) { cpumask_copy(to_cpumask(cpu_present_bits), src); @@ -817,6 +829,11 @@ void init_cpu_online(const struct cpumask *src) cpumask_copy(to_cpumask(cpu_online_bits), src); } +void init_cpu_isolated(const struct cpumask *src) +{ + cpumask_copy(to_cpumask(cpu_isolated_bits), src); +} + static ATOMIC_NOTIFIER_HEAD(idle_notifier); void idle_notifier_register(struct notifier_block *n) -- cgit v1.2.3 From 3fe956359cd3f1cd174285b693b424f89123ff96 Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Thu, 18 Aug 2016 16:49:44 -0700 Subject: watchdog: Add support for cpu isolation Open up interface to allow external subsystem to enable and disable hard lockup detector. Change-Id: I88a728ee1d54aaa887fab52e5e40d1d4e4fc69ca Signed-off-by: Olav Haugan --- kernel/watchdog.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 029da92fb712..7f21591c8ec5 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -95,6 +96,7 @@ static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); +static DEFINE_PER_CPU(unsigned int, watchdog_en); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(bool, soft_watchdog_warn); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); @@ -586,9 +588,17 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio) sched_setscheduler(current, policy, ¶m); } -static void watchdog_enable(unsigned int cpu) +/* Must be called with hotplug lock (lock_device_hotplug()) held. */ +void watchdog_enable(unsigned int cpu) { struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); + unsigned int *enabled = raw_cpu_ptr(&watchdog_en); + + lock_device_hotplug_assert(); + + if (*enabled) + return; + *enabled = 1; /* kick off the timer for the hardlockup detector */ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); @@ -606,9 +616,17 @@ static void watchdog_enable(unsigned int cpu) __touch_watchdog(); } -static void watchdog_disable(unsigned int cpu) +/* Must be called with hotplug lock (lock_device_hotplug()) held. */ +void watchdog_disable(unsigned int cpu) { struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); + unsigned int *enabled = raw_cpu_ptr(&watchdog_en); + + lock_device_hotplug_assert(); + + if (!*enabled) + return; + *enabled = 0; watchdog_set_prio(SCHED_NORMAL, 0); hrtimer_cancel(hrtimer); -- cgit v1.2.3 From e33c24bfecde67d7d665bfcf90c7d4c2f231be79 Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Tue, 31 May 2016 14:34:46 -0700 Subject: sched: add cpu isolation support This adds cpu isolation APIs to the scheduler to isolate and unisolate CPUs. Isolating and unisolating a CPU can be used in place of hotplug. Isolating and unisolating a CPU is faster than hotplug and can thus be used to optimize the performance and power of multi-core CPUs. Isolating works by migrating non-pinned IRQs and tasks to other CPUS and marking the CPU as not available to the scheduler and load balancer. Pinned tasks and IRQs are still allowed to run but it is expected that this would be minimal. Unisolation works by just marking the CPU available for scheduler and load balancer. Change-Id: I0bbddb56238c2958c5987877c5bfc3e79afa67cc Signed-off-by: Olav Haugan --- kernel/sched/core.c | 280 +++++++++++++++++++++++++++++++++++++++++++++++---- kernel/sched/fair.c | 74 ++++++++++---- kernel/sched/hmp.c | 4 +- kernel/sched/rt.c | 13 ++- kernel/sched/sched.h | 5 +- 5 files changed, 333 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7474463b9835..cddb0073c5fb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #include @@ -1229,6 +1230,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, struct rq *rq; unsigned int dest_cpu; int ret = 0; + cpumask_t allowed_mask; rq = task_rq_lock(p, &flags); @@ -1244,16 +1246,22 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_equal(&p->cpus_allowed, new_mask)) goto out; - dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); + cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask); + + dest_cpu = cpumask_any_and(cpu_active_mask, &allowed_mask); if (dest_cpu >= nr_cpu_ids) { - ret = -EINVAL; - goto out; + dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); + if (dest_cpu >= nr_cpu_ids) { + ret = -EINVAL; + goto out; + } + cpumask_copy(&allowed_mask, new_mask); } do_set_cpus_allowed(p, new_mask); /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask)) + if (cpumask_test_cpu(task_cpu(p), &allowed_mask)) goto out; if (task_running(rq, p) || p->state == TASK_WAKING) { @@ -1577,12 +1585,13 @@ EXPORT_SYMBOL_GPL(kick_process); /* * ->cpus_allowed is protected by both rq->lock and p->pi_lock */ -static int select_fallback_rq(int cpu, struct task_struct *p) +static int select_fallback_rq(int cpu, struct task_struct *p, bool allow_iso) { int nid = cpu_to_node(cpu); const struct cpumask *nodemask = NULL; enum { cpuset, possible, fail } state = cpuset; int dest_cpu; + int isolated_candidate = -1; /* * If the node that the cpu is on has been offlined, cpu_to_node() @@ -1598,6 +1607,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p) continue; if (!cpu_active(dest_cpu)) continue; + if (cpu_isolated(dest_cpu)) + continue; if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) return dest_cpu; } @@ -1610,6 +1621,16 @@ static int select_fallback_rq(int cpu, struct task_struct *p) continue; if (!cpu_active(dest_cpu)) continue; + if (cpu_isolated(dest_cpu)) { + if (allow_iso) + isolated_candidate = dest_cpu; + continue; + } + goto out; + } + + if (isolated_candidate != -1) { + dest_cpu = isolated_candidate; goto out; } @@ -1655,6 +1676,8 @@ out: static inline int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { + bool allow_isolated = (p->flags & PF_KTHREAD); + lockdep_assert_held(&p->pi_lock); if (p->nr_cpus_allowed > 1) @@ -1671,8 +1694,9 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) * not worry about this generic constraint ] */ if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || - !cpu_online(cpu))) - cpu = select_fallback_rq(task_cpu(p), p); + !cpu_online(cpu)) || + (cpu_isolated(cpu) && !allow_isolated)) + cpu = select_fallback_rq(task_cpu(p), p, allow_isolated); return cpu; } @@ -2956,7 +2980,7 @@ void sched_exec(void) if (dest_cpu == smp_processor_id()) goto unlock; - if (likely(cpu_active(dest_cpu))) { + if (likely(cpu_active(dest_cpu) && likely(!cpu_isolated(dest_cpu)))) { struct migration_arg arg = { p, dest_cpu }; raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -5414,18 +5438,22 @@ static struct task_struct fake_task = { }; /* - * Migrate all tasks from the rq, sleeping tasks will be migrated by - * try_to_wake_up()->select_task_rq(). + * Migrate all tasks (not pinned if pinned argument say so) from the rq, + * sleeping tasks will be migrated by try_to_wake_up()->select_task_rq(). * * Called with rq->lock held even though we'er in stop_machine() and * there's no concurrency possible, we hold the required locks anyway * because of lock validation efforts. */ -static void migrate_tasks(struct rq *dead_rq) +static void migrate_tasks(struct rq *dead_rq, bool migrate_pinned_tasks) { struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; int dest_cpu; + unsigned int num_pinned_kthreads = 1; /* this thread */ + cpumask_t avail_cpus; + + cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask); /* * Fudge the rq selection such that the below task selection loop @@ -5447,10 +5475,12 @@ static void migrate_tasks(struct rq *dead_rq) for (;;) { /* - * There's this thread running, bail when that's the only - * remaining thread. + * There's this thread running + pinned threads, bail when + * that's the only remaining threads. */ - if (rq->nr_running == 1) + if ((migrate_pinned_tasks && rq->nr_running == 1) || + (!migrate_pinned_tasks && + rq->nr_running == num_pinned_kthreads)) break; /* @@ -5461,6 +5491,13 @@ static void migrate_tasks(struct rq *dead_rq) BUG_ON(!next); next->sched_class->put_prev_task(rq, next); + if (!migrate_pinned_tasks && next->flags & PF_KTHREAD && + !cpumask_intersects(&avail_cpus, &next->cpus_allowed)) { + lockdep_unpin_lock(&rq->lock); + num_pinned_kthreads += 1; + continue; + } + /* * Rules for changing task_struct::cpus_allowed are holding * both pi_lock and rq->lock, such that holding either @@ -5486,7 +5523,7 @@ static void migrate_tasks(struct rq *dead_rq) } /* Find suitable destination for @next, with force if needed. */ - dest_cpu = select_fallback_rq(dead_rq->cpu, next); + dest_cpu = select_fallback_rq(dead_rq->cpu, next, false); rq = __migrate_task(rq, next, dest_cpu); if (rq != dead_rq) { @@ -5502,6 +5539,210 @@ static void migrate_tasks(struct rq *dead_rq) rq->stop = stop; } + +static void set_rq_online(struct rq *rq); +static void set_rq_offline(struct rq *rq); + +int do_isolation_work_cpu_stop(void *data) +{ + unsigned long flags; + unsigned int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + + watchdog_disable(cpu); + + irq_migrate_all_off_this_cpu(); + + sched_ttwu_pending(); + /* Update our root-domain */ + raw_spin_lock_irqsave(&rq->lock, flags); + + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + + migrate_tasks(rq, false); + raw_spin_unlock_irqrestore(&rq->lock, flags); + + /* + * We might have been in tickless state. Clear NOHZ flags to avoid + * us being kicked for helping out with balancing + */ + nohz_balance_clear_nohz_mask(cpu); + return 0; +} + +int do_unisolation_work_cpu_stop(void *data) +{ + watchdog_enable(smp_processor_id()); + return 0; +} + +static void init_sched_groups_capacity(int cpu, struct sched_domain *sd); + +static void sched_update_group_capacities(int cpu) +{ + struct sched_domain *sd; + + mutex_lock(&sched_domains_mutex); + rcu_read_lock(); + + for_each_domain(cpu, sd) { + int balance_cpu = group_balance_cpu(sd->groups); + + init_sched_groups_capacity(cpu, sd); + /* + * Need to ensure this is also called with balancing + * cpu. + */ + if (cpu != balance_cpu) + init_sched_groups_capacity(balance_cpu, sd); + } + + rcu_read_unlock(); + mutex_unlock(&sched_domains_mutex); +} + +static unsigned int cpu_isolation_vote[NR_CPUS]; + +int sched_isolate_count(const cpumask_t *mask, bool include_offline) +{ + cpumask_t count_mask = CPU_MASK_NONE; + + if (include_offline) { + cpumask_complement(&count_mask, cpu_online_mask); + cpumask_or(&count_mask, &count_mask, cpu_isolated_mask); + cpumask_and(&count_mask, &count_mask, mask); + } else { + cpumask_and(&count_mask, mask, cpu_isolated_mask); + } + + return cpumask_weight(&count_mask); +} + +/* + * 1) CPU is isolated and cpu is offlined: + * Unisolate the core. + * 2) CPU is not isolated and CPU is offlined: + * No action taken. + * 3) CPU is offline and request to isolate + * Request ignored. + * 4) CPU is offline and isolated: + * Not a possible state. + * 5) CPU is online and request to isolate + * Normal case: Isolate the CPU + * 6) CPU is not isolated and comes back online + * Nothing to do + * + * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY + * calling sched_unisolate_cpu() on a CPU that the client previously isolated. + * Client is also responsible for unisolating when a core goes offline + * (after CPU is marked offline). + */ +int sched_isolate_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + cpumask_t avail_cpus; + int ret_code = 0; + + lock_device_hotplug(); + + cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask); + + /* We cannot isolate ALL cpus in the system */ + if (cpumask_weight(&avail_cpus) == 1) { + ret_code = -EINVAL; + goto out; + } + + if (!cpu_online(cpu)) { + ret_code = -EINVAL; + goto out; + } + + if (++cpu_isolation_vote[cpu] > 1) + goto out; + + set_cpu_isolated(cpu, true); + cpumask_clear_cpu(cpu, &avail_cpus); + + /* Migrate timers */ + smp_call_function_any(&avail_cpus, hrtimer_quiesce_cpu, &cpu, 1); + smp_call_function_any(&avail_cpus, timer_quiesce_cpu, &cpu, 1); + + migrate_sync_cpu(cpu, cpumask_first(&avail_cpus)); + stop_cpus(cpumask_of(cpu), do_isolation_work_cpu_stop, 0); + + clear_hmp_request(cpu); + calc_load_migrate(rq); + update_max_interval(); + sched_update_group_capacities(cpu); + +out: + unlock_device_hotplug(); + return ret_code; +} + +/* + * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY + * calling sched_unisolate_cpu() on a CPU that the client previously isolated. + * Client is also responsible for unisolating when a core goes offline + * (after CPU is marked offline). + */ +int sched_unisolate_cpu_unlocked(int cpu) +{ + int ret_code = 0; + struct rq *rq = cpu_rq(cpu); + + lock_device_hotplug_assert(); + + if (!cpu_isolation_vote[cpu]) { + ret_code = -EINVAL; + goto out; + } + + if (--cpu_isolation_vote[cpu]) + goto out; + + if (cpu_online(cpu)) { + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + rq->age_stamp = sched_clock_cpu(cpu); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_online(rq); + } + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + + set_cpu_isolated(cpu, false); + update_max_interval(); + sched_update_group_capacities(cpu); + + if (cpu_online(cpu)) { + stop_cpus(cpumask_of(cpu), do_unisolation_work_cpu_stop, 0); + + /* Kick CPU to immediately do load balancing */ + if (!test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) + smp_send_reschedule(cpu); + } + +out: + return ret_code; +} + +int sched_unisolate_cpu(int cpu) +{ + int ret_code; + + lock_device_hotplug(); + ret_code = sched_unisolate_cpu_unlocked(cpu); + unlock_device_hotplug(); + return ret_code; +} + #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -5748,13 +5989,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) sched_ttwu_pending(); /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); - migrate_sync_cpu(cpu); + migrate_sync_cpu(cpu, smp_processor_id()); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - migrate_tasks(rq); + migrate_tasks(rq, true); BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); break; @@ -6509,11 +6750,14 @@ build_sched_groups(struct sched_domain *sd, int cpu) static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) { struct sched_group *sg = sd->groups; + cpumask_t avail_mask; WARN_ON(!sg); do { - sg->group_weight = cpumask_weight(sched_group_cpus(sg)); + cpumask_andnot(&avail_mask, sched_group_cpus(sg), + cpu_isolated_mask); + sg->group_weight = cpumask_weight(&avail_mask); sg = sg->next; } while (sg != sd->groups); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e893b0fcac6b..83da13b5f6b8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2941,6 +2941,8 @@ static void find_best_cpu_in_cluster(struct sched_cluster *c, struct cpumask search_cpus; cpumask_and(&search_cpus, tsk_cpus_allowed(env->p), &c->cpus); + cpumask_andnot(&search_cpus, &search_cpus, cpu_isolated_mask); + if (env->ignore_prev_cpu) cpumask_clear_cpu(env->prev_cpu, &search_cpus); @@ -3009,7 +3011,8 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats) prev_cpu = env->prev_cpu; if (!cpumask_test_cpu(prev_cpu, tsk_cpus_allowed(task)) || - unlikely(!cpu_active(prev_cpu))) + unlikely(!cpu_active(prev_cpu)) || + cpu_isolated(prev_cpu)) return false; if (task->ravg.mark_start - task->last_cpu_selected_ts >= @@ -7354,6 +7357,8 @@ void update_group_capacity(struct sched_domain *sd, int cpu) struct sched_group_capacity *sgc; struct rq *rq = cpu_rq(cpu); + if (cpumask_test_cpu(cpu, cpu_isolated_mask)) + continue; /* * build_sched_domains() -> init_sched_groups_capacity() * gets here before we've attached the domains to the @@ -7381,7 +7386,11 @@ void update_group_capacity(struct sched_domain *sd, int cpu) group = child->groups; do { - capacity += group->sgc->capacity; + cpumask_t *cpus = sched_group_cpus(group); + + /* Revisit this later. This won't work for MT domain */ + if (!cpu_isolated(cpumask_first(cpus))) + capacity += group->sgc->capacity; group = group->next; } while (group != child->groups); } @@ -7521,6 +7530,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, power_cost(i, 0), cpu_temp(i)); + if (cpu_isolated(i)) + continue; + /* Bias balancing toward cpus of our domain */ if (local_group) load = target_load(i, load_idx); @@ -7548,17 +7560,27 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->idle_cpus++; } - /* Adjust by relative CPU capacity of the group */ - sgs->group_capacity = group->sgc->capacity; - sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; + /* Isolated CPU has no weight */ + if (!group->group_weight) { + sgs->group_capacity = 0; + sgs->avg_load = 0; + sgs->group_no_capacity = 1; + sgs->group_type = group_other; + sgs->group_weight = group->group_weight; + } else { + /* Adjust by relative CPU capacity of the group */ + sgs->group_capacity = group->sgc->capacity; + sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / + sgs->group_capacity; - if (sgs->sum_nr_running) - sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; + sgs->group_weight = group->group_weight; - sgs->group_weight = group->group_weight; + sgs->group_no_capacity = group_is_overloaded(env, sgs); + sgs->group_type = group_classify(group, sgs, env); + } - sgs->group_no_capacity = group_is_overloaded(env, sgs); - sgs->group_type = group_classify(group, sgs, env); + if (sgs->sum_nr_running) + sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; } #ifdef CONFIG_SCHED_HMP @@ -8601,6 +8623,9 @@ static int idle_balance(struct rq *this_rq) int pulled_task = 0; u64 curr_cost = 0; + if (cpu_isolated(this_cpu)) + return 0; + idle_enter_fair(this_rq); /* @@ -8908,16 +8933,21 @@ static void nohz_balancer_kick(int type) return; } +void nohz_balance_clear_nohz_mask(int cpu) +{ + if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); + atomic_dec(&nohz.nr_cpus); + } +} + static inline void nohz_balance_exit_idle(int cpu) { if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { /* * Completely isolated CPUs don't ever set, so we must test. */ - if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); - } + nohz_balance_clear_nohz_mask(cpu); clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } } @@ -8974,7 +9004,7 @@ void nohz_balance_enter_idle(int cpu) /* * If we're a completely isolated CPU, we don't play. */ - if (on_null_domain(cpu_rq(cpu))) + if (on_null_domain(cpu_rq(cpu)) || cpu_isolated(cpu)) return; cpumask_set_cpu(cpu, nohz.idle_cpus_mask); @@ -9003,7 +9033,13 @@ static DEFINE_SPINLOCK(balancing); */ void update_max_interval(void) { - max_load_balance_interval = HZ*num_online_cpus()/10; + cpumask_t avail_mask; + unsigned int available_cpus; + + cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask); + available_cpus = cpumask_weight(&avail_mask); + + max_load_balance_interval = HZ*available_cpus/10; } /* @@ -9342,8 +9378,10 @@ void trigger_load_balance(struct rq *rq) { int type = NOHZ_KICK_ANY; - /* Don't need to rebalance while attached to NULL domain */ - if (unlikely(on_null_domain(rq))) + /* Don't need to rebalance while attached to NULL domain or + * cpu is isolated. + */ + if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq))) return; if (time_after_eq(jiffies, rq->next_balance)) diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index 5002619961ce..a921498dbf09 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -2828,10 +2828,10 @@ void set_window_start(struct rq *rq) rq->curr->ravg.mark_start = rq->window_start; } -void migrate_sync_cpu(int cpu) +void migrate_sync_cpu(int cpu, int new_cpu) { if (cpu == sync_cpu) - sync_cpu = smp_processor_id(); + sync_cpu = new_cpu; } static void reset_all_task_stats(void) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index cfec881491ef..ba4403e910d8 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -265,8 +265,12 @@ static void pull_rt_task(struct rq *this_rq); static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) { - /* Try to pull RT tasks here if we lower this rq's prio */ - return rq->rt.highest_prio.curr > prev->prio; + /* + * Try to pull RT tasks here if we lower this rq's prio and cpu is not + * isolated + */ + return rq->rt.highest_prio.curr > prev->prio && + !cpu_isolated(cpu_of(rq)); } static inline int rt_overloaded(struct rq *rq) @@ -1694,6 +1698,8 @@ static int find_lowest_rq_hmp(struct task_struct *task) for_each_sched_cluster(cluster) { cpumask_and(&candidate_mask, &cluster->cpus, lowest_mask); + cpumask_andnot(&candidate_mask, &candidate_mask, + cpu_isolated_mask); if (cpumask_empty(&candidate_mask)) continue; @@ -2282,7 +2288,8 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) + if (!task_on_rq_queued(p) || rq->rt.rt_nr_running || + cpu_isolated(cpu_of(rq))) return; queue_pull_task(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec7721112b05..41abb4dabeb7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1069,7 +1069,7 @@ extern void clear_boost_kick(int cpu); extern void clear_hmp_request(int cpu); extern void mark_task_starting(struct task_struct *p); extern void set_window_start(struct rq *rq); -extern void migrate_sync_cpu(int cpu); +extern void migrate_sync_cpu(int cpu, int new_cpu); extern void update_cluster_topology(void); extern void set_task_last_wake(struct task_struct *p, u64 wallclock); extern void set_task_last_switch_out(struct task_struct *p, u64 wallclock); @@ -1424,7 +1424,7 @@ static inline void clear_boost_kick(int cpu) { } static inline void clear_hmp_request(int cpu) { } static inline void mark_task_starting(struct task_struct *p) { } static inline void set_window_start(struct rq *rq) { } -static inline void migrate_sync_cpu(int cpu) { } +static inline void migrate_sync_cpu(int cpu, int new_cpu) {} static inline void update_cluster_topology(void) { } static inline void set_task_last_wake(struct task_struct *p, u64 wallclock) { } static inline void set_task_last_switch_out(struct task_struct *p, @@ -1953,6 +1953,7 @@ extern const struct sched_class idle_sched_class; extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); +extern void nohz_balance_clear_nohz_mask(int cpu); extern void idle_enter_fair(struct rq *this_rq); extern void idle_exit_fair(struct rq *this_rq); -- cgit v1.2.3 From 0a17b36a20d65df27ccf2de068ea517b19f6a53f Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Sun, 12 Jun 2016 13:57:05 -0700 Subject: sched/core: Add trace point for cpu isolation Add tracepoint to capture the cpu isolation event including KPI for time it took to isolate. Change-Id: If2d30000f068afc50db953940f4636ef6a089b24 Signed-off-by: Olav Haugan --- kernel/sched/core.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cddb0073c5fb..7b7f1961fd10 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5645,6 +5645,10 @@ int sched_isolate_cpu(int cpu) struct rq *rq = cpu_rq(cpu); cpumask_t avail_cpus; int ret_code = 0; + u64 start_time; + + if (trace_sched_isolate_enabled()) + start_time = sched_clock(); lock_device_hotplug(); @@ -5681,6 +5685,8 @@ int sched_isolate_cpu(int cpu) out: unlock_device_hotplug(); + trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0], + start_time, 1); return ret_code; } @@ -5694,6 +5700,10 @@ int sched_unisolate_cpu_unlocked(int cpu) { int ret_code = 0; struct rq *rq = cpu_rq(cpu); + u64 start_time; + + if (trace_sched_isolate_enabled()) + start_time = sched_clock(); lock_device_hotplug_assert(); @@ -5730,6 +5740,8 @@ int sched_unisolate_cpu_unlocked(int cpu) } out: + trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0], + start_time, 0); return ret_code; } -- cgit v1.2.3 From 4400ef145f2b7aac21676b25942f32fb32ed724e Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Sun, 29 May 2016 19:35:54 -0700 Subject: irq: Make irq affinity function cpu isolation aware Prohibit setting the affinity of an IRQ to an isolated core. Change-Id: I7b50778615541a64f9956573757c7f28748c4f69 Signed-off-by: Olav Haugan --- kernel/irq/cpuhotplug.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 011f8c4c63da..104432f3d311 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "internals.h" @@ -20,6 +21,7 @@ static bool migrate_one_irq(struct irq_desc *desc) const struct cpumask *affinity = d->common->affinity; struct irq_chip *c; bool ret = false; + struct cpumask available_cpus; /* * If this is a per-CPU interrupt, or the affinity does not @@ -29,8 +31,15 @@ static bool migrate_one_irq(struct irq_desc *desc) !cpumask_test_cpu(smp_processor_id(), affinity)) return false; + cpumask_copy(&available_cpus, affinity); + cpumask_andnot(&available_cpus, &available_cpus, cpu_isolated_mask); + affinity = &available_cpus; + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { - affinity = cpu_online_mask; + cpumask_andnot(&available_cpus, cpu_online_mask, + cpu_isolated_mask); + if (cpumask_empty(affinity)) + affinity = cpu_online_mask; ret = true; } -- cgit v1.2.3 From bc24c063ef8b8c16432fd328403972ae3da12526 Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Sun, 29 May 2016 19:50:47 -0700 Subject: pmqos: Enable cpu isolation awareness Set long latency requirement for isolated cores to ensure LPM logic will select a deep sleep state. Change-Id: I83e9fbb800df259616a145d311b50627dc42a5ff Signed-off-by: Olav Haugan --- kernel/power/qos.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 8ecc7b3f7dd9..69c32c42080f 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -447,6 +448,9 @@ EXPORT_SYMBOL_GPL(pm_qos_request); int pm_qos_request_for_cpu(int pm_qos_class, int cpu) { + if (cpu_isolated(cpu)) + return INT_MAX; + return pm_qos_array[pm_qos_class]->constraints->target_per_cpu[cpu]; } EXPORT_SYMBOL(pm_qos_request_for_cpu); @@ -469,6 +473,9 @@ int pm_qos_request_for_cpumask(int pm_qos_class, struct cpumask *mask) val = c->default_value; for_each_cpu(cpu, mask) { + if (cpu_isolated(cpu)) + continue; + switch (c->type) { case PM_QOS_MIN: if (c->target_per_cpu[cpu] < val) -- cgit v1.2.3 From e38c1ce12351b6e8bfa8a4237d940449afcc7500 Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Sun, 29 May 2016 19:53:00 -0700 Subject: smp: Do not wake up all idle CPUs Do not wake up cpus that are isolated. Change-Id: I07702bb5b738c1c75c49a2ca4cb08be0231ccb12 Signed-off-by: Olav Haugan --- kernel/smp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index abdc48cd79a3..b2ec21c5c9d6 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -766,8 +766,8 @@ void wake_up_all_idle_cpus(void) for_each_online_cpu(cpu) { if (cpu == smp_processor_id()) continue; - - wake_up_if_idle(cpu); + if (!cpu_isolated(cpu)) + wake_up_if_idle(cpu); } preempt_enable(); } -- cgit v1.2.3 From 639c8ad52d59e7b9447b15c0e9a2ceaa533ad854 Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Sun, 29 May 2016 19:54:27 -0700 Subject: perf: Add cpu isolation awareness Ensure perf events does not wake up idle cores when core is isolated. Change-Id: Ifefb2f1cf6c24af7bc46fc62797955b8c8ad5815 Signed-off-by: Olav Haugan --- kernel/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d6ec580584b6..5beb88f11671 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3386,7 +3386,8 @@ static int perf_event_read(struct perf_event *event, bool group) * If event is enabled and currently active on a CPU, update the * value in the event structure: */ - if (event->state == PERF_EVENT_STATE_ACTIVE) { + if (event->state == PERF_EVENT_STATE_ACTIVE && + !cpu_isolated(event->oncpu)) { struct perf_read_data data = { .event = event, .group = group, -- cgit v1.2.3 From 7f2c52364339cdcd3f56375b8a883005cb88680e Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Thu, 18 Aug 2016 17:21:30 -0700 Subject: core_ctl_helper: Remove code since it is not used anymore Remove the core control helper code since this is not needed anymore with subsequent patches that moves core control into the kernel. Change-Id: I62acddeb707fc7d5626580166b3466e63f45fd89 Signed-off-by: Olav Haugan --- kernel/trace/power-traces.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index 9270e1ac6460..49fa2e6eea98 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -15,5 +15,3 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); -EXPORT_TRACEPOINT_SYMBOL(core_ctl_set_busy); -EXPORT_TRACEPOINT_SYMBOL(core_ctl_eval_need); -- cgit v1.2.3 From 59f16ae0345c902c1d09da75e0f89d7e7ddbc54f Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Thu, 18 Aug 2016 17:22:44 -0700 Subject: core_ctrl: Move core control into kernel Move core control from out-of-tree module into the kernel proper. Core control monitors load on CPUs and controls how many CPUs are available for the system to use at any point in time. This can help save power. Core control can be configured through sysfs interface. Change-Id: Ia78e701468ea3828195c2a15c9cf9fafd099804a Signed-off-by: Olav Haugan --- kernel/sched/Makefile | 1 + kernel/sched/core_ctl.c | 1014 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 1015 insertions(+) create mode 100644 kernel/sched/core_ctl.c (limited to 'kernel') diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 1f159743ebfc..508b65690288 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -20,3 +20,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_SCHED_CORE_CTL) += core_ctl.o diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c new file mode 100644 index 000000000000..8f071757d516 --- /dev/null +++ b/kernel/sched/core_ctl.c @@ -0,0 +1,1014 @@ +/* Copyright (c) 2014-2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define MAX_CPUS_PER_GROUP 4 + +struct cpu_data { + /* Per CPU data. */ + bool inited; + bool online; + bool rejected; + bool is_busy; + bool not_preferred; + unsigned int busy; + unsigned int cpu; + struct list_head sib; + unsigned int first_cpu; + + /* Per cluster data set only on first CPU */ + unsigned int min_cpus; + unsigned int max_cpus; + unsigned int offline_delay_ms; + unsigned int busy_up_thres[MAX_CPUS_PER_GROUP]; + unsigned int busy_down_thres[MAX_CPUS_PER_GROUP]; + unsigned int online_cpus; + unsigned int avail_cpus; + unsigned int num_cpus; + unsigned int need_cpus; + unsigned int task_thres; + s64 need_ts; + struct list_head lru; + bool pending; + spinlock_t pending_lock; + bool is_big_cluster; + int nrrun; + bool nrrun_changed; + struct timer_list timer; + struct task_struct *hotplug_thread; + struct kobject kobj; +}; + +static DEFINE_PER_CPU(struct cpu_data, cpu_state); +static DEFINE_SPINLOCK(state_lock); +static void apply_need(struct cpu_data *f); +static void wake_up_hotplug_thread(struct cpu_data *state); + +/* ========================= sysfs interface =========================== */ + +static ssize_t store_min_cpus(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->min_cpus = min(val, state->max_cpus); + wake_up_hotplug_thread(state); + + return count; +} + +static ssize_t show_min_cpus(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->min_cpus); +} + +static ssize_t store_max_cpus(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + val = min(val, state->num_cpus); + state->max_cpus = val; + state->min_cpus = min(state->min_cpus, state->max_cpus); + wake_up_hotplug_thread(state); + + return count; +} + +static ssize_t show_max_cpus(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->max_cpus); +} + +static ssize_t store_offline_delay_ms(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->offline_delay_ms = val; + apply_need(state); + + return count; +} + +static ssize_t show_task_thres(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->task_thres); +} + +static ssize_t store_task_thres(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + if (val < state->num_cpus) + return -EINVAL; + + state->task_thres = val; + apply_need(state); + + return count; +} + +static ssize_t show_offline_delay_ms(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->offline_delay_ms); +} + +static ssize_t store_busy_up_thres(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val[MAX_CPUS_PER_GROUP]; + int ret, i; + + ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + if (ret == 1) { + for (i = 0; i < state->num_cpus; i++) + state->busy_up_thres[i] = val[0]; + } else { + for (i = 0; i < state->num_cpus; i++) + state->busy_up_thres[i] = val[i]; + } + apply_need(state); + return count; +} + +static ssize_t show_busy_up_thres(struct cpu_data *state, char *buf) +{ + int i, count = 0; + + for (i = 0; i < state->num_cpus; i++) + count += snprintf(buf + count, PAGE_SIZE - count, "%u ", + state->busy_up_thres[i]); + count += snprintf(buf + count, PAGE_SIZE - count, "\n"); + return count; +} + +static ssize_t store_busy_down_thres(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val[MAX_CPUS_PER_GROUP]; + int ret, i; + + ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + if (ret == 1) { + for (i = 0; i < state->num_cpus; i++) + state->busy_down_thres[i] = val[0]; + } else { + for (i = 0; i < state->num_cpus; i++) + state->busy_down_thres[i] = val[i]; + } + apply_need(state); + return count; +} + +static ssize_t show_busy_down_thres(struct cpu_data *state, char *buf) +{ + int i, count = 0; + + for (i = 0; i < state->num_cpus; i++) + count += snprintf(buf + count, PAGE_SIZE - count, "%u ", + state->busy_down_thres[i]); + count += snprintf(buf + count, PAGE_SIZE - count, "\n"); + return count; +} + +static ssize_t store_is_big_cluster(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->is_big_cluster = val ? 1 : 0; + return count; +} + +static ssize_t show_is_big_cluster(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->is_big_cluster); +} + +static ssize_t show_cpus(struct cpu_data *state, char *buf) +{ + struct cpu_data *c; + ssize_t count = 0; + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry(c, &state->lru, sib) { + count += snprintf(buf + count, PAGE_SIZE - count, + "CPU%u (%s)\n", c->cpu, + c->online ? "Online" : "Offline"); + } + spin_unlock_irqrestore(&state_lock, flags); + return count; +} + +static ssize_t show_need_cpus(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->need_cpus); +} + +static ssize_t show_online_cpus(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->online_cpus); +} + +static ssize_t show_global_state(struct cpu_data *state, char *buf) +{ + struct cpu_data *c; + ssize_t count = 0; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + count += snprintf(buf + count, PAGE_SIZE - count, + "CPU%u\n", cpu); + c = &per_cpu(cpu_state, cpu); + if (!c->inited) + continue; + count += snprintf(buf + count, PAGE_SIZE - count, + "\tCPU: %u\n", c->cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tOnline: %u\n", c->online); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tRejected: %u\n", c->rejected); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tFirst CPU: %u\n", c->first_cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tBusy%%: %u\n", c->busy); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tIs busy: %u\n", c->is_busy); + if (c->cpu != c->first_cpu) + continue; + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNr running: %u\n", c->nrrun); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tAvail CPUs: %u\n", c->avail_cpus); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNeed CPUs: %u\n", c->need_cpus); + } + + return count; +} + +static ssize_t store_not_preferred(struct cpu_data *state, + const char *buf, size_t count) +{ + struct cpu_data *c; + unsigned int i, first_cpu; + unsigned int val[MAX_CPUS_PER_GROUP]; + int ret; + + ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + first_cpu = state->first_cpu; + + for (i = 0; i < state->num_cpus; i++) { + c = &per_cpu(cpu_state, first_cpu); + c->not_preferred = val[i]; + first_cpu++; + } + + return count; +} + +static ssize_t show_not_preferred(struct cpu_data *state, char *buf) +{ + struct cpu_data *c; + ssize_t count = 0; + unsigned int i, first_cpu; + + first_cpu = state->first_cpu; + + for (i = 0; i < state->num_cpus; i++) { + c = &per_cpu(cpu_state, first_cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tCPU:%d %u\n", first_cpu, c->not_preferred); + first_cpu++; + } + + return count; +} + +struct core_ctl_attr { + struct attribute attr; + ssize_t (*show)(struct cpu_data *, char *); + ssize_t (*store)(struct cpu_data *, const char *, size_t count); +}; + +#define core_ctl_attr_ro(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0444, show_##_name, NULL) + +#define core_ctl_attr_rw(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0644, show_##_name, store_##_name) + +core_ctl_attr_rw(min_cpus); +core_ctl_attr_rw(max_cpus); +core_ctl_attr_rw(offline_delay_ms); +core_ctl_attr_rw(busy_up_thres); +core_ctl_attr_rw(busy_down_thres); +core_ctl_attr_rw(task_thres); +core_ctl_attr_rw(is_big_cluster); +core_ctl_attr_ro(cpus); +core_ctl_attr_ro(need_cpus); +core_ctl_attr_ro(online_cpus); +core_ctl_attr_ro(global_state); +core_ctl_attr_rw(not_preferred); + +static struct attribute *default_attrs[] = { + &min_cpus.attr, + &max_cpus.attr, + &offline_delay_ms.attr, + &busy_up_thres.attr, + &busy_down_thres.attr, + &task_thres.attr, + &is_big_cluster.attr, + &cpus.attr, + &need_cpus.attr, + &online_cpus.attr, + &global_state.attr, + ¬_preferred.attr, + NULL +}; + +#define to_cpu_data(k) container_of(k, struct cpu_data, kobj) +#define to_attr(a) container_of(a, struct core_ctl_attr, attr) +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct cpu_data *data = to_cpu_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->show) + ret = cattr->show(data, buf); + + return ret; +} + +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct cpu_data *data = to_cpu_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->store) + ret = cattr->store(data, buf, count); + + return ret; +} + +static const struct sysfs_ops sysfs_ops = { + .show = show, + .store = store, +}; + +static struct kobj_type ktype_core_ctl = { + .sysfs_ops = &sysfs_ops, + .default_attrs = default_attrs, +}; + +/* ==================== runqueue based core count =================== */ + +#define RQ_AVG_TOLERANCE 2 +#define RQ_AVG_DEFAULT_MS 20 +#define NR_RUNNING_TOLERANCE 5 +static unsigned int rq_avg_period_ms = RQ_AVG_DEFAULT_MS; + +static s64 rq_avg_timestamp_ms; +static struct timer_list rq_avg_timer; + +static void update_running_avg(bool trigger_update) +{ + int cpu; + struct cpu_data *pcpu; + int avg, iowait_avg, big_avg, old_nrrun; + s64 now; + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + + now = ktime_to_ms(ktime_get()); + if (now - rq_avg_timestamp_ms < rq_avg_period_ms - RQ_AVG_TOLERANCE) { + spin_unlock_irqrestore(&state_lock, flags); + return; + } + rq_avg_timestamp_ms = now; + sched_get_nr_running_avg(&avg, &iowait_avg, &big_avg); + + spin_unlock_irqrestore(&state_lock, flags); + + /* + * Round up to the next integer if the average nr running tasks + * is within NR_RUNNING_TOLERANCE/100 of the next integer. + * If normal rounding up is used, it will allow a transient task + * to trigger online event. By the time core is onlined, the task + * has finished. + * Rounding to closest suffers same problem because scheduler + * might only provide running stats per jiffy, and a transient + * task could skew the number for one jiffy. If core control + * samples every 2 jiffies, it will observe 0.5 additional running + * average which rounds up to 1 task. + */ + avg = (avg + NR_RUNNING_TOLERANCE) / 100; + big_avg = (big_avg + NR_RUNNING_TOLERANCE) / 100; + + for_each_possible_cpu(cpu) { + pcpu = &per_cpu(cpu_state, cpu); + if (!pcpu->inited || pcpu->first_cpu != cpu) + continue; + old_nrrun = pcpu->nrrun; + /* + * Big cluster only need to take care of big tasks, but if + * there are not enough big cores, big tasks need to be run + * on little as well. Thus for little's runqueue stat, it + * has to use overall runqueue average, or derive what big + * tasks would have to be run on little. The latter approach + * is not easy to get given core control reacts much slower + * than scheduler, and can't predict scheduler's behavior. + */ + pcpu->nrrun = pcpu->is_big_cluster ? big_avg : avg; + if (pcpu->nrrun != old_nrrun) { + if (trigger_update) + apply_need(pcpu); + else + pcpu->nrrun_changed = true; + } + } +} + +/* adjust needed CPUs based on current runqueue information */ +static unsigned int apply_task_need(struct cpu_data *f, unsigned int new_need) +{ + /* Online all cores if there are enough tasks */ + if (f->nrrun >= f->task_thres) + return f->num_cpus; + + /* only online more cores if there are tasks to run */ + if (f->nrrun > new_need) + return new_need + 1; + + return new_need; +} + +static u64 round_to_nw_start(void) +{ + unsigned long step = msecs_to_jiffies(rq_avg_period_ms); + u64 jif = get_jiffies_64(); + + do_div(jif, step); + return (jif + 1) * step; +} + +static void rq_avg_timer_func(unsigned long not_used) +{ + update_running_avg(true); + mod_timer(&rq_avg_timer, round_to_nw_start()); +} + +/* ======================= load based core count ====================== */ + +static unsigned int apply_limits(struct cpu_data *f, unsigned int need_cpus) +{ + return min(max(f->min_cpus, need_cpus), f->max_cpus); +} + +static bool eval_need(struct cpu_data *f) +{ + unsigned long flags; + struct cpu_data *c; + unsigned int need_cpus = 0, last_need, thres_idx; + int ret = 0; + bool need_flag = false; + s64 now; + + if (unlikely(!f->inited)) + return 0; + + spin_lock_irqsave(&state_lock, flags); + thres_idx = f->online_cpus ? f->online_cpus - 1 : 0; + list_for_each_entry(c, &f->lru, sib) { + if (c->busy >= f->busy_up_thres[thres_idx]) + c->is_busy = true; + else if (c->busy < f->busy_down_thres[thres_idx]) + c->is_busy = false; + need_cpus += c->is_busy; + } + need_cpus = apply_task_need(f, need_cpus); + need_flag = apply_limits(f, need_cpus) != apply_limits(f, f->need_cpus); + last_need = f->need_cpus; + + now = ktime_to_ms(ktime_get()); + + if (need_cpus == last_need) { + f->need_ts = now; + spin_unlock_irqrestore(&state_lock, flags); + return 0; + } + + if (need_cpus > last_need) { + ret = 1; + } else if (need_cpus < last_need) { + s64 elapsed = now - f->need_ts; + + if (elapsed >= f->offline_delay_ms) { + ret = 1; + } else { + mod_timer(&f->timer, jiffies + + msecs_to_jiffies(f->offline_delay_ms)); + } + } + + if (ret) { + f->need_ts = now; + f->need_cpus = need_cpus; + } + + trace_core_ctl_eval_need(f->cpu, last_need, need_cpus, + ret && need_flag); + spin_unlock_irqrestore(&state_lock, flags); + + return ret && need_flag; +} + +static void apply_need(struct cpu_data *f) +{ + if (eval_need(f)) + wake_up_hotplug_thread(f); +} + +static int core_ctl_set_busy(unsigned int cpu, unsigned int busy) +{ + struct cpu_data *c = &per_cpu(cpu_state, cpu); + struct cpu_data *f; + unsigned int old_is_busy = c->is_busy; + + if (!c->inited) + return 0; + f = &per_cpu(cpu_state, c->first_cpu); + + update_running_avg(false); + if (c->busy == busy && !f->nrrun_changed) + return 0; + c->busy = busy; + f->nrrun_changed = false; + + apply_need(f); + trace_core_ctl_set_busy(cpu, busy, old_is_busy, c->is_busy); + return 0; +} + +/* ========================= core count enforcement ==================== */ + +/* + * If current thread is hotplug thread, don't attempt to wake up + * itself or other hotplug threads because it will deadlock. Instead, + * schedule a timer to fire in next timer tick and wake up the thread. + */ +static void wake_up_hotplug_thread(struct cpu_data *state) +{ + unsigned long flags; + int cpu; + struct cpu_data *pcpu; + bool no_wakeup = false; + + for_each_possible_cpu(cpu) { + pcpu = &per_cpu(cpu_state, cpu); + if (cpu != pcpu->first_cpu) + continue; + if (pcpu->hotplug_thread == current) { + no_wakeup = true; + break; + } + } + + spin_lock_irqsave(&state->pending_lock, flags); + state->pending = true; + spin_unlock_irqrestore(&state->pending_lock, flags); + + if (no_wakeup) { + spin_lock_irqsave(&state_lock, flags); + mod_timer(&state->timer, jiffies); + spin_unlock_irqrestore(&state_lock, flags); + } else { + wake_up_process(state->hotplug_thread); + } +} + +static void core_ctl_timer_func(unsigned long cpu) +{ + struct cpu_data *state = &per_cpu(cpu_state, cpu); + unsigned long flags; + + if (eval_need(state)) { + spin_lock_irqsave(&state->pending_lock, flags); + state->pending = true; + spin_unlock_irqrestore(&state->pending_lock, flags); + wake_up_process(state->hotplug_thread); + } + +} + +static int core_ctl_online_core(unsigned int cpu) +{ + int ret; + struct device *dev; + + lock_device_hotplug(); + dev = get_cpu_device(cpu); + if (!dev) { + pr_err("%s: failed to get cpu%d device\n", __func__, cpu); + ret = -ENODEV; + } else { + ret = device_online(dev); + } + unlock_device_hotplug(); + return ret; +} + +static int core_ctl_offline_core(unsigned int cpu) +{ + int ret; + struct device *dev; + + lock_device_hotplug(); + dev = get_cpu_device(cpu); + if (!dev) { + pr_err("%s: failed to get cpu%d device\n", __func__, cpu); + ret = -ENODEV; + } else { + ret = device_offline(dev); + } + unlock_device_hotplug(); + return ret; +} + +static void __ref do_hotplug(struct cpu_data *f) +{ + unsigned int need; + struct cpu_data *c, *tmp; + + need = apply_limits(f, f->need_cpus); + pr_debug("Trying to adjust group %u to %u\n", f->first_cpu, need); + + if (f->online_cpus > need) { + list_for_each_entry_safe(c, tmp, &f->lru, sib) { + if (!c->online) + continue; + + if (f->online_cpus == need) + break; + + /* Don't offline busy CPUs. */ + if (c->is_busy) + continue; + + pr_debug("Trying to Offline CPU%u\n", c->cpu); + if (core_ctl_offline_core(c->cpu)) + pr_debug("Unable to Offline CPU%u\n", c->cpu); + } + + /* + * If the number of online CPUs is within the limits, then + * don't force any busy CPUs offline. + */ + if (f->online_cpus <= f->max_cpus) + return; + + list_for_each_entry_safe(c, tmp, &f->lru, sib) { + if (!c->online) + continue; + + if (f->online_cpus <= f->max_cpus) + break; + + pr_debug("Trying to Offline CPU%u\n", c->cpu); + if (core_ctl_offline_core(c->cpu)) + pr_debug("Unable to Offline CPU%u\n", c->cpu); + } + } else if (f->online_cpus < need) { + list_for_each_entry_safe(c, tmp, &f->lru, sib) { + if (c->online || c->rejected || c->not_preferred) + continue; + if (f->online_cpus == need) + break; + + pr_debug("Trying to Online CPU%u\n", c->cpu); + if (core_ctl_online_core(c->cpu)) + pr_debug("Unable to Online CPU%u\n", c->cpu); + } + + if (f->online_cpus == need) + return; + + + list_for_each_entry_safe(c, tmp, &f->lru, sib) { + if (c->online || c->rejected || !c->not_preferred) + continue; + if (f->online_cpus == need) + break; + + pr_debug("Trying to Online CPU%u\n", c->cpu); + if (core_ctl_online_core(c->cpu)) + pr_debug("Unable to Online CPU%u\n", c->cpu); + } + + } +} + +static int __ref try_hotplug(void *data) +{ + struct cpu_data *f = data; + unsigned long flags; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + spin_lock_irqsave(&f->pending_lock, flags); + if (!f->pending) { + spin_unlock_irqrestore(&f->pending_lock, flags); + schedule(); + if (kthread_should_stop()) + break; + spin_lock_irqsave(&f->pending_lock, flags); + } + set_current_state(TASK_RUNNING); + f->pending = false; + spin_unlock_irqrestore(&f->pending_lock, flags); + + do_hotplug(f); + } + + return 0; +} + +static int __ref cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + uint32_t cpu = (uintptr_t)hcpu; + struct cpu_data *state = &per_cpu(cpu_state, cpu); + struct cpu_data *f; + int ret = NOTIFY_OK; + unsigned long flags; + + /* Don't affect suspend resume */ + if (action & CPU_TASKS_FROZEN) + return NOTIFY_OK; + + if (unlikely(!state->inited)) + return NOTIFY_OK; + + f = &per_cpu(cpu_state, state->first_cpu); + + switch (action) { + case CPU_UP_PREPARE: + + /* If online state of CPU somehow got out of sync, fix it. */ + if (state->online) { + f->online_cpus--; + state->online = false; + pr_warn("CPU%d offline when state is online\n", cpu); + } + + if (state->rejected) { + state->rejected = false; + f->avail_cpus++; + } + + /* + * If a CPU is in the process of coming up, mark it as online + * so that there's no race with hotplug thread bringing up more + * CPUs than necessary. + */ + if (apply_limits(f, f->need_cpus) <= f->online_cpus) { + pr_debug("Prevent CPU%d onlining\n", cpu); + ret = NOTIFY_BAD; + } else { + state->online = true; + f->online_cpus++; + } + break; + + case CPU_ONLINE: + /* + * Moving to the end of the list should only happen in + * CPU_ONLINE and not on CPU_UP_PREPARE to prevent an + * infinite list traversal when thermal (or other entities) + * reject trying to online CPUs. + */ + spin_lock_irqsave(&state_lock, flags); + list_del(&state->sib); + list_add_tail(&state->sib, &f->lru); + spin_unlock_irqrestore(&state_lock, flags); + break; + + case CPU_DEAD: + /* Move a CPU to the end of the LRU when it goes offline. */ + spin_lock_irqsave(&state_lock, flags); + list_del(&state->sib); + list_add_tail(&state->sib, &f->lru); + spin_unlock_irqrestore(&state_lock, flags); + + /* Fall through */ + + case CPU_UP_CANCELED: + + /* If online state of CPU somehow got out of sync, fix it. */ + if (!state->online) { + f->online_cpus++; + pr_warn("CPU%d online when state is offline\n", cpu); + } + + if (!state->rejected && action == CPU_UP_CANCELED) { + state->rejected = true; + f->avail_cpus--; + } + + state->online = false; + state->busy = 0; + f->online_cpus--; + break; + } + + if (f->online_cpus < apply_limits(f, f->need_cpus) + && f->online_cpus < f->avail_cpus + && action == CPU_DEAD) + wake_up_hotplug_thread(f); + + return ret; +} + +static struct notifier_block __refdata cpu_notifier = { + .notifier_call = cpu_callback, +}; + +/* ============================ init code ============================== */ + +static int group_init(struct cpumask *mask) +{ + struct device *dev; + unsigned int first_cpu = cpumask_first(mask); + struct cpu_data *f = &per_cpu(cpu_state, first_cpu); + struct cpu_data *state; + unsigned int cpu; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + + if (likely(f->inited)) + return 0; + + dev = get_cpu_device(first_cpu); + if (!dev) + return -ENODEV; + + pr_info("Creating CPU group %d\n", first_cpu); + + f->num_cpus = cpumask_weight(mask); + if (f->num_cpus > MAX_CPUS_PER_GROUP) { + pr_err("HW configuration not supported\n"); + return -EINVAL; + } + f->min_cpus = 1; + f->max_cpus = f->num_cpus; + f->need_cpus = f->num_cpus; + f->avail_cpus = f->num_cpus; + f->offline_delay_ms = 100; + f->task_thres = UINT_MAX; + f->nrrun = f->num_cpus; + INIT_LIST_HEAD(&f->lru); + init_timer(&f->timer); + spin_lock_init(&f->pending_lock); + f->timer.function = core_ctl_timer_func; + f->timer.data = first_cpu; + + for_each_cpu(cpu, mask) { + pr_info("Init CPU%u state\n", cpu); + + state = &per_cpu(cpu_state, cpu); + state->cpu = cpu; + state->first_cpu = first_cpu; + + if (cpu_online(cpu)) { + f->online_cpus++; + state->online = true; + } + + list_add_tail(&state->sib, &f->lru); + } + + f->hotplug_thread = kthread_run(try_hotplug, (void *) f, + "core_ctl/%d", first_cpu); + sched_setscheduler_nocheck(f->hotplug_thread, SCHED_FIFO, ¶m); + + for_each_cpu(cpu, mask) { + state = &per_cpu(cpu_state, cpu); + state->inited = true; + } + + kobject_init(&f->kobj, &ktype_core_ctl); + return kobject_add(&f->kobj, &dev->kobj, "core_ctl"); +} + +static int cpufreq_policy_cb(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_policy *policy = data; + + switch (val) { + case CPUFREQ_CREATE_POLICY: + group_init(policy->related_cpus); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block cpufreq_pol_nb = { + .notifier_call = cpufreq_policy_cb, +}; + +static int cpufreq_gov_cb(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_govinfo *info = data; + + switch (val) { + case CPUFREQ_LOAD_CHANGE: + core_ctl_set_busy(info->cpu, info->load); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block cpufreq_gov_nb = { + .notifier_call = cpufreq_gov_cb, +}; + +static int __init core_ctl_init(void) +{ + struct cpufreq_policy *policy; + unsigned int cpu; + + register_cpu_notifier(&cpu_notifier); + cpufreq_register_notifier(&cpufreq_pol_nb, CPUFREQ_POLICY_NOTIFIER); + cpufreq_register_notifier(&cpufreq_gov_nb, CPUFREQ_GOVINFO_NOTIFIER); + init_timer_deferrable(&rq_avg_timer); + rq_avg_timer.function = rq_avg_timer_func; + + get_online_cpus(); + for_each_online_cpu(cpu) { + policy = cpufreq_cpu_get(cpu); + if (policy) { + group_init(policy->related_cpus); + cpufreq_cpu_put(policy); + } + } + put_online_cpus(); + mod_timer(&rq_avg_timer, round_to_nw_start()); + return 0; +} + +late_initcall(core_ctl_init); -- cgit v1.2.3