From 6cc98f03adbf517986a90c43af0cbc9a732b8435 Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Fri, 8 Jul 2016 10:59:08 -0700 Subject: timer: Ensure timers are not running before migrating This is needed to support migration of timers during cpu isolation. A timer might be running on the CPU that we want to isolate so we are unable to migrate the timers at this point. We are adding a spin-loop to wait for the timer to finish before migrating the timers. Change-Id: I24d6e91b6dff468c640c2fe3a37a7f31b6f0c79a Signed-off-by: Olav Haugan --- kernel/time/timer.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 51896272fcde..be750f6b2a68 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1635,7 +1635,7 @@ static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *he } } -static void migrate_timers(int cpu) +static void migrate_timers(int cpu, bool wait) { struct tvec_base *old_base; struct tvec_base *new_base; @@ -1651,7 +1651,18 @@ static void migrate_timers(int cpu) spin_lock_irq(&new_base->lock); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); - BUG_ON(old_base->running_timer); + if (wait) { + /* Ensure timers are done running before continuing */ + while (old_base->running_timer) { + spin_unlock(&old_base->lock); + spin_unlock_irq(&new_base->lock); + cpu_relax(); + spin_lock_irq(&new_base->lock); + spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + } + } else { + BUG_ON(old_base->running_timer); + } for (i = 0; i < TVR_SIZE; i++) migrate_timer_list(new_base, old_base->tv1.vec + i); @@ -1676,7 +1687,7 @@ static int timer_cpu_notify(struct notifier_block *self, switch (action) { case CPU_DEAD: case CPU_DEAD_FROZEN: - migrate_timers((long)hcpu); + migrate_timers((long)hcpu, false); break; default: break; -- cgit v1.2.3 From a7dffd7ffbe6aafe8da10f82a922c00e1c65acdc Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 25 Mar 2015 11:47:53 +0530 Subject: timer: create timer_quiesce_cpu() to isolate CPU from timers To isolate CPUs (isolate from timers) from sysfs using cpusets, we need some support from the timer core. i.e. A routine timer_quiesce_cpu() which would migrates away all the unpinned timers, but shouldn't touch the pinned ones. This patch creates this routine. Change-Id: I8624e0659b86b7b8fa425a3fafdb0784fe005124 Signed-off-by: Viresh Kumar [forward port to 3.18] Signed-off-by: Santosh Shukla [ohaugan@codeaurora.org: Port to 4.4. Fixes for compilation error] Git-commit: 313910b70ea0c73f8789d9189c11e1f339080646 Git-repo: git://git.linaro.org/people/mike.holmes/santosh.shukla/lng-isol.git Signed-off-by: Olav Haugan --- include/linux/timer.h | 3 +++ kernel/time/timer.c | 56 ++++++++++++++++++++++++++++++++------------------- 2 files changed, 38 insertions(+), 21 deletions(-) diff --git a/include/linux/timer.h b/include/linux/timer.h index 7a5602e19e87..b1617e8932b2 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -182,6 +182,9 @@ extern void set_timer_slack(struct timer_list *time, int slack_hz); */ #define NEXT_TIMER_MAX_DELTA ((1UL << 30) - 1) +/* To be used from cpusets, only */ +extern void timer_quiesce_cpu(void *cpup); + /* * Timer-statistics info: */ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index be750f6b2a68..067174a4dde3 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1620,44 +1620,49 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) } EXPORT_SYMBOL(schedule_timeout_uninterruptible); -#ifdef CONFIG_HOTPLUG_CPU -static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) +#if defined(CONFIG_HOTPLUG_CPU) +static void migrate_timer_list(struct tvec_base *new_base, + struct hlist_head *head, bool remove_pinned) { struct timer_list *timer; int cpu = new_base->cpu; + struct hlist_node *n; + int is_pinned; - while (!hlist_empty(head)) { - timer = hlist_entry(head->first, struct timer_list, entry); - /* We ignore the accounting on the dying cpu */ - detach_timer(timer, false); + hlist_for_each_entry_safe(timer, n, head, entry) { + is_pinned = timer->flags & TIMER_PINNED_ON_CPU; + if (!remove_pinned && is_pinned) + continue; + + detach_if_pending(timer, get_timer_base(timer->flags), false); timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; internal_add_timer(new_base, timer); } } -static void migrate_timers(int cpu, bool wait) +static void __migrate_timers(int cpu, bool wait, bool remove_pinned) { struct tvec_base *old_base; struct tvec_base *new_base; + unsigned long flags; int i; - BUG_ON(cpu_online(cpu)); old_base = per_cpu_ptr(&tvec_bases, cpu); new_base = get_cpu_ptr(&tvec_bases); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - spin_lock_irq(&new_base->lock); + spin_lock_irqsave(&new_base->lock, flags); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); if (wait) { /* Ensure timers are done running before continuing */ while (old_base->running_timer) { spin_unlock(&old_base->lock); - spin_unlock_irq(&new_base->lock); + spin_unlock_irqrestore(&new_base->lock, flags); cpu_relax(); - spin_lock_irq(&new_base->lock); + spin_lock_irqsave(&new_base->lock, flags); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); } } else { @@ -1665,29 +1670,38 @@ static void migrate_timers(int cpu, bool wait) } for (i = 0; i < TVR_SIZE; i++) - migrate_timer_list(new_base, old_base->tv1.vec + i); + migrate_timer_list(new_base, old_base->tv1.vec + i, + remove_pinned); for (i = 0; i < TVN_SIZE; i++) { - migrate_timer_list(new_base, old_base->tv2.vec + i); - migrate_timer_list(new_base, old_base->tv3.vec + i); - migrate_timer_list(new_base, old_base->tv4.vec + i); - migrate_timer_list(new_base, old_base->tv5.vec + i); + migrate_timer_list(new_base, old_base->tv2.vec + i, + remove_pinned); + migrate_timer_list(new_base, old_base->tv3.vec + i, + remove_pinned); + migrate_timer_list(new_base, old_base->tv4.vec + i, + remove_pinned); + migrate_timer_list(new_base, old_base->tv5.vec + i, + remove_pinned); } - old_base->active_timers = 0; - old_base->all_timers = 0; - spin_unlock(&old_base->lock); - spin_unlock_irq(&new_base->lock); + spin_unlock_irqrestore(&new_base->lock, flags); put_cpu_ptr(&tvec_bases); } +/* Migrate timers from 'cpu' to this_cpu */ +static void migrate_timers(int cpu) +{ + BUG_ON(cpu_online(cpu)); + __migrate_timers(cpu, false, true); +} + static int timer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { switch (action) { case CPU_DEAD: case CPU_DEAD_FROZEN: - migrate_timers((long)hcpu, false); + migrate_timers((long)hcpu); break; default: break; -- cgit v1.2.3 From d0cea65e27b7541234d61b70017ec6cc3cfe3eec Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 25 Mar 2015 11:57:45 +0530 Subject: hrtimer: update timer->state with 'pinned' information 'Pinned' information would be required in migrate_hrtimers() now, as we can migrate non-pinned timers away without a hotplug (i.e. with cpuset.quiesce). And so we may need to identify pinned timers now, as we can't migrate them. This patch reuses the timer->state variable for setting this flag as there were enough number of free bits available in this variable. And there is no point increasing size of this struct by adding another field. Change-Id: If3b3770e547971809e789ea7c8033c48ec2aa92d Signed-off-by: Viresh Kumar [forward port to 3.18] Signed-off-by: Santosh Shukla [ohaugan@codeaurora.org: Port to 4.4] Git-commit: 62feaf1ed0b64c04868d143d8bdb92d60dc3189b Git-repo: git://git.linaro.org/people/mike.holmes/santosh.shukla/lng-isol.git Signed-off-by: Olav Haugan --- include/linux/hrtimer.h | 3 +++ kernel/time/hrtimer.c | 19 ++++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 2ead22dd74a0..07766186ee89 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -53,6 +53,7 @@ enum hrtimer_restart { * * 0x00 inactive * 0x01 enqueued into rbtree + * 0x02 timer is pinned to a cpu * * The callback state is not part of the timer->state because clearing it would * mean touching the timer after the callback, this makes it impossible to free @@ -72,6 +73,8 @@ enum hrtimer_restart { */ #define HRTIMER_STATE_INACTIVE 0x00 #define HRTIMER_STATE_ENQUEUED 0x01 +#define HRTIMER_PINNED_SHIFT 1 +#define HRTIMER_STATE_PINNED (1 << HRTIMER_PINNED_SHIFT) /** * struct hrtimer - the basic hrtimer structure diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index fa909f9fd559..83c298cc0533 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -880,7 +880,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, base->cpu_base->active_bases |= 1 << base->index; - timer->state = HRTIMER_STATE_ENQUEUED; + timer->state |= HRTIMER_STATE_ENQUEUED; return timerqueue_add(&base->active, &timer->node); } @@ -900,11 +900,9 @@ static void __remove_hrtimer(struct hrtimer *timer, u8 newstate, int reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; - u8 state = timer->state; - timer->state = newstate; - if (!(state & HRTIMER_STATE_ENQUEUED)) - return; + if (!(timer->state & HRTIMER_STATE_ENQUEUED)) + goto out; if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); @@ -921,6 +919,13 @@ static void __remove_hrtimer(struct hrtimer *timer, if (reprogram && timer == cpu_base->next_timer) hrtimer_force_reprogram(cpu_base, 1); #endif + +out: + /* + * We need to preserve PINNED state here, otherwise we may end up + * migrating pinned hrtimers as well. + */ + timer->state = newstate | (timer->state & HRTIMER_STATE_PINNED); } /* @@ -1002,6 +1007,10 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, timer_stats_hrtimer_set_start_info(timer); + /* Update pinned state */ + timer->state &= ~HRTIMER_STATE_PINNED; + timer->state |= !!(mode & HRTIMER_MODE_PINNED) << HRTIMER_PINNED_SHIFT; + leftmost = enqueue_hrtimer(timer, new_base); if (!leftmost) goto unlock; -- cgit v1.2.3 From f461d408acb522ec6db8ae34a49c2dbbd6092ab1 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 25 Mar 2015 12:07:31 +0530 Subject: hrtimer: create hrtimer_quiesce_cpu() to isolate CPU from hrtimers To isolate CPUs (isolate from hrtimers) from sysfs using cpusets, we need some support from the hrtimer core. i.e. A routine hrtimer_quiesce_cpu() which would migrate away all the unpinned hrtimers, but shouldn't touch the pinned ones. This patch creates this routine. Change-Id: I51259ea41e3bd5cdba50b718201a6840174a7224 Signed-off-by: Viresh Kumar [forward port to 3.18] Signed-off-by: Santosh Shukla [ohaugan@codeaurora.org: Port to 4.4] Git-commit: d4d50a0ddc35e58ee95137ba4d14e74fea8b682f Git-repo: git://git.linaro.org/people/mike.holmes/santosh.shukla/lng-isol.git Signed-off-by: Olav Haugan --- include/linux/hrtimer.h | 3 +++ kernel/time/hrtimer.c | 54 ++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 47 insertions(+), 10 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 07766186ee89..952adcacc4cf 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -360,6 +360,9 @@ DECLARE_PER_CPU(struct tick_device, tick_cpu_device); /* Exported timer functions: */ +/* To be used from cpusets, only */ +extern void hrtimer_quiesce_cpu(void *cpup); + /* Initialize timers: */ extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 83c298cc0533..44ddf403fb01 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1623,13 +1623,17 @@ static void init_hrtimers_cpu(int cpu) hrtimer_init_hres(cpu_base); } -#ifdef CONFIG_HOTPLUG_CPU - +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_CPUSETS) static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, - struct hrtimer_clock_base *new_base) + struct hrtimer_clock_base *new_base, + bool remove_pinned) { struct hrtimer *timer; struct timerqueue_node *node; + struct timerqueue_head pinned; + int is_pinned; + + timerqueue_init_head(&pinned); while ((node = timerqueue_getnext(&old_base->active))) { timer = container_of(node, struct hrtimer, node); @@ -1642,6 +1646,13 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * under us on another CPU */ __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); + + is_pinned = timer->state & HRTIMER_STATE_PINNED; + if (!remove_pinned && is_pinned) { + timerqueue_add(&pinned, &timer->node); + continue; + } + timer->base = new_base; /* * Enqueue the timers on the new cpu. This does not @@ -1653,17 +1664,23 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, */ enqueue_hrtimer(timer, new_base); } + + /* Re-queue pinned timers for non-hotplug usecase */ + while ((node = timerqueue_getnext(&pinned))) { + timer = container_of(node, struct hrtimer, node); + + timerqueue_del(&pinned, &timer->node); + enqueue_hrtimer(timer, old_base); + } } -static void migrate_hrtimers(int scpu) +static void __migrate_hrtimers(int scpu, bool remove_pinned) { struct hrtimer_cpu_base *old_base, *new_base; + unsigned long flags; int i; - BUG_ON(cpu_online(scpu)); - tick_cancel_sched_timer(scpu); - - local_irq_disable(); + local_irq_save(flags); old_base = &per_cpu(hrtimer_bases, scpu); new_base = this_cpu_ptr(&hrtimer_bases); /* @@ -1675,7 +1692,7 @@ static void migrate_hrtimers(int scpu) for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], - &new_base->clock_base[i]); + &new_base->clock_base[i], remove_pinned); } raw_spin_unlock(&old_base->lock); @@ -1683,11 +1700,28 @@ static void migrate_hrtimers(int scpu) /* Check, if we got expired work to do */ __hrtimer_peek_ahead_timers(); - local_irq_enable(); + local_irq_restore(flags); +} +#endif /* CONFIG_HOTPLUG_CPU || CONFIG_CPUSETS */ + +#ifdef CONFIG_HOTPLUG_CPU +static void migrate_hrtimers(int scpu) +{ + BUG_ON(cpu_online(scpu)); + tick_cancel_sched_timer(scpu); + + __migrate_hrtimers(scpu, true); } #endif /* CONFIG_HOTPLUG_CPU */ +#ifdef CONFIG_CPUSETS +void hrtimer_quiesce_cpu(void *cpup) +{ + __migrate_hrtimers(*(int *)cpup, false); +} +#endif /* CONFIG_CPUSETS */ + static int hrtimer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { -- cgit v1.2.3 From a66156c83c6de8dd2e614d6ad37afe7673ca1473 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 25 Mar 2015 12:57:46 +0530 Subject: hrtimer: make sure PINNED flag is cleared after removing hrtimer Change-Id: Icc4d1c183e993b4b3c9b96ec9779c234e73ecab7 Signed-off-by: Viresh Kumar [forward port to 3.18] Signed-off-by: Santosh Shukla Git-commit: d6c894e515b4cd93c3a08e7c60cce0aa5118c656 Git-repo: git://git.linaro.org/people/mike.holmes/santosh.shukla/lng-isol.git Signed-off-by: Olav Haugan --- kernel/time/hrtimer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 44ddf403fb01..ab304f854743 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -954,6 +954,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest state = HRTIMER_STATE_INACTIVE; __remove_hrtimer(timer, base, state, reprogram); + timer->state &= ~HRTIMER_STATE_PINNED; return 1; } return 0; @@ -1009,7 +1010,7 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, /* Update pinned state */ timer->state &= ~HRTIMER_STATE_PINNED; - timer->state |= !!(mode & HRTIMER_MODE_PINNED) << HRTIMER_PINNED_SHIFT; + timer->state |= (!!(mode & HRTIMER_MODE_PINNED)) << HRTIMER_PINNED_SHIFT; leftmost = enqueue_hrtimer(timer, new_base); if (!leftmost) -- cgit v1.2.3 From 84e39dcb3bf4b0ea5ed9b1bcd1b1fb43d2a4bc63 Mon Sep 17 00:00:00 2001 From: "Gary S. Robertson" Date: Wed, 10 Sep 2014 14:57:16 -0500 Subject: hrtimer.h: prevent pinned timer state from breaking inactive test An hrtimer may be pinned to a CPU but inactive, so it is no longer valid to test the hrtimer.state struct member as having no bits set when inactive. Changed the test function to mask out the HRTIMER_STATE_PINNED bit when checking for inactive state. Change-Id: I632f37874ef79887ee1202a028ef734f392d6ed0 Signed-off-by: Gary S. Robertson [ohaugan@codeaurora.org: Port to 4.4] Git-commit: 902e4d4eb0d2158d2792166221a72a829caecf07 Git-repo: git://git.linaro.org/people/mike.holmes/santosh.shukla/lng-isol.git Signed-off-by: Olav Haugan --- kernel/time/hrtimer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index ab304f854743..f3b89de9ca2a 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1186,8 +1186,8 @@ bool hrtimer_active(const struct hrtimer *timer) cpu_base = READ_ONCE(timer->base->cpu_base); seq = raw_read_seqcount_begin(&cpu_base->seq); - if (timer->state != HRTIMER_STATE_INACTIVE || - cpu_base->running == timer) + if (((timer->state & ~HRTIMER_STATE_PINNED) != + HRTIMER_STATE_INACTIVE) || cpu_base->running == timer) return true; } while (read_seqcount_retry(&cpu_base->seq, seq) || -- cgit v1.2.3 From bba552f4fc0046325f29156720a04623a71821d5 Mon Sep 17 00:00:00 2001 From: Santosh Shukla Date: Wed, 25 Mar 2015 16:09:32 +0530 Subject: timer: Add function to migrate timers Add function to migrate timer that will be used by later patch set. Change-Id: I370e404001344e635a663822b07557abbe0f6f52 Signed-off-by: Santosh Shukla [ohaugan@codeaurora.org: Updated commit text and fixed trivial merge conflict] Git-commit: 3633b88d8fcb4273807574c27c328b6908a741e5 Git-repo: git://git.linaro.org/people/mike.holmes/santosh.shukla/lng-isol.git Signed-off-by: Olav Haugan --- kernel/time/timer.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 067174a4dde3..d38a67a49550 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1695,6 +1695,13 @@ static void migrate_timers(int cpu) __migrate_timers(cpu, false, true); } +#ifdef CONFIG_CPUSETS +void timer_quiesce_cpu(void *cpup) +{ + __migrate_timers(*(int *)cpup, true, false); +} +#endif /* CONFIG_CPUSETS */ + static int timer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { -- cgit v1.2.3 From 922fed628c625b28a50f66267e4b9f99088e2aa4 Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Sun, 3 Jul 2016 15:02:08 -0700 Subject: timer: Do not require CPUSETS to be enabled for migration Do not require CPUSETS to be enabled to allow migration of timers and hrtimers. Change-Id: Ib911a0d34c250c4df020bdb265b92d2b8df8db93 Signed-off-by: Olav Haugan --- kernel/time/hrtimer.c | 10 +++------- kernel/time/timer.c | 2 -- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f3b89de9ca2a..1b0117198a08 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1624,7 +1624,7 @@ static void init_hrtimers_cpu(int cpu) hrtimer_init_hres(cpu_base); } -#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_CPUSETS) +#if defined(CONFIG_HOTPLUG_CPU) static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base, bool remove_pinned) @@ -1703,9 +1703,7 @@ static void __migrate_hrtimers(int scpu, bool remove_pinned) __hrtimer_peek_ahead_timers(); local_irq_restore(flags); } -#endif /* CONFIG_HOTPLUG_CPU || CONFIG_CPUSETS */ -#ifdef CONFIG_HOTPLUG_CPU static void migrate_hrtimers(int scpu) { BUG_ON(cpu_online(scpu)); @@ -1714,14 +1712,12 @@ static void migrate_hrtimers(int scpu) __migrate_hrtimers(scpu, true); } -#endif /* CONFIG_HOTPLUG_CPU */ - -#ifdef CONFIG_CPUSETS void hrtimer_quiesce_cpu(void *cpup) { __migrate_hrtimers(*(int *)cpup, false); } -#endif /* CONFIG_CPUSETS */ + +#endif /* CONFIG_HOTPLUG_CPU */ static int hrtimer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index d38a67a49550..0efb3916f5a4 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1695,12 +1695,10 @@ static void migrate_timers(int cpu) __migrate_timers(cpu, false, true); } -#ifdef CONFIG_CPUSETS void timer_quiesce_cpu(void *cpup) { __migrate_timers(*(int *)cpup, true, false); } -#endif /* CONFIG_CPUSETS */ static int timer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) -- cgit v1.2.3 From fc70615291343d031b148bb2676963419a7b672e Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Sun, 29 May 2016 19:56:37 -0700 Subject: cpumask: Add cpu isolation support Add bitmask and corresponding supporting functions for cpu isolation. Change-Id: Ice1a9503666a2b720bdb324289ca55ceb33097cd Signed-off-by: Olav Haugan --- include/linux/cpumask.h | 8 ++++++++ kernel/cpu.c | 17 +++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 59915ea5373c..0eab4811ee92 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -53,6 +53,7 @@ extern int nr_cpu_ids; * cpu_present_mask - has bit 'cpu' set iff cpu is populated * cpu_online_mask - has bit 'cpu' set iff cpu available to scheduler * cpu_active_mask - has bit 'cpu' set iff cpu available to migration + * cpu_isolated_mask- has bit 'cpu' set iff cpu isolated * * If !CONFIG_HOTPLUG_CPU, present == possible, and active == online. * @@ -89,25 +90,30 @@ extern const struct cpumask *const cpu_possible_mask; extern const struct cpumask *const cpu_online_mask; extern const struct cpumask *const cpu_present_mask; extern const struct cpumask *const cpu_active_mask; +extern const struct cpumask *const cpu_isolated_mask; #if NR_CPUS > 1 #define num_online_cpus() cpumask_weight(cpu_online_mask) #define num_possible_cpus() cpumask_weight(cpu_possible_mask) #define num_present_cpus() cpumask_weight(cpu_present_mask) #define num_active_cpus() cpumask_weight(cpu_active_mask) +#define num_isolated_cpus() cpumask_weight(cpu_isolated_mask) #define cpu_online(cpu) cpumask_test_cpu((cpu), cpu_online_mask) #define cpu_possible(cpu) cpumask_test_cpu((cpu), cpu_possible_mask) #define cpu_present(cpu) cpumask_test_cpu((cpu), cpu_present_mask) #define cpu_active(cpu) cpumask_test_cpu((cpu), cpu_active_mask) +#define cpu_isolated(cpu) cpumask_test_cpu((cpu), cpu_isolated_mask) #else #define num_online_cpus() 1U #define num_possible_cpus() 1U #define num_present_cpus() 1U #define num_active_cpus() 1U +#define num_isolated_cpus() 0U #define cpu_online(cpu) ((cpu) == 0) #define cpu_possible(cpu) ((cpu) == 0) #define cpu_present(cpu) ((cpu) == 0) #define cpu_active(cpu) ((cpu) == 0) +#define cpu_isolated(cpu) ((cpu) == 0) #endif /* verify cpu argument to cpumask_* operators */ @@ -714,12 +720,14 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); #define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask) #define for_each_online_cpu(cpu) for_each_cpu((cpu), cpu_online_mask) #define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask) +#define for_each_isolated_cpu(cpu) for_each_cpu((cpu), cpu_isolated_mask) /* Wrappers for arch boot code to manipulate normally-constant masks */ void set_cpu_possible(unsigned int cpu, bool possible); void set_cpu_present(unsigned int cpu, bool present); void set_cpu_online(unsigned int cpu, bool online); void set_cpu_active(unsigned int cpu, bool active); +void set_cpu_isolated(unsigned int cpu, bool isolated); void init_cpu_present(const struct cpumask *src); void init_cpu_possible(const struct cpumask *src); void init_cpu_online(const struct cpumask *src); diff --git a/kernel/cpu.c b/kernel/cpu.c index 1cfd381642da..3c97f5b88a07 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -768,6 +768,10 @@ static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly; const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits); EXPORT_SYMBOL(cpu_active_mask); +static DECLARE_BITMAP(cpu_isolated_bits, CONFIG_NR_CPUS) __read_mostly; +const struct cpumask *const cpu_isolated_mask = to_cpumask(cpu_isolated_bits); +EXPORT_SYMBOL(cpu_isolated_mask); + void set_cpu_possible(unsigned int cpu, bool possible) { if (possible) @@ -802,6 +806,14 @@ void set_cpu_active(unsigned int cpu, bool active) cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits)); } +void set_cpu_isolated(unsigned int cpu, bool isolated) +{ + if (isolated) + cpumask_set_cpu(cpu, to_cpumask(cpu_isolated_bits)); + else + cpumask_clear_cpu(cpu, to_cpumask(cpu_isolated_bits)); +} + void init_cpu_present(const struct cpumask *src) { cpumask_copy(to_cpumask(cpu_present_bits), src); @@ -817,6 +829,11 @@ void init_cpu_online(const struct cpumask *src) cpumask_copy(to_cpumask(cpu_online_bits), src); } +void init_cpu_isolated(const struct cpumask *src) +{ + cpumask_copy(to_cpumask(cpu_isolated_bits), src); +} + static ATOMIC_NOTIFIER_HEAD(idle_notifier); void idle_notifier_register(struct notifier_block *n) -- cgit v1.2.3 From dcf716301e77728f08667cf0189d6de1cdc5e5e9 Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Thu, 18 Aug 2016 16:47:02 -0700 Subject: soc: qcom: watchdog_v2: Add support for cpu isolation Ensure watchdog does not wake up isolated cpu. Change-Id: Ie4c6cb1496ae3490d81681f1ad51c8103caa0014 Signed-off-by: Olav Haugan --- drivers/soc/qcom/watchdog_v2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/qcom/watchdog_v2.c b/drivers/soc/qcom/watchdog_v2.c index aa20705b9adc..8f58eaa537b1 100644 --- a/drivers/soc/qcom/watchdog_v2.c +++ b/drivers/soc/qcom/watchdog_v2.c @@ -360,7 +360,7 @@ static void ping_other_cpus(struct msm_watchdog_data *wdog_dd) cpumask_clear(&wdog_dd->alive_mask); smp_mb(); for_each_cpu(cpu, cpu_online_mask) { - if (!cpu_idle_pc_state[cpu]) + if (!cpu_idle_pc_state[cpu] && !cpu_isolated(cpu)) smp_call_function_single(cpu, keep_alive_response, wdog_dd, 1); } -- cgit v1.2.3 From 3fe956359cd3f1cd174285b693b424f89123ff96 Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Thu, 18 Aug 2016 16:49:44 -0700 Subject: watchdog: Add support for cpu isolation Open up interface to allow external subsystem to enable and disable hard lockup detector. Change-Id: I88a728ee1d54aaa887fab52e5e40d1d4e4fc69ca Signed-off-by: Olav Haugan --- drivers/base/core.c | 5 +++++ include/linux/device.h | 1 + include/linux/sched.h | 8 ++++++++ kernel/watchdog.c | 22 ++++++++++++++++++++-- 4 files changed, 34 insertions(+), 2 deletions(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index b7d56c5ea3c6..3ac683dff7de 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -72,6 +72,11 @@ int lock_device_hotplug_sysfs(void) return restart_syscall(); } +void lock_device_hotplug_assert(void) +{ + lockdep_assert_held(&device_hotplug_lock); +} + #ifdef CONFIG_BLOCK static inline int device_is_not_partition(struct device *dev) { diff --git a/include/linux/device.h b/include/linux/device.h index 9f27351c6b9c..4b4e2d5ce6e7 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1023,6 +1023,7 @@ static inline bool device_supports_offline(struct device *dev) extern void lock_device_hotplug(void); extern void unlock_device_hotplug(void); extern int lock_device_hotplug_sysfs(void); +extern void lock_device_hotplug_assert(void); extern int device_offline(struct device *dev); extern int device_online(struct device *dev); extern void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode); diff --git a/include/linux/sched.h b/include/linux/sched.h index 4701e0403167..ac6b519494a2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -409,6 +409,8 @@ extern int proc_dowatchdog_thresh(struct ctl_table *table, int write, extern unsigned int softlockup_panic; extern unsigned int hardlockup_panic; void lockup_detector_init(void); +extern void watchdog_enable(unsigned int cpu); +extern void watchdog_disable(unsigned int cpu); #else static inline void touch_softlockup_watchdog_sched(void) { @@ -425,6 +427,12 @@ static inline void touch_all_softlockup_watchdogs(void) static inline void lockup_detector_init(void) { } +static inline void watchdog_enable(unsigned int cpu) +{ +} +static inline void watchdog_disable(unsigned int cpu) +{ +} #endif #ifdef CONFIG_DETECT_HUNG_TASK diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 029da92fb712..7f21591c8ec5 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -95,6 +96,7 @@ static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); +static DEFINE_PER_CPU(unsigned int, watchdog_en); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(bool, soft_watchdog_warn); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); @@ -586,9 +588,17 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio) sched_setscheduler(current, policy, ¶m); } -static void watchdog_enable(unsigned int cpu) +/* Must be called with hotplug lock (lock_device_hotplug()) held. */ +void watchdog_enable(unsigned int cpu) { struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); + unsigned int *enabled = raw_cpu_ptr(&watchdog_en); + + lock_device_hotplug_assert(); + + if (*enabled) + return; + *enabled = 1; /* kick off the timer for the hardlockup detector */ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); @@ -606,9 +616,17 @@ static void watchdog_enable(unsigned int cpu) __touch_watchdog(); } -static void watchdog_disable(unsigned int cpu) +/* Must be called with hotplug lock (lock_device_hotplug()) held. */ +void watchdog_disable(unsigned int cpu) { struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); + unsigned int *enabled = raw_cpu_ptr(&watchdog_en); + + lock_device_hotplug_assert(); + + if (!*enabled) + return; + *enabled = 0; watchdog_set_prio(SCHED_NORMAL, 0); hrtimer_cancel(hrtimer); -- cgit v1.2.3 From e33c24bfecde67d7d665bfcf90c7d4c2f231be79 Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Tue, 31 May 2016 14:34:46 -0700 Subject: sched: add cpu isolation support This adds cpu isolation APIs to the scheduler to isolate and unisolate CPUs. Isolating and unisolating a CPU can be used in place of hotplug. Isolating and unisolating a CPU is faster than hotplug and can thus be used to optimize the performance and power of multi-core CPUs. Isolating works by migrating non-pinned IRQs and tasks to other CPUS and marking the CPU as not available to the scheduler and load balancer. Pinned tasks and IRQs are still allowed to run but it is expected that this would be minimal. Unisolation works by just marking the CPU available for scheduler and load balancer. Change-Id: I0bbddb56238c2958c5987877c5bfc3e79afa67cc Signed-off-by: Olav Haugan --- include/linux/sched.h | 35 +++++++ kernel/sched/core.c | 280 ++++++++++++++++++++++++++++++++++++++++++++++---- kernel/sched/fair.c | 74 +++++++++---- kernel/sched/hmp.c | 4 +- kernel/sched/rt.c | 13 ++- kernel/sched/sched.h | 5 +- 6 files changed, 368 insertions(+), 43 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index ac6b519494a2..a395d8a9ff73 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -363,6 +363,41 @@ extern cpumask_var_t cpu_isolated_map; extern int runqueue_is_locked(int cpu); +#ifdef CONFIG_HOTPLUG_CPU +extern int sched_isolate_count(const cpumask_t *mask, bool include_offline); +extern int sched_isolate_cpu(int cpu); +extern int sched_unisolate_cpu(int cpu); +extern int sched_unisolate_cpu_unlocked(int cpu); +#else +static inline int sched_isolate_count(const cpumask_t *mask, + bool include_offline) +{ + cpumask_t count_mask; + + if (include_offline) + cpumask_andnot(&count_mask, mask, cpu_online_mask); + else + return 0; + + return cpumask_weight(&count_mask); +} + +static inline int sched_isolate_cpu(int cpu) +{ + return 0; +} + +static inline int sched_unisolate_cpu(int cpu) +{ + return 0; +} + +static inline int sched_unisolate_cpu_unlocked(int cpu) +{ + return 0; +} +#endif + #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) extern void nohz_balance_enter_idle(int cpu); extern void set_cpu_sd_state_idle(void); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7474463b9835..cddb0073c5fb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #include @@ -1229,6 +1230,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, struct rq *rq; unsigned int dest_cpu; int ret = 0; + cpumask_t allowed_mask; rq = task_rq_lock(p, &flags); @@ -1244,16 +1246,22 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_equal(&p->cpus_allowed, new_mask)) goto out; - dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); + cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask); + + dest_cpu = cpumask_any_and(cpu_active_mask, &allowed_mask); if (dest_cpu >= nr_cpu_ids) { - ret = -EINVAL; - goto out; + dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); + if (dest_cpu >= nr_cpu_ids) { + ret = -EINVAL; + goto out; + } + cpumask_copy(&allowed_mask, new_mask); } do_set_cpus_allowed(p, new_mask); /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask)) + if (cpumask_test_cpu(task_cpu(p), &allowed_mask)) goto out; if (task_running(rq, p) || p->state == TASK_WAKING) { @@ -1577,12 +1585,13 @@ EXPORT_SYMBOL_GPL(kick_process); /* * ->cpus_allowed is protected by both rq->lock and p->pi_lock */ -static int select_fallback_rq(int cpu, struct task_struct *p) +static int select_fallback_rq(int cpu, struct task_struct *p, bool allow_iso) { int nid = cpu_to_node(cpu); const struct cpumask *nodemask = NULL; enum { cpuset, possible, fail } state = cpuset; int dest_cpu; + int isolated_candidate = -1; /* * If the node that the cpu is on has been offlined, cpu_to_node() @@ -1598,6 +1607,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p) continue; if (!cpu_active(dest_cpu)) continue; + if (cpu_isolated(dest_cpu)) + continue; if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) return dest_cpu; } @@ -1610,6 +1621,16 @@ static int select_fallback_rq(int cpu, struct task_struct *p) continue; if (!cpu_active(dest_cpu)) continue; + if (cpu_isolated(dest_cpu)) { + if (allow_iso) + isolated_candidate = dest_cpu; + continue; + } + goto out; + } + + if (isolated_candidate != -1) { + dest_cpu = isolated_candidate; goto out; } @@ -1655,6 +1676,8 @@ out: static inline int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { + bool allow_isolated = (p->flags & PF_KTHREAD); + lockdep_assert_held(&p->pi_lock); if (p->nr_cpus_allowed > 1) @@ -1671,8 +1694,9 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) * not worry about this generic constraint ] */ if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || - !cpu_online(cpu))) - cpu = select_fallback_rq(task_cpu(p), p); + !cpu_online(cpu)) || + (cpu_isolated(cpu) && !allow_isolated)) + cpu = select_fallback_rq(task_cpu(p), p, allow_isolated); return cpu; } @@ -2956,7 +2980,7 @@ void sched_exec(void) if (dest_cpu == smp_processor_id()) goto unlock; - if (likely(cpu_active(dest_cpu))) { + if (likely(cpu_active(dest_cpu) && likely(!cpu_isolated(dest_cpu)))) { struct migration_arg arg = { p, dest_cpu }; raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -5414,18 +5438,22 @@ static struct task_struct fake_task = { }; /* - * Migrate all tasks from the rq, sleeping tasks will be migrated by - * try_to_wake_up()->select_task_rq(). + * Migrate all tasks (not pinned if pinned argument say so) from the rq, + * sleeping tasks will be migrated by try_to_wake_up()->select_task_rq(). * * Called with rq->lock held even though we'er in stop_machine() and * there's no concurrency possible, we hold the required locks anyway * because of lock validation efforts. */ -static void migrate_tasks(struct rq *dead_rq) +static void migrate_tasks(struct rq *dead_rq, bool migrate_pinned_tasks) { struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; int dest_cpu; + unsigned int num_pinned_kthreads = 1; /* this thread */ + cpumask_t avail_cpus; + + cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask); /* * Fudge the rq selection such that the below task selection loop @@ -5447,10 +5475,12 @@ static void migrate_tasks(struct rq *dead_rq) for (;;) { /* - * There's this thread running, bail when that's the only - * remaining thread. + * There's this thread running + pinned threads, bail when + * that's the only remaining threads. */ - if (rq->nr_running == 1) + if ((migrate_pinned_tasks && rq->nr_running == 1) || + (!migrate_pinned_tasks && + rq->nr_running == num_pinned_kthreads)) break; /* @@ -5461,6 +5491,13 @@ static void migrate_tasks(struct rq *dead_rq) BUG_ON(!next); next->sched_class->put_prev_task(rq, next); + if (!migrate_pinned_tasks && next->flags & PF_KTHREAD && + !cpumask_intersects(&avail_cpus, &next->cpus_allowed)) { + lockdep_unpin_lock(&rq->lock); + num_pinned_kthreads += 1; + continue; + } + /* * Rules for changing task_struct::cpus_allowed are holding * both pi_lock and rq->lock, such that holding either @@ -5486,7 +5523,7 @@ static void migrate_tasks(struct rq *dead_rq) } /* Find suitable destination for @next, with force if needed. */ - dest_cpu = select_fallback_rq(dead_rq->cpu, next); + dest_cpu = select_fallback_rq(dead_rq->cpu, next, false); rq = __migrate_task(rq, next, dest_cpu); if (rq != dead_rq) { @@ -5502,6 +5539,210 @@ static void migrate_tasks(struct rq *dead_rq) rq->stop = stop; } + +static void set_rq_online(struct rq *rq); +static void set_rq_offline(struct rq *rq); + +int do_isolation_work_cpu_stop(void *data) +{ + unsigned long flags; + unsigned int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + + watchdog_disable(cpu); + + irq_migrate_all_off_this_cpu(); + + sched_ttwu_pending(); + /* Update our root-domain */ + raw_spin_lock_irqsave(&rq->lock, flags); + + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + + migrate_tasks(rq, false); + raw_spin_unlock_irqrestore(&rq->lock, flags); + + /* + * We might have been in tickless state. Clear NOHZ flags to avoid + * us being kicked for helping out with balancing + */ + nohz_balance_clear_nohz_mask(cpu); + return 0; +} + +int do_unisolation_work_cpu_stop(void *data) +{ + watchdog_enable(smp_processor_id()); + return 0; +} + +static void init_sched_groups_capacity(int cpu, struct sched_domain *sd); + +static void sched_update_group_capacities(int cpu) +{ + struct sched_domain *sd; + + mutex_lock(&sched_domains_mutex); + rcu_read_lock(); + + for_each_domain(cpu, sd) { + int balance_cpu = group_balance_cpu(sd->groups); + + init_sched_groups_capacity(cpu, sd); + /* + * Need to ensure this is also called with balancing + * cpu. + */ + if (cpu != balance_cpu) + init_sched_groups_capacity(balance_cpu, sd); + } + + rcu_read_unlock(); + mutex_unlock(&sched_domains_mutex); +} + +static unsigned int cpu_isolation_vote[NR_CPUS]; + +int sched_isolate_count(const cpumask_t *mask, bool include_offline) +{ + cpumask_t count_mask = CPU_MASK_NONE; + + if (include_offline) { + cpumask_complement(&count_mask, cpu_online_mask); + cpumask_or(&count_mask, &count_mask, cpu_isolated_mask); + cpumask_and(&count_mask, &count_mask, mask); + } else { + cpumask_and(&count_mask, mask, cpu_isolated_mask); + } + + return cpumask_weight(&count_mask); +} + +/* + * 1) CPU is isolated and cpu is offlined: + * Unisolate the core. + * 2) CPU is not isolated and CPU is offlined: + * No action taken. + * 3) CPU is offline and request to isolate + * Request ignored. + * 4) CPU is offline and isolated: + * Not a possible state. + * 5) CPU is online and request to isolate + * Normal case: Isolate the CPU + * 6) CPU is not isolated and comes back online + * Nothing to do + * + * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY + * calling sched_unisolate_cpu() on a CPU that the client previously isolated. + * Client is also responsible for unisolating when a core goes offline + * (after CPU is marked offline). + */ +int sched_isolate_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + cpumask_t avail_cpus; + int ret_code = 0; + + lock_device_hotplug(); + + cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask); + + /* We cannot isolate ALL cpus in the system */ + if (cpumask_weight(&avail_cpus) == 1) { + ret_code = -EINVAL; + goto out; + } + + if (!cpu_online(cpu)) { + ret_code = -EINVAL; + goto out; + } + + if (++cpu_isolation_vote[cpu] > 1) + goto out; + + set_cpu_isolated(cpu, true); + cpumask_clear_cpu(cpu, &avail_cpus); + + /* Migrate timers */ + smp_call_function_any(&avail_cpus, hrtimer_quiesce_cpu, &cpu, 1); + smp_call_function_any(&avail_cpus, timer_quiesce_cpu, &cpu, 1); + + migrate_sync_cpu(cpu, cpumask_first(&avail_cpus)); + stop_cpus(cpumask_of(cpu), do_isolation_work_cpu_stop, 0); + + clear_hmp_request(cpu); + calc_load_migrate(rq); + update_max_interval(); + sched_update_group_capacities(cpu); + +out: + unlock_device_hotplug(); + return ret_code; +} + +/* + * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY + * calling sched_unisolate_cpu() on a CPU that the client previously isolated. + * Client is also responsible for unisolating when a core goes offline + * (after CPU is marked offline). + */ +int sched_unisolate_cpu_unlocked(int cpu) +{ + int ret_code = 0; + struct rq *rq = cpu_rq(cpu); + + lock_device_hotplug_assert(); + + if (!cpu_isolation_vote[cpu]) { + ret_code = -EINVAL; + goto out; + } + + if (--cpu_isolation_vote[cpu]) + goto out; + + if (cpu_online(cpu)) { + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + rq->age_stamp = sched_clock_cpu(cpu); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_online(rq); + } + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + + set_cpu_isolated(cpu, false); + update_max_interval(); + sched_update_group_capacities(cpu); + + if (cpu_online(cpu)) { + stop_cpus(cpumask_of(cpu), do_unisolation_work_cpu_stop, 0); + + /* Kick CPU to immediately do load balancing */ + if (!test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) + smp_send_reschedule(cpu); + } + +out: + return ret_code; +} + +int sched_unisolate_cpu(int cpu) +{ + int ret_code; + + lock_device_hotplug(); + ret_code = sched_unisolate_cpu_unlocked(cpu); + unlock_device_hotplug(); + return ret_code; +} + #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -5748,13 +5989,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) sched_ttwu_pending(); /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); - migrate_sync_cpu(cpu); + migrate_sync_cpu(cpu, smp_processor_id()); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - migrate_tasks(rq); + migrate_tasks(rq, true); BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); break; @@ -6509,11 +6750,14 @@ build_sched_groups(struct sched_domain *sd, int cpu) static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) { struct sched_group *sg = sd->groups; + cpumask_t avail_mask; WARN_ON(!sg); do { - sg->group_weight = cpumask_weight(sched_group_cpus(sg)); + cpumask_andnot(&avail_mask, sched_group_cpus(sg), + cpu_isolated_mask); + sg->group_weight = cpumask_weight(&avail_mask); sg = sg->next; } while (sg != sd->groups); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e893b0fcac6b..83da13b5f6b8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2941,6 +2941,8 @@ static void find_best_cpu_in_cluster(struct sched_cluster *c, struct cpumask search_cpus; cpumask_and(&search_cpus, tsk_cpus_allowed(env->p), &c->cpus); + cpumask_andnot(&search_cpus, &search_cpus, cpu_isolated_mask); + if (env->ignore_prev_cpu) cpumask_clear_cpu(env->prev_cpu, &search_cpus); @@ -3009,7 +3011,8 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats) prev_cpu = env->prev_cpu; if (!cpumask_test_cpu(prev_cpu, tsk_cpus_allowed(task)) || - unlikely(!cpu_active(prev_cpu))) + unlikely(!cpu_active(prev_cpu)) || + cpu_isolated(prev_cpu)) return false; if (task->ravg.mark_start - task->last_cpu_selected_ts >= @@ -7354,6 +7357,8 @@ void update_group_capacity(struct sched_domain *sd, int cpu) struct sched_group_capacity *sgc; struct rq *rq = cpu_rq(cpu); + if (cpumask_test_cpu(cpu, cpu_isolated_mask)) + continue; /* * build_sched_domains() -> init_sched_groups_capacity() * gets here before we've attached the domains to the @@ -7381,7 +7386,11 @@ void update_group_capacity(struct sched_domain *sd, int cpu) group = child->groups; do { - capacity += group->sgc->capacity; + cpumask_t *cpus = sched_group_cpus(group); + + /* Revisit this later. This won't work for MT domain */ + if (!cpu_isolated(cpumask_first(cpus))) + capacity += group->sgc->capacity; group = group->next; } while (group != child->groups); } @@ -7521,6 +7530,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, power_cost(i, 0), cpu_temp(i)); + if (cpu_isolated(i)) + continue; + /* Bias balancing toward cpus of our domain */ if (local_group) load = target_load(i, load_idx); @@ -7548,17 +7560,27 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->idle_cpus++; } - /* Adjust by relative CPU capacity of the group */ - sgs->group_capacity = group->sgc->capacity; - sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; + /* Isolated CPU has no weight */ + if (!group->group_weight) { + sgs->group_capacity = 0; + sgs->avg_load = 0; + sgs->group_no_capacity = 1; + sgs->group_type = group_other; + sgs->group_weight = group->group_weight; + } else { + /* Adjust by relative CPU capacity of the group */ + sgs->group_capacity = group->sgc->capacity; + sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / + sgs->group_capacity; - if (sgs->sum_nr_running) - sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; + sgs->group_weight = group->group_weight; - sgs->group_weight = group->group_weight; + sgs->group_no_capacity = group_is_overloaded(env, sgs); + sgs->group_type = group_classify(group, sgs, env); + } - sgs->group_no_capacity = group_is_overloaded(env, sgs); - sgs->group_type = group_classify(group, sgs, env); + if (sgs->sum_nr_running) + sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; } #ifdef CONFIG_SCHED_HMP @@ -8601,6 +8623,9 @@ static int idle_balance(struct rq *this_rq) int pulled_task = 0; u64 curr_cost = 0; + if (cpu_isolated(this_cpu)) + return 0; + idle_enter_fair(this_rq); /* @@ -8908,16 +8933,21 @@ static void nohz_balancer_kick(int type) return; } +void nohz_balance_clear_nohz_mask(int cpu) +{ + if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); + atomic_dec(&nohz.nr_cpus); + } +} + static inline void nohz_balance_exit_idle(int cpu) { if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { /* * Completely isolated CPUs don't ever set, so we must test. */ - if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); - } + nohz_balance_clear_nohz_mask(cpu); clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } } @@ -8974,7 +9004,7 @@ void nohz_balance_enter_idle(int cpu) /* * If we're a completely isolated CPU, we don't play. */ - if (on_null_domain(cpu_rq(cpu))) + if (on_null_domain(cpu_rq(cpu)) || cpu_isolated(cpu)) return; cpumask_set_cpu(cpu, nohz.idle_cpus_mask); @@ -9003,7 +9033,13 @@ static DEFINE_SPINLOCK(balancing); */ void update_max_interval(void) { - max_load_balance_interval = HZ*num_online_cpus()/10; + cpumask_t avail_mask; + unsigned int available_cpus; + + cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask); + available_cpus = cpumask_weight(&avail_mask); + + max_load_balance_interval = HZ*available_cpus/10; } /* @@ -9342,8 +9378,10 @@ void trigger_load_balance(struct rq *rq) { int type = NOHZ_KICK_ANY; - /* Don't need to rebalance while attached to NULL domain */ - if (unlikely(on_null_domain(rq))) + /* Don't need to rebalance while attached to NULL domain or + * cpu is isolated. + */ + if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq))) return; if (time_after_eq(jiffies, rq->next_balance)) diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index 5002619961ce..a921498dbf09 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -2828,10 +2828,10 @@ void set_window_start(struct rq *rq) rq->curr->ravg.mark_start = rq->window_start; } -void migrate_sync_cpu(int cpu) +void migrate_sync_cpu(int cpu, int new_cpu) { if (cpu == sync_cpu) - sync_cpu = smp_processor_id(); + sync_cpu = new_cpu; } static void reset_all_task_stats(void) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index cfec881491ef..ba4403e910d8 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -265,8 +265,12 @@ static void pull_rt_task(struct rq *this_rq); static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) { - /* Try to pull RT tasks here if we lower this rq's prio */ - return rq->rt.highest_prio.curr > prev->prio; + /* + * Try to pull RT tasks here if we lower this rq's prio and cpu is not + * isolated + */ + return rq->rt.highest_prio.curr > prev->prio && + !cpu_isolated(cpu_of(rq)); } static inline int rt_overloaded(struct rq *rq) @@ -1694,6 +1698,8 @@ static int find_lowest_rq_hmp(struct task_struct *task) for_each_sched_cluster(cluster) { cpumask_and(&candidate_mask, &cluster->cpus, lowest_mask); + cpumask_andnot(&candidate_mask, &candidate_mask, + cpu_isolated_mask); if (cpumask_empty(&candidate_mask)) continue; @@ -2282,7 +2288,8 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) + if (!task_on_rq_queued(p) || rq->rt.rt_nr_running || + cpu_isolated(cpu_of(rq))) return; queue_pull_task(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec7721112b05..41abb4dabeb7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1069,7 +1069,7 @@ extern void clear_boost_kick(int cpu); extern void clear_hmp_request(int cpu); extern void mark_task_starting(struct task_struct *p); extern void set_window_start(struct rq *rq); -extern void migrate_sync_cpu(int cpu); +extern void migrate_sync_cpu(int cpu, int new_cpu); extern void update_cluster_topology(void); extern void set_task_last_wake(struct task_struct *p, u64 wallclock); extern void set_task_last_switch_out(struct task_struct *p, u64 wallclock); @@ -1424,7 +1424,7 @@ static inline void clear_boost_kick(int cpu) { } static inline void clear_hmp_request(int cpu) { } static inline void mark_task_starting(struct task_struct *p) { } static inline void set_window_start(struct rq *rq) { } -static inline void migrate_sync_cpu(int cpu) { } +static inline void migrate_sync_cpu(int cpu, int new_cpu) {} static inline void update_cluster_topology(void) { } static inline void set_task_last_wake(struct task_struct *p, u64 wallclock) { } static inline void set_task_last_switch_out(struct task_struct *p, @@ -1953,6 +1953,7 @@ extern const struct sched_class idle_sched_class; extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); +extern void nohz_balance_clear_nohz_mask(int cpu); extern void idle_enter_fair(struct rq *this_rq); extern void idle_exit_fair(struct rq *this_rq); -- cgit v1.2.3 From 0a17b36a20d65df27ccf2de068ea517b19f6a53f Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Sun, 12 Jun 2016 13:57:05 -0700 Subject: sched/core: Add trace point for cpu isolation Add tracepoint to capture the cpu isolation event including KPI for time it took to isolate. Change-Id: If2d30000f068afc50db953940f4636ef6a089b24 Signed-off-by: Olav Haugan --- include/trace/events/sched.h | 35 +++++++++++++++++++++++++++++++++++ kernel/sched/core.c | 12 ++++++++++++ 2 files changed, 47 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 1ef5ec3eaf70..3990efdd0cc0 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -1244,6 +1244,41 @@ TRACE_EVENT(sched_get_nr_running_avg, __entry->avg, __entry->big_avg, __entry->iowait_avg) ); +/** + * sched_isolate - called when cores are isolated/unisolated + * + * @acutal_mask: mask of cores actually isolated/unisolated + * @req_mask: mask of cores requested isolated/unisolated + * @online_mask: cpu online mask + * @time: amount of time in us it took to isolate/unisolate + * @isolate: 1 if isolating, 0 if unisolating + * + */ +TRACE_EVENT(sched_isolate, + + TP_PROTO(unsigned int requested_cpu, unsigned int isolated_cpus, + u64 start_time, unsigned char isolate), + + TP_ARGS(requested_cpu, isolated_cpus, start_time, isolate), + + TP_STRUCT__entry( + __field(u32, requested_cpu) + __field(u32, isolated_cpus) + __field(u32, time) + __field(unsigned char, isolate) + ), + + TP_fast_assign( + __entry->requested_cpu = requested_cpu; + __entry->isolated_cpus = isolated_cpus; + __entry->time = div64_u64(sched_clock() - start_time, 1000); + __entry->isolate = isolate; + ), + + TP_printk("iso cpu=%u cpus=0x%x time=%u us isolated=%d", + __entry->requested_cpu, __entry->isolated_cpus, + __entry->time, __entry->isolate) +); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cddb0073c5fb..7b7f1961fd10 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5645,6 +5645,10 @@ int sched_isolate_cpu(int cpu) struct rq *rq = cpu_rq(cpu); cpumask_t avail_cpus; int ret_code = 0; + u64 start_time; + + if (trace_sched_isolate_enabled()) + start_time = sched_clock(); lock_device_hotplug(); @@ -5681,6 +5685,8 @@ int sched_isolate_cpu(int cpu) out: unlock_device_hotplug(); + trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0], + start_time, 1); return ret_code; } @@ -5694,6 +5700,10 @@ int sched_unisolate_cpu_unlocked(int cpu) { int ret_code = 0; struct rq *rq = cpu_rq(cpu); + u64 start_time; + + if (trace_sched_isolate_enabled()) + start_time = sched_clock(); lock_device_hotplug_assert(); @@ -5730,6 +5740,8 @@ int sched_unisolate_cpu_unlocked(int cpu) } out: + trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0], + start_time, 0); return ret_code; } -- cgit v1.2.3 From de1a32cd143b4db85bf18fab02782a4e8af79b17 Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Thu, 2 Jun 2016 18:06:10 -0700 Subject: drivers/base: cpu: Add node for cpu isolation Add a new per-cpu node to control isolation of CPU. Usage: echo 1 > isolate - Isolate the core echo 0 > isolate - Unisolate the core Change-Id: I6a13e8dda99130ca794e5b6f51600f4c57a3e921 Signed-off-by: Olav Haugan --- drivers/base/cpu.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index dee022638fe6..c8bfb6077224 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -180,6 +180,58 @@ static struct attribute_group crash_note_cpu_attr_group = { }; #endif +#ifdef CONFIG_HOTPLUG_CPU + +static ssize_t show_cpu_isolated(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpu *cpu = container_of(dev, struct cpu, dev); + ssize_t rc; + int cpuid = cpu->dev.id; + unsigned int isolated = cpu_isolated(cpuid); + + rc = snprintf(buf, PAGE_SIZE-2, "%d\n", isolated); + + return rc; +} + +static ssize_t __ref store_cpu_isolated(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cpu *cpu = container_of(dev, struct cpu, dev); + int err; + int cpuid = cpu->dev.id; + unsigned int isolated; + + err = kstrtouint(strstrip((char *)buf), 0, &isolated); + if (err) + return err; + + if (isolated > 1) + return -EINVAL; + + if (isolated) + sched_isolate_cpu(cpuid); + else + sched_unisolate_cpu(cpuid); + + return count; +} + +static DEVICE_ATTR(isolate, 0644, show_cpu_isolated, store_cpu_isolated); + +static struct attribute *cpu_isolated_attrs[] = { + &dev_attr_isolate.attr, + NULL +}; + +static struct attribute_group cpu_isolated_attr_group = { + .attrs = cpu_isolated_attrs, +}; + +#endif + #ifdef CONFIG_SCHED_HMP static ssize_t show_sched_static_cpu_pwr_cost(struct device *dev, @@ -279,6 +331,9 @@ static const struct attribute_group *common_cpu_attr_groups[] = { #endif #ifdef CONFIG_SCHED_HMP &sched_hmp_cpu_attr_group, +#endif +#ifdef CONFIG_HOTPLUG_CPU + &cpu_isolated_attr_group, #endif NULL }; @@ -289,6 +344,9 @@ static const struct attribute_group *hotplugable_cpu_attr_groups[] = { #endif #ifdef CONFIG_SCHED_HMP &sched_hmp_cpu_attr_group, +#endif +#ifdef CONFIG_HOTPLUG_CPU + &cpu_isolated_attr_group, #endif NULL }; -- cgit v1.2.3 From 4400ef145f2b7aac21676b25942f32fb32ed724e Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Sun, 29 May 2016 19:35:54 -0700 Subject: irq: Make irq affinity function cpu isolation aware Prohibit setting the affinity of an IRQ to an isolated core. Change-Id: I7b50778615541a64f9956573757c7f28748c4f69 Signed-off-by: Olav Haugan --- arch/arm/kernel/irq.c | 11 ++++++++++- kernel/irq/cpuhotplug.c | 11 ++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c index 1d45320ee125..f56a831de043 100644 --- a/arch/arm/kernel/irq.c +++ b/arch/arm/kernel/irq.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -127,6 +128,7 @@ static bool migrate_one_irq(struct irq_desc *desc) const struct cpumask *affinity = irq_data_get_affinity_mask(d); struct irq_chip *c; bool ret = false; + struct cpumask available_cpus; /* * If this is a per-CPU interrupt, or the affinity does not @@ -135,8 +137,15 @@ static bool migrate_one_irq(struct irq_desc *desc) if (irqd_is_per_cpu(d) || !cpumask_test_cpu(smp_processor_id(), affinity)) return false; + cpumask_copy(&available_cpus, affinity); + cpumask_andnot(&available_cpus, &available_cpus, cpu_isolated_mask); + affinity = &available_cpus; + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { - affinity = cpu_online_mask; + cpumask_andnot(&available_cpus, cpu_online_mask, + cpu_isolated_mask); + if (cpumask_empty(affinity)) + affinity = cpu_online_mask; ret = true; } diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 011f8c4c63da..104432f3d311 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "internals.h" @@ -20,6 +21,7 @@ static bool migrate_one_irq(struct irq_desc *desc) const struct cpumask *affinity = d->common->affinity; struct irq_chip *c; bool ret = false; + struct cpumask available_cpus; /* * If this is a per-CPU interrupt, or the affinity does not @@ -29,8 +31,15 @@ static bool migrate_one_irq(struct irq_desc *desc) !cpumask_test_cpu(smp_processor_id(), affinity)) return false; + cpumask_copy(&available_cpus, affinity); + cpumask_andnot(&available_cpus, &available_cpus, cpu_isolated_mask); + affinity = &available_cpus; + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { - affinity = cpu_online_mask; + cpumask_andnot(&available_cpus, cpu_online_mask, + cpu_isolated_mask); + if (cpumask_empty(affinity)) + affinity = cpu_online_mask; ret = true; } -- cgit v1.2.3 From 287b1a8c1cb83f2c4bed561a2dc48b673e4eb1c9 Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Sun, 29 May 2016 19:37:20 -0700 Subject: vmstat: Add cpu isolation awareness Ensure vmstat updates do not run on isolated cpus. Change-Id: I401de0b52fa6d20573187265ee56edd543b1419e Signed-off-by: Olav Haugan --- mm/vmstat.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index ca75eeecbad1..77b8eabd5446 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1390,7 +1390,7 @@ static cpumask_var_t cpu_stat_off; static void vmstat_update(struct work_struct *w) { - if (refresh_cpu_vm_stats(true)) { + if (refresh_cpu_vm_stats(true) && !cpu_isolated(smp_processor_id())) { /* * Counters were updated so we expect more updates * to occur in the future. Keep on running the @@ -1402,7 +1402,8 @@ static void vmstat_update(struct work_struct *w) } else { /* * We did not update any counters so the app may be in - * a mode where it does not cause counter updates. + * a mode where it does not cause counter updates or the cpu + * was isolated. * We may be uselessly running vmstat_update. * Defer the checking for differentials to the * shepherd thread on a different processor. @@ -1469,7 +1470,7 @@ static void vmstat_shepherd(struct work_struct *w) get_online_cpus(); /* Check processors whose vmstat worker threads have been disabled */ for_each_cpu(cpu, cpu_stat_off) - if (need_update(cpu) && + if (!cpu_isolated(cpu) && need_update(cpu) && cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) queue_delayed_work_on(cpu, vmstat_wq, -- cgit v1.2.3 From bc24c063ef8b8c16432fd328403972ae3da12526 Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Sun, 29 May 2016 19:50:47 -0700 Subject: pmqos: Enable cpu isolation awareness Set long latency requirement for isolated cores to ensure LPM logic will select a deep sleep state. Change-Id: I83e9fbb800df259616a145d311b50627dc42a5ff Signed-off-by: Olav Haugan --- drivers/cpuidle/cpuidle.c | 9 ++++++--- kernel/power/qos.c | 7 +++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index b622b9541279..a045b9a940e8 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -625,12 +625,15 @@ static void smp_callback(void *v) static int cpuidle_latency_notify(struct notifier_block *b, unsigned long l, void *v) { - const struct cpumask *cpus; + struct cpumask cpus; - cpus = v ?: cpu_online_mask; + if (v) + cpumask_andnot(&cpus, v, cpu_isolated_mask); + else + cpumask_andnot(&cpus, cpu_online_mask, cpu_isolated_mask); preempt_disable(); - smp_call_function_many(cpus, smp_callback, NULL, 1); + smp_call_function_many(&cpus, smp_callback, NULL, 1); preempt_enable(); return NOTIFY_OK; diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 8ecc7b3f7dd9..69c32c42080f 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -447,6 +448,9 @@ EXPORT_SYMBOL_GPL(pm_qos_request); int pm_qos_request_for_cpu(int pm_qos_class, int cpu) { + if (cpu_isolated(cpu)) + return INT_MAX; + return pm_qos_array[pm_qos_class]->constraints->target_per_cpu[cpu]; } EXPORT_SYMBOL(pm_qos_request_for_cpu); @@ -469,6 +473,9 @@ int pm_qos_request_for_cpumask(int pm_qos_class, struct cpumask *mask) val = c->default_value; for_each_cpu(cpu, mask) { + if (cpu_isolated(cpu)) + continue; + switch (c->type) { case PM_QOS_MIN: if (c->target_per_cpu[cpu] < val) -- cgit v1.2.3 From e38c1ce12351b6e8bfa8a4237d940449afcc7500 Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Sun, 29 May 2016 19:53:00 -0700 Subject: smp: Do not wake up all idle CPUs Do not wake up cpus that are isolated. Change-Id: I07702bb5b738c1c75c49a2ca4cb08be0231ccb12 Signed-off-by: Olav Haugan --- kernel/smp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/smp.c b/kernel/smp.c index abdc48cd79a3..b2ec21c5c9d6 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -766,8 +766,8 @@ void wake_up_all_idle_cpus(void) for_each_online_cpu(cpu) { if (cpu == smp_processor_id()) continue; - - wake_up_if_idle(cpu); + if (!cpu_isolated(cpu)) + wake_up_if_idle(cpu); } preempt_enable(); } -- cgit v1.2.3 From 639c8ad52d59e7b9447b15c0e9a2ceaa533ad854 Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Sun, 29 May 2016 19:54:27 -0700 Subject: perf: Add cpu isolation awareness Ensure perf events does not wake up idle cores when core is isolated. Change-Id: Ifefb2f1cf6c24af7bc46fc62797955b8c8ad5815 Signed-off-by: Olav Haugan --- kernel/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index d6ec580584b6..5beb88f11671 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3386,7 +3386,8 @@ static int perf_event_read(struct perf_event *event, bool group) * If event is enabled and currently active on a CPU, update the * value in the event structure: */ - if (event->state == PERF_EVENT_STATE_ACTIVE) { + if (event->state == PERF_EVENT_STATE_ACTIVE && + !cpu_isolated(event->oncpu)) { struct perf_read_data data = { .event = event, .group = group, -- cgit v1.2.3 From ca2c4c65188206b0251a8d7576374847b45110e4 Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Thu, 2 Jun 2016 16:29:27 -0700 Subject: sched/tick: Ensure timers does not get queued on isolated cpus Timers should not be queued on isolated cpus. Instead try to find other cpus to queue timer on. Change-Id: I5d849dfd29aa5bb594454473768d7db1da258028 Signed-off-by: Olav Haugan --- include/linux/tick.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/include/linux/tick.h b/include/linux/tick.h index 5bf3ddade19c..1732697ea419 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -161,7 +161,15 @@ extern void __tick_nohz_task_switch(void); #else static inline int housekeeping_any_cpu(void) { - return smp_processor_id(); + cpumask_t available; + int cpu; + + cpumask_andnot(&available, cpu_online_mask, cpu_isolated_mask); + cpu = cpumask_any(&available); + if (cpu >= nr_cpu_ids) + cpu = smp_processor_id(); + + return cpu; } static inline bool tick_nohz_full_enabled(void) { return false; } static inline bool tick_nohz_full_cpu(int cpu) { return false; } @@ -187,7 +195,7 @@ static inline bool is_housekeeping_cpu(int cpu) if (tick_nohz_full_enabled()) return cpumask_test_cpu(cpu, housekeeping_mask); #endif - return true; + return !cpu_isolated(cpu); } static inline void housekeeping_affine(struct task_struct *t) -- cgit v1.2.3 From 75c9d1833a9ee5ef704bda120d942e5b75cd204a Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Thu, 18 Aug 2016 17:19:13 -0700 Subject: defconfig: msm: Disable core control helper Core control helper is not needed anymore with subsequent patches that moves core control into the kernel. Change-Id: I2c62af441fb9e5ba9f29719853a63e4c8f2d031b Signed-off-by: Olav Haugan --- arch/arm/configs/msmcortex_defconfig | 1 - arch/arm/configs/msmfalcon_defconfig | 1 - arch/arm64/configs/msm-perf_defconfig | 1 - arch/arm64/configs/msm_defconfig | 1 - arch/arm64/configs/msmcortex-perf_defconfig | 1 - arch/arm64/configs/msmcortex_defconfig | 1 - arch/arm64/configs/msmfalcon-perf_defconfig | 1 - arch/arm64/configs/msmfalcon_defconfig | 1 - 8 files changed, 8 deletions(-) diff --git a/arch/arm/configs/msmcortex_defconfig b/arch/arm/configs/msmcortex_defconfig index 0a20c52bd3b2..2b35b0f12787 100644 --- a/arch/arm/configs/msmcortex_defconfig +++ b/arch/arm/configs/msmcortex_defconfig @@ -459,7 +459,6 @@ CONFIG_TRACER_PKT=y CONFIG_QCOM_FORCE_WDOG_BITE_ON_PANIC=y CONFIG_MSM_MPM_OF=y CONFIG_MSM_EVENT_TIMER=y -CONFIG_MSM_CORE_CTL_HELPER=y CONFIG_QCOM_REMOTEQDSS=y CONFIG_MSM_SERVICE_NOTIFIER=y CONFIG_MEM_SHARE_QMI_SERVICE=y diff --git a/arch/arm/configs/msmfalcon_defconfig b/arch/arm/configs/msmfalcon_defconfig index 0a20c52bd3b2..2b35b0f12787 100644 --- a/arch/arm/configs/msmfalcon_defconfig +++ b/arch/arm/configs/msmfalcon_defconfig @@ -459,7 +459,6 @@ CONFIG_TRACER_PKT=y CONFIG_QCOM_FORCE_WDOG_BITE_ON_PANIC=y CONFIG_MSM_MPM_OF=y CONFIG_MSM_EVENT_TIMER=y -CONFIG_MSM_CORE_CTL_HELPER=y CONFIG_QCOM_REMOTEQDSS=y CONFIG_MSM_SERVICE_NOTIFIER=y CONFIG_MEM_SHARE_QMI_SERVICE=y diff --git a/arch/arm64/configs/msm-perf_defconfig b/arch/arm64/configs/msm-perf_defconfig index 84bb603c3142..7f31331933bb 100644 --- a/arch/arm64/configs/msm-perf_defconfig +++ b/arch/arm64/configs/msm-perf_defconfig @@ -525,7 +525,6 @@ CONFIG_MSM_PIL_MSS_QDSP6V5=y CONFIG_TRACER_PKT=y CONFIG_MSM_MPM_OF=y CONFIG_MSM_AVTIMER=y -CONFIG_MSM_CORE_CTL_HELPER=y CONFIG_MSM_RPM_RBCPR_STATS_V2_LOG=y CONFIG_MSM_RPM_LOG=y CONFIG_MSM_RPM_STATS_LOG=y diff --git a/arch/arm64/configs/msm_defconfig b/arch/arm64/configs/msm_defconfig index 6119ff12d46d..89bee1463421 100644 --- a/arch/arm64/configs/msm_defconfig +++ b/arch/arm64/configs/msm_defconfig @@ -527,7 +527,6 @@ CONFIG_TRACER_PKT=y CONFIG_QCOM_FORCE_WDOG_BITE_ON_PANIC=y CONFIG_MSM_MPM_OF=y CONFIG_MSM_AVTIMER=y -CONFIG_MSM_CORE_CTL_HELPER=y CONFIG_QCOM_REMOTEQDSS=y CONFIG_MSM_SERVICE_NOTIFIER=y CONFIG_MSM_RPM_RBCPR_STATS_V2_LOG=y diff --git a/arch/arm64/configs/msmcortex-perf_defconfig b/arch/arm64/configs/msmcortex-perf_defconfig index 08288e1b5c25..7667669e8aa4 100644 --- a/arch/arm64/configs/msmcortex-perf_defconfig +++ b/arch/arm64/configs/msmcortex-perf_defconfig @@ -525,7 +525,6 @@ CONFIG_QCOM_FORCE_WDOG_BITE_ON_PANIC=y CONFIG_MSM_MPM_OF=y CONFIG_MSM_EVENT_TIMER=y CONFIG_MSM_AVTIMER=y -CONFIG_MSM_CORE_CTL_HELPER=y CONFIG_QCOM_REMOTEQDSS=y CONFIG_MSM_SERVICE_NOTIFIER=y CONFIG_MSM_RPM_RBCPR_STATS_V2_LOG=y diff --git a/arch/arm64/configs/msmcortex_defconfig b/arch/arm64/configs/msmcortex_defconfig index 9e2727c4fe1e..c267345858cf 100644 --- a/arch/arm64/configs/msmcortex_defconfig +++ b/arch/arm64/configs/msmcortex_defconfig @@ -544,7 +544,6 @@ CONFIG_QCOM_FORCE_WDOG_BITE_ON_PANIC=y CONFIG_MSM_MPM_OF=y CONFIG_MSM_EVENT_TIMER=y CONFIG_MSM_AVTIMER=y -CONFIG_MSM_CORE_CTL_HELPER=y CONFIG_QCOM_REMOTEQDSS=y CONFIG_MSM_SERVICE_NOTIFIER=y CONFIG_MSM_RPM_RBCPR_STATS_V2_LOG=y diff --git a/arch/arm64/configs/msmfalcon-perf_defconfig b/arch/arm64/configs/msmfalcon-perf_defconfig index 39c2d3f71c5a..6ebd60b43c71 100644 --- a/arch/arm64/configs/msmfalcon-perf_defconfig +++ b/arch/arm64/configs/msmfalcon-perf_defconfig @@ -522,7 +522,6 @@ CONFIG_QCOM_FORCE_WDOG_BITE_ON_PANIC=y CONFIG_MSM_MPM_OF=y CONFIG_MSM_EVENT_TIMER=y CONFIG_MSM_AVTIMER=y -CONFIG_MSM_CORE_CTL_HELPER=y CONFIG_QCOM_REMOTEQDSS=y CONFIG_MSM_SERVICE_NOTIFIER=y CONFIG_MSM_RPM_RBCPR_STATS_V2_LOG=y diff --git a/arch/arm64/configs/msmfalcon_defconfig b/arch/arm64/configs/msmfalcon_defconfig index a277038b3fc3..01324d89e79e 100644 --- a/arch/arm64/configs/msmfalcon_defconfig +++ b/arch/arm64/configs/msmfalcon_defconfig @@ -541,7 +541,6 @@ CONFIG_QCOM_FORCE_WDOG_BITE_ON_PANIC=y CONFIG_MSM_MPM_OF=y CONFIG_MSM_EVENT_TIMER=y CONFIG_MSM_AVTIMER=y -CONFIG_MSM_CORE_CTL_HELPER=y CONFIG_QCOM_REMOTEQDSS=y CONFIG_MSM_SERVICE_NOTIFIER=y CONFIG_MSM_RPM_RBCPR_STATS_V2_LOG=y -- cgit v1.2.3 From 7f2c52364339cdcd3f56375b8a883005cb88680e Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Thu, 18 Aug 2016 17:21:30 -0700 Subject: core_ctl_helper: Remove code since it is not used anymore Remove the core control helper code since this is not needed anymore with subsequent patches that moves core control into the kernel. Change-Id: I62acddeb707fc7d5626580166b3466e63f45fd89 Signed-off-by: Olav Haugan --- drivers/soc/qcom/Kconfig | 8 --- drivers/soc/qcom/Makefile | 1 - drivers/soc/qcom/core_ctl_helper.c | 99 -------------------------------------- include/soc/qcom/core_ctl.h | 35 -------------- kernel/trace/power-traces.c | 2 - 5 files changed, 145 deletions(-) delete mode 100644 drivers/soc/qcom/core_ctl_helper.c delete mode 100644 include/soc/qcom/core_ctl.h diff --git a/drivers/soc/qcom/Kconfig b/drivers/soc/qcom/Kconfig index 9c27344165be..c32eafbd38fd 100644 --- a/drivers/soc/qcom/Kconfig +++ b/drivers/soc/qcom/Kconfig @@ -716,14 +716,6 @@ config MSM_KERNEL_PROTECT_TEST read-only. This test is FATAL whether it passes or fails! Success is signaled by a stage-2 fault. -config MSM_CORE_CTL_HELPER - tristate "Core control helper functions for dynamically hotplug CPUs" - help - Provide helper functions for core control driver. Core control - driver dynamically hotplugs CPUs from kernel based on current - system load and state. It also supports limiting min and - max online CPUs from userspace. - config QCOM_REMOTEQDSS bool "Allow debug tools to enable events on other processors" depends on QCOM_SCM && DEBUG_FS diff --git a/drivers/soc/qcom/Makefile b/drivers/soc/qcom/Makefile index d9134a558be6..434a114c000f 100644 --- a/drivers/soc/qcom/Makefile +++ b/drivers/soc/qcom/Makefile @@ -38,7 +38,6 @@ obj-$(CONFIG_MEM_SHARE_QMI_SERVICE) += memshare/ obj-$(CONFIG_MSM_PIL_SSR_GENERIC) += subsys-pil-tz.o obj-$(CONFIG_MSM_PIL_MSS_QDSP6V5) += pil-q6v5.o pil-msa.o pil-q6v5-mss.o obj-$(CONFIG_MSM_PIL) += peripheral-loader.o -obj-$(CONFIG_MSM_CORE_CTL_HELPER) += core_ctl_helper.o obj-$(CONFIG_MSM_PFE_WA) += pfe-wa.o obj-$(CONFIG_ARCH_MSM8996) += msm_cpu_voltage.o diff --git a/drivers/soc/qcom/core_ctl_helper.c b/drivers/soc/qcom/core_ctl_helper.c deleted file mode 100644 index 88201412128e..000000000000 --- a/drivers/soc/qcom/core_ctl_helper.c +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2014-2016, The Linux Foundation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 and - * only version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -void core_ctl_trace(int type, int cpu, int arg1, int arg2, int arg3) -{ - switch (type) { - case CORE_CTL_EVAL_NEED: - trace_core_ctl_eval_need(cpu, arg1, arg2, arg3); - break; - - case CORE_CTL_SET_BUSY: - trace_core_ctl_set_busy(cpu, arg1, arg2, arg3); - break; - }; -} -EXPORT_SYMBOL(core_ctl_trace); - -void core_ctl_block_hotplug(void) -{ - get_online_cpus(); -} -EXPORT_SYMBOL(core_ctl_block_hotplug); - -void core_ctl_unblock_hotplug(void) -{ - put_online_cpus(); -} -EXPORT_SYMBOL(core_ctl_unblock_hotplug); - -s64 core_ctl_get_time(void) -{ - return ktime_to_ms(ktime_get()); -} -EXPORT_SYMBOL(core_ctl_get_time); - -struct cpufreq_policy *core_ctl_get_policy(int cpu) -{ - return cpufreq_cpu_get(cpu); -} -EXPORT_SYMBOL(core_ctl_get_policy); - -void core_ctl_put_policy(struct cpufreq_policy *policy) -{ - cpufreq_cpu_put(policy); -} -EXPORT_SYMBOL(core_ctl_put_policy); - -struct device *core_ctl_find_cpu_device(unsigned cpu) -{ - return get_cpu_device(cpu); -} -EXPORT_SYMBOL(core_ctl_find_cpu_device); - -int __ref core_ctl_online_core(unsigned int cpu) -{ - int ret = -EINVAL; - struct device *dev = get_cpu_device(cpu); - - if (dev) { - lock_device_hotplug(); - ret = device_online(dev); - unlock_device_hotplug(); - } - return ret; -} -EXPORT_SYMBOL(core_ctl_online_core); - -int __ref core_ctl_offline_core(unsigned int cpu) -{ - int ret = -EINVAL; - struct device *dev = get_cpu_device(cpu); - - if (dev) { - lock_device_hotplug(); - ret = device_offline(dev); - unlock_device_hotplug(); - } - return ret; -} -EXPORT_SYMBOL(core_ctl_offline_core); diff --git a/include/soc/qcom/core_ctl.h b/include/soc/qcom/core_ctl.h deleted file mode 100644 index 08b43058b37c..000000000000 --- a/include/soc/qcom/core_ctl.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2014-2015, The Linux Foundation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 and - * only version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#ifndef __SOC_QCOM_CORE_CTL_H -#define __SOC_QCOM_CORE_CTL_H - -enum { - CORE_CTL_EVAL_NEED, - CORE_CTL_SET_BUSY, - CORE_CTL_N_TRACE_EVENTS, -}; - -extern void core_ctl_block_hotplug(void); -extern void core_ctl_unblock_hotplug(void); -extern s64 core_ctl_get_time(void); -extern struct cpufreq_policy *core_ctl_get_policy(int cpu); -extern void core_ctl_put_policy(struct cpufreq_policy *policy); -extern struct device *core_ctl_find_cpu_device(unsigned cpu); -extern int core_ctl_online_core(unsigned int cpu); -extern int core_ctl_offline_core(unsigned int cpu); - -#define USE_CORE_CTL_TRACE -extern void core_ctl_trace(int type, int cpu, int arg1, int arg2, int arg3); - -#endif diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index 9270e1ac6460..49fa2e6eea98 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -15,5 +15,3 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); -EXPORT_TRACEPOINT_SYMBOL(core_ctl_set_busy); -EXPORT_TRACEPOINT_SYMBOL(core_ctl_eval_need); -- cgit v1.2.3 From 59f16ae0345c902c1d09da75e0f89d7e7ddbc54f Mon Sep 17 00:00:00 2001 From: Olav Haugan Date: Thu, 18 Aug 2016 17:22:44 -0700 Subject: core_ctrl: Move core control into kernel Move core control from out-of-tree module into the kernel proper. Core control monitors load on CPUs and controls how many CPUs are available for the system to use at any point in time. This can help save power. Core control can be configured through sysfs interface. Change-Id: Ia78e701468ea3828195c2a15c9cf9fafd099804a Signed-off-by: Olav Haugan --- init/Kconfig | 10 + kernel/sched/Makefile | 1 + kernel/sched/core_ctl.c | 1014 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1025 insertions(+) create mode 100644 kernel/sched/core_ctl.c diff --git a/init/Kconfig b/init/Kconfig index 9ad1ae9d9da8..6020a351c57b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1170,6 +1170,16 @@ config SCHED_HMP_CSTATE_AWARE with CPUs C-state. If this is enabled, scheduler places tasks onto the shallowest C-state CPU among the most power efficient CPUs. +config SCHED_CORE_CTL + bool "QTI Core Control" + depends on SMP + help + This options enables the core control functionality in + the scheduler. Core control automatically offline and + online cores based on cpu load and utilization. + + If unsure, say N here. + config CHECKPOINT_RESTORE bool "Checkpoint/restore support" if EXPERT select PROC_CHILDREN diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 1f159743ebfc..508b65690288 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -20,3 +20,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_SCHED_CORE_CTL) += core_ctl.o diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c new file mode 100644 index 000000000000..8f071757d516 --- /dev/null +++ b/kernel/sched/core_ctl.c @@ -0,0 +1,1014 @@ +/* Copyright (c) 2014-2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define MAX_CPUS_PER_GROUP 4 + +struct cpu_data { + /* Per CPU data. */ + bool inited; + bool online; + bool rejected; + bool is_busy; + bool not_preferred; + unsigned int busy; + unsigned int cpu; + struct list_head sib; + unsigned int first_cpu; + + /* Per cluster data set only on first CPU */ + unsigned int min_cpus; + unsigned int max_cpus; + unsigned int offline_delay_ms; + unsigned int busy_up_thres[MAX_CPUS_PER_GROUP]; + unsigned int busy_down_thres[MAX_CPUS_PER_GROUP]; + unsigned int online_cpus; + unsigned int avail_cpus; + unsigned int num_cpus; + unsigned int need_cpus; + unsigned int task_thres; + s64 need_ts; + struct list_head lru; + bool pending; + spinlock_t pending_lock; + bool is_big_cluster; + int nrrun; + bool nrrun_changed; + struct timer_list timer; + struct task_struct *hotplug_thread; + struct kobject kobj; +}; + +static DEFINE_PER_CPU(struct cpu_data, cpu_state); +static DEFINE_SPINLOCK(state_lock); +static void apply_need(struct cpu_data *f); +static void wake_up_hotplug_thread(struct cpu_data *state); + +/* ========================= sysfs interface =========================== */ + +static ssize_t store_min_cpus(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->min_cpus = min(val, state->max_cpus); + wake_up_hotplug_thread(state); + + return count; +} + +static ssize_t show_min_cpus(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->min_cpus); +} + +static ssize_t store_max_cpus(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + val = min(val, state->num_cpus); + state->max_cpus = val; + state->min_cpus = min(state->min_cpus, state->max_cpus); + wake_up_hotplug_thread(state); + + return count; +} + +static ssize_t show_max_cpus(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->max_cpus); +} + +static ssize_t store_offline_delay_ms(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->offline_delay_ms = val; + apply_need(state); + + return count; +} + +static ssize_t show_task_thres(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->task_thres); +} + +static ssize_t store_task_thres(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + if (val < state->num_cpus) + return -EINVAL; + + state->task_thres = val; + apply_need(state); + + return count; +} + +static ssize_t show_offline_delay_ms(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->offline_delay_ms); +} + +static ssize_t store_busy_up_thres(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val[MAX_CPUS_PER_GROUP]; + int ret, i; + + ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + if (ret == 1) { + for (i = 0; i < state->num_cpus; i++) + state->busy_up_thres[i] = val[0]; + } else { + for (i = 0; i < state->num_cpus; i++) + state->busy_up_thres[i] = val[i]; + } + apply_need(state); + return count; +} + +static ssize_t show_busy_up_thres(struct cpu_data *state, char *buf) +{ + int i, count = 0; + + for (i = 0; i < state->num_cpus; i++) + count += snprintf(buf + count, PAGE_SIZE - count, "%u ", + state->busy_up_thres[i]); + count += snprintf(buf + count, PAGE_SIZE - count, "\n"); + return count; +} + +static ssize_t store_busy_down_thres(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val[MAX_CPUS_PER_GROUP]; + int ret, i; + + ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + if (ret == 1) { + for (i = 0; i < state->num_cpus; i++) + state->busy_down_thres[i] = val[0]; + } else { + for (i = 0; i < state->num_cpus; i++) + state->busy_down_thres[i] = val[i]; + } + apply_need(state); + return count; +} + +static ssize_t show_busy_down_thres(struct cpu_data *state, char *buf) +{ + int i, count = 0; + + for (i = 0; i < state->num_cpus; i++) + count += snprintf(buf + count, PAGE_SIZE - count, "%u ", + state->busy_down_thres[i]); + count += snprintf(buf + count, PAGE_SIZE - count, "\n"); + return count; +} + +static ssize_t store_is_big_cluster(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->is_big_cluster = val ? 1 : 0; + return count; +} + +static ssize_t show_is_big_cluster(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->is_big_cluster); +} + +static ssize_t show_cpus(struct cpu_data *state, char *buf) +{ + struct cpu_data *c; + ssize_t count = 0; + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry(c, &state->lru, sib) { + count += snprintf(buf + count, PAGE_SIZE - count, + "CPU%u (%s)\n", c->cpu, + c->online ? "Online" : "Offline"); + } + spin_unlock_irqrestore(&state_lock, flags); + return count; +} + +static ssize_t show_need_cpus(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->need_cpus); +} + +static ssize_t show_online_cpus(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->online_cpus); +} + +static ssize_t show_global_state(struct cpu_data *state, char *buf) +{ + struct cpu_data *c; + ssize_t count = 0; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + count += snprintf(buf + count, PAGE_SIZE - count, + "CPU%u\n", cpu); + c = &per_cpu(cpu_state, cpu); + if (!c->inited) + continue; + count += snprintf(buf + count, PAGE_SIZE - count, + "\tCPU: %u\n", c->cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tOnline: %u\n", c->online); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tRejected: %u\n", c->rejected); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tFirst CPU: %u\n", c->first_cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tBusy%%: %u\n", c->busy); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tIs busy: %u\n", c->is_busy); + if (c->cpu != c->first_cpu) + continue; + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNr running: %u\n", c->nrrun); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tAvail CPUs: %u\n", c->avail_cpus); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNeed CPUs: %u\n", c->need_cpus); + } + + return count; +} + +static ssize_t store_not_preferred(struct cpu_data *state, + const char *buf, size_t count) +{ + struct cpu_data *c; + unsigned int i, first_cpu; + unsigned int val[MAX_CPUS_PER_GROUP]; + int ret; + + ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + first_cpu = state->first_cpu; + + for (i = 0; i < state->num_cpus; i++) { + c = &per_cpu(cpu_state, first_cpu); + c->not_preferred = val[i]; + first_cpu++; + } + + return count; +} + +static ssize_t show_not_preferred(struct cpu_data *state, char *buf) +{ + struct cpu_data *c; + ssize_t count = 0; + unsigned int i, first_cpu; + + first_cpu = state->first_cpu; + + for (i = 0; i < state->num_cpus; i++) { + c = &per_cpu(cpu_state, first_cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tCPU:%d %u\n", first_cpu, c->not_preferred); + first_cpu++; + } + + return count; +} + +struct core_ctl_attr { + struct attribute attr; + ssize_t (*show)(struct cpu_data *, char *); + ssize_t (*store)(struct cpu_data *, const char *, size_t count); +}; + +#define core_ctl_attr_ro(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0444, show_##_name, NULL) + +#define core_ctl_attr_rw(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0644, show_##_name, store_##_name) + +core_ctl_attr_rw(min_cpus); +core_ctl_attr_rw(max_cpus); +core_ctl_attr_rw(offline_delay_ms); +core_ctl_attr_rw(busy_up_thres); +core_ctl_attr_rw(busy_down_thres); +core_ctl_attr_rw(task_thres); +core_ctl_attr_rw(is_big_cluster); +core_ctl_attr_ro(cpus); +core_ctl_attr_ro(need_cpus); +core_ctl_attr_ro(online_cpus); +core_ctl_attr_ro(global_state); +core_ctl_attr_rw(not_preferred); + +static struct attribute *default_attrs[] = { + &min_cpus.attr, + &max_cpus.attr, + &offline_delay_ms.attr, + &busy_up_thres.attr, + &busy_down_thres.attr, + &task_thres.attr, + &is_big_cluster.attr, + &cpus.attr, + &need_cpus.attr, + &online_cpus.attr, + &global_state.attr, + ¬_preferred.attr, + NULL +}; + +#define to_cpu_data(k) container_of(k, struct cpu_data, kobj) +#define to_attr(a) container_of(a, struct core_ctl_attr, attr) +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct cpu_data *data = to_cpu_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->show) + ret = cattr->show(data, buf); + + return ret; +} + +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct cpu_data *data = to_cpu_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->store) + ret = cattr->store(data, buf, count); + + return ret; +} + +static const struct sysfs_ops sysfs_ops = { + .show = show, + .store = store, +}; + +static struct kobj_type ktype_core_ctl = { + .sysfs_ops = &sysfs_ops, + .default_attrs = default_attrs, +}; + +/* ==================== runqueue based core count =================== */ + +#define RQ_AVG_TOLERANCE 2 +#define RQ_AVG_DEFAULT_MS 20 +#define NR_RUNNING_TOLERANCE 5 +static unsigned int rq_avg_period_ms = RQ_AVG_DEFAULT_MS; + +static s64 rq_avg_timestamp_ms; +static struct timer_list rq_avg_timer; + +static void update_running_avg(bool trigger_update) +{ + int cpu; + struct cpu_data *pcpu; + int avg, iowait_avg, big_avg, old_nrrun; + s64 now; + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + + now = ktime_to_ms(ktime_get()); + if (now - rq_avg_timestamp_ms < rq_avg_period_ms - RQ_AVG_TOLERANCE) { + spin_unlock_irqrestore(&state_lock, flags); + return; + } + rq_avg_timestamp_ms = now; + sched_get_nr_running_avg(&avg, &iowait_avg, &big_avg); + + spin_unlock_irqrestore(&state_lock, flags); + + /* + * Round up to the next integer if the average nr running tasks + * is within NR_RUNNING_TOLERANCE/100 of the next integer. + * If normal rounding up is used, it will allow a transient task + * to trigger online event. By the time core is onlined, the task + * has finished. + * Rounding to closest suffers same problem because scheduler + * might only provide running stats per jiffy, and a transient + * task could skew the number for one jiffy. If core control + * samples every 2 jiffies, it will observe 0.5 additional running + * average which rounds up to 1 task. + */ + avg = (avg + NR_RUNNING_TOLERANCE) / 100; + big_avg = (big_avg + NR_RUNNING_TOLERANCE) / 100; + + for_each_possible_cpu(cpu) { + pcpu = &per_cpu(cpu_state, cpu); + if (!pcpu->inited || pcpu->first_cpu != cpu) + continue; + old_nrrun = pcpu->nrrun; + /* + * Big cluster only need to take care of big tasks, but if + * there are not enough big cores, big tasks need to be run + * on little as well. Thus for little's runqueue stat, it + * has to use overall runqueue average, or derive what big + * tasks would have to be run on little. The latter approach + * is not easy to get given core control reacts much slower + * than scheduler, and can't predict scheduler's behavior. + */ + pcpu->nrrun = pcpu->is_big_cluster ? big_avg : avg; + if (pcpu->nrrun != old_nrrun) { + if (trigger_update) + apply_need(pcpu); + else + pcpu->nrrun_changed = true; + } + } +} + +/* adjust needed CPUs based on current runqueue information */ +static unsigned int apply_task_need(struct cpu_data *f, unsigned int new_need) +{ + /* Online all cores if there are enough tasks */ + if (f->nrrun >= f->task_thres) + return f->num_cpus; + + /* only online more cores if there are tasks to run */ + if (f->nrrun > new_need) + return new_need + 1; + + return new_need; +} + +static u64 round_to_nw_start(void) +{ + unsigned long step = msecs_to_jiffies(rq_avg_period_ms); + u64 jif = get_jiffies_64(); + + do_div(jif, step); + return (jif + 1) * step; +} + +static void rq_avg_timer_func(unsigned long not_used) +{ + update_running_avg(true); + mod_timer(&rq_avg_timer, round_to_nw_start()); +} + +/* ======================= load based core count ====================== */ + +static unsigned int apply_limits(struct cpu_data *f, unsigned int need_cpus) +{ + return min(max(f->min_cpus, need_cpus), f->max_cpus); +} + +static bool eval_need(struct cpu_data *f) +{ + unsigned long flags; + struct cpu_data *c; + unsigned int need_cpus = 0, last_need, thres_idx; + int ret = 0; + bool need_flag = false; + s64 now; + + if (unlikely(!f->inited)) + return 0; + + spin_lock_irqsave(&state_lock, flags); + thres_idx = f->online_cpus ? f->online_cpus - 1 : 0; + list_for_each_entry(c, &f->lru, sib) { + if (c->busy >= f->busy_up_thres[thres_idx]) + c->is_busy = true; + else if (c->busy < f->busy_down_thres[thres_idx]) + c->is_busy = false; + need_cpus += c->is_busy; + } + need_cpus = apply_task_need(f, need_cpus); + need_flag = apply_limits(f, need_cpus) != apply_limits(f, f->need_cpus); + last_need = f->need_cpus; + + now = ktime_to_ms(ktime_get()); + + if (need_cpus == last_need) { + f->need_ts = now; + spin_unlock_irqrestore(&state_lock, flags); + return 0; + } + + if (need_cpus > last_need) { + ret = 1; + } else if (need_cpus < last_need) { + s64 elapsed = now - f->need_ts; + + if (elapsed >= f->offline_delay_ms) { + ret = 1; + } else { + mod_timer(&f->timer, jiffies + + msecs_to_jiffies(f->offline_delay_ms)); + } + } + + if (ret) { + f->need_ts = now; + f->need_cpus = need_cpus; + } + + trace_core_ctl_eval_need(f->cpu, last_need, need_cpus, + ret && need_flag); + spin_unlock_irqrestore(&state_lock, flags); + + return ret && need_flag; +} + +static void apply_need(struct cpu_data *f) +{ + if (eval_need(f)) + wake_up_hotplug_thread(f); +} + +static int core_ctl_set_busy(unsigned int cpu, unsigned int busy) +{ + struct cpu_data *c = &per_cpu(cpu_state, cpu); + struct cpu_data *f; + unsigned int old_is_busy = c->is_busy; + + if (!c->inited) + return 0; + f = &per_cpu(cpu_state, c->first_cpu); + + update_running_avg(false); + if (c->busy == busy && !f->nrrun_changed) + return 0; + c->busy = busy; + f->nrrun_changed = false; + + apply_need(f); + trace_core_ctl_set_busy(cpu, busy, old_is_busy, c->is_busy); + return 0; +} + +/* ========================= core count enforcement ==================== */ + +/* + * If current thread is hotplug thread, don't attempt to wake up + * itself or other hotplug threads because it will deadlock. Instead, + * schedule a timer to fire in next timer tick and wake up the thread. + */ +static void wake_up_hotplug_thread(struct cpu_data *state) +{ + unsigned long flags; + int cpu; + struct cpu_data *pcpu; + bool no_wakeup = false; + + for_each_possible_cpu(cpu) { + pcpu = &per_cpu(cpu_state, cpu); + if (cpu != pcpu->first_cpu) + continue; + if (pcpu->hotplug_thread == current) { + no_wakeup = true; + break; + } + } + + spin_lock_irqsave(&state->pending_lock, flags); + state->pending = true; + spin_unlock_irqrestore(&state->pending_lock, flags); + + if (no_wakeup) { + spin_lock_irqsave(&state_lock, flags); + mod_timer(&state->timer, jiffies); + spin_unlock_irqrestore(&state_lock, flags); + } else { + wake_up_process(state->hotplug_thread); + } +} + +static void core_ctl_timer_func(unsigned long cpu) +{ + struct cpu_data *state = &per_cpu(cpu_state, cpu); + unsigned long flags; + + if (eval_need(state)) { + spin_lock_irqsave(&state->pending_lock, flags); + state->pending = true; + spin_unlock_irqrestore(&state->pending_lock, flags); + wake_up_process(state->hotplug_thread); + } + +} + +static int core_ctl_online_core(unsigned int cpu) +{ + int ret; + struct device *dev; + + lock_device_hotplug(); + dev = get_cpu_device(cpu); + if (!dev) { + pr_err("%s: failed to get cpu%d device\n", __func__, cpu); + ret = -ENODEV; + } else { + ret = device_online(dev); + } + unlock_device_hotplug(); + return ret; +} + +static int core_ctl_offline_core(unsigned int cpu) +{ + int ret; + struct device *dev; + + lock_device_hotplug(); + dev = get_cpu_device(cpu); + if (!dev) { + pr_err("%s: failed to get cpu%d device\n", __func__, cpu); + ret = -ENODEV; + } else { + ret = device_offline(dev); + } + unlock_device_hotplug(); + return ret; +} + +static void __ref do_hotplug(struct cpu_data *f) +{ + unsigned int need; + struct cpu_data *c, *tmp; + + need = apply_limits(f, f->need_cpus); + pr_debug("Trying to adjust group %u to %u\n", f->first_cpu, need); + + if (f->online_cpus > need) { + list_for_each_entry_safe(c, tmp, &f->lru, sib) { + if (!c->online) + continue; + + if (f->online_cpus == need) + break; + + /* Don't offline busy CPUs. */ + if (c->is_busy) + continue; + + pr_debug("Trying to Offline CPU%u\n", c->cpu); + if (core_ctl_offline_core(c->cpu)) + pr_debug("Unable to Offline CPU%u\n", c->cpu); + } + + /* + * If the number of online CPUs is within the limits, then + * don't force any busy CPUs offline. + */ + if (f->online_cpus <= f->max_cpus) + return; + + list_for_each_entry_safe(c, tmp, &f->lru, sib) { + if (!c->online) + continue; + + if (f->online_cpus <= f->max_cpus) + break; + + pr_debug("Trying to Offline CPU%u\n", c->cpu); + if (core_ctl_offline_core(c->cpu)) + pr_debug("Unable to Offline CPU%u\n", c->cpu); + } + } else if (f->online_cpus < need) { + list_for_each_entry_safe(c, tmp, &f->lru, sib) { + if (c->online || c->rejected || c->not_preferred) + continue; + if (f->online_cpus == need) + break; + + pr_debug("Trying to Online CPU%u\n", c->cpu); + if (core_ctl_online_core(c->cpu)) + pr_debug("Unable to Online CPU%u\n", c->cpu); + } + + if (f->online_cpus == need) + return; + + + list_for_each_entry_safe(c, tmp, &f->lru, sib) { + if (c->online || c->rejected || !c->not_preferred) + continue; + if (f->online_cpus == need) + break; + + pr_debug("Trying to Online CPU%u\n", c->cpu); + if (core_ctl_online_core(c->cpu)) + pr_debug("Unable to Online CPU%u\n", c->cpu); + } + + } +} + +static int __ref try_hotplug(void *data) +{ + struct cpu_data *f = data; + unsigned long flags; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + spin_lock_irqsave(&f->pending_lock, flags); + if (!f->pending) { + spin_unlock_irqrestore(&f->pending_lock, flags); + schedule(); + if (kthread_should_stop()) + break; + spin_lock_irqsave(&f->pending_lock, flags); + } + set_current_state(TASK_RUNNING); + f->pending = false; + spin_unlock_irqrestore(&f->pending_lock, flags); + + do_hotplug(f); + } + + return 0; +} + +static int __ref cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + uint32_t cpu = (uintptr_t)hcpu; + struct cpu_data *state = &per_cpu(cpu_state, cpu); + struct cpu_data *f; + int ret = NOTIFY_OK; + unsigned long flags; + + /* Don't affect suspend resume */ + if (action & CPU_TASKS_FROZEN) + return NOTIFY_OK; + + if (unlikely(!state->inited)) + return NOTIFY_OK; + + f = &per_cpu(cpu_state, state->first_cpu); + + switch (action) { + case CPU_UP_PREPARE: + + /* If online state of CPU somehow got out of sync, fix it. */ + if (state->online) { + f->online_cpus--; + state->online = false; + pr_warn("CPU%d offline when state is online\n", cpu); + } + + if (state->rejected) { + state->rejected = false; + f->avail_cpus++; + } + + /* + * If a CPU is in the process of coming up, mark it as online + * so that there's no race with hotplug thread bringing up more + * CPUs than necessary. + */ + if (apply_limits(f, f->need_cpus) <= f->online_cpus) { + pr_debug("Prevent CPU%d onlining\n", cpu); + ret = NOTIFY_BAD; + } else { + state->online = true; + f->online_cpus++; + } + break; + + case CPU_ONLINE: + /* + * Moving to the end of the list should only happen in + * CPU_ONLINE and not on CPU_UP_PREPARE to prevent an + * infinite list traversal when thermal (or other entities) + * reject trying to online CPUs. + */ + spin_lock_irqsave(&state_lock, flags); + list_del(&state->sib); + list_add_tail(&state->sib, &f->lru); + spin_unlock_irqrestore(&state_lock, flags); + break; + + case CPU_DEAD: + /* Move a CPU to the end of the LRU when it goes offline. */ + spin_lock_irqsave(&state_lock, flags); + list_del(&state->sib); + list_add_tail(&state->sib, &f->lru); + spin_unlock_irqrestore(&state_lock, flags); + + /* Fall through */ + + case CPU_UP_CANCELED: + + /* If online state of CPU somehow got out of sync, fix it. */ + if (!state->online) { + f->online_cpus++; + pr_warn("CPU%d online when state is offline\n", cpu); + } + + if (!state->rejected && action == CPU_UP_CANCELED) { + state->rejected = true; + f->avail_cpus--; + } + + state->online = false; + state->busy = 0; + f->online_cpus--; + break; + } + + if (f->online_cpus < apply_limits(f, f->need_cpus) + && f->online_cpus < f->avail_cpus + && action == CPU_DEAD) + wake_up_hotplug_thread(f); + + return ret; +} + +static struct notifier_block __refdata cpu_notifier = { + .notifier_call = cpu_callback, +}; + +/* ============================ init code ============================== */ + +static int group_init(struct cpumask *mask) +{ + struct device *dev; + unsigned int first_cpu = cpumask_first(mask); + struct cpu_data *f = &per_cpu(cpu_state, first_cpu); + struct cpu_data *state; + unsigned int cpu; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + + if (likely(f->inited)) + return 0; + + dev = get_cpu_device(first_cpu); + if (!dev) + return -ENODEV; + + pr_info("Creating CPU group %d\n", first_cpu); + + f->num_cpus = cpumask_weight(mask); + if (f->num_cpus > MAX_CPUS_PER_GROUP) { + pr_err("HW configuration not supported\n"); + return -EINVAL; + } + f->min_cpus = 1; + f->max_cpus = f->num_cpus; + f->need_cpus = f->num_cpus; + f->avail_cpus = f->num_cpus; + f->offline_delay_ms = 100; + f->task_thres = UINT_MAX; + f->nrrun = f->num_cpus; + INIT_LIST_HEAD(&f->lru); + init_timer(&f->timer); + spin_lock_init(&f->pending_lock); + f->timer.function = core_ctl_timer_func; + f->timer.data = first_cpu; + + for_each_cpu(cpu, mask) { + pr_info("Init CPU%u state\n", cpu); + + state = &per_cpu(cpu_state, cpu); + state->cpu = cpu; + state->first_cpu = first_cpu; + + if (cpu_online(cpu)) { + f->online_cpus++; + state->online = true; + } + + list_add_tail(&state->sib, &f->lru); + } + + f->hotplug_thread = kthread_run(try_hotplug, (void *) f, + "core_ctl/%d", first_cpu); + sched_setscheduler_nocheck(f->hotplug_thread, SCHED_FIFO, ¶m); + + for_each_cpu(cpu, mask) { + state = &per_cpu(cpu_state, cpu); + state->inited = true; + } + + kobject_init(&f->kobj, &ktype_core_ctl); + return kobject_add(&f->kobj, &dev->kobj, "core_ctl"); +} + +static int cpufreq_policy_cb(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_policy *policy = data; + + switch (val) { + case CPUFREQ_CREATE_POLICY: + group_init(policy->related_cpus); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block cpufreq_pol_nb = { + .notifier_call = cpufreq_policy_cb, +}; + +static int cpufreq_gov_cb(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_govinfo *info = data; + + switch (val) { + case CPUFREQ_LOAD_CHANGE: + core_ctl_set_busy(info->cpu, info->load); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block cpufreq_gov_nb = { + .notifier_call = cpufreq_gov_cb, +}; + +static int __init core_ctl_init(void) +{ + struct cpufreq_policy *policy; + unsigned int cpu; + + register_cpu_notifier(&cpu_notifier); + cpufreq_register_notifier(&cpufreq_pol_nb, CPUFREQ_POLICY_NOTIFIER); + cpufreq_register_notifier(&cpufreq_gov_nb, CPUFREQ_GOVINFO_NOTIFIER); + init_timer_deferrable(&rq_avg_timer); + rq_avg_timer.function = rq_avg_timer_func; + + get_online_cpus(); + for_each_online_cpu(cpu) { + policy = cpufreq_cpu_get(cpu); + if (policy) { + group_init(policy->related_cpus); + cpufreq_cpu_put(policy); + } + } + put_online_cpus(); + mod_timer(&rq_avg_timer, round_to_nw_start()); + return 0; +} + +late_initcall(core_ctl_init); -- cgit v1.2.3