diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/cpu.c | 17 | ||||
| -rw-r--r-- | kernel/events/core.c | 3 | ||||
| -rw-r--r-- | kernel/irq/cpuhotplug.c | 11 | ||||
| -rw-r--r-- | kernel/power/qos.c | 7 | ||||
| -rw-r--r-- | kernel/sched/Makefile | 1 | ||||
| -rw-r--r-- | kernel/sched/core.c | 295 | ||||
| -rw-r--r-- | kernel/sched/core_ctl.c | 1055 | ||||
| -rw-r--r-- | kernel/sched/core_ctl.h | 24 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 74 | ||||
| -rw-r--r-- | kernel/sched/hmp.c | 33 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 13 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 5 | ||||
| -rw-r--r-- | kernel/smp.c | 4 | ||||
| -rw-r--r-- | kernel/sysctl.c | 23 | ||||
| -rw-r--r-- | kernel/time/hrtimer.c | 74 | ||||
| -rw-r--r-- | kernel/time/timer.c | 68 | ||||
| -rw-r--r-- | kernel/trace/power-traces.c | 2 | ||||
| -rw-r--r-- | kernel/watchdog.c | 22 |
18 files changed, 1622 insertions, 109 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c index 1cfd381642da..3c97f5b88a07 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -768,6 +768,10 @@ static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly; const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits); EXPORT_SYMBOL(cpu_active_mask); +static DECLARE_BITMAP(cpu_isolated_bits, CONFIG_NR_CPUS) __read_mostly; +const struct cpumask *const cpu_isolated_mask = to_cpumask(cpu_isolated_bits); +EXPORT_SYMBOL(cpu_isolated_mask); + void set_cpu_possible(unsigned int cpu, bool possible) { if (possible) @@ -802,6 +806,14 @@ void set_cpu_active(unsigned int cpu, bool active) cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits)); } +void set_cpu_isolated(unsigned int cpu, bool isolated) +{ + if (isolated) + cpumask_set_cpu(cpu, to_cpumask(cpu_isolated_bits)); + else + cpumask_clear_cpu(cpu, to_cpumask(cpu_isolated_bits)); +} + void init_cpu_present(const struct cpumask *src) { cpumask_copy(to_cpumask(cpu_present_bits), src); @@ -817,6 +829,11 @@ void init_cpu_online(const struct cpumask *src) cpumask_copy(to_cpumask(cpu_online_bits), src); } +void init_cpu_isolated(const struct cpumask *src) +{ + cpumask_copy(to_cpumask(cpu_isolated_bits), src); +} + static ATOMIC_NOTIFIER_HEAD(idle_notifier); void idle_notifier_register(struct notifier_block *n) diff --git a/kernel/events/core.c b/kernel/events/core.c index d6ec580584b6..5beb88f11671 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3386,7 +3386,8 @@ static int perf_event_read(struct perf_event *event, bool group) * If event is enabled and currently active on a CPU, update the * value in the event structure: */ - if (event->state == PERF_EVENT_STATE_ACTIVE) { + if (event->state == PERF_EVENT_STATE_ACTIVE && + !cpu_isolated(event->oncpu)) { struct perf_read_data data = { .event = event, .group = group, diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 011f8c4c63da..104432f3d311 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -11,6 +11,7 @@ #include <linux/interrupt.h> #include <linux/ratelimit.h> #include <linux/irq.h> +#include <linux/cpumask.h> #include "internals.h" @@ -20,6 +21,7 @@ static bool migrate_one_irq(struct irq_desc *desc) const struct cpumask *affinity = d->common->affinity; struct irq_chip *c; bool ret = false; + struct cpumask available_cpus; /* * If this is a per-CPU interrupt, or the affinity does not @@ -29,8 +31,15 @@ static bool migrate_one_irq(struct irq_desc *desc) !cpumask_test_cpu(smp_processor_id(), affinity)) return false; + cpumask_copy(&available_cpus, affinity); + cpumask_andnot(&available_cpus, &available_cpus, cpu_isolated_mask); + affinity = &available_cpus; + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { - affinity = cpu_online_mask; + cpumask_andnot(&available_cpus, cpu_online_mask, + cpu_isolated_mask); + if (cpumask_empty(affinity)) + affinity = cpu_online_mask; ret = true; } diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 8ecc7b3f7dd9..69c32c42080f 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -45,6 +45,7 @@ #include <linux/seq_file.h> #include <linux/irq.h> #include <linux/irqdesc.h> +#include <linux/cpumask.h> #include <linux/uaccess.h> #include <linux/export.h> @@ -447,6 +448,9 @@ EXPORT_SYMBOL_GPL(pm_qos_request); int pm_qos_request_for_cpu(int pm_qos_class, int cpu) { + if (cpu_isolated(cpu)) + return INT_MAX; + return pm_qos_array[pm_qos_class]->constraints->target_per_cpu[cpu]; } EXPORT_SYMBOL(pm_qos_request_for_cpu); @@ -469,6 +473,9 @@ int pm_qos_request_for_cpumask(int pm_qos_class, struct cpumask *mask) val = c->default_value; for_each_cpu(cpu, mask) { + if (cpu_isolated(cpu)) + continue; + switch (c->type) { case PM_QOS_MIN: if (c->target_per_cpu[cpu] < val) diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 1f159743ebfc..508b65690288 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -20,3 +20,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_SCHED_CORE_CTL) += core_ctl.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7474463b9835..609aa2e588d7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -74,6 +74,7 @@ #include <linux/binfmts.h> #include <linux/context_tracking.h> #include <linux/compiler.h> +#include <linux/irq.h> #include <asm/switch_to.h> #include <asm/tlb.h> @@ -84,6 +85,7 @@ #endif #include "sched.h" +#include "core_ctl.h" #include "../workqueue_internal.h" #include "../smpboot.h" @@ -1229,6 +1231,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, struct rq *rq; unsigned int dest_cpu; int ret = 0; + cpumask_t allowed_mask; rq = task_rq_lock(p, &flags); @@ -1244,16 +1247,22 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_equal(&p->cpus_allowed, new_mask)) goto out; - dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); + cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask); + + dest_cpu = cpumask_any_and(cpu_active_mask, &allowed_mask); if (dest_cpu >= nr_cpu_ids) { - ret = -EINVAL; - goto out; + dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); + if (dest_cpu >= nr_cpu_ids) { + ret = -EINVAL; + goto out; + } + cpumask_copy(&allowed_mask, new_mask); } do_set_cpus_allowed(p, new_mask); /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask)) + if (cpumask_test_cpu(task_cpu(p), &allowed_mask)) goto out; if (task_running(rq, p) || p->state == TASK_WAKING) { @@ -1577,12 +1586,13 @@ EXPORT_SYMBOL_GPL(kick_process); /* * ->cpus_allowed is protected by both rq->lock and p->pi_lock */ -static int select_fallback_rq(int cpu, struct task_struct *p) +static int select_fallback_rq(int cpu, struct task_struct *p, bool allow_iso) { int nid = cpu_to_node(cpu); const struct cpumask *nodemask = NULL; enum { cpuset, possible, fail } state = cpuset; int dest_cpu; + int isolated_candidate = -1; /* * If the node that the cpu is on has been offlined, cpu_to_node() @@ -1598,6 +1608,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p) continue; if (!cpu_active(dest_cpu)) continue; + if (cpu_isolated(dest_cpu)) + continue; if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) return dest_cpu; } @@ -1610,6 +1622,16 @@ static int select_fallback_rq(int cpu, struct task_struct *p) continue; if (!cpu_active(dest_cpu)) continue; + if (cpu_isolated(dest_cpu)) { + if (allow_iso) + isolated_candidate = dest_cpu; + continue; + } + goto out; + } + + if (isolated_candidate != -1) { + dest_cpu = isolated_candidate; goto out; } @@ -1655,6 +1677,8 @@ out: static inline int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { + bool allow_isolated = (p->flags & PF_KTHREAD); + lockdep_assert_held(&p->pi_lock); if (p->nr_cpus_allowed > 1) @@ -1671,8 +1695,9 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) * not worry about this generic constraint ] */ if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || - !cpu_online(cpu))) - cpu = select_fallback_rq(task_cpu(p), p); + !cpu_online(cpu)) || + (cpu_isolated(cpu) && !allow_isolated)) + cpu = select_fallback_rq(task_cpu(p), p, allow_isolated); return cpu; } @@ -2956,7 +2981,7 @@ void sched_exec(void) if (dest_cpu == smp_processor_id()) goto unlock; - if (likely(cpu_active(dest_cpu))) { + if (likely(cpu_active(dest_cpu) && likely(!cpu_isolated(dest_cpu)))) { struct migration_arg arg = { p, dest_cpu }; raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -3066,6 +3091,8 @@ void scheduler_tick(void) if (curr->sched_class == &fair_sched_class) check_for_migration(rq, curr); + + core_ctl_check(wallclock); } #ifdef CONFIG_NO_HZ_FULL @@ -5414,18 +5441,22 @@ static struct task_struct fake_task = { }; /* - * Migrate all tasks from the rq, sleeping tasks will be migrated by - * try_to_wake_up()->select_task_rq(). + * Migrate all tasks (not pinned if pinned argument say so) from the rq, + * sleeping tasks will be migrated by try_to_wake_up()->select_task_rq(). * * Called with rq->lock held even though we'er in stop_machine() and * there's no concurrency possible, we hold the required locks anyway * because of lock validation efforts. */ -static void migrate_tasks(struct rq *dead_rq) +static void migrate_tasks(struct rq *dead_rq, bool migrate_pinned_tasks) { struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; int dest_cpu; + unsigned int num_pinned_kthreads = 1; /* this thread */ + cpumask_t avail_cpus; + + cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask); /* * Fudge the rq selection such that the below task selection loop @@ -5447,10 +5478,12 @@ static void migrate_tasks(struct rq *dead_rq) for (;;) { /* - * There's this thread running, bail when that's the only - * remaining thread. + * There's this thread running + pinned threads, bail when + * that's the only remaining threads. */ - if (rq->nr_running == 1) + if ((migrate_pinned_tasks && rq->nr_running == 1) || + (!migrate_pinned_tasks && + rq->nr_running == num_pinned_kthreads)) break; /* @@ -5461,6 +5494,13 @@ static void migrate_tasks(struct rq *dead_rq) BUG_ON(!next); next->sched_class->put_prev_task(rq, next); + if (!migrate_pinned_tasks && next->flags & PF_KTHREAD && + !cpumask_intersects(&avail_cpus, &next->cpus_allowed)) { + lockdep_unpin_lock(&rq->lock); + num_pinned_kthreads += 1; + continue; + } + /* * Rules for changing task_struct::cpus_allowed are holding * both pi_lock and rq->lock, such that holding either @@ -5486,7 +5526,7 @@ static void migrate_tasks(struct rq *dead_rq) } /* Find suitable destination for @next, with force if needed. */ - dest_cpu = select_fallback_rq(dead_rq->cpu, next); + dest_cpu = select_fallback_rq(dead_rq->cpu, next, false); rq = __migrate_task(rq, next, dest_cpu); if (rq != dead_rq) { @@ -5502,6 +5542,222 @@ static void migrate_tasks(struct rq *dead_rq) rq->stop = stop; } + +static void set_rq_online(struct rq *rq); +static void set_rq_offline(struct rq *rq); + +int do_isolation_work_cpu_stop(void *data) +{ + unsigned long flags; + unsigned int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + + watchdog_disable(cpu); + + irq_migrate_all_off_this_cpu(); + + sched_ttwu_pending(); + /* Update our root-domain */ + raw_spin_lock_irqsave(&rq->lock, flags); + + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + + migrate_tasks(rq, false); + raw_spin_unlock_irqrestore(&rq->lock, flags); + + /* + * We might have been in tickless state. Clear NOHZ flags to avoid + * us being kicked for helping out with balancing + */ + nohz_balance_clear_nohz_mask(cpu); + return 0; +} + +int do_unisolation_work_cpu_stop(void *data) +{ + watchdog_enable(smp_processor_id()); + return 0; +} + +static void init_sched_groups_capacity(int cpu, struct sched_domain *sd); + +static void sched_update_group_capacities(int cpu) +{ + struct sched_domain *sd; + + mutex_lock(&sched_domains_mutex); + rcu_read_lock(); + + for_each_domain(cpu, sd) { + int balance_cpu = group_balance_cpu(sd->groups); + + init_sched_groups_capacity(cpu, sd); + /* + * Need to ensure this is also called with balancing + * cpu. + */ + if (cpu != balance_cpu) + init_sched_groups_capacity(balance_cpu, sd); + } + + rcu_read_unlock(); + mutex_unlock(&sched_domains_mutex); +} + +static unsigned int cpu_isolation_vote[NR_CPUS]; + +int sched_isolate_count(const cpumask_t *mask, bool include_offline) +{ + cpumask_t count_mask = CPU_MASK_NONE; + + if (include_offline) { + cpumask_complement(&count_mask, cpu_online_mask); + cpumask_or(&count_mask, &count_mask, cpu_isolated_mask); + cpumask_and(&count_mask, &count_mask, mask); + } else { + cpumask_and(&count_mask, mask, cpu_isolated_mask); + } + + return cpumask_weight(&count_mask); +} + +/* + * 1) CPU is isolated and cpu is offlined: + * Unisolate the core. + * 2) CPU is not isolated and CPU is offlined: + * No action taken. + * 3) CPU is offline and request to isolate + * Request ignored. + * 4) CPU is offline and isolated: + * Not a possible state. + * 5) CPU is online and request to isolate + * Normal case: Isolate the CPU + * 6) CPU is not isolated and comes back online + * Nothing to do + * + * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY + * calling sched_unisolate_cpu() on a CPU that the client previously isolated. + * Client is also responsible for unisolating when a core goes offline + * (after CPU is marked offline). + */ +int sched_isolate_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + cpumask_t avail_cpus; + int ret_code = 0; + u64 start_time; + + if (trace_sched_isolate_enabled()) + start_time = sched_clock(); + + lock_device_hotplug(); + + cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask); + + /* We cannot isolate ALL cpus in the system */ + if (cpumask_weight(&avail_cpus) == 1) { + ret_code = -EINVAL; + goto out; + } + + if (!cpu_online(cpu)) { + ret_code = -EINVAL; + goto out; + } + + if (++cpu_isolation_vote[cpu] > 1) + goto out; + + set_cpu_isolated(cpu, true); + cpumask_clear_cpu(cpu, &avail_cpus); + + /* Migrate timers */ + smp_call_function_any(&avail_cpus, hrtimer_quiesce_cpu, &cpu, 1); + smp_call_function_any(&avail_cpus, timer_quiesce_cpu, &cpu, 1); + + migrate_sync_cpu(cpu, cpumask_first(&avail_cpus)); + stop_cpus(cpumask_of(cpu), do_isolation_work_cpu_stop, 0); + + clear_hmp_request(cpu); + calc_load_migrate(rq); + update_max_interval(); + sched_update_group_capacities(cpu); + +out: + unlock_device_hotplug(); + trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0], + start_time, 1); + return ret_code; +} + +/* + * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY + * calling sched_unisolate_cpu() on a CPU that the client previously isolated. + * Client is also responsible for unisolating when a core goes offline + * (after CPU is marked offline). + */ +int sched_unisolate_cpu_unlocked(int cpu) +{ + int ret_code = 0; + struct rq *rq = cpu_rq(cpu); + u64 start_time; + + if (trace_sched_isolate_enabled()) + start_time = sched_clock(); + + lock_device_hotplug_assert(); + + if (!cpu_isolation_vote[cpu]) { + ret_code = -EINVAL; + goto out; + } + + if (--cpu_isolation_vote[cpu]) + goto out; + + if (cpu_online(cpu)) { + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + rq->age_stamp = sched_clock_cpu(cpu); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_online(rq); + } + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + + set_cpu_isolated(cpu, false); + update_max_interval(); + sched_update_group_capacities(cpu); + + if (cpu_online(cpu)) { + stop_cpus(cpumask_of(cpu), do_unisolation_work_cpu_stop, 0); + + /* Kick CPU to immediately do load balancing */ + if (!test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) + smp_send_reschedule(cpu); + } + +out: + trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0], + start_time, 0); + return ret_code; +} + +int sched_unisolate_cpu(int cpu) +{ + int ret_code; + + lock_device_hotplug(); + ret_code = sched_unisolate_cpu_unlocked(cpu); + unlock_device_hotplug(); + return ret_code; +} + #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -5748,13 +6004,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) sched_ttwu_pending(); /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); - migrate_sync_cpu(cpu); + migrate_sync_cpu(cpu, smp_processor_id()); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - migrate_tasks(rq); + migrate_tasks(rq, true); BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); break; @@ -6509,11 +6765,14 @@ build_sched_groups(struct sched_domain *sd, int cpu) static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) { struct sched_group *sg = sd->groups; + cpumask_t avail_mask; WARN_ON(!sg); do { - sg->group_weight = cpumask_weight(sched_group_cpus(sg)); + cpumask_andnot(&avail_mask, sched_group_cpus(sg), + cpu_isolated_mask); + sg->group_weight = cpumask_weight(&avail_mask); sg = sg->next; } while (sg != sd->groups); diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c new file mode 100644 index 000000000000..d81886da7ca2 --- /dev/null +++ b/kernel/sched/core_ctl.c @@ -0,0 +1,1055 @@ +/* Copyright (c) 2014-2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/init.h> +#include <linux/notifier.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/cpufreq.h> +#include <linux/kthread.h> +#include <linux/sched.h> +#include <linux/sched/rt.h> + +#include <trace/events/sched.h> + +#define MAX_CPUS_PER_CLUSTER 4 +#define MAX_CLUSTERS 2 + +struct cluster_data { + bool inited; + unsigned int min_cpus; + unsigned int max_cpus; + unsigned int offline_delay_ms; + unsigned int busy_up_thres[MAX_CPUS_PER_CLUSTER]; + unsigned int busy_down_thres[MAX_CPUS_PER_CLUSTER]; + unsigned int active_cpus; + unsigned int num_cpus; + cpumask_t cpu_mask; + unsigned int need_cpus; + unsigned int task_thres; + s64 last_isolate_ts; + struct list_head lru; + bool pending; + spinlock_t pending_lock; + bool is_big_cluster; + int nrrun; + bool nrrun_changed; + struct task_struct *core_ctl_thread; + unsigned int first_cpu; + bool boost; + struct kobject kobj; +}; + +struct cpu_data { + bool online; + bool is_busy; + unsigned int busy; + unsigned int cpu; + bool not_preferred; + struct cluster_data *cluster; + struct list_head sib; + bool isolated_by_us; +}; + +static DEFINE_PER_CPU(struct cpu_data, cpu_state); +static struct cluster_data cluster_state[MAX_CLUSTERS]; +static unsigned int num_clusters; + +#define for_each_cluster(cluster, idx) \ + for ((cluster) = &cluster_state[idx]; (idx) < num_clusters;\ + (idx)++, (cluster) = &cluster_state[idx]) + +static DEFINE_SPINLOCK(state_lock); +static void apply_need(struct cluster_data *state); +static void wake_up_core_ctl_thread(struct cluster_data *state); +static bool initialized; + +static unsigned int get_active_cpu_count(const struct cluster_data *cluster); + +/* ========================= sysfs interface =========================== */ + +static ssize_t store_min_cpus(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->min_cpus = min(val, state->max_cpus); + wake_up_core_ctl_thread(state); + + return count; +} + +static ssize_t show_min_cpus(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->min_cpus); +} + +static ssize_t store_max_cpus(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + val = min(val, state->num_cpus); + state->max_cpus = val; + state->min_cpus = min(state->min_cpus, state->max_cpus); + wake_up_core_ctl_thread(state); + + return count; +} + +static ssize_t show_max_cpus(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->max_cpus); +} + +static ssize_t store_offline_delay_ms(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->offline_delay_ms = val; + apply_need(state); + + return count; +} + +static ssize_t show_task_thres(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->task_thres); +} + +static ssize_t store_task_thres(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + if (val < state->num_cpus) + return -EINVAL; + + state->task_thres = val; + apply_need(state); + + return count; +} + +static ssize_t show_offline_delay_ms(const struct cluster_data *state, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->offline_delay_ms); +} + +static ssize_t store_busy_up_thres(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val[MAX_CPUS_PER_CLUSTER]; + int ret, i; + + ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + if (ret == 1) { + for (i = 0; i < state->num_cpus; i++) + state->busy_up_thres[i] = val[0]; + } else { + for (i = 0; i < state->num_cpus; i++) + state->busy_up_thres[i] = val[i]; + } + apply_need(state); + return count; +} + +static ssize_t show_busy_up_thres(const struct cluster_data *state, char *buf) +{ + int i, count = 0; + + for (i = 0; i < state->num_cpus; i++) + count += snprintf(buf + count, PAGE_SIZE - count, "%u ", + state->busy_up_thres[i]); + + count += snprintf(buf + count, PAGE_SIZE - count, "\n"); + return count; +} + +static ssize_t store_busy_down_thres(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val[MAX_CPUS_PER_CLUSTER]; + int ret, i; + + ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + if (ret == 1) { + for (i = 0; i < state->num_cpus; i++) + state->busy_down_thres[i] = val[0]; + } else { + for (i = 0; i < state->num_cpus; i++) + state->busy_down_thres[i] = val[i]; + } + apply_need(state); + return count; +} + +static ssize_t show_busy_down_thres(const struct cluster_data *state, char *buf) +{ + int i, count = 0; + + for (i = 0; i < state->num_cpus; i++) + count += snprintf(buf + count, PAGE_SIZE - count, "%u ", + state->busy_down_thres[i]); + + count += snprintf(buf + count, PAGE_SIZE - count, "\n"); + return count; +} + +static ssize_t store_is_big_cluster(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->is_big_cluster = val ? 1 : 0; + return count; +} + +static ssize_t show_is_big_cluster(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->is_big_cluster); +} + +static ssize_t show_cpus(const struct cluster_data *state, char *buf) +{ + struct cpu_data *c; + ssize_t count = 0; + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry(c, &state->lru, sib) { + count += snprintf(buf + count, PAGE_SIZE - count, + "CPU%u (%s)\n", c->cpu, + c->online ? "Online" : "Offline"); + } + spin_unlock_irqrestore(&state_lock, flags); + return count; +} + +static ssize_t show_need_cpus(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->need_cpus); +} + +static ssize_t show_active_cpus(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->active_cpus); +} + +static ssize_t show_global_state(const struct cluster_data *state, char *buf) +{ + struct cpu_data *c; + struct cluster_data *cluster; + ssize_t count = 0; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + c = &per_cpu(cpu_state, cpu); + if (!c->cluster) + continue; + + cluster = c->cluster; + if (!cluster || !cluster->inited) + continue; + + count += snprintf(buf + count, PAGE_SIZE - count, + "CPU%u\n", cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tCPU: %u\n", c->cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tOnline: %u\n", c->online); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tActive: %u\n", + !cpu_isolated(c->cpu)); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tFirst CPU: %u\n", + cluster->first_cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tBusy%%: %u\n", c->busy); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tIs busy: %u\n", c->is_busy); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNr running: %u\n", cluster->nrrun); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tActive CPUs: %u\n", get_active_cpu_count(cluster)); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNeed CPUs: %u\n", cluster->need_cpus); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tBoost: %u\n", (unsigned int) cluster->boost); + } + + return count; +} + +static ssize_t store_not_preferred(struct cluster_data *state, + const char *buf, size_t count) +{ + struct cpu_data *c; + unsigned int i; + unsigned int val[MAX_CPUS_PER_CLUSTER]; + unsigned long flags; + int ret; + + ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + i = 0; + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry(c, &state->lru, sib) + c->not_preferred = val[i++]; + spin_unlock_irqrestore(&state_lock, flags); + + return count; +} + +static ssize_t show_not_preferred(const struct cluster_data *state, char *buf) +{ + struct cpu_data *c; + ssize_t count = 0; + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry(c, &state->lru, sib) + count += snprintf(buf + count, PAGE_SIZE - count, + "\tCPU:%d %u\n", c->cpu, c->not_preferred); + spin_unlock_irqrestore(&state_lock, flags); + + return count; +} + + +struct core_ctl_attr { + struct attribute attr; + ssize_t (*show)(const struct cluster_data *, char *); + ssize_t (*store)(struct cluster_data *, const char *, size_t count); +}; + +#define core_ctl_attr_ro(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0444, show_##_name, NULL) + +#define core_ctl_attr_rw(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0644, show_##_name, store_##_name) + +core_ctl_attr_rw(min_cpus); +core_ctl_attr_rw(max_cpus); +core_ctl_attr_rw(offline_delay_ms); +core_ctl_attr_rw(busy_up_thres); +core_ctl_attr_rw(busy_down_thres); +core_ctl_attr_rw(task_thres); +core_ctl_attr_rw(is_big_cluster); +core_ctl_attr_ro(cpus); +core_ctl_attr_ro(need_cpus); +core_ctl_attr_ro(active_cpus); +core_ctl_attr_ro(global_state); +core_ctl_attr_rw(not_preferred); + +static struct attribute *default_attrs[] = { + &min_cpus.attr, + &max_cpus.attr, + &offline_delay_ms.attr, + &busy_up_thres.attr, + &busy_down_thres.attr, + &task_thres.attr, + &is_big_cluster.attr, + &cpus.attr, + &need_cpus.attr, + &active_cpus.attr, + &global_state.attr, + ¬_preferred.attr, + NULL +}; + +#define to_cluster_data(k) container_of(k, struct cluster_data, kobj) +#define to_attr(a) container_of(a, struct core_ctl_attr, attr) +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct cluster_data *data = to_cluster_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->show) + ret = cattr->show(data, buf); + + return ret; +} + +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct cluster_data *data = to_cluster_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->store) + ret = cattr->store(data, buf, count); + + return ret; +} + +static const struct sysfs_ops sysfs_ops = { + .show = show, + .store = store, +}; + +static struct kobj_type ktype_core_ctl = { + .sysfs_ops = &sysfs_ops, + .default_attrs = default_attrs, +}; + +/* ==================== runqueue based core count =================== */ + +#define RQ_AVG_TOLERANCE 2 +#define RQ_AVG_DEFAULT_MS 20 +#define NR_RUNNING_TOLERANCE 5 +static unsigned int rq_avg_period_ms = RQ_AVG_DEFAULT_MS; + +static s64 rq_avg_timestamp_ms; + +static void update_running_avg(bool trigger_update) +{ + int avg, iowait_avg, big_avg, old_nrrun; + s64 now; + unsigned long flags; + struct cluster_data *cluster; + unsigned int index = 0; + + spin_lock_irqsave(&state_lock, flags); + + now = ktime_to_ms(ktime_get()); + if (now - rq_avg_timestamp_ms < rq_avg_period_ms - RQ_AVG_TOLERANCE) { + spin_unlock_irqrestore(&state_lock, flags); + return; + } + rq_avg_timestamp_ms = now; + sched_get_nr_running_avg(&avg, &iowait_avg, &big_avg); + + spin_unlock_irqrestore(&state_lock, flags); + + /* + * Round up to the next integer if the average nr running tasks + * is within NR_RUNNING_TOLERANCE/100 of the next integer. + * If normal rounding up is used, it will allow a transient task + * to trigger online event. By the time core is onlined, the task + * has finished. + * Rounding to closest suffers same problem because scheduler + * might only provide running stats per jiffy, and a transient + * task could skew the number for one jiffy. If core control + * samples every 2 jiffies, it will observe 0.5 additional running + * average which rounds up to 1 task. + */ + avg = (avg + NR_RUNNING_TOLERANCE) / 100; + big_avg = (big_avg + NR_RUNNING_TOLERANCE) / 100; + + for_each_cluster(cluster, index) { + if (!cluster->inited) + continue; + old_nrrun = cluster->nrrun; + /* + * Big cluster only need to take care of big tasks, but if + * there are not enough big cores, big tasks need to be run + * on little as well. Thus for little's runqueue stat, it + * has to use overall runqueue average, or derive what big + * tasks would have to be run on little. The latter approach + * is not easy to get given core control reacts much slower + * than scheduler, and can't predict scheduler's behavior. + */ + cluster->nrrun = cluster->is_big_cluster ? big_avg : avg; + if (cluster->nrrun != old_nrrun) { + if (trigger_update) + apply_need(cluster); + else + cluster->nrrun_changed = true; + } + } + return; +} + +/* adjust needed CPUs based on current runqueue information */ +static unsigned int apply_task_need(const struct cluster_data *cluster, + unsigned int new_need) +{ + /* unisolate all cores if there are enough tasks */ + if (cluster->nrrun >= cluster->task_thres) + return cluster->num_cpus; + + /* only unisolate more cores if there are tasks to run */ + if (cluster->nrrun > new_need) + return new_need + 1; + + return new_need; +} + +/* ======================= load based core count ====================== */ + +static unsigned int apply_limits(const struct cluster_data *cluster, + unsigned int need_cpus) +{ + return min(max(cluster->min_cpus, need_cpus), cluster->max_cpus); +} + +static unsigned int get_active_cpu_count(const struct cluster_data *cluster) +{ + return cluster->num_cpus - + sched_isolate_count(&cluster->cpu_mask, true); +} + +static bool is_active(const struct cpu_data *state) +{ + return state->online && !cpu_isolated(state->cpu); +} + +static bool adjustment_possible(const struct cluster_data *cluster, + unsigned int need) +{ + return (need < cluster->active_cpus || (need > cluster->active_cpus && + sched_isolate_count(&cluster->cpu_mask, false))); +} + +static bool eval_need(struct cluster_data *cluster) +{ + unsigned long flags; + struct cpu_data *c; + unsigned int need_cpus = 0, last_need, thres_idx; + int ret = 0; + bool need_flag = false; + unsigned int active_cpus; + unsigned int new_need; + + if (unlikely(!cluster->inited)) + return 0; + + spin_lock_irqsave(&state_lock, flags); + + if (cluster->boost) { + need_cpus = cluster->max_cpus; + } else { + active_cpus = get_active_cpu_count(cluster); + thres_idx = active_cpus ? active_cpus - 1 : 0; + list_for_each_entry(c, &cluster->lru, sib) { + if (c->busy >= cluster->busy_up_thres[thres_idx]) + c->is_busy = true; + else if (c->busy < cluster->busy_down_thres[thres_idx]) + c->is_busy = false; + need_cpus += c->is_busy; + } + need_cpus = apply_task_need(cluster, need_cpus); + } + new_need = apply_limits(cluster, need_cpus); + need_flag = adjustment_possible(cluster, new_need); + + last_need = cluster->need_cpus; + cluster->need_cpus = new_need; + + if (!need_flag) { + spin_unlock_irqrestore(&state_lock, flags); + return 0; + } + + if (need_cpus > cluster->active_cpus) { + ret = 1; + } else if (need_cpus < cluster->active_cpus) { + s64 now = ktime_to_ms(ktime_get()); + s64 elapsed = now - cluster->last_isolate_ts; + + ret = elapsed >= cluster->offline_delay_ms; + } + + trace_core_ctl_eval_need(cluster->first_cpu, last_need, need_cpus, + ret && need_flag); + spin_unlock_irqrestore(&state_lock, flags); + + return ret && need_flag; +} + +static void apply_need(struct cluster_data *cluster) +{ + if (eval_need(cluster)) + wake_up_core_ctl_thread(cluster); +} + +static int core_ctl_set_busy(unsigned int cpu, unsigned int busy) +{ + struct cpu_data *c = &per_cpu(cpu_state, cpu); + struct cluster_data *cluster = c->cluster; + unsigned int old_is_busy = c->is_busy; + + if (!cluster || !cluster->inited) + return 0; + + update_running_avg(false); + if (c->busy == busy && !cluster->nrrun_changed) + return 0; + c->busy = busy; + cluster->nrrun_changed = false; + + apply_need(cluster); + trace_core_ctl_set_busy(cpu, busy, old_is_busy, c->is_busy); + return 0; +} + +/* ========================= core count enforcement ==================== */ + +static void wake_up_core_ctl_thread(struct cluster_data *cluster) +{ + unsigned long flags; + + spin_lock_irqsave(&cluster->pending_lock, flags); + cluster->pending = true; + spin_unlock_irqrestore(&cluster->pending_lock, flags); + + wake_up_process_no_notif(cluster->core_ctl_thread); +} + +static u64 core_ctl_check_timestamp; +static u64 core_ctl_check_interval; + +static bool do_check(u64 wallclock) +{ + bool do_check = false; + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + if ((wallclock - core_ctl_check_timestamp) >= core_ctl_check_interval) { + core_ctl_check_timestamp = wallclock; + do_check = true; + } + spin_unlock_irqrestore(&state_lock, flags); + return do_check; +} + +void core_ctl_set_boost(bool boost) +{ + unsigned int index = 0; + struct cluster_data *cluster; + + for_each_cluster(cluster, index) { + if (cluster->is_big_cluster && cluster->boost != boost) { + cluster->boost = boost; + apply_need(cluster); + } + } +} + +void core_ctl_check(u64 wallclock) +{ + if (unlikely(!initialized)) + return; + + if (do_check(wallclock)) { + unsigned int index = 0; + struct cluster_data *cluster; + + update_running_avg(true); + + for_each_cluster(cluster, index) { + if (eval_need(cluster)) + wake_up_core_ctl_thread(cluster); + } + } +} + +static void move_cpu_lru(struct cpu_data *cpu_data) +{ + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + list_del(&cpu_data->sib); + list_add_tail(&cpu_data->sib, &cpu_data->cluster->lru); + spin_unlock_irqrestore(&state_lock, flags); +} + +static void try_to_isolate(struct cluster_data *cluster, unsigned int need) +{ + struct cpu_data *c, *tmp; + + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!is_active(c)) + continue; + if (cluster->active_cpus == need) + break; + /* Don't offline busy CPUs. */ + if (c->is_busy) + continue; + + pr_debug("Trying to isolate CPU%u\n", c->cpu); + if (!sched_isolate_cpu(c->cpu)) { + c->isolated_by_us = true; + move_cpu_lru(c); + cluster->last_isolate_ts = ktime_to_ms(ktime_get()); + } else { + pr_debug("Unable to isolate CPU%u\n", c->cpu); + } + cluster->active_cpus = get_active_cpu_count(cluster); + } + + /* + * If the number of active CPUs is within the limits, then + * don't force isolation of any busy CPUs. + */ + if (cluster->active_cpus <= cluster->max_cpus) + return; + + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!is_active(c)) + continue; + if (cluster->active_cpus <= cluster->max_cpus) + break; + + pr_debug("Trying to isolate CPU%u\n", c->cpu); + if (!sched_isolate_cpu(c->cpu)) { + c->isolated_by_us = true; + move_cpu_lru(c); + cluster->last_isolate_ts = ktime_to_ms(ktime_get()); + } else { + pr_debug("Unable to isolate CPU%u\n", c->cpu); + } + cluster->active_cpus = get_active_cpu_count(cluster); + } +} + +static void __try_to_unisolate(struct cluster_data *cluster, + unsigned int need, bool force) +{ + struct cpu_data *c, *tmp; + + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!c->isolated_by_us) + continue; + if ((c->online && !cpu_isolated(c->cpu)) || + (!force && c->not_preferred)) + continue; + if (cluster->active_cpus == need) + break; + + pr_debug("Trying to unisolate CPU%u\n", c->cpu); + if (!sched_unisolate_cpu(c->cpu)) { + c->isolated_by_us = false; + move_cpu_lru(c); + } else { + pr_debug("Unable to unisolate CPU%u\n", c->cpu); + } + cluster->active_cpus = get_active_cpu_count(cluster); + } +} + +static void try_to_unisolate(struct cluster_data *cluster, unsigned int need) +{ + bool force_use_non_preferred = false; + + __try_to_unisolate(cluster, need, force_use_non_preferred); + + if (cluster->active_cpus == need) + return; + + force_use_non_preferred = true; + __try_to_unisolate(cluster, need, force_use_non_preferred); +} + +static void __ref do_core_ctl(struct cluster_data *cluster) +{ + unsigned int need; + + need = apply_limits(cluster, cluster->need_cpus); + + if (adjustment_possible(cluster, need)) { + pr_debug("Trying to adjust group %u from %u to %u\n", + cluster->first_cpu, cluster->active_cpus, need); + + if (cluster->active_cpus > need) + try_to_isolate(cluster, need); + else if (cluster->active_cpus < need) + try_to_unisolate(cluster, need); + } +} + +static int __ref try_core_ctl(void *data) +{ + struct cluster_data *cluster = data; + unsigned long flags; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + spin_lock_irqsave(&cluster->pending_lock, flags); + if (!cluster->pending) { + spin_unlock_irqrestore(&cluster->pending_lock, flags); + schedule(); + if (kthread_should_stop()) + break; + spin_lock_irqsave(&cluster->pending_lock, flags); + } + set_current_state(TASK_RUNNING); + cluster->pending = false; + spin_unlock_irqrestore(&cluster->pending_lock, flags); + + do_core_ctl(cluster); + } + + return 0; +} + +static int __ref cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + uint32_t cpu = (uintptr_t)hcpu; + struct cpu_data *state = &per_cpu(cpu_state, cpu); + struct cluster_data *cluster = state->cluster; + unsigned int need; + int ret = NOTIFY_OK; + + /* Don't affect suspend resume */ + if (action & CPU_TASKS_FROZEN) + return NOTIFY_OK; + + if (unlikely(!cluster || !cluster->inited)) + return NOTIFY_OK; + + switch (action) { + case CPU_UP_PREPARE: + + /* If online state of CPU somehow got out of sync, fix it. */ + if (state->online) { + state->online = false; + cluster->active_cpus = get_active_cpu_count(cluster); + pr_warn("CPU%d offline when state is online\n", cpu); + } + break; + + case CPU_ONLINE: + + state->online = true; + cluster->active_cpus = get_active_cpu_count(cluster); + + /* + * Moving to the end of the list should only happen in + * CPU_ONLINE and not on CPU_UP_PREPARE to prevent an + * infinite list traversal when thermal (or other entities) + * reject trying to online CPUs. + */ + move_cpu_lru(state); + break; + + case CPU_DEAD: + /* + * We don't want to have a CPU both offline and isolated. + * So unisolate a CPU that went down if it was isolated by us. + */ + if (state->isolated_by_us) { + sched_unisolate_cpu_unlocked(cpu); + state->isolated_by_us = false; + } + + /* Move a CPU to the end of the LRU when it goes offline. */ + move_cpu_lru(state); + + /* Fall through */ + + case CPU_UP_CANCELED: + + /* If online state of CPU somehow got out of sync, fix it. */ + if (!state->online) + pr_warn("CPU%d online when state is offline\n", cpu); + + state->online = false; + state->busy = 0; + cluster->active_cpus = get_active_cpu_count(cluster); + break; + } + + need = apply_limits(cluster, cluster->need_cpus); + if (adjustment_possible(cluster, need)) + wake_up_core_ctl_thread(cluster); + + return ret; +} + +static struct notifier_block __refdata cpu_notifier = { + .notifier_call = cpu_callback, +}; + +/* ============================ init code ============================== */ + +static struct cluster_data *find_cluster_by_first_cpu(unsigned int first_cpu) +{ + unsigned int i; + + for (i = 0; i < num_clusters; ++i) { + if (cluster_state[i].first_cpu == first_cpu) + return &cluster_state[i]; + } + + return NULL; +} + +static int cluster_init(const struct cpumask *mask) +{ + struct device *dev; + unsigned int first_cpu = cpumask_first(mask); + struct cluster_data *cluster; + struct cpu_data *state; + unsigned int cpu; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + + if (find_cluster_by_first_cpu(first_cpu)) + return 0; + + dev = get_cpu_device(first_cpu); + if (!dev) + return -ENODEV; + + pr_info("Creating CPU group %d\n", first_cpu); + + if (num_clusters == MAX_CLUSTERS) { + pr_err("Unsupported number of clusters. Only %u supported\n", + MAX_CLUSTERS); + return -EINVAL; + } + cluster = &cluster_state[num_clusters]; + ++num_clusters; + + cpumask_copy(&cluster->cpu_mask, mask); + cluster->num_cpus = cpumask_weight(mask); + if (cluster->num_cpus > MAX_CPUS_PER_CLUSTER) { + pr_err("HW configuration not supported\n"); + return -EINVAL; + } + cluster->first_cpu = first_cpu; + cluster->min_cpus = 1; + cluster->max_cpus = cluster->num_cpus; + cluster->need_cpus = cluster->num_cpus; + cluster->offline_delay_ms = 100; + cluster->task_thres = UINT_MAX; + cluster->nrrun = cluster->num_cpus; + INIT_LIST_HEAD(&cluster->lru); + spin_lock_init(&cluster->pending_lock); + + for_each_cpu(cpu, mask) { + pr_info("Init CPU%u state\n", cpu); + + state = &per_cpu(cpu_state, cpu); + state->cluster = cluster; + state->cpu = cpu; + if (cpu_online(cpu)) + state->online = true; + list_add_tail(&state->sib, &cluster->lru); + } + cluster->active_cpus = get_active_cpu_count(cluster); + + cluster->core_ctl_thread = kthread_run(try_core_ctl, (void *) cluster, + "core_ctl/%d", first_cpu); + if (IS_ERR(cluster->core_ctl_thread)) + return PTR_ERR(cluster->core_ctl_thread); + + sched_setscheduler_nocheck(cluster->core_ctl_thread, SCHED_FIFO, + ¶m); + + cluster->inited = true; + + kobject_init(&cluster->kobj, &ktype_core_ctl); + return kobject_add(&cluster->kobj, &dev->kobj, "core_ctl"); +} + +static int cpufreq_policy_cb(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_policy *policy = data; + int ret; + + switch (val) { + case CPUFREQ_CREATE_POLICY: + ret = cluster_init(policy->related_cpus); + if (ret) + pr_warn("unable to create core ctl group: %d\n", ret); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block cpufreq_pol_nb = { + .notifier_call = cpufreq_policy_cb, +}; + +static int cpufreq_gov_cb(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_govinfo *info = data; + + switch (val) { + case CPUFREQ_LOAD_CHANGE: + core_ctl_set_busy(info->cpu, info->load); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block cpufreq_gov_nb = { + .notifier_call = cpufreq_gov_cb, +}; + +static int __init core_ctl_init(void) +{ + unsigned int cpu; + + core_ctl_check_interval = (rq_avg_period_ms - RQ_AVG_TOLERANCE) + * NSEC_PER_MSEC; + + register_cpu_notifier(&cpu_notifier); + cpufreq_register_notifier(&cpufreq_pol_nb, CPUFREQ_POLICY_NOTIFIER); + cpufreq_register_notifier(&cpufreq_gov_nb, CPUFREQ_GOVINFO_NOTIFIER); + + lock_device_hotplug(); + for_each_online_cpu(cpu) { + struct cpufreq_policy *policy; + int ret; + + policy = cpufreq_cpu_get(cpu); + if (policy) { + ret = cluster_init(policy->related_cpus); + if (ret) + pr_warn("unable to create core ctl group: %d\n" + , ret); + cpufreq_cpu_put(policy); + } + } + unlock_device_hotplug(); + initialized = true; + return 0; +} + +late_initcall(core_ctl_init); diff --git a/kernel/sched/core_ctl.h b/kernel/sched/core_ctl.h new file mode 100644 index 000000000000..3b0c12acb9c0 --- /dev/null +++ b/kernel/sched/core_ctl.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef __CORE_CTL_H +#define __CORE_CTL_H + +#ifdef CONFIG_SCHED_CORE_CTL +void core_ctl_check(u64 wallclock); +void core_ctl_set_boost(bool boost); +#else +static inline void core_ctl_check(u64 wallclock) {} +static inline void core_ctl_set_boost(bool boost) {} +#endif +#endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e893b0fcac6b..83da13b5f6b8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2941,6 +2941,8 @@ static void find_best_cpu_in_cluster(struct sched_cluster *c, struct cpumask search_cpus; cpumask_and(&search_cpus, tsk_cpus_allowed(env->p), &c->cpus); + cpumask_andnot(&search_cpus, &search_cpus, cpu_isolated_mask); + if (env->ignore_prev_cpu) cpumask_clear_cpu(env->prev_cpu, &search_cpus); @@ -3009,7 +3011,8 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats) prev_cpu = env->prev_cpu; if (!cpumask_test_cpu(prev_cpu, tsk_cpus_allowed(task)) || - unlikely(!cpu_active(prev_cpu))) + unlikely(!cpu_active(prev_cpu)) || + cpu_isolated(prev_cpu)) return false; if (task->ravg.mark_start - task->last_cpu_selected_ts >= @@ -7354,6 +7357,8 @@ void update_group_capacity(struct sched_domain *sd, int cpu) struct sched_group_capacity *sgc; struct rq *rq = cpu_rq(cpu); + if (cpumask_test_cpu(cpu, cpu_isolated_mask)) + continue; /* * build_sched_domains() -> init_sched_groups_capacity() * gets here before we've attached the domains to the @@ -7381,7 +7386,11 @@ void update_group_capacity(struct sched_domain *sd, int cpu) group = child->groups; do { - capacity += group->sgc->capacity; + cpumask_t *cpus = sched_group_cpus(group); + + /* Revisit this later. This won't work for MT domain */ + if (!cpu_isolated(cpumask_first(cpus))) + capacity += group->sgc->capacity; group = group->next; } while (group != child->groups); } @@ -7521,6 +7530,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, power_cost(i, 0), cpu_temp(i)); + if (cpu_isolated(i)) + continue; + /* Bias balancing toward cpus of our domain */ if (local_group) load = target_load(i, load_idx); @@ -7548,17 +7560,27 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->idle_cpus++; } - /* Adjust by relative CPU capacity of the group */ - sgs->group_capacity = group->sgc->capacity; - sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; + /* Isolated CPU has no weight */ + if (!group->group_weight) { + sgs->group_capacity = 0; + sgs->avg_load = 0; + sgs->group_no_capacity = 1; + sgs->group_type = group_other; + sgs->group_weight = group->group_weight; + } else { + /* Adjust by relative CPU capacity of the group */ + sgs->group_capacity = group->sgc->capacity; + sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / + sgs->group_capacity; - if (sgs->sum_nr_running) - sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; + sgs->group_weight = group->group_weight; - sgs->group_weight = group->group_weight; + sgs->group_no_capacity = group_is_overloaded(env, sgs); + sgs->group_type = group_classify(group, sgs, env); + } - sgs->group_no_capacity = group_is_overloaded(env, sgs); - sgs->group_type = group_classify(group, sgs, env); + if (sgs->sum_nr_running) + sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; } #ifdef CONFIG_SCHED_HMP @@ -8601,6 +8623,9 @@ static int idle_balance(struct rq *this_rq) int pulled_task = 0; u64 curr_cost = 0; + if (cpu_isolated(this_cpu)) + return 0; + idle_enter_fair(this_rq); /* @@ -8908,16 +8933,21 @@ static void nohz_balancer_kick(int type) return; } +void nohz_balance_clear_nohz_mask(int cpu) +{ + if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); + atomic_dec(&nohz.nr_cpus); + } +} + static inline void nohz_balance_exit_idle(int cpu) { if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { /* * Completely isolated CPUs don't ever set, so we must test. */ - if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); - } + nohz_balance_clear_nohz_mask(cpu); clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } } @@ -8974,7 +9004,7 @@ void nohz_balance_enter_idle(int cpu) /* * If we're a completely isolated CPU, we don't play. */ - if (on_null_domain(cpu_rq(cpu))) + if (on_null_domain(cpu_rq(cpu)) || cpu_isolated(cpu)) return; cpumask_set_cpu(cpu, nohz.idle_cpus_mask); @@ -9003,7 +9033,13 @@ static DEFINE_SPINLOCK(balancing); */ void update_max_interval(void) { - max_load_balance_interval = HZ*num_online_cpus()/10; + cpumask_t avail_mask; + unsigned int available_cpus; + + cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask); + available_cpus = cpumask_weight(&avail_mask); + + max_load_balance_interval = HZ*available_cpus/10; } /* @@ -9342,8 +9378,10 @@ void trigger_load_balance(struct rq *rq) { int type = NOHZ_KICK_ANY; - /* Don't need to rebalance while attached to NULL domain */ - if (unlikely(on_null_domain(rq))) + /* Don't need to rebalance while attached to NULL domain or + * cpu is isolated. + */ + if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq))) return; if (time_after_eq(jiffies, rq->next_balance)) diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index 5002619961ce..6e1757aa1541 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -19,6 +19,7 @@ #include <linux/syscore_ops.h> #include "sched.h" +#include "core_ctl.h" #include <trace/events/sched.h> @@ -1090,6 +1091,8 @@ int sched_set_boost(int enable) if (!old_refcount && boost_refcount) boost_kick_cpus(); + if (boost_refcount <= 1) + core_ctl_set_boost(boost_refcount == 1); trace_sched_set_boost(boost_refcount); spin_unlock_irqrestore(&boost_lock, flags); @@ -1499,28 +1502,10 @@ int sched_hmp_proc_update_handler(struct ctl_table *table, int write, if (write && (old_val == *data)) goto done; - /* - * Special handling for sched_freq_aggregate_threshold_pct - * which can be greater than 100. Use 1000 as an upper bound - * value which works for all practical use cases. - */ - if (data == &sysctl_sched_freq_aggregate_threshold_pct) { - if (*data > 1000) { - *data = old_val; - ret = -EINVAL; - goto done; - } - } else if (data != &sysctl_sched_select_prev_cpu_us) { - /* - * all tunables other than sched_select_prev_cpu_us are - * in percentage. - */ - if (sysctl_sched_downmigrate_pct > - sysctl_sched_upmigrate_pct || *data > 100) { - *data = old_val; - ret = -EINVAL; - goto done; - } + if (sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct) { + *data = old_val; + ret = -EINVAL; + goto done; } /* @@ -2828,10 +2813,10 @@ void set_window_start(struct rq *rq) rq->curr->ravg.mark_start = rq->window_start; } -void migrate_sync_cpu(int cpu) +void migrate_sync_cpu(int cpu, int new_cpu) { if (cpu == sync_cpu) - sync_cpu = smp_processor_id(); + sync_cpu = new_cpu; } static void reset_all_task_stats(void) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index cfec881491ef..ba4403e910d8 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -265,8 +265,12 @@ static void pull_rt_task(struct rq *this_rq); static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) { - /* Try to pull RT tasks here if we lower this rq's prio */ - return rq->rt.highest_prio.curr > prev->prio; + /* + * Try to pull RT tasks here if we lower this rq's prio and cpu is not + * isolated + */ + return rq->rt.highest_prio.curr > prev->prio && + !cpu_isolated(cpu_of(rq)); } static inline int rt_overloaded(struct rq *rq) @@ -1694,6 +1698,8 @@ static int find_lowest_rq_hmp(struct task_struct *task) for_each_sched_cluster(cluster) { cpumask_and(&candidate_mask, &cluster->cpus, lowest_mask); + cpumask_andnot(&candidate_mask, &candidate_mask, + cpu_isolated_mask); if (cpumask_empty(&candidate_mask)) continue; @@ -2282,7 +2288,8 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) + if (!task_on_rq_queued(p) || rq->rt.rt_nr_running || + cpu_isolated(cpu_of(rq))) return; queue_pull_task(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec7721112b05..41abb4dabeb7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1069,7 +1069,7 @@ extern void clear_boost_kick(int cpu); extern void clear_hmp_request(int cpu); extern void mark_task_starting(struct task_struct *p); extern void set_window_start(struct rq *rq); -extern void migrate_sync_cpu(int cpu); +extern void migrate_sync_cpu(int cpu, int new_cpu); extern void update_cluster_topology(void); extern void set_task_last_wake(struct task_struct *p, u64 wallclock); extern void set_task_last_switch_out(struct task_struct *p, u64 wallclock); @@ -1424,7 +1424,7 @@ static inline void clear_boost_kick(int cpu) { } static inline void clear_hmp_request(int cpu) { } static inline void mark_task_starting(struct task_struct *p) { } static inline void set_window_start(struct rq *rq) { } -static inline void migrate_sync_cpu(int cpu) { } +static inline void migrate_sync_cpu(int cpu, int new_cpu) {} static inline void update_cluster_topology(void) { } static inline void set_task_last_wake(struct task_struct *p, u64 wallclock) { } static inline void set_task_last_switch_out(struct task_struct *p, @@ -1953,6 +1953,7 @@ extern const struct sched_class idle_sched_class; extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); +extern void nohz_balance_clear_nohz_mask(int cpu); extern void idle_enter_fair(struct rq *this_rq); extern void idle_exit_fair(struct rq *this_rq); diff --git a/kernel/smp.c b/kernel/smp.c index abdc48cd79a3..b2ec21c5c9d6 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -766,8 +766,8 @@ void wake_up_all_idle_cpus(void) for_each_online_cpu(cpu) { if (cpu == smp_processor_id()) continue; - - wake_up_if_idle(cpu); + if (!cpu_isolated(cpu)) + wake_up_if_idle(cpu); } preempt_enable(); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 07fef40d1274..dad3324e7372 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -130,6 +130,9 @@ static int one_hundred = 100; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; #endif +#ifdef CONFIG_SCHED_HMP +static int one_thousand = 1000; +#endif /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; @@ -336,6 +339,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + .extra2 = &one_hundred, }, { .procname = "sched_spill_nr_run", @@ -351,6 +356,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + .extra2 = &one_hundred, }, { .procname = "sched_downmigrate", @@ -358,6 +365,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + .extra2 = &one_hundred, }, { .procname = "sched_init_task_load", @@ -365,6 +374,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + .extra2 = &one_hundred, }, { .procname = "sched_select_prev_cpu_us", @@ -372,6 +383,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, }, { .procname = "sched_enable_colocation", @@ -397,6 +409,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + .extra2 = &one_hundred, }, { .procname = "sched_big_waker_task_load", @@ -404,6 +418,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + .extra2 = &one_hundred, }, { .procname = "sched_enable_thread_grouping", @@ -440,6 +456,13 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + /* + * Special handling for sched_freq_aggregate_threshold_pct + * which can be greater than 100. Use 1000 as an upper bound + * value which works for all practical use cases. + */ + .extra2 = &one_thousand, }, { .procname = "sched_boost", diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index fa909f9fd559..1b0117198a08 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -880,7 +880,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, base->cpu_base->active_bases |= 1 << base->index; - timer->state = HRTIMER_STATE_ENQUEUED; + timer->state |= HRTIMER_STATE_ENQUEUED; return timerqueue_add(&base->active, &timer->node); } @@ -900,11 +900,9 @@ static void __remove_hrtimer(struct hrtimer *timer, u8 newstate, int reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; - u8 state = timer->state; - timer->state = newstate; - if (!(state & HRTIMER_STATE_ENQUEUED)) - return; + if (!(timer->state & HRTIMER_STATE_ENQUEUED)) + goto out; if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); @@ -921,6 +919,13 @@ static void __remove_hrtimer(struct hrtimer *timer, if (reprogram && timer == cpu_base->next_timer) hrtimer_force_reprogram(cpu_base, 1); #endif + +out: + /* + * We need to preserve PINNED state here, otherwise we may end up + * migrating pinned hrtimers as well. + */ + timer->state = newstate | (timer->state & HRTIMER_STATE_PINNED); } /* @@ -949,6 +954,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest state = HRTIMER_STATE_INACTIVE; __remove_hrtimer(timer, base, state, reprogram); + timer->state &= ~HRTIMER_STATE_PINNED; return 1; } return 0; @@ -1002,6 +1008,10 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, timer_stats_hrtimer_set_start_info(timer); + /* Update pinned state */ + timer->state &= ~HRTIMER_STATE_PINNED; + timer->state |= (!!(mode & HRTIMER_MODE_PINNED)) << HRTIMER_PINNED_SHIFT; + leftmost = enqueue_hrtimer(timer, new_base); if (!leftmost) goto unlock; @@ -1176,8 +1186,8 @@ bool hrtimer_active(const struct hrtimer *timer) cpu_base = READ_ONCE(timer->base->cpu_base); seq = raw_read_seqcount_begin(&cpu_base->seq); - if (timer->state != HRTIMER_STATE_INACTIVE || - cpu_base->running == timer) + if (((timer->state & ~HRTIMER_STATE_PINNED) != + HRTIMER_STATE_INACTIVE) || cpu_base->running == timer) return true; } while (read_seqcount_retry(&cpu_base->seq, seq) || @@ -1614,13 +1624,17 @@ static void init_hrtimers_cpu(int cpu) hrtimer_init_hres(cpu_base); } -#ifdef CONFIG_HOTPLUG_CPU - +#if defined(CONFIG_HOTPLUG_CPU) static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, - struct hrtimer_clock_base *new_base) + struct hrtimer_clock_base *new_base, + bool remove_pinned) { struct hrtimer *timer; struct timerqueue_node *node; + struct timerqueue_head pinned; + int is_pinned; + + timerqueue_init_head(&pinned); while ((node = timerqueue_getnext(&old_base->active))) { timer = container_of(node, struct hrtimer, node); @@ -1633,6 +1647,13 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * under us on another CPU */ __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); + + is_pinned = timer->state & HRTIMER_STATE_PINNED; + if (!remove_pinned && is_pinned) { + timerqueue_add(&pinned, &timer->node); + continue; + } + timer->base = new_base; /* * Enqueue the timers on the new cpu. This does not @@ -1644,17 +1665,23 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, */ enqueue_hrtimer(timer, new_base); } + + /* Re-queue pinned timers for non-hotplug usecase */ + while ((node = timerqueue_getnext(&pinned))) { + timer = container_of(node, struct hrtimer, node); + + timerqueue_del(&pinned, &timer->node); + enqueue_hrtimer(timer, old_base); + } } -static void migrate_hrtimers(int scpu) +static void __migrate_hrtimers(int scpu, bool remove_pinned) { struct hrtimer_cpu_base *old_base, *new_base; + unsigned long flags; int i; - BUG_ON(cpu_online(scpu)); - tick_cancel_sched_timer(scpu); - - local_irq_disable(); + local_irq_save(flags); old_base = &per_cpu(hrtimer_bases, scpu); new_base = this_cpu_ptr(&hrtimer_bases); /* @@ -1666,7 +1693,7 @@ static void migrate_hrtimers(int scpu) for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], - &new_base->clock_base[i]); + &new_base->clock_base[i], remove_pinned); } raw_spin_unlock(&old_base->lock); @@ -1674,7 +1701,20 @@ static void migrate_hrtimers(int scpu) /* Check, if we got expired work to do */ __hrtimer_peek_ahead_timers(); - local_irq_enable(); + local_irq_restore(flags); +} + +static void migrate_hrtimers(int scpu) +{ + BUG_ON(cpu_online(scpu)); + tick_cancel_sched_timer(scpu); + + __migrate_hrtimers(scpu, true); +} + +void hrtimer_quiesce_cpu(void *cpup) +{ + __migrate_hrtimers(*(int *)cpup, false); } #endif /* CONFIG_HOTPLUG_CPU */ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 51896272fcde..0efb3916f5a4 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1620,56 +1620,86 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) } EXPORT_SYMBOL(schedule_timeout_uninterruptible); -#ifdef CONFIG_HOTPLUG_CPU -static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) +#if defined(CONFIG_HOTPLUG_CPU) +static void migrate_timer_list(struct tvec_base *new_base, + struct hlist_head *head, bool remove_pinned) { struct timer_list *timer; int cpu = new_base->cpu; + struct hlist_node *n; + int is_pinned; - while (!hlist_empty(head)) { - timer = hlist_entry(head->first, struct timer_list, entry); - /* We ignore the accounting on the dying cpu */ - detach_timer(timer, false); + hlist_for_each_entry_safe(timer, n, head, entry) { + is_pinned = timer->flags & TIMER_PINNED_ON_CPU; + if (!remove_pinned && is_pinned) + continue; + + detach_if_pending(timer, get_timer_base(timer->flags), false); timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; internal_add_timer(new_base, timer); } } -static void migrate_timers(int cpu) +static void __migrate_timers(int cpu, bool wait, bool remove_pinned) { struct tvec_base *old_base; struct tvec_base *new_base; + unsigned long flags; int i; - BUG_ON(cpu_online(cpu)); old_base = per_cpu_ptr(&tvec_bases, cpu); new_base = get_cpu_ptr(&tvec_bases); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - spin_lock_irq(&new_base->lock); + spin_lock_irqsave(&new_base->lock, flags); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); - BUG_ON(old_base->running_timer); + if (wait) { + /* Ensure timers are done running before continuing */ + while (old_base->running_timer) { + spin_unlock(&old_base->lock); + spin_unlock_irqrestore(&new_base->lock, flags); + cpu_relax(); + spin_lock_irqsave(&new_base->lock, flags); + spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + } + } else { + BUG_ON(old_base->running_timer); + } for (i = 0; i < TVR_SIZE; i++) - migrate_timer_list(new_base, old_base->tv1.vec + i); + migrate_timer_list(new_base, old_base->tv1.vec + i, + remove_pinned); for (i = 0; i < TVN_SIZE; i++) { - migrate_timer_list(new_base, old_base->tv2.vec + i); - migrate_timer_list(new_base, old_base->tv3.vec + i); - migrate_timer_list(new_base, old_base->tv4.vec + i); - migrate_timer_list(new_base, old_base->tv5.vec + i); + migrate_timer_list(new_base, old_base->tv2.vec + i, + remove_pinned); + migrate_timer_list(new_base, old_base->tv3.vec + i, + remove_pinned); + migrate_timer_list(new_base, old_base->tv4.vec + i, + remove_pinned); + migrate_timer_list(new_base, old_base->tv5.vec + i, + remove_pinned); } - old_base->active_timers = 0; - old_base->all_timers = 0; - spin_unlock(&old_base->lock); - spin_unlock_irq(&new_base->lock); + spin_unlock_irqrestore(&new_base->lock, flags); put_cpu_ptr(&tvec_bases); } +/* Migrate timers from 'cpu' to this_cpu */ +static void migrate_timers(int cpu) +{ + BUG_ON(cpu_online(cpu)); + __migrate_timers(cpu, false, true); +} + +void timer_quiesce_cpu(void *cpup) +{ + __migrate_timers(*(int *)cpup, true, false); +} + static int timer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index 9270e1ac6460..49fa2e6eea98 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -15,5 +15,3 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); -EXPORT_TRACEPOINT_SYMBOL(core_ctl_set_busy); -EXPORT_TRACEPOINT_SYMBOL(core_ctl_eval_need); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 029da92fb712..7f21591c8ec5 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -13,6 +13,7 @@ #include <linux/mm.h> #include <linux/cpu.h> +#include <linux/device.h> #include <linux/nmi.h> #include <linux/init.h> #include <linux/module.h> @@ -95,6 +96,7 @@ static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); +static DEFINE_PER_CPU(unsigned int, watchdog_en); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(bool, soft_watchdog_warn); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); @@ -586,9 +588,17 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio) sched_setscheduler(current, policy, ¶m); } -static void watchdog_enable(unsigned int cpu) +/* Must be called with hotplug lock (lock_device_hotplug()) held. */ +void watchdog_enable(unsigned int cpu) { struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); + unsigned int *enabled = raw_cpu_ptr(&watchdog_en); + + lock_device_hotplug_assert(); + + if (*enabled) + return; + *enabled = 1; /* kick off the timer for the hardlockup detector */ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); @@ -606,9 +616,17 @@ static void watchdog_enable(unsigned int cpu) __touch_watchdog(); } -static void watchdog_disable(unsigned int cpu) +/* Must be called with hotplug lock (lock_device_hotplug()) held. */ +void watchdog_disable(unsigned int cpu) { struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); + unsigned int *enabled = raw_cpu_ptr(&watchdog_en); + + lock_device_hotplug_assert(); + + if (!*enabled) + return; + *enabled = 0; watchdog_set_prio(SCHED_NORMAL, 0); hrtimer_cancel(hrtimer); |
