diff options
| author | Linux Build Service Account <lnxbuild@localhost> | 2016-09-29 11:20:17 -0700 |
|---|---|---|
| committer | Gerrit - the friendly Code Review server <code-review@localhost> | 2016-09-29 11:20:17 -0700 |
| commit | bbf8724641bd78107e8a641e6a87d89241f7b4ee (patch) | |
| tree | e3625abd853f3e16b704222b170280920d5e6b15 | |
| parent | a18b94fe05a311bf084cb235c0904c48eb04149f (diff) | |
| parent | 59f16ae0345c902c1d09da75e0f89d7e7ddbc54f (diff) | |
Merge "core_ctrl: Move core control into kernel"
42 files changed, 1724 insertions, 248 deletions
diff --git a/arch/arm/configs/msmcortex_defconfig b/arch/arm/configs/msmcortex_defconfig index 0a20c52bd3b2..2b35b0f12787 100644 --- a/arch/arm/configs/msmcortex_defconfig +++ b/arch/arm/configs/msmcortex_defconfig @@ -459,7 +459,6 @@ CONFIG_TRACER_PKT=y CONFIG_QCOM_FORCE_WDOG_BITE_ON_PANIC=y CONFIG_MSM_MPM_OF=y CONFIG_MSM_EVENT_TIMER=y -CONFIG_MSM_CORE_CTL_HELPER=y CONFIG_QCOM_REMOTEQDSS=y CONFIG_MSM_SERVICE_NOTIFIER=y CONFIG_MEM_SHARE_QMI_SERVICE=y diff --git a/arch/arm/configs/msmfalcon_defconfig b/arch/arm/configs/msmfalcon_defconfig index 0a20c52bd3b2..2b35b0f12787 100644 --- a/arch/arm/configs/msmfalcon_defconfig +++ b/arch/arm/configs/msmfalcon_defconfig @@ -459,7 +459,6 @@ CONFIG_TRACER_PKT=y CONFIG_QCOM_FORCE_WDOG_BITE_ON_PANIC=y CONFIG_MSM_MPM_OF=y CONFIG_MSM_EVENT_TIMER=y -CONFIG_MSM_CORE_CTL_HELPER=y CONFIG_QCOM_REMOTEQDSS=y CONFIG_MSM_SERVICE_NOTIFIER=y CONFIG_MEM_SHARE_QMI_SERVICE=y diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c index 1d45320ee125..f56a831de043 100644 --- a/arch/arm/kernel/irq.c +++ b/arch/arm/kernel/irq.c @@ -37,6 +37,7 @@ #include <linux/kallsyms.h> #include <linux/proc_fs.h> #include <linux/export.h> +#include <linux/cpumask.h> #include <asm/hardware/cache-l2x0.h> #include <asm/hardware/cache-uniphier.h> @@ -127,6 +128,7 @@ static bool migrate_one_irq(struct irq_desc *desc) const struct cpumask *affinity = irq_data_get_affinity_mask(d); struct irq_chip *c; bool ret = false; + struct cpumask available_cpus; /* * If this is a per-CPU interrupt, or the affinity does not @@ -135,8 +137,15 @@ static bool migrate_one_irq(struct irq_desc *desc) if (irqd_is_per_cpu(d) || !cpumask_test_cpu(smp_processor_id(), affinity)) return false; + cpumask_copy(&available_cpus, affinity); + cpumask_andnot(&available_cpus, &available_cpus, cpu_isolated_mask); + affinity = &available_cpus; + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { - affinity = cpu_online_mask; + cpumask_andnot(&available_cpus, cpu_online_mask, + cpu_isolated_mask); + if (cpumask_empty(affinity)) + affinity = cpu_online_mask; ret = true; } diff --git a/arch/arm64/configs/msm-perf_defconfig b/arch/arm64/configs/msm-perf_defconfig index 84bb603c3142..7f31331933bb 100644 --- a/arch/arm64/configs/msm-perf_defconfig +++ b/arch/arm64/configs/msm-perf_defconfig @@ -525,7 +525,6 @@ CONFIG_MSM_PIL_MSS_QDSP6V5=y CONFIG_TRACER_PKT=y CONFIG_MSM_MPM_OF=y CONFIG_MSM_AVTIMER=y -CONFIG_MSM_CORE_CTL_HELPER=y CONFIG_MSM_RPM_RBCPR_STATS_V2_LOG=y CONFIG_MSM_RPM_LOG=y CONFIG_MSM_RPM_STATS_LOG=y diff --git a/arch/arm64/configs/msm_defconfig b/arch/arm64/configs/msm_defconfig index 6119ff12d46d..89bee1463421 100644 --- a/arch/arm64/configs/msm_defconfig +++ b/arch/arm64/configs/msm_defconfig @@ -527,7 +527,6 @@ CONFIG_TRACER_PKT=y CONFIG_QCOM_FORCE_WDOG_BITE_ON_PANIC=y CONFIG_MSM_MPM_OF=y CONFIG_MSM_AVTIMER=y -CONFIG_MSM_CORE_CTL_HELPER=y CONFIG_QCOM_REMOTEQDSS=y CONFIG_MSM_SERVICE_NOTIFIER=y CONFIG_MSM_RPM_RBCPR_STATS_V2_LOG=y diff --git a/arch/arm64/configs/msmcortex-perf_defconfig b/arch/arm64/configs/msmcortex-perf_defconfig index 08288e1b5c25..7667669e8aa4 100644 --- a/arch/arm64/configs/msmcortex-perf_defconfig +++ b/arch/arm64/configs/msmcortex-perf_defconfig @@ -525,7 +525,6 @@ CONFIG_QCOM_FORCE_WDOG_BITE_ON_PANIC=y CONFIG_MSM_MPM_OF=y CONFIG_MSM_EVENT_TIMER=y CONFIG_MSM_AVTIMER=y -CONFIG_MSM_CORE_CTL_HELPER=y CONFIG_QCOM_REMOTEQDSS=y CONFIG_MSM_SERVICE_NOTIFIER=y CONFIG_MSM_RPM_RBCPR_STATS_V2_LOG=y diff --git a/arch/arm64/configs/msmcortex_defconfig b/arch/arm64/configs/msmcortex_defconfig index 9e2727c4fe1e..c267345858cf 100644 --- a/arch/arm64/configs/msmcortex_defconfig +++ b/arch/arm64/configs/msmcortex_defconfig @@ -544,7 +544,6 @@ CONFIG_QCOM_FORCE_WDOG_BITE_ON_PANIC=y CONFIG_MSM_MPM_OF=y CONFIG_MSM_EVENT_TIMER=y CONFIG_MSM_AVTIMER=y -CONFIG_MSM_CORE_CTL_HELPER=y CONFIG_QCOM_REMOTEQDSS=y CONFIG_MSM_SERVICE_NOTIFIER=y CONFIG_MSM_RPM_RBCPR_STATS_V2_LOG=y diff --git a/arch/arm64/configs/msmfalcon-perf_defconfig b/arch/arm64/configs/msmfalcon-perf_defconfig index 39c2d3f71c5a..6ebd60b43c71 100644 --- a/arch/arm64/configs/msmfalcon-perf_defconfig +++ b/arch/arm64/configs/msmfalcon-perf_defconfig @@ -522,7 +522,6 @@ CONFIG_QCOM_FORCE_WDOG_BITE_ON_PANIC=y CONFIG_MSM_MPM_OF=y CONFIG_MSM_EVENT_TIMER=y CONFIG_MSM_AVTIMER=y -CONFIG_MSM_CORE_CTL_HELPER=y CONFIG_QCOM_REMOTEQDSS=y CONFIG_MSM_SERVICE_NOTIFIER=y CONFIG_MSM_RPM_RBCPR_STATS_V2_LOG=y diff --git a/arch/arm64/configs/msmfalcon_defconfig b/arch/arm64/configs/msmfalcon_defconfig index a277038b3fc3..01324d89e79e 100644 --- a/arch/arm64/configs/msmfalcon_defconfig +++ b/arch/arm64/configs/msmfalcon_defconfig @@ -541,7 +541,6 @@ CONFIG_QCOM_FORCE_WDOG_BITE_ON_PANIC=y CONFIG_MSM_MPM_OF=y CONFIG_MSM_EVENT_TIMER=y CONFIG_MSM_AVTIMER=y -CONFIG_MSM_CORE_CTL_HELPER=y CONFIG_QCOM_REMOTEQDSS=y CONFIG_MSM_SERVICE_NOTIFIER=y CONFIG_MSM_RPM_RBCPR_STATS_V2_LOG=y diff --git a/drivers/base/core.c b/drivers/base/core.c index b7d56c5ea3c6..3ac683dff7de 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -72,6 +72,11 @@ int lock_device_hotplug_sysfs(void) return restart_syscall(); } +void lock_device_hotplug_assert(void) +{ + lockdep_assert_held(&device_hotplug_lock); +} + #ifdef CONFIG_BLOCK static inline int device_is_not_partition(struct device *dev) { diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index dee022638fe6..c8bfb6077224 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -180,6 +180,58 @@ static struct attribute_group crash_note_cpu_attr_group = { }; #endif +#ifdef CONFIG_HOTPLUG_CPU + +static ssize_t show_cpu_isolated(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpu *cpu = container_of(dev, struct cpu, dev); + ssize_t rc; + int cpuid = cpu->dev.id; + unsigned int isolated = cpu_isolated(cpuid); + + rc = snprintf(buf, PAGE_SIZE-2, "%d\n", isolated); + + return rc; +} + +static ssize_t __ref store_cpu_isolated(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cpu *cpu = container_of(dev, struct cpu, dev); + int err; + int cpuid = cpu->dev.id; + unsigned int isolated; + + err = kstrtouint(strstrip((char *)buf), 0, &isolated); + if (err) + return err; + + if (isolated > 1) + return -EINVAL; + + if (isolated) + sched_isolate_cpu(cpuid); + else + sched_unisolate_cpu(cpuid); + + return count; +} + +static DEVICE_ATTR(isolate, 0644, show_cpu_isolated, store_cpu_isolated); + +static struct attribute *cpu_isolated_attrs[] = { + &dev_attr_isolate.attr, + NULL +}; + +static struct attribute_group cpu_isolated_attr_group = { + .attrs = cpu_isolated_attrs, +}; + +#endif + #ifdef CONFIG_SCHED_HMP static ssize_t show_sched_static_cpu_pwr_cost(struct device *dev, @@ -280,6 +332,9 @@ static const struct attribute_group *common_cpu_attr_groups[] = { #ifdef CONFIG_SCHED_HMP &sched_hmp_cpu_attr_group, #endif +#ifdef CONFIG_HOTPLUG_CPU + &cpu_isolated_attr_group, +#endif NULL }; @@ -290,6 +345,9 @@ static const struct attribute_group *hotplugable_cpu_attr_groups[] = { #ifdef CONFIG_SCHED_HMP &sched_hmp_cpu_attr_group, #endif +#ifdef CONFIG_HOTPLUG_CPU + &cpu_isolated_attr_group, +#endif NULL }; diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index b622b9541279..a045b9a940e8 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -625,12 +625,15 @@ static void smp_callback(void *v) static int cpuidle_latency_notify(struct notifier_block *b, unsigned long l, void *v) { - const struct cpumask *cpus; + struct cpumask cpus; - cpus = v ?: cpu_online_mask; + if (v) + cpumask_andnot(&cpus, v, cpu_isolated_mask); + else + cpumask_andnot(&cpus, cpu_online_mask, cpu_isolated_mask); preempt_disable(); - smp_call_function_many(cpus, smp_callback, NULL, 1); + smp_call_function_many(&cpus, smp_callback, NULL, 1); preempt_enable(); return NOTIFY_OK; diff --git a/drivers/soc/qcom/Kconfig b/drivers/soc/qcom/Kconfig index 9c27344165be..c32eafbd38fd 100644 --- a/drivers/soc/qcom/Kconfig +++ b/drivers/soc/qcom/Kconfig @@ -716,14 +716,6 @@ config MSM_KERNEL_PROTECT_TEST read-only. This test is FATAL whether it passes or fails! Success is signaled by a stage-2 fault. -config MSM_CORE_CTL_HELPER - tristate "Core control helper functions for dynamically hotplug CPUs" - help - Provide helper functions for core control driver. Core control - driver dynamically hotplugs CPUs from kernel based on current - system load and state. It also supports limiting min and - max online CPUs from userspace. - config QCOM_REMOTEQDSS bool "Allow debug tools to enable events on other processors" depends on QCOM_SCM && DEBUG_FS diff --git a/drivers/soc/qcom/Makefile b/drivers/soc/qcom/Makefile index d9134a558be6..434a114c000f 100644 --- a/drivers/soc/qcom/Makefile +++ b/drivers/soc/qcom/Makefile @@ -38,7 +38,6 @@ obj-$(CONFIG_MEM_SHARE_QMI_SERVICE) += memshare/ obj-$(CONFIG_MSM_PIL_SSR_GENERIC) += subsys-pil-tz.o obj-$(CONFIG_MSM_PIL_MSS_QDSP6V5) += pil-q6v5.o pil-msa.o pil-q6v5-mss.o obj-$(CONFIG_MSM_PIL) += peripheral-loader.o -obj-$(CONFIG_MSM_CORE_CTL_HELPER) += core_ctl_helper.o obj-$(CONFIG_MSM_PFE_WA) += pfe-wa.o obj-$(CONFIG_ARCH_MSM8996) += msm_cpu_voltage.o diff --git a/drivers/soc/qcom/core_ctl_helper.c b/drivers/soc/qcom/core_ctl_helper.c deleted file mode 100644 index 88201412128e..000000000000 --- a/drivers/soc/qcom/core_ctl_helper.c +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2014-2016, The Linux Foundation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 and - * only version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#include <linux/cpu.h> -#include <linux/cpufreq.h> -#include <linux/ktime.h> -#include <linux/hrtimer.h> -#include <linux/module.h> -#include <linux/init.h> -#include <trace/events/power.h> -#include <soc/qcom/core_ctl.h> - -void core_ctl_trace(int type, int cpu, int arg1, int arg2, int arg3) -{ - switch (type) { - case CORE_CTL_EVAL_NEED: - trace_core_ctl_eval_need(cpu, arg1, arg2, arg3); - break; - - case CORE_CTL_SET_BUSY: - trace_core_ctl_set_busy(cpu, arg1, arg2, arg3); - break; - }; -} -EXPORT_SYMBOL(core_ctl_trace); - -void core_ctl_block_hotplug(void) -{ - get_online_cpus(); -} -EXPORT_SYMBOL(core_ctl_block_hotplug); - -void core_ctl_unblock_hotplug(void) -{ - put_online_cpus(); -} -EXPORT_SYMBOL(core_ctl_unblock_hotplug); - -s64 core_ctl_get_time(void) -{ - return ktime_to_ms(ktime_get()); -} -EXPORT_SYMBOL(core_ctl_get_time); - -struct cpufreq_policy *core_ctl_get_policy(int cpu) -{ - return cpufreq_cpu_get(cpu); -} -EXPORT_SYMBOL(core_ctl_get_policy); - -void core_ctl_put_policy(struct cpufreq_policy *policy) -{ - cpufreq_cpu_put(policy); -} -EXPORT_SYMBOL(core_ctl_put_policy); - -struct device *core_ctl_find_cpu_device(unsigned cpu) -{ - return get_cpu_device(cpu); -} -EXPORT_SYMBOL(core_ctl_find_cpu_device); - -int __ref core_ctl_online_core(unsigned int cpu) -{ - int ret = -EINVAL; - struct device *dev = get_cpu_device(cpu); - - if (dev) { - lock_device_hotplug(); - ret = device_online(dev); - unlock_device_hotplug(); - } - return ret; -} -EXPORT_SYMBOL(core_ctl_online_core); - -int __ref core_ctl_offline_core(unsigned int cpu) -{ - int ret = -EINVAL; - struct device *dev = get_cpu_device(cpu); - - if (dev) { - lock_device_hotplug(); - ret = device_offline(dev); - unlock_device_hotplug(); - } - return ret; -} -EXPORT_SYMBOL(core_ctl_offline_core); diff --git a/drivers/soc/qcom/watchdog_v2.c b/drivers/soc/qcom/watchdog_v2.c index aa20705b9adc..8f58eaa537b1 100644 --- a/drivers/soc/qcom/watchdog_v2.c +++ b/drivers/soc/qcom/watchdog_v2.c @@ -360,7 +360,7 @@ static void ping_other_cpus(struct msm_watchdog_data *wdog_dd) cpumask_clear(&wdog_dd->alive_mask); smp_mb(); for_each_cpu(cpu, cpu_online_mask) { - if (!cpu_idle_pc_state[cpu]) + if (!cpu_idle_pc_state[cpu] && !cpu_isolated(cpu)) smp_call_function_single(cpu, keep_alive_response, wdog_dd, 1); } diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 59915ea5373c..0eab4811ee92 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -53,6 +53,7 @@ extern int nr_cpu_ids; * cpu_present_mask - has bit 'cpu' set iff cpu is populated * cpu_online_mask - has bit 'cpu' set iff cpu available to scheduler * cpu_active_mask - has bit 'cpu' set iff cpu available to migration + * cpu_isolated_mask- has bit 'cpu' set iff cpu isolated * * If !CONFIG_HOTPLUG_CPU, present == possible, and active == online. * @@ -89,25 +90,30 @@ extern const struct cpumask *const cpu_possible_mask; extern const struct cpumask *const cpu_online_mask; extern const struct cpumask *const cpu_present_mask; extern const struct cpumask *const cpu_active_mask; +extern const struct cpumask *const cpu_isolated_mask; #if NR_CPUS > 1 #define num_online_cpus() cpumask_weight(cpu_online_mask) #define num_possible_cpus() cpumask_weight(cpu_possible_mask) #define num_present_cpus() cpumask_weight(cpu_present_mask) #define num_active_cpus() cpumask_weight(cpu_active_mask) +#define num_isolated_cpus() cpumask_weight(cpu_isolated_mask) #define cpu_online(cpu) cpumask_test_cpu((cpu), cpu_online_mask) #define cpu_possible(cpu) cpumask_test_cpu((cpu), cpu_possible_mask) #define cpu_present(cpu) cpumask_test_cpu((cpu), cpu_present_mask) #define cpu_active(cpu) cpumask_test_cpu((cpu), cpu_active_mask) +#define cpu_isolated(cpu) cpumask_test_cpu((cpu), cpu_isolated_mask) #else #define num_online_cpus() 1U #define num_possible_cpus() 1U #define num_present_cpus() 1U #define num_active_cpus() 1U +#define num_isolated_cpus() 0U #define cpu_online(cpu) ((cpu) == 0) #define cpu_possible(cpu) ((cpu) == 0) #define cpu_present(cpu) ((cpu) == 0) #define cpu_active(cpu) ((cpu) == 0) +#define cpu_isolated(cpu) ((cpu) == 0) #endif /* verify cpu argument to cpumask_* operators */ @@ -714,12 +720,14 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); #define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask) #define for_each_online_cpu(cpu) for_each_cpu((cpu), cpu_online_mask) #define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask) +#define for_each_isolated_cpu(cpu) for_each_cpu((cpu), cpu_isolated_mask) /* Wrappers for arch boot code to manipulate normally-constant masks */ void set_cpu_possible(unsigned int cpu, bool possible); void set_cpu_present(unsigned int cpu, bool present); void set_cpu_online(unsigned int cpu, bool online); void set_cpu_active(unsigned int cpu, bool active); +void set_cpu_isolated(unsigned int cpu, bool isolated); void init_cpu_present(const struct cpumask *src); void init_cpu_possible(const struct cpumask *src); void init_cpu_online(const struct cpumask *src); diff --git a/include/linux/device.h b/include/linux/device.h index 9f27351c6b9c..4b4e2d5ce6e7 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1023,6 +1023,7 @@ static inline bool device_supports_offline(struct device *dev) extern void lock_device_hotplug(void); extern void unlock_device_hotplug(void); extern int lock_device_hotplug_sysfs(void); +extern void lock_device_hotplug_assert(void); extern int device_offline(struct device *dev); extern int device_online(struct device *dev); extern void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode); diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 2ead22dd74a0..952adcacc4cf 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -53,6 +53,7 @@ enum hrtimer_restart { * * 0x00 inactive * 0x01 enqueued into rbtree + * 0x02 timer is pinned to a cpu * * The callback state is not part of the timer->state because clearing it would * mean touching the timer after the callback, this makes it impossible to free @@ -72,6 +73,8 @@ enum hrtimer_restart { */ #define HRTIMER_STATE_INACTIVE 0x00 #define HRTIMER_STATE_ENQUEUED 0x01 +#define HRTIMER_PINNED_SHIFT 1 +#define HRTIMER_STATE_PINNED (1 << HRTIMER_PINNED_SHIFT) /** * struct hrtimer - the basic hrtimer structure @@ -357,6 +360,9 @@ DECLARE_PER_CPU(struct tick_device, tick_cpu_device); /* Exported timer functions: */ +/* To be used from cpusets, only */ +extern void hrtimer_quiesce_cpu(void *cpup); + /* Initialize timers: */ extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode); diff --git a/include/linux/sched.h b/include/linux/sched.h index 4701e0403167..a395d8a9ff73 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -363,6 +363,41 @@ extern cpumask_var_t cpu_isolated_map; extern int runqueue_is_locked(int cpu); +#ifdef CONFIG_HOTPLUG_CPU +extern int sched_isolate_count(const cpumask_t *mask, bool include_offline); +extern int sched_isolate_cpu(int cpu); +extern int sched_unisolate_cpu(int cpu); +extern int sched_unisolate_cpu_unlocked(int cpu); +#else +static inline int sched_isolate_count(const cpumask_t *mask, + bool include_offline) +{ + cpumask_t count_mask; + + if (include_offline) + cpumask_andnot(&count_mask, mask, cpu_online_mask); + else + return 0; + + return cpumask_weight(&count_mask); +} + +static inline int sched_isolate_cpu(int cpu) +{ + return 0; +} + +static inline int sched_unisolate_cpu(int cpu) +{ + return 0; +} + +static inline int sched_unisolate_cpu_unlocked(int cpu) +{ + return 0; +} +#endif + #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) extern void nohz_balance_enter_idle(int cpu); extern void set_cpu_sd_state_idle(void); @@ -409,6 +444,8 @@ extern int proc_dowatchdog_thresh(struct ctl_table *table, int write, extern unsigned int softlockup_panic; extern unsigned int hardlockup_panic; void lockup_detector_init(void); +extern void watchdog_enable(unsigned int cpu); +extern void watchdog_disable(unsigned int cpu); #else static inline void touch_softlockup_watchdog_sched(void) { @@ -425,6 +462,12 @@ static inline void touch_all_softlockup_watchdogs(void) static inline void lockup_detector_init(void) { } +static inline void watchdog_enable(unsigned int cpu) +{ +} +static inline void watchdog_disable(unsigned int cpu) +{ +} #endif #ifdef CONFIG_DETECT_HUNG_TASK diff --git a/include/linux/tick.h b/include/linux/tick.h index 5bf3ddade19c..1732697ea419 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -161,7 +161,15 @@ extern void __tick_nohz_task_switch(void); #else static inline int housekeeping_any_cpu(void) { - return smp_processor_id(); + cpumask_t available; + int cpu; + + cpumask_andnot(&available, cpu_online_mask, cpu_isolated_mask); + cpu = cpumask_any(&available); + if (cpu >= nr_cpu_ids) + cpu = smp_processor_id(); + + return cpu; } static inline bool tick_nohz_full_enabled(void) { return false; } static inline bool tick_nohz_full_cpu(int cpu) { return false; } @@ -187,7 +195,7 @@ static inline bool is_housekeeping_cpu(int cpu) if (tick_nohz_full_enabled()) return cpumask_test_cpu(cpu, housekeeping_mask); #endif - return true; + return !cpu_isolated(cpu); } static inline void housekeeping_affine(struct task_struct *t) diff --git a/include/linux/timer.h b/include/linux/timer.h index 7a5602e19e87..b1617e8932b2 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -182,6 +182,9 @@ extern void set_timer_slack(struct timer_list *time, int slack_hz); */ #define NEXT_TIMER_MAX_DELTA ((1UL << 30) - 1) +/* To be used from cpusets, only */ +extern void timer_quiesce_cpu(void *cpup); + /* * Timer-statistics info: */ diff --git a/include/soc/qcom/core_ctl.h b/include/soc/qcom/core_ctl.h deleted file mode 100644 index 08b43058b37c..000000000000 --- a/include/soc/qcom/core_ctl.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2014-2015, The Linux Foundation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 and - * only version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#ifndef __SOC_QCOM_CORE_CTL_H -#define __SOC_QCOM_CORE_CTL_H - -enum { - CORE_CTL_EVAL_NEED, - CORE_CTL_SET_BUSY, - CORE_CTL_N_TRACE_EVENTS, -}; - -extern void core_ctl_block_hotplug(void); -extern void core_ctl_unblock_hotplug(void); -extern s64 core_ctl_get_time(void); -extern struct cpufreq_policy *core_ctl_get_policy(int cpu); -extern void core_ctl_put_policy(struct cpufreq_policy *policy); -extern struct device *core_ctl_find_cpu_device(unsigned cpu); -extern int core_ctl_online_core(unsigned int cpu); -extern int core_ctl_offline_core(unsigned int cpu); - -#define USE_CORE_CTL_TRACE -extern void core_ctl_trace(int type, int cpu, int arg1, int arg2, int arg3); - -#endif diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 1ef5ec3eaf70..3990efdd0cc0 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -1244,6 +1244,41 @@ TRACE_EVENT(sched_get_nr_running_avg, __entry->avg, __entry->big_avg, __entry->iowait_avg) ); +/** + * sched_isolate - called when cores are isolated/unisolated + * + * @acutal_mask: mask of cores actually isolated/unisolated + * @req_mask: mask of cores requested isolated/unisolated + * @online_mask: cpu online mask + * @time: amount of time in us it took to isolate/unisolate + * @isolate: 1 if isolating, 0 if unisolating + * + */ +TRACE_EVENT(sched_isolate, + + TP_PROTO(unsigned int requested_cpu, unsigned int isolated_cpus, + u64 start_time, unsigned char isolate), + + TP_ARGS(requested_cpu, isolated_cpus, start_time, isolate), + + TP_STRUCT__entry( + __field(u32, requested_cpu) + __field(u32, isolated_cpus) + __field(u32, time) + __field(unsigned char, isolate) + ), + + TP_fast_assign( + __entry->requested_cpu = requested_cpu; + __entry->isolated_cpus = isolated_cpus; + __entry->time = div64_u64(sched_clock() - start_time, 1000); + __entry->isolate = isolate; + ), + + TP_printk("iso cpu=%u cpus=0x%x time=%u us isolated=%d", + __entry->requested_cpu, __entry->isolated_cpus, + __entry->time, __entry->isolate) +); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/init/Kconfig b/init/Kconfig index 9ad1ae9d9da8..6020a351c57b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1170,6 +1170,16 @@ config SCHED_HMP_CSTATE_AWARE with CPUs C-state. If this is enabled, scheduler places tasks onto the shallowest C-state CPU among the most power efficient CPUs. +config SCHED_CORE_CTL + bool "QTI Core Control" + depends on SMP + help + This options enables the core control functionality in + the scheduler. Core control automatically offline and + online cores based on cpu load and utilization. + + If unsure, say N here. + config CHECKPOINT_RESTORE bool "Checkpoint/restore support" if EXPERT select PROC_CHILDREN diff --git a/kernel/cpu.c b/kernel/cpu.c index 1cfd381642da..3c97f5b88a07 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -768,6 +768,10 @@ static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly; const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits); EXPORT_SYMBOL(cpu_active_mask); +static DECLARE_BITMAP(cpu_isolated_bits, CONFIG_NR_CPUS) __read_mostly; +const struct cpumask *const cpu_isolated_mask = to_cpumask(cpu_isolated_bits); +EXPORT_SYMBOL(cpu_isolated_mask); + void set_cpu_possible(unsigned int cpu, bool possible) { if (possible) @@ -802,6 +806,14 @@ void set_cpu_active(unsigned int cpu, bool active) cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits)); } +void set_cpu_isolated(unsigned int cpu, bool isolated) +{ + if (isolated) + cpumask_set_cpu(cpu, to_cpumask(cpu_isolated_bits)); + else + cpumask_clear_cpu(cpu, to_cpumask(cpu_isolated_bits)); +} + void init_cpu_present(const struct cpumask *src) { cpumask_copy(to_cpumask(cpu_present_bits), src); @@ -817,6 +829,11 @@ void init_cpu_online(const struct cpumask *src) cpumask_copy(to_cpumask(cpu_online_bits), src); } +void init_cpu_isolated(const struct cpumask *src) +{ + cpumask_copy(to_cpumask(cpu_isolated_bits), src); +} + static ATOMIC_NOTIFIER_HEAD(idle_notifier); void idle_notifier_register(struct notifier_block *n) diff --git a/kernel/events/core.c b/kernel/events/core.c index d6ec580584b6..5beb88f11671 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3386,7 +3386,8 @@ static int perf_event_read(struct perf_event *event, bool group) * If event is enabled and currently active on a CPU, update the * value in the event structure: */ - if (event->state == PERF_EVENT_STATE_ACTIVE) { + if (event->state == PERF_EVENT_STATE_ACTIVE && + !cpu_isolated(event->oncpu)) { struct perf_read_data data = { .event = event, .group = group, diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 011f8c4c63da..104432f3d311 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -11,6 +11,7 @@ #include <linux/interrupt.h> #include <linux/ratelimit.h> #include <linux/irq.h> +#include <linux/cpumask.h> #include "internals.h" @@ -20,6 +21,7 @@ static bool migrate_one_irq(struct irq_desc *desc) const struct cpumask *affinity = d->common->affinity; struct irq_chip *c; bool ret = false; + struct cpumask available_cpus; /* * If this is a per-CPU interrupt, or the affinity does not @@ -29,8 +31,15 @@ static bool migrate_one_irq(struct irq_desc *desc) !cpumask_test_cpu(smp_processor_id(), affinity)) return false; + cpumask_copy(&available_cpus, affinity); + cpumask_andnot(&available_cpus, &available_cpus, cpu_isolated_mask); + affinity = &available_cpus; + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { - affinity = cpu_online_mask; + cpumask_andnot(&available_cpus, cpu_online_mask, + cpu_isolated_mask); + if (cpumask_empty(affinity)) + affinity = cpu_online_mask; ret = true; } diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 8ecc7b3f7dd9..69c32c42080f 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -45,6 +45,7 @@ #include <linux/seq_file.h> #include <linux/irq.h> #include <linux/irqdesc.h> +#include <linux/cpumask.h> #include <linux/uaccess.h> #include <linux/export.h> @@ -447,6 +448,9 @@ EXPORT_SYMBOL_GPL(pm_qos_request); int pm_qos_request_for_cpu(int pm_qos_class, int cpu) { + if (cpu_isolated(cpu)) + return INT_MAX; + return pm_qos_array[pm_qos_class]->constraints->target_per_cpu[cpu]; } EXPORT_SYMBOL(pm_qos_request_for_cpu); @@ -469,6 +473,9 @@ int pm_qos_request_for_cpumask(int pm_qos_class, struct cpumask *mask) val = c->default_value; for_each_cpu(cpu, mask) { + if (cpu_isolated(cpu)) + continue; + switch (c->type) { case PM_QOS_MIN: if (c->target_per_cpu[cpu] < val) diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 1f159743ebfc..508b65690288 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -20,3 +20,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_SCHED_CORE_CTL) += core_ctl.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7474463b9835..7b7f1961fd10 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -74,6 +74,7 @@ #include <linux/binfmts.h> #include <linux/context_tracking.h> #include <linux/compiler.h> +#include <linux/irq.h> #include <asm/switch_to.h> #include <asm/tlb.h> @@ -1229,6 +1230,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, struct rq *rq; unsigned int dest_cpu; int ret = 0; + cpumask_t allowed_mask; rq = task_rq_lock(p, &flags); @@ -1244,16 +1246,22 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_equal(&p->cpus_allowed, new_mask)) goto out; - dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); + cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask); + + dest_cpu = cpumask_any_and(cpu_active_mask, &allowed_mask); if (dest_cpu >= nr_cpu_ids) { - ret = -EINVAL; - goto out; + dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); + if (dest_cpu >= nr_cpu_ids) { + ret = -EINVAL; + goto out; + } + cpumask_copy(&allowed_mask, new_mask); } do_set_cpus_allowed(p, new_mask); /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask)) + if (cpumask_test_cpu(task_cpu(p), &allowed_mask)) goto out; if (task_running(rq, p) || p->state == TASK_WAKING) { @@ -1577,12 +1585,13 @@ EXPORT_SYMBOL_GPL(kick_process); /* * ->cpus_allowed is protected by both rq->lock and p->pi_lock */ -static int select_fallback_rq(int cpu, struct task_struct *p) +static int select_fallback_rq(int cpu, struct task_struct *p, bool allow_iso) { int nid = cpu_to_node(cpu); const struct cpumask *nodemask = NULL; enum { cpuset, possible, fail } state = cpuset; int dest_cpu; + int isolated_candidate = -1; /* * If the node that the cpu is on has been offlined, cpu_to_node() @@ -1598,6 +1607,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p) continue; if (!cpu_active(dest_cpu)) continue; + if (cpu_isolated(dest_cpu)) + continue; if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) return dest_cpu; } @@ -1610,6 +1621,16 @@ static int select_fallback_rq(int cpu, struct task_struct *p) continue; if (!cpu_active(dest_cpu)) continue; + if (cpu_isolated(dest_cpu)) { + if (allow_iso) + isolated_candidate = dest_cpu; + continue; + } + goto out; + } + + if (isolated_candidate != -1) { + dest_cpu = isolated_candidate; goto out; } @@ -1655,6 +1676,8 @@ out: static inline int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { + bool allow_isolated = (p->flags & PF_KTHREAD); + lockdep_assert_held(&p->pi_lock); if (p->nr_cpus_allowed > 1) @@ -1671,8 +1694,9 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) * not worry about this generic constraint ] */ if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || - !cpu_online(cpu))) - cpu = select_fallback_rq(task_cpu(p), p); + !cpu_online(cpu)) || + (cpu_isolated(cpu) && !allow_isolated)) + cpu = select_fallback_rq(task_cpu(p), p, allow_isolated); return cpu; } @@ -2956,7 +2980,7 @@ void sched_exec(void) if (dest_cpu == smp_processor_id()) goto unlock; - if (likely(cpu_active(dest_cpu))) { + if (likely(cpu_active(dest_cpu) && likely(!cpu_isolated(dest_cpu)))) { struct migration_arg arg = { p, dest_cpu }; raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -5414,18 +5438,22 @@ static struct task_struct fake_task = { }; /* - * Migrate all tasks from the rq, sleeping tasks will be migrated by - * try_to_wake_up()->select_task_rq(). + * Migrate all tasks (not pinned if pinned argument say so) from the rq, + * sleeping tasks will be migrated by try_to_wake_up()->select_task_rq(). * * Called with rq->lock held even though we'er in stop_machine() and * there's no concurrency possible, we hold the required locks anyway * because of lock validation efforts. */ -static void migrate_tasks(struct rq *dead_rq) +static void migrate_tasks(struct rq *dead_rq, bool migrate_pinned_tasks) { struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; int dest_cpu; + unsigned int num_pinned_kthreads = 1; /* this thread */ + cpumask_t avail_cpus; + + cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask); /* * Fudge the rq selection such that the below task selection loop @@ -5447,10 +5475,12 @@ static void migrate_tasks(struct rq *dead_rq) for (;;) { /* - * There's this thread running, bail when that's the only - * remaining thread. + * There's this thread running + pinned threads, bail when + * that's the only remaining threads. */ - if (rq->nr_running == 1) + if ((migrate_pinned_tasks && rq->nr_running == 1) || + (!migrate_pinned_tasks && + rq->nr_running == num_pinned_kthreads)) break; /* @@ -5461,6 +5491,13 @@ static void migrate_tasks(struct rq *dead_rq) BUG_ON(!next); next->sched_class->put_prev_task(rq, next); + if (!migrate_pinned_tasks && next->flags & PF_KTHREAD && + !cpumask_intersects(&avail_cpus, &next->cpus_allowed)) { + lockdep_unpin_lock(&rq->lock); + num_pinned_kthreads += 1; + continue; + } + /* * Rules for changing task_struct::cpus_allowed are holding * both pi_lock and rq->lock, such that holding either @@ -5486,7 +5523,7 @@ static void migrate_tasks(struct rq *dead_rq) } /* Find suitable destination for @next, with force if needed. */ - dest_cpu = select_fallback_rq(dead_rq->cpu, next); + dest_cpu = select_fallback_rq(dead_rq->cpu, next, false); rq = __migrate_task(rq, next, dest_cpu); if (rq != dead_rq) { @@ -5502,6 +5539,222 @@ static void migrate_tasks(struct rq *dead_rq) rq->stop = stop; } + +static void set_rq_online(struct rq *rq); +static void set_rq_offline(struct rq *rq); + +int do_isolation_work_cpu_stop(void *data) +{ + unsigned long flags; + unsigned int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + + watchdog_disable(cpu); + + irq_migrate_all_off_this_cpu(); + + sched_ttwu_pending(); + /* Update our root-domain */ + raw_spin_lock_irqsave(&rq->lock, flags); + + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + + migrate_tasks(rq, false); + raw_spin_unlock_irqrestore(&rq->lock, flags); + + /* + * We might have been in tickless state. Clear NOHZ flags to avoid + * us being kicked for helping out with balancing + */ + nohz_balance_clear_nohz_mask(cpu); + return 0; +} + +int do_unisolation_work_cpu_stop(void *data) +{ + watchdog_enable(smp_processor_id()); + return 0; +} + +static void init_sched_groups_capacity(int cpu, struct sched_domain *sd); + +static void sched_update_group_capacities(int cpu) +{ + struct sched_domain *sd; + + mutex_lock(&sched_domains_mutex); + rcu_read_lock(); + + for_each_domain(cpu, sd) { + int balance_cpu = group_balance_cpu(sd->groups); + + init_sched_groups_capacity(cpu, sd); + /* + * Need to ensure this is also called with balancing + * cpu. + */ + if (cpu != balance_cpu) + init_sched_groups_capacity(balance_cpu, sd); + } + + rcu_read_unlock(); + mutex_unlock(&sched_domains_mutex); +} + +static unsigned int cpu_isolation_vote[NR_CPUS]; + +int sched_isolate_count(const cpumask_t *mask, bool include_offline) +{ + cpumask_t count_mask = CPU_MASK_NONE; + + if (include_offline) { + cpumask_complement(&count_mask, cpu_online_mask); + cpumask_or(&count_mask, &count_mask, cpu_isolated_mask); + cpumask_and(&count_mask, &count_mask, mask); + } else { + cpumask_and(&count_mask, mask, cpu_isolated_mask); + } + + return cpumask_weight(&count_mask); +} + +/* + * 1) CPU is isolated and cpu is offlined: + * Unisolate the core. + * 2) CPU is not isolated and CPU is offlined: + * No action taken. + * 3) CPU is offline and request to isolate + * Request ignored. + * 4) CPU is offline and isolated: + * Not a possible state. + * 5) CPU is online and request to isolate + * Normal case: Isolate the CPU + * 6) CPU is not isolated and comes back online + * Nothing to do + * + * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY + * calling sched_unisolate_cpu() on a CPU that the client previously isolated. + * Client is also responsible for unisolating when a core goes offline + * (after CPU is marked offline). + */ +int sched_isolate_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + cpumask_t avail_cpus; + int ret_code = 0; + u64 start_time; + + if (trace_sched_isolate_enabled()) + start_time = sched_clock(); + + lock_device_hotplug(); + + cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask); + + /* We cannot isolate ALL cpus in the system */ + if (cpumask_weight(&avail_cpus) == 1) { + ret_code = -EINVAL; + goto out; + } + + if (!cpu_online(cpu)) { + ret_code = -EINVAL; + goto out; + } + + if (++cpu_isolation_vote[cpu] > 1) + goto out; + + set_cpu_isolated(cpu, true); + cpumask_clear_cpu(cpu, &avail_cpus); + + /* Migrate timers */ + smp_call_function_any(&avail_cpus, hrtimer_quiesce_cpu, &cpu, 1); + smp_call_function_any(&avail_cpus, timer_quiesce_cpu, &cpu, 1); + + migrate_sync_cpu(cpu, cpumask_first(&avail_cpus)); + stop_cpus(cpumask_of(cpu), do_isolation_work_cpu_stop, 0); + + clear_hmp_request(cpu); + calc_load_migrate(rq); + update_max_interval(); + sched_update_group_capacities(cpu); + +out: + unlock_device_hotplug(); + trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0], + start_time, 1); + return ret_code; +} + +/* + * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY + * calling sched_unisolate_cpu() on a CPU that the client previously isolated. + * Client is also responsible for unisolating when a core goes offline + * (after CPU is marked offline). + */ +int sched_unisolate_cpu_unlocked(int cpu) +{ + int ret_code = 0; + struct rq *rq = cpu_rq(cpu); + u64 start_time; + + if (trace_sched_isolate_enabled()) + start_time = sched_clock(); + + lock_device_hotplug_assert(); + + if (!cpu_isolation_vote[cpu]) { + ret_code = -EINVAL; + goto out; + } + + if (--cpu_isolation_vote[cpu]) + goto out; + + if (cpu_online(cpu)) { + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + rq->age_stamp = sched_clock_cpu(cpu); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_online(rq); + } + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + + set_cpu_isolated(cpu, false); + update_max_interval(); + sched_update_group_capacities(cpu); + + if (cpu_online(cpu)) { + stop_cpus(cpumask_of(cpu), do_unisolation_work_cpu_stop, 0); + + /* Kick CPU to immediately do load balancing */ + if (!test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) + smp_send_reschedule(cpu); + } + +out: + trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0], + start_time, 0); + return ret_code; +} + +int sched_unisolate_cpu(int cpu) +{ + int ret_code; + + lock_device_hotplug(); + ret_code = sched_unisolate_cpu_unlocked(cpu); + unlock_device_hotplug(); + return ret_code; +} + #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -5748,13 +6001,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) sched_ttwu_pending(); /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); - migrate_sync_cpu(cpu); + migrate_sync_cpu(cpu, smp_processor_id()); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - migrate_tasks(rq); + migrate_tasks(rq, true); BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); break; @@ -6509,11 +6762,14 @@ build_sched_groups(struct sched_domain *sd, int cpu) static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) { struct sched_group *sg = sd->groups; + cpumask_t avail_mask; WARN_ON(!sg); do { - sg->group_weight = cpumask_weight(sched_group_cpus(sg)); + cpumask_andnot(&avail_mask, sched_group_cpus(sg), + cpu_isolated_mask); + sg->group_weight = cpumask_weight(&avail_mask); sg = sg->next; } while (sg != sd->groups); diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c new file mode 100644 index 000000000000..8f071757d516 --- /dev/null +++ b/kernel/sched/core_ctl.c @@ -0,0 +1,1014 @@ +/* Copyright (c) 2014-2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/init.h> +#include <linux/notifier.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/cpufreq.h> +#include <linux/timer.h> +#include <linux/kthread.h> +#include <linux/sched.h> +#include <linux/sched/rt.h> + +#include <trace/events/power.h> + +#define MAX_CPUS_PER_GROUP 4 + +struct cpu_data { + /* Per CPU data. */ + bool inited; + bool online; + bool rejected; + bool is_busy; + bool not_preferred; + unsigned int busy; + unsigned int cpu; + struct list_head sib; + unsigned int first_cpu; + + /* Per cluster data set only on first CPU */ + unsigned int min_cpus; + unsigned int max_cpus; + unsigned int offline_delay_ms; + unsigned int busy_up_thres[MAX_CPUS_PER_GROUP]; + unsigned int busy_down_thres[MAX_CPUS_PER_GROUP]; + unsigned int online_cpus; + unsigned int avail_cpus; + unsigned int num_cpus; + unsigned int need_cpus; + unsigned int task_thres; + s64 need_ts; + struct list_head lru; + bool pending; + spinlock_t pending_lock; + bool is_big_cluster; + int nrrun; + bool nrrun_changed; + struct timer_list timer; + struct task_struct *hotplug_thread; + struct kobject kobj; +}; + +static DEFINE_PER_CPU(struct cpu_data, cpu_state); +static DEFINE_SPINLOCK(state_lock); +static void apply_need(struct cpu_data *f); +static void wake_up_hotplug_thread(struct cpu_data *state); + +/* ========================= sysfs interface =========================== */ + +static ssize_t store_min_cpus(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->min_cpus = min(val, state->max_cpus); + wake_up_hotplug_thread(state); + + return count; +} + +static ssize_t show_min_cpus(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->min_cpus); +} + +static ssize_t store_max_cpus(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + val = min(val, state->num_cpus); + state->max_cpus = val; + state->min_cpus = min(state->min_cpus, state->max_cpus); + wake_up_hotplug_thread(state); + + return count; +} + +static ssize_t show_max_cpus(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->max_cpus); +} + +static ssize_t store_offline_delay_ms(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->offline_delay_ms = val; + apply_need(state); + + return count; +} + +static ssize_t show_task_thres(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->task_thres); +} + +static ssize_t store_task_thres(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + if (val < state->num_cpus) + return -EINVAL; + + state->task_thres = val; + apply_need(state); + + return count; +} + +static ssize_t show_offline_delay_ms(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->offline_delay_ms); +} + +static ssize_t store_busy_up_thres(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val[MAX_CPUS_PER_GROUP]; + int ret, i; + + ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + if (ret == 1) { + for (i = 0; i < state->num_cpus; i++) + state->busy_up_thres[i] = val[0]; + } else { + for (i = 0; i < state->num_cpus; i++) + state->busy_up_thres[i] = val[i]; + } + apply_need(state); + return count; +} + +static ssize_t show_busy_up_thres(struct cpu_data *state, char *buf) +{ + int i, count = 0; + + for (i = 0; i < state->num_cpus; i++) + count += snprintf(buf + count, PAGE_SIZE - count, "%u ", + state->busy_up_thres[i]); + count += snprintf(buf + count, PAGE_SIZE - count, "\n"); + return count; +} + +static ssize_t store_busy_down_thres(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val[MAX_CPUS_PER_GROUP]; + int ret, i; + + ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + if (ret == 1) { + for (i = 0; i < state->num_cpus; i++) + state->busy_down_thres[i] = val[0]; + } else { + for (i = 0; i < state->num_cpus; i++) + state->busy_down_thres[i] = val[i]; + } + apply_need(state); + return count; +} + +static ssize_t show_busy_down_thres(struct cpu_data *state, char *buf) +{ + int i, count = 0; + + for (i = 0; i < state->num_cpus; i++) + count += snprintf(buf + count, PAGE_SIZE - count, "%u ", + state->busy_down_thres[i]); + count += snprintf(buf + count, PAGE_SIZE - count, "\n"); + return count; +} + +static ssize_t store_is_big_cluster(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->is_big_cluster = val ? 1 : 0; + return count; +} + +static ssize_t show_is_big_cluster(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->is_big_cluster); +} + +static ssize_t show_cpus(struct cpu_data *state, char *buf) +{ + struct cpu_data *c; + ssize_t count = 0; + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry(c, &state->lru, sib) { + count += snprintf(buf + count, PAGE_SIZE - count, + "CPU%u (%s)\n", c->cpu, + c->online ? "Online" : "Offline"); + } + spin_unlock_irqrestore(&state_lock, flags); + return count; +} + +static ssize_t show_need_cpus(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->need_cpus); +} + +static ssize_t show_online_cpus(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->online_cpus); +} + +static ssize_t show_global_state(struct cpu_data *state, char *buf) +{ + struct cpu_data *c; + ssize_t count = 0; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + count += snprintf(buf + count, PAGE_SIZE - count, + "CPU%u\n", cpu); + c = &per_cpu(cpu_state, cpu); + if (!c->inited) + continue; + count += snprintf(buf + count, PAGE_SIZE - count, + "\tCPU: %u\n", c->cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tOnline: %u\n", c->online); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tRejected: %u\n", c->rejected); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tFirst CPU: %u\n", c->first_cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tBusy%%: %u\n", c->busy); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tIs busy: %u\n", c->is_busy); + if (c->cpu != c->first_cpu) + continue; + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNr running: %u\n", c->nrrun); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tAvail CPUs: %u\n", c->avail_cpus); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNeed CPUs: %u\n", c->need_cpus); + } + + return count; +} + +static ssize_t store_not_preferred(struct cpu_data *state, + const char *buf, size_t count) +{ + struct cpu_data *c; + unsigned int i, first_cpu; + unsigned int val[MAX_CPUS_PER_GROUP]; + int ret; + + ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + first_cpu = state->first_cpu; + + for (i = 0; i < state->num_cpus; i++) { + c = &per_cpu(cpu_state, first_cpu); + c->not_preferred = val[i]; + first_cpu++; + } + + return count; +} + +static ssize_t show_not_preferred(struct cpu_data *state, char *buf) +{ + struct cpu_data *c; + ssize_t count = 0; + unsigned int i, first_cpu; + + first_cpu = state->first_cpu; + + for (i = 0; i < state->num_cpus; i++) { + c = &per_cpu(cpu_state, first_cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tCPU:%d %u\n", first_cpu, c->not_preferred); + first_cpu++; + } + + return count; +} + +struct core_ctl_attr { + struct attribute attr; + ssize_t (*show)(struct cpu_data *, char *); + ssize_t (*store)(struct cpu_data *, const char *, size_t count); +}; + +#define core_ctl_attr_ro(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0444, show_##_name, NULL) + +#define core_ctl_attr_rw(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0644, show_##_name, store_##_name) + +core_ctl_attr_rw(min_cpus); +core_ctl_attr_rw(max_cpus); +core_ctl_attr_rw(offline_delay_ms); +core_ctl_attr_rw(busy_up_thres); +core_ctl_attr_rw(busy_down_thres); +core_ctl_attr_rw(task_thres); +core_ctl_attr_rw(is_big_cluster); +core_ctl_attr_ro(cpus); +core_ctl_attr_ro(need_cpus); +core_ctl_attr_ro(online_cpus); +core_ctl_attr_ro(global_state); +core_ctl_attr_rw(not_preferred); + +static struct attribute *default_attrs[] = { + &min_cpus.attr, + &max_cpus.attr, + &offline_delay_ms.attr, + &busy_up_thres.attr, + &busy_down_thres.attr, + &task_thres.attr, + &is_big_cluster.attr, + &cpus.attr, + &need_cpus.attr, + &online_cpus.attr, + &global_state.attr, + ¬_preferred.attr, + NULL +}; + +#define to_cpu_data(k) container_of(k, struct cpu_data, kobj) +#define to_attr(a) container_of(a, struct core_ctl_attr, attr) +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct cpu_data *data = to_cpu_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->show) + ret = cattr->show(data, buf); + + return ret; +} + +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct cpu_data *data = to_cpu_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->store) + ret = cattr->store(data, buf, count); + + return ret; +} + +static const struct sysfs_ops sysfs_ops = { + .show = show, + .store = store, +}; + +static struct kobj_type ktype_core_ctl = { + .sysfs_ops = &sysfs_ops, + .default_attrs = default_attrs, +}; + +/* ==================== runqueue based core count =================== */ + +#define RQ_AVG_TOLERANCE 2 +#define RQ_AVG_DEFAULT_MS 20 +#define NR_RUNNING_TOLERANCE 5 +static unsigned int rq_avg_period_ms = RQ_AVG_DEFAULT_MS; + +static s64 rq_avg_timestamp_ms; +static struct timer_list rq_avg_timer; + +static void update_running_avg(bool trigger_update) +{ + int cpu; + struct cpu_data *pcpu; + int avg, iowait_avg, big_avg, old_nrrun; + s64 now; + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + + now = ktime_to_ms(ktime_get()); + if (now - rq_avg_timestamp_ms < rq_avg_period_ms - RQ_AVG_TOLERANCE) { + spin_unlock_irqrestore(&state_lock, flags); + return; + } + rq_avg_timestamp_ms = now; + sched_get_nr_running_avg(&avg, &iowait_avg, &big_avg); + + spin_unlock_irqrestore(&state_lock, flags); + + /* + * Round up to the next integer if the average nr running tasks + * is within NR_RUNNING_TOLERANCE/100 of the next integer. + * If normal rounding up is used, it will allow a transient task + * to trigger online event. By the time core is onlined, the task + * has finished. + * Rounding to closest suffers same problem because scheduler + * might only provide running stats per jiffy, and a transient + * task could skew the number for one jiffy. If core control + * samples every 2 jiffies, it will observe 0.5 additional running + * average which rounds up to 1 task. + */ + avg = (avg + NR_RUNNING_TOLERANCE) / 100; + big_avg = (big_avg + NR_RUNNING_TOLERANCE) / 100; + + for_each_possible_cpu(cpu) { + pcpu = &per_cpu(cpu_state, cpu); + if (!pcpu->inited || pcpu->first_cpu != cpu) + continue; + old_nrrun = pcpu->nrrun; + /* + * Big cluster only need to take care of big tasks, but if + * there are not enough big cores, big tasks need to be run + * on little as well. Thus for little's runqueue stat, it + * has to use overall runqueue average, or derive what big + * tasks would have to be run on little. The latter approach + * is not easy to get given core control reacts much slower + * than scheduler, and can't predict scheduler's behavior. + */ + pcpu->nrrun = pcpu->is_big_cluster ? big_avg : avg; + if (pcpu->nrrun != old_nrrun) { + if (trigger_update) + apply_need(pcpu); + else + pcpu->nrrun_changed = true; + } + } +} + +/* adjust needed CPUs based on current runqueue information */ +static unsigned int apply_task_need(struct cpu_data *f, unsigned int new_need) +{ + /* Online all cores if there are enough tasks */ + if (f->nrrun >= f->task_thres) + return f->num_cpus; + + /* only online more cores if there are tasks to run */ + if (f->nrrun > new_need) + return new_need + 1; + + return new_need; +} + +static u64 round_to_nw_start(void) +{ + unsigned long step = msecs_to_jiffies(rq_avg_period_ms); + u64 jif = get_jiffies_64(); + + do_div(jif, step); + return (jif + 1) * step; +} + +static void rq_avg_timer_func(unsigned long not_used) +{ + update_running_avg(true); + mod_timer(&rq_avg_timer, round_to_nw_start()); +} + +/* ======================= load based core count ====================== */ + +static unsigned int apply_limits(struct cpu_data *f, unsigned int need_cpus) +{ + return min(max(f->min_cpus, need_cpus), f->max_cpus); +} + +static bool eval_need(struct cpu_data *f) +{ + unsigned long flags; + struct cpu_data *c; + unsigned int need_cpus = 0, last_need, thres_idx; + int ret = 0; + bool need_flag = false; + s64 now; + + if (unlikely(!f->inited)) + return 0; + + spin_lock_irqsave(&state_lock, flags); + thres_idx = f->online_cpus ? f->online_cpus - 1 : 0; + list_for_each_entry(c, &f->lru, sib) { + if (c->busy >= f->busy_up_thres[thres_idx]) + c->is_busy = true; + else if (c->busy < f->busy_down_thres[thres_idx]) + c->is_busy = false; + need_cpus += c->is_busy; + } + need_cpus = apply_task_need(f, need_cpus); + need_flag = apply_limits(f, need_cpus) != apply_limits(f, f->need_cpus); + last_need = f->need_cpus; + + now = ktime_to_ms(ktime_get()); + + if (need_cpus == last_need) { + f->need_ts = now; + spin_unlock_irqrestore(&state_lock, flags); + return 0; + } + + if (need_cpus > last_need) { + ret = 1; + } else if (need_cpus < last_need) { + s64 elapsed = now - f->need_ts; + + if (elapsed >= f->offline_delay_ms) { + ret = 1; + } else { + mod_timer(&f->timer, jiffies + + msecs_to_jiffies(f->offline_delay_ms)); + } + } + + if (ret) { + f->need_ts = now; + f->need_cpus = need_cpus; + } + + trace_core_ctl_eval_need(f->cpu, last_need, need_cpus, + ret && need_flag); + spin_unlock_irqrestore(&state_lock, flags); + + return ret && need_flag; +} + +static void apply_need(struct cpu_data *f) +{ + if (eval_need(f)) + wake_up_hotplug_thread(f); +} + +static int core_ctl_set_busy(unsigned int cpu, unsigned int busy) +{ + struct cpu_data *c = &per_cpu(cpu_state, cpu); + struct cpu_data *f; + unsigned int old_is_busy = c->is_busy; + + if (!c->inited) + return 0; + f = &per_cpu(cpu_state, c->first_cpu); + + update_running_avg(false); + if (c->busy == busy && !f->nrrun_changed) + return 0; + c->busy = busy; + f->nrrun_changed = false; + + apply_need(f); + trace_core_ctl_set_busy(cpu, busy, old_is_busy, c->is_busy); + return 0; +} + +/* ========================= core count enforcement ==================== */ + +/* + * If current thread is hotplug thread, don't attempt to wake up + * itself or other hotplug threads because it will deadlock. Instead, + * schedule a timer to fire in next timer tick and wake up the thread. + */ +static void wake_up_hotplug_thread(struct cpu_data *state) +{ + unsigned long flags; + int cpu; + struct cpu_data *pcpu; + bool no_wakeup = false; + + for_each_possible_cpu(cpu) { + pcpu = &per_cpu(cpu_state, cpu); + if (cpu != pcpu->first_cpu) + continue; + if (pcpu->hotplug_thread == current) { + no_wakeup = true; + break; + } + } + + spin_lock_irqsave(&state->pending_lock, flags); + state->pending = true; + spin_unlock_irqrestore(&state->pending_lock, flags); + + if (no_wakeup) { + spin_lock_irqsave(&state_lock, flags); + mod_timer(&state->timer, jiffies); + spin_unlock_irqrestore(&state_lock, flags); + } else { + wake_up_process(state->hotplug_thread); + } +} + +static void core_ctl_timer_func(unsigned long cpu) +{ + struct cpu_data *state = &per_cpu(cpu_state, cpu); + unsigned long flags; + + if (eval_need(state)) { + spin_lock_irqsave(&state->pending_lock, flags); + state->pending = true; + spin_unlock_irqrestore(&state->pending_lock, flags); + wake_up_process(state->hotplug_thread); + } + +} + +static int core_ctl_online_core(unsigned int cpu) +{ + int ret; + struct device *dev; + + lock_device_hotplug(); + dev = get_cpu_device(cpu); + if (!dev) { + pr_err("%s: failed to get cpu%d device\n", __func__, cpu); + ret = -ENODEV; + } else { + ret = device_online(dev); + } + unlock_device_hotplug(); + return ret; +} + +static int core_ctl_offline_core(unsigned int cpu) +{ + int ret; + struct device *dev; + + lock_device_hotplug(); + dev = get_cpu_device(cpu); + if (!dev) { + pr_err("%s: failed to get cpu%d device\n", __func__, cpu); + ret = -ENODEV; + } else { + ret = device_offline(dev); + } + unlock_device_hotplug(); + return ret; +} + +static void __ref do_hotplug(struct cpu_data *f) +{ + unsigned int need; + struct cpu_data *c, *tmp; + + need = apply_limits(f, f->need_cpus); + pr_debug("Trying to adjust group %u to %u\n", f->first_cpu, need); + + if (f->online_cpus > need) { + list_for_each_entry_safe(c, tmp, &f->lru, sib) { + if (!c->online) + continue; + + if (f->online_cpus == need) + break; + + /* Don't offline busy CPUs. */ + if (c->is_busy) + continue; + + pr_debug("Trying to Offline CPU%u\n", c->cpu); + if (core_ctl_offline_core(c->cpu)) + pr_debug("Unable to Offline CPU%u\n", c->cpu); + } + + /* + * If the number of online CPUs is within the limits, then + * don't force any busy CPUs offline. + */ + if (f->online_cpus <= f->max_cpus) + return; + + list_for_each_entry_safe(c, tmp, &f->lru, sib) { + if (!c->online) + continue; + + if (f->online_cpus <= f->max_cpus) + break; + + pr_debug("Trying to Offline CPU%u\n", c->cpu); + if (core_ctl_offline_core(c->cpu)) + pr_debug("Unable to Offline CPU%u\n", c->cpu); + } + } else if (f->online_cpus < need) { + list_for_each_entry_safe(c, tmp, &f->lru, sib) { + if (c->online || c->rejected || c->not_preferred) + continue; + if (f->online_cpus == need) + break; + + pr_debug("Trying to Online CPU%u\n", c->cpu); + if (core_ctl_online_core(c->cpu)) + pr_debug("Unable to Online CPU%u\n", c->cpu); + } + + if (f->online_cpus == need) + return; + + + list_for_each_entry_safe(c, tmp, &f->lru, sib) { + if (c->online || c->rejected || !c->not_preferred) + continue; + if (f->online_cpus == need) + break; + + pr_debug("Trying to Online CPU%u\n", c->cpu); + if (core_ctl_online_core(c->cpu)) + pr_debug("Unable to Online CPU%u\n", c->cpu); + } + + } +} + +static int __ref try_hotplug(void *data) +{ + struct cpu_data *f = data; + unsigned long flags; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + spin_lock_irqsave(&f->pending_lock, flags); + if (!f->pending) { + spin_unlock_irqrestore(&f->pending_lock, flags); + schedule(); + if (kthread_should_stop()) + break; + spin_lock_irqsave(&f->pending_lock, flags); + } + set_current_state(TASK_RUNNING); + f->pending = false; + spin_unlock_irqrestore(&f->pending_lock, flags); + + do_hotplug(f); + } + + return 0; +} + +static int __ref cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + uint32_t cpu = (uintptr_t)hcpu; + struct cpu_data *state = &per_cpu(cpu_state, cpu); + struct cpu_data *f; + int ret = NOTIFY_OK; + unsigned long flags; + + /* Don't affect suspend resume */ + if (action & CPU_TASKS_FROZEN) + return NOTIFY_OK; + + if (unlikely(!state->inited)) + return NOTIFY_OK; + + f = &per_cpu(cpu_state, state->first_cpu); + + switch (action) { + case CPU_UP_PREPARE: + + /* If online state of CPU somehow got out of sync, fix it. */ + if (state->online) { + f->online_cpus--; + state->online = false; + pr_warn("CPU%d offline when state is online\n", cpu); + } + + if (state->rejected) { + state->rejected = false; + f->avail_cpus++; + } + + /* + * If a CPU is in the process of coming up, mark it as online + * so that there's no race with hotplug thread bringing up more + * CPUs than necessary. + */ + if (apply_limits(f, f->need_cpus) <= f->online_cpus) { + pr_debug("Prevent CPU%d onlining\n", cpu); + ret = NOTIFY_BAD; + } else { + state->online = true; + f->online_cpus++; + } + break; + + case CPU_ONLINE: + /* + * Moving to the end of the list should only happen in + * CPU_ONLINE and not on CPU_UP_PREPARE to prevent an + * infinite list traversal when thermal (or other entities) + * reject trying to online CPUs. + */ + spin_lock_irqsave(&state_lock, flags); + list_del(&state->sib); + list_add_tail(&state->sib, &f->lru); + spin_unlock_irqrestore(&state_lock, flags); + break; + + case CPU_DEAD: + /* Move a CPU to the end of the LRU when it goes offline. */ + spin_lock_irqsave(&state_lock, flags); + list_del(&state->sib); + list_add_tail(&state->sib, &f->lru); + spin_unlock_irqrestore(&state_lock, flags); + + /* Fall through */ + + case CPU_UP_CANCELED: + + /* If online state of CPU somehow got out of sync, fix it. */ + if (!state->online) { + f->online_cpus++; + pr_warn("CPU%d online when state is offline\n", cpu); + } + + if (!state->rejected && action == CPU_UP_CANCELED) { + state->rejected = true; + f->avail_cpus--; + } + + state->online = false; + state->busy = 0; + f->online_cpus--; + break; + } + + if (f->online_cpus < apply_limits(f, f->need_cpus) + && f->online_cpus < f->avail_cpus + && action == CPU_DEAD) + wake_up_hotplug_thread(f); + + return ret; +} + +static struct notifier_block __refdata cpu_notifier = { + .notifier_call = cpu_callback, +}; + +/* ============================ init code ============================== */ + +static int group_init(struct cpumask *mask) +{ + struct device *dev; + unsigned int first_cpu = cpumask_first(mask); + struct cpu_data *f = &per_cpu(cpu_state, first_cpu); + struct cpu_data *state; + unsigned int cpu; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + + if (likely(f->inited)) + return 0; + + dev = get_cpu_device(first_cpu); + if (!dev) + return -ENODEV; + + pr_info("Creating CPU group %d\n", first_cpu); + + f->num_cpus = cpumask_weight(mask); + if (f->num_cpus > MAX_CPUS_PER_GROUP) { + pr_err("HW configuration not supported\n"); + return -EINVAL; + } + f->min_cpus = 1; + f->max_cpus = f->num_cpus; + f->need_cpus = f->num_cpus; + f->avail_cpus = f->num_cpus; + f->offline_delay_ms = 100; + f->task_thres = UINT_MAX; + f->nrrun = f->num_cpus; + INIT_LIST_HEAD(&f->lru); + init_timer(&f->timer); + spin_lock_init(&f->pending_lock); + f->timer.function = core_ctl_timer_func; + f->timer.data = first_cpu; + + for_each_cpu(cpu, mask) { + pr_info("Init CPU%u state\n", cpu); + + state = &per_cpu(cpu_state, cpu); + state->cpu = cpu; + state->first_cpu = first_cpu; + + if (cpu_online(cpu)) { + f->online_cpus++; + state->online = true; + } + + list_add_tail(&state->sib, &f->lru); + } + + f->hotplug_thread = kthread_run(try_hotplug, (void *) f, + "core_ctl/%d", first_cpu); + sched_setscheduler_nocheck(f->hotplug_thread, SCHED_FIFO, ¶m); + + for_each_cpu(cpu, mask) { + state = &per_cpu(cpu_state, cpu); + state->inited = true; + } + + kobject_init(&f->kobj, &ktype_core_ctl); + return kobject_add(&f->kobj, &dev->kobj, "core_ctl"); +} + +static int cpufreq_policy_cb(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_policy *policy = data; + + switch (val) { + case CPUFREQ_CREATE_POLICY: + group_init(policy->related_cpus); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block cpufreq_pol_nb = { + .notifier_call = cpufreq_policy_cb, +}; + +static int cpufreq_gov_cb(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_govinfo *info = data; + + switch (val) { + case CPUFREQ_LOAD_CHANGE: + core_ctl_set_busy(info->cpu, info->load); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block cpufreq_gov_nb = { + .notifier_call = cpufreq_gov_cb, +}; + +static int __init core_ctl_init(void) +{ + struct cpufreq_policy *policy; + unsigned int cpu; + + register_cpu_notifier(&cpu_notifier); + cpufreq_register_notifier(&cpufreq_pol_nb, CPUFREQ_POLICY_NOTIFIER); + cpufreq_register_notifier(&cpufreq_gov_nb, CPUFREQ_GOVINFO_NOTIFIER); + init_timer_deferrable(&rq_avg_timer); + rq_avg_timer.function = rq_avg_timer_func; + + get_online_cpus(); + for_each_online_cpu(cpu) { + policy = cpufreq_cpu_get(cpu); + if (policy) { + group_init(policy->related_cpus); + cpufreq_cpu_put(policy); + } + } + put_online_cpus(); + mod_timer(&rq_avg_timer, round_to_nw_start()); + return 0; +} + +late_initcall(core_ctl_init); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e893b0fcac6b..83da13b5f6b8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2941,6 +2941,8 @@ static void find_best_cpu_in_cluster(struct sched_cluster *c, struct cpumask search_cpus; cpumask_and(&search_cpus, tsk_cpus_allowed(env->p), &c->cpus); + cpumask_andnot(&search_cpus, &search_cpus, cpu_isolated_mask); + if (env->ignore_prev_cpu) cpumask_clear_cpu(env->prev_cpu, &search_cpus); @@ -3009,7 +3011,8 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats) prev_cpu = env->prev_cpu; if (!cpumask_test_cpu(prev_cpu, tsk_cpus_allowed(task)) || - unlikely(!cpu_active(prev_cpu))) + unlikely(!cpu_active(prev_cpu)) || + cpu_isolated(prev_cpu)) return false; if (task->ravg.mark_start - task->last_cpu_selected_ts >= @@ -7354,6 +7357,8 @@ void update_group_capacity(struct sched_domain *sd, int cpu) struct sched_group_capacity *sgc; struct rq *rq = cpu_rq(cpu); + if (cpumask_test_cpu(cpu, cpu_isolated_mask)) + continue; /* * build_sched_domains() -> init_sched_groups_capacity() * gets here before we've attached the domains to the @@ -7381,7 +7386,11 @@ void update_group_capacity(struct sched_domain *sd, int cpu) group = child->groups; do { - capacity += group->sgc->capacity; + cpumask_t *cpus = sched_group_cpus(group); + + /* Revisit this later. This won't work for MT domain */ + if (!cpu_isolated(cpumask_first(cpus))) + capacity += group->sgc->capacity; group = group->next; } while (group != child->groups); } @@ -7521,6 +7530,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, power_cost(i, 0), cpu_temp(i)); + if (cpu_isolated(i)) + continue; + /* Bias balancing toward cpus of our domain */ if (local_group) load = target_load(i, load_idx); @@ -7548,17 +7560,27 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->idle_cpus++; } - /* Adjust by relative CPU capacity of the group */ - sgs->group_capacity = group->sgc->capacity; - sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; + /* Isolated CPU has no weight */ + if (!group->group_weight) { + sgs->group_capacity = 0; + sgs->avg_load = 0; + sgs->group_no_capacity = 1; + sgs->group_type = group_other; + sgs->group_weight = group->group_weight; + } else { + /* Adjust by relative CPU capacity of the group */ + sgs->group_capacity = group->sgc->capacity; + sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / + sgs->group_capacity; - if (sgs->sum_nr_running) - sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; + sgs->group_weight = group->group_weight; - sgs->group_weight = group->group_weight; + sgs->group_no_capacity = group_is_overloaded(env, sgs); + sgs->group_type = group_classify(group, sgs, env); + } - sgs->group_no_capacity = group_is_overloaded(env, sgs); - sgs->group_type = group_classify(group, sgs, env); + if (sgs->sum_nr_running) + sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; } #ifdef CONFIG_SCHED_HMP @@ -8601,6 +8623,9 @@ static int idle_balance(struct rq *this_rq) int pulled_task = 0; u64 curr_cost = 0; + if (cpu_isolated(this_cpu)) + return 0; + idle_enter_fair(this_rq); /* @@ -8908,16 +8933,21 @@ static void nohz_balancer_kick(int type) return; } +void nohz_balance_clear_nohz_mask(int cpu) +{ + if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); + atomic_dec(&nohz.nr_cpus); + } +} + static inline void nohz_balance_exit_idle(int cpu) { if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { /* * Completely isolated CPUs don't ever set, so we must test. */ - if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); - } + nohz_balance_clear_nohz_mask(cpu); clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } } @@ -8974,7 +9004,7 @@ void nohz_balance_enter_idle(int cpu) /* * If we're a completely isolated CPU, we don't play. */ - if (on_null_domain(cpu_rq(cpu))) + if (on_null_domain(cpu_rq(cpu)) || cpu_isolated(cpu)) return; cpumask_set_cpu(cpu, nohz.idle_cpus_mask); @@ -9003,7 +9033,13 @@ static DEFINE_SPINLOCK(balancing); */ void update_max_interval(void) { - max_load_balance_interval = HZ*num_online_cpus()/10; + cpumask_t avail_mask; + unsigned int available_cpus; + + cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask); + available_cpus = cpumask_weight(&avail_mask); + + max_load_balance_interval = HZ*available_cpus/10; } /* @@ -9342,8 +9378,10 @@ void trigger_load_balance(struct rq *rq) { int type = NOHZ_KICK_ANY; - /* Don't need to rebalance while attached to NULL domain */ - if (unlikely(on_null_domain(rq))) + /* Don't need to rebalance while attached to NULL domain or + * cpu is isolated. + */ + if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq))) return; if (time_after_eq(jiffies, rq->next_balance)) diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index 5002619961ce..a921498dbf09 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -2828,10 +2828,10 @@ void set_window_start(struct rq *rq) rq->curr->ravg.mark_start = rq->window_start; } -void migrate_sync_cpu(int cpu) +void migrate_sync_cpu(int cpu, int new_cpu) { if (cpu == sync_cpu) - sync_cpu = smp_processor_id(); + sync_cpu = new_cpu; } static void reset_all_task_stats(void) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index cfec881491ef..ba4403e910d8 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -265,8 +265,12 @@ static void pull_rt_task(struct rq *this_rq); static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) { - /* Try to pull RT tasks here if we lower this rq's prio */ - return rq->rt.highest_prio.curr > prev->prio; + /* + * Try to pull RT tasks here if we lower this rq's prio and cpu is not + * isolated + */ + return rq->rt.highest_prio.curr > prev->prio && + !cpu_isolated(cpu_of(rq)); } static inline int rt_overloaded(struct rq *rq) @@ -1694,6 +1698,8 @@ static int find_lowest_rq_hmp(struct task_struct *task) for_each_sched_cluster(cluster) { cpumask_and(&candidate_mask, &cluster->cpus, lowest_mask); + cpumask_andnot(&candidate_mask, &candidate_mask, + cpu_isolated_mask); if (cpumask_empty(&candidate_mask)) continue; @@ -2282,7 +2288,8 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) + if (!task_on_rq_queued(p) || rq->rt.rt_nr_running || + cpu_isolated(cpu_of(rq))) return; queue_pull_task(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec7721112b05..41abb4dabeb7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1069,7 +1069,7 @@ extern void clear_boost_kick(int cpu); extern void clear_hmp_request(int cpu); extern void mark_task_starting(struct task_struct *p); extern void set_window_start(struct rq *rq); -extern void migrate_sync_cpu(int cpu); +extern void migrate_sync_cpu(int cpu, int new_cpu); extern void update_cluster_topology(void); extern void set_task_last_wake(struct task_struct *p, u64 wallclock); extern void set_task_last_switch_out(struct task_struct *p, u64 wallclock); @@ -1424,7 +1424,7 @@ static inline void clear_boost_kick(int cpu) { } static inline void clear_hmp_request(int cpu) { } static inline void mark_task_starting(struct task_struct *p) { } static inline void set_window_start(struct rq *rq) { } -static inline void migrate_sync_cpu(int cpu) { } +static inline void migrate_sync_cpu(int cpu, int new_cpu) {} static inline void update_cluster_topology(void) { } static inline void set_task_last_wake(struct task_struct *p, u64 wallclock) { } static inline void set_task_last_switch_out(struct task_struct *p, @@ -1953,6 +1953,7 @@ extern const struct sched_class idle_sched_class; extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); +extern void nohz_balance_clear_nohz_mask(int cpu); extern void idle_enter_fair(struct rq *this_rq); extern void idle_exit_fair(struct rq *this_rq); diff --git a/kernel/smp.c b/kernel/smp.c index abdc48cd79a3..b2ec21c5c9d6 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -766,8 +766,8 @@ void wake_up_all_idle_cpus(void) for_each_online_cpu(cpu) { if (cpu == smp_processor_id()) continue; - - wake_up_if_idle(cpu); + if (!cpu_isolated(cpu)) + wake_up_if_idle(cpu); } preempt_enable(); } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index fa909f9fd559..1b0117198a08 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -880,7 +880,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, base->cpu_base->active_bases |= 1 << base->index; - timer->state = HRTIMER_STATE_ENQUEUED; + timer->state |= HRTIMER_STATE_ENQUEUED; return timerqueue_add(&base->active, &timer->node); } @@ -900,11 +900,9 @@ static void __remove_hrtimer(struct hrtimer *timer, u8 newstate, int reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; - u8 state = timer->state; - timer->state = newstate; - if (!(state & HRTIMER_STATE_ENQUEUED)) - return; + if (!(timer->state & HRTIMER_STATE_ENQUEUED)) + goto out; if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); @@ -921,6 +919,13 @@ static void __remove_hrtimer(struct hrtimer *timer, if (reprogram && timer == cpu_base->next_timer) hrtimer_force_reprogram(cpu_base, 1); #endif + +out: + /* + * We need to preserve PINNED state here, otherwise we may end up + * migrating pinned hrtimers as well. + */ + timer->state = newstate | (timer->state & HRTIMER_STATE_PINNED); } /* @@ -949,6 +954,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest state = HRTIMER_STATE_INACTIVE; __remove_hrtimer(timer, base, state, reprogram); + timer->state &= ~HRTIMER_STATE_PINNED; return 1; } return 0; @@ -1002,6 +1008,10 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, timer_stats_hrtimer_set_start_info(timer); + /* Update pinned state */ + timer->state &= ~HRTIMER_STATE_PINNED; + timer->state |= (!!(mode & HRTIMER_MODE_PINNED)) << HRTIMER_PINNED_SHIFT; + leftmost = enqueue_hrtimer(timer, new_base); if (!leftmost) goto unlock; @@ -1176,8 +1186,8 @@ bool hrtimer_active(const struct hrtimer *timer) cpu_base = READ_ONCE(timer->base->cpu_base); seq = raw_read_seqcount_begin(&cpu_base->seq); - if (timer->state != HRTIMER_STATE_INACTIVE || - cpu_base->running == timer) + if (((timer->state & ~HRTIMER_STATE_PINNED) != + HRTIMER_STATE_INACTIVE) || cpu_base->running == timer) return true; } while (read_seqcount_retry(&cpu_base->seq, seq) || @@ -1614,13 +1624,17 @@ static void init_hrtimers_cpu(int cpu) hrtimer_init_hres(cpu_base); } -#ifdef CONFIG_HOTPLUG_CPU - +#if defined(CONFIG_HOTPLUG_CPU) static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, - struct hrtimer_clock_base *new_base) + struct hrtimer_clock_base *new_base, + bool remove_pinned) { struct hrtimer *timer; struct timerqueue_node *node; + struct timerqueue_head pinned; + int is_pinned; + + timerqueue_init_head(&pinned); while ((node = timerqueue_getnext(&old_base->active))) { timer = container_of(node, struct hrtimer, node); @@ -1633,6 +1647,13 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * under us on another CPU */ __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); + + is_pinned = timer->state & HRTIMER_STATE_PINNED; + if (!remove_pinned && is_pinned) { + timerqueue_add(&pinned, &timer->node); + continue; + } + timer->base = new_base; /* * Enqueue the timers on the new cpu. This does not @@ -1644,17 +1665,23 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, */ enqueue_hrtimer(timer, new_base); } + + /* Re-queue pinned timers for non-hotplug usecase */ + while ((node = timerqueue_getnext(&pinned))) { + timer = container_of(node, struct hrtimer, node); + + timerqueue_del(&pinned, &timer->node); + enqueue_hrtimer(timer, old_base); + } } -static void migrate_hrtimers(int scpu) +static void __migrate_hrtimers(int scpu, bool remove_pinned) { struct hrtimer_cpu_base *old_base, *new_base; + unsigned long flags; int i; - BUG_ON(cpu_online(scpu)); - tick_cancel_sched_timer(scpu); - - local_irq_disable(); + local_irq_save(flags); old_base = &per_cpu(hrtimer_bases, scpu); new_base = this_cpu_ptr(&hrtimer_bases); /* @@ -1666,7 +1693,7 @@ static void migrate_hrtimers(int scpu) for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], - &new_base->clock_base[i]); + &new_base->clock_base[i], remove_pinned); } raw_spin_unlock(&old_base->lock); @@ -1674,7 +1701,20 @@ static void migrate_hrtimers(int scpu) /* Check, if we got expired work to do */ __hrtimer_peek_ahead_timers(); - local_irq_enable(); + local_irq_restore(flags); +} + +static void migrate_hrtimers(int scpu) +{ + BUG_ON(cpu_online(scpu)); + tick_cancel_sched_timer(scpu); + + __migrate_hrtimers(scpu, true); +} + +void hrtimer_quiesce_cpu(void *cpup) +{ + __migrate_hrtimers(*(int *)cpup, false); } #endif /* CONFIG_HOTPLUG_CPU */ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 51896272fcde..0efb3916f5a4 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1620,56 +1620,86 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) } EXPORT_SYMBOL(schedule_timeout_uninterruptible); -#ifdef CONFIG_HOTPLUG_CPU -static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) +#if defined(CONFIG_HOTPLUG_CPU) +static void migrate_timer_list(struct tvec_base *new_base, + struct hlist_head *head, bool remove_pinned) { struct timer_list *timer; int cpu = new_base->cpu; + struct hlist_node *n; + int is_pinned; - while (!hlist_empty(head)) { - timer = hlist_entry(head->first, struct timer_list, entry); - /* We ignore the accounting on the dying cpu */ - detach_timer(timer, false); + hlist_for_each_entry_safe(timer, n, head, entry) { + is_pinned = timer->flags & TIMER_PINNED_ON_CPU; + if (!remove_pinned && is_pinned) + continue; + + detach_if_pending(timer, get_timer_base(timer->flags), false); timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; internal_add_timer(new_base, timer); } } -static void migrate_timers(int cpu) +static void __migrate_timers(int cpu, bool wait, bool remove_pinned) { struct tvec_base *old_base; struct tvec_base *new_base; + unsigned long flags; int i; - BUG_ON(cpu_online(cpu)); old_base = per_cpu_ptr(&tvec_bases, cpu); new_base = get_cpu_ptr(&tvec_bases); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - spin_lock_irq(&new_base->lock); + spin_lock_irqsave(&new_base->lock, flags); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); - BUG_ON(old_base->running_timer); + if (wait) { + /* Ensure timers are done running before continuing */ + while (old_base->running_timer) { + spin_unlock(&old_base->lock); + spin_unlock_irqrestore(&new_base->lock, flags); + cpu_relax(); + spin_lock_irqsave(&new_base->lock, flags); + spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + } + } else { + BUG_ON(old_base->running_timer); + } for (i = 0; i < TVR_SIZE; i++) - migrate_timer_list(new_base, old_base->tv1.vec + i); + migrate_timer_list(new_base, old_base->tv1.vec + i, + remove_pinned); for (i = 0; i < TVN_SIZE; i++) { - migrate_timer_list(new_base, old_base->tv2.vec + i); - migrate_timer_list(new_base, old_base->tv3.vec + i); - migrate_timer_list(new_base, old_base->tv4.vec + i); - migrate_timer_list(new_base, old_base->tv5.vec + i); + migrate_timer_list(new_base, old_base->tv2.vec + i, + remove_pinned); + migrate_timer_list(new_base, old_base->tv3.vec + i, + remove_pinned); + migrate_timer_list(new_base, old_base->tv4.vec + i, + remove_pinned); + migrate_timer_list(new_base, old_base->tv5.vec + i, + remove_pinned); } - old_base->active_timers = 0; - old_base->all_timers = 0; - spin_unlock(&old_base->lock); - spin_unlock_irq(&new_base->lock); + spin_unlock_irqrestore(&new_base->lock, flags); put_cpu_ptr(&tvec_bases); } +/* Migrate timers from 'cpu' to this_cpu */ +static void migrate_timers(int cpu) +{ + BUG_ON(cpu_online(cpu)); + __migrate_timers(cpu, false, true); +} + +void timer_quiesce_cpu(void *cpup) +{ + __migrate_timers(*(int *)cpup, true, false); +} + static int timer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index 9270e1ac6460..49fa2e6eea98 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -15,5 +15,3 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); -EXPORT_TRACEPOINT_SYMBOL(core_ctl_set_busy); -EXPORT_TRACEPOINT_SYMBOL(core_ctl_eval_need); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 029da92fb712..7f21591c8ec5 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -13,6 +13,7 @@ #include <linux/mm.h> #include <linux/cpu.h> +#include <linux/device.h> #include <linux/nmi.h> #include <linux/init.h> #include <linux/module.h> @@ -95,6 +96,7 @@ static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); +static DEFINE_PER_CPU(unsigned int, watchdog_en); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(bool, soft_watchdog_warn); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); @@ -586,9 +588,17 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio) sched_setscheduler(current, policy, ¶m); } -static void watchdog_enable(unsigned int cpu) +/* Must be called with hotplug lock (lock_device_hotplug()) held. */ +void watchdog_enable(unsigned int cpu) { struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); + unsigned int *enabled = raw_cpu_ptr(&watchdog_en); + + lock_device_hotplug_assert(); + + if (*enabled) + return; + *enabled = 1; /* kick off the timer for the hardlockup detector */ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); @@ -606,9 +616,17 @@ static void watchdog_enable(unsigned int cpu) __touch_watchdog(); } -static void watchdog_disable(unsigned int cpu) +/* Must be called with hotplug lock (lock_device_hotplug()) held. */ +void watchdog_disable(unsigned int cpu) { struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); + unsigned int *enabled = raw_cpu_ptr(&watchdog_en); + + lock_device_hotplug_assert(); + + if (!*enabled) + return; + *enabled = 0; watchdog_set_prio(SCHED_NORMAL, 0); hrtimer_cancel(hrtimer); diff --git a/mm/vmstat.c b/mm/vmstat.c index ca75eeecbad1..77b8eabd5446 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1390,7 +1390,7 @@ static cpumask_var_t cpu_stat_off; static void vmstat_update(struct work_struct *w) { - if (refresh_cpu_vm_stats(true)) { + if (refresh_cpu_vm_stats(true) && !cpu_isolated(smp_processor_id())) { /* * Counters were updated so we expect more updates * to occur in the future. Keep on running the @@ -1402,7 +1402,8 @@ static void vmstat_update(struct work_struct *w) } else { /* * We did not update any counters so the app may be in - * a mode where it does not cause counter updates. + * a mode where it does not cause counter updates or the cpu + * was isolated. * We may be uselessly running vmstat_update. * Defer the checking for differentials to the * shepherd thread on a different processor. @@ -1469,7 +1470,7 @@ static void vmstat_shepherd(struct work_struct *w) get_online_cpus(); /* Check processors whose vmstat worker threads have been disabled */ for_each_cpu(cpu, cpu_stat_off) - if (need_update(cpu) && + if (!cpu_isolated(cpu) && need_update(cpu) && cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) queue_delayed_work_on(cpu, vmstat_wq, |
