summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinux Build Service Account <lnxbuild@localhost>2016-09-30 11:33:23 -0600
committerLinux Build Service Account <lnxbuild@localhost>2016-09-30 11:33:23 -0600
commit89ca2e2c45e6e48a2d47ebb5bf845bfec88e4319 (patch)
treea4a13dd84e201ffd7398caffd3043265621e1aa4 /kernel
parent8d9657e7e077f13fd3e4e8091d3f1c5ca81f6649 (diff)
parent46692be6dd06ce405cb09cf8338b1f0edbbfb295 (diff)
Promotion of kernel.lnx.4.4-160930.
CRs Change ID Subject -------------------------------------------------------------------------------------------------------------- 1054226 Ifefb2f1cf6c24af7bc46fc62797955b8c8ad5815 perf: Add cpu isolation awareness 1055668 Ibb8afffdc1e4780a48d085918cb6d6cf84cc0dba msm: camera: Export IR LED device to mm-qcamera-daemon 1065513 I45690b239c73f636538b864f0c4a7e539a02eedb input: touchscreen: Change dev_pm_ops for Goodix driver 1069068 Ie19fececd9d2bc6cd3328a6c63c956bcc9eed9a6 arm64: process: Reduce the no. of bytes of data around r 1064336 I18a27267eebdca2c87bf4bffc11a120822cdc7c2 msm: ipa3: hal: change FnR prints to low 1063261 Id38c8e21a853907c884bccd2978f2fd0a547a1ca wil6210: fix wiphy registration sequence 1067981 I3f78196927501f582c36d5815096581185d797b4 soc: qcom: glink: Fix uninitialzed return value. 1065513 I3a00fb46106f859128f0fa9b8c99b5d6ba24bc7b input: touchscreen: Code clean-up for Goodix driver 1054226 I2c62af441fb9e5ba9f29719853a63e4c8f2d031b defconfig: msm: Disable core control helper 1054000 I783b7df9c7e0253e5dc88bd60e0b5300e26fba56 drivers: soc: Enable APR driver to use audio notifier 1044635 1053246 I8c5b8ffc62d34a44bfb47ec4f11477d4320d30a8 misc: qcom: qdsp6v2: SSR recovery support for NT decoder 1001175 I7d1e4d5cc421b800d2f00b6d23f6ff19ba4c4c7d msm: pcie: configurable PERST propagation delay 1067178 I9951f061ad22cc91eba1c75aba3bdfbdde904cb9 ASoC: wcd934x: Add dapm ignore suspend for backend dais 1068464 I479e23db73a64e0fc1371e6b5abfaf1c8969954b ARM: dts: msm: correct PM8005 S1 min/max voltage for msm 1053827 Icee01f6ba95e469acac9eed6bf2fdbc83947f5a8 msm: kgsl: Revisit the GPU snapshot dumping 1068464 I8539ca3a3456b9562e7ff0e48fd7824c15cea68f regulator: cpr3-mmss-regulator: update reference voltage 1066695 I11e71639291479d544849d0f8672b9384fa34d0a ARM: dts: msm: Add SMP2P entries for MSMTRITON 1064336 Ie1f0f48ffc1fd67fc8a2074d3d334fb8cd29c99f msm: ipa3: change FnR prints to low 1054000 I15325c1385eaa0f0cca2c07130f2b4a997d98e1f drivers: soc: Add Audio Notifier, PDR, and SSR drivers 1071217 I37e6f76b60ef0085d102c5d98179b467f6b77dee ARM: dts: msm: Add bus name to venus pil for MSMCOBALT 1060610 I88b6c0748e6683b2f11b751840ab26e3ed397b70 ASoC: wcd9335: add handset speaker gain mixer control 1054000 Ief6e89b003aa1e2b02f33e21e3cb17f8731425ad ASoC: msmcobalt: Add Audio Notifier, PDR, and SSR to Kco 1054226 I62acddeb707fc7d5626580166b3466e63f45fd89 core_ctl_helper: Remove code since it is not used anymor 1056661 Ida0d2d010f7bd226d7e2221f63e64a1d7e5a9075 ARM: dts: msm: Remove 710Mhz frequency for A540v2 GPU. 1063261 I51e58438672a45d210df5db3ac813e656cb525df wil6210: change HALP logging category to IRQ 1062271 If462fe3d82f139d72547f82dc7eb564f83cb35bf ASoC: msm: initialize the params array before using it 1054226 I4f1514ba5bac2e259a1105fcafb31d6a92ddd249 sched/core_ctl: Integrate core control with cpu isolatio 1063261 Ibc1e6dc2994268a60384b7d9bd459abc3791a0c4 wil6210: fix protection of wil->scan_request 1066424 Ic6dd2d1e7f829630dc6eae5ff74fae04f7fc7f9b ARM: dts: msm: enable more FG interrupts for PMICOBALT 1066695 I82e992df1bfa1113843a0772ca8b88e48050dd8d ARM: dts: msm: Add glink_pkt devices for MSMTRITON 1069455 Ibae066276b099ffb78c72a890a689f83e4df56a9 thermal: tsens: Update sensor ID index 1054226 Ice1a9503666a2b720bdb324289ca55ceb33097cd cpumask: Add cpu isolation support 1054226 I96505aeb9d07a6fa3a2c28648ffa299e0cfa2e41 sched/core_ctl: Refactor cpu data 1054226 Ib911a0d34c250c4df020bdb265b92d2b8df8db93 timer: Do not require CPUSETS to be enabled for migratio 1054226 Ie4c6cb1496ae3490d81681f1ad51c8103caa0014 soc: qcom: watchdog_v2: Add support for cpu isolation 1068464 I3fbfa4c1fc5a6b4f30f8acaf659d2abcc05a7d16 ARM: dts: msm: modify VDD_GFX CPR ceiling voltages for m 1066695 I1aaeec4eea40fedbadf8b6008233dbd6ce5b3312 ARM: dts: msm: Add ipc-spinlock entry for MSMTRITON 1054226 Icc4d1c183e993b4b3c9b96ec9779c234e73ecab7 hrtimer: make sure PINNED flag is cleared after removing 1069455 I63e2a0a29f6bbe787fa10170c28569cf692d2807 ARM: dts: msm: Update TSENS sensor ID for MSMCOBALT 1070187 Id2de8e2ac94476c1a4927f719f2987a31d692ab5 ARM: dts: msm: modify VDD_APC0/1 CPR voltages for msmcob 1065513 I0a9037eac6e30a6319919043dd2ef1c226663af9 input: touchscreen: Remove irq polling from Goodix drive 1065513 I5132854367330a9b47f678409cbe6a45f2b5abb3 msm: reap unused kernel files 1063261 I20bc8d4b2b58fc3235ec3fe778738055d7535276 wil6210: extract firmware capabilities from FW file 1069380 Ic332f87666e405edbf3511671828ba824af1f3a2 ARM: dts: msm: Fix smem rpm xprt entry for msmfalcon 1054226 I8624e0659b86b7b8fa425a3fafdb0784fe005124 timer: create timer_quiesce_cpu() to isolate CPU from ti 1056910 I09fcc019133f4d37b7be3287da8e0733e40fc0ac sched: constrain HMP scheduler tunable range with in bet 1063261 Ib5c55a37208d76505658635b12afc88552d5a220 wil6210: support rx key setting for all TIDs 1071204 Ib44789559c69e5808ed362cf9191486c93b2b66e msm: mink: Accept zero args for invoke command 1071752 I1dddc7df26caa1556e57128603afd32b2613ebde soc: qcom: pil: add PBL spare error status 1066569 I99ce1e5940506a5e65debfe822460c210a276b00 thermal: adc_tm: Initialize ADC parameters 1054000 I2ab027d4a6e2cb98df5358e24f6bebacd9aecca7 drivers: soc: Add Kconfigs for Audio Notifier, PDR, and 1002389 Id74650d5c4aaf9f84a56372f60ff5a40374e8f7b diag: Fix possible buffer corruption by proper typecasti 1054226 I5975f1e5d7a1947dc5ee1cf8a0c16ec88b0fc6fb defconfig: msmcortex: Enable core control 1065513 I3ade13181957d327ad9d0266b1999a4b0f2d8d1a input: touchscreen: Add threaded irq support to Goodix d 1069084 I916e37bd79a6645bdc186a78a65051ce1c4dd475 soc: qcom: Listen to SUBSYS_AFTER_SHUTDOWN notification 1054226 I401de0b52fa6d20573187265ee56edd543b1419e vmstat: Add cpu isolation awareness 1066422 Id9f28a0eeb2a904aca41eb46d0215d80287e0b88 qcom-charger: fg-util: add float decode function 1071363 I542fab59eadbea404c0001d25315172cc993488b ASoC: wcd934x: Avoid pop during dsd path tear down 1054226 I51259ea41e3bd5cdba50b718201a6840174a7224 hrtimer: create hrtimer_quiesce_cpu() to isolate CPU fro 1054226 I5d849dfd29aa5bb594454473768d7db1da258028 sched/tick: Ensure timers does not get queued on isolate 1054226 I7b50778615541a64f9956573757c7f28748c4f69 irq: Make irq affinity function cpu isolation aware 1068946 I6943b7f5565ad95eddb9e3d30de5efbc47106e3d qpnp-smb2: support batteryless platforms 1054226 I83e9fbb800df259616a145d311b50627dc42a5ff pmqos: Enable cpu isolation awareness 1054226 I632f37874ef79887ee1202a028ef734f392d6ed0 hrtimer.h: prevent pinned timer state from breaking inac 1066695 I53657de1a41e727b29793f02c7f2c1a43db1c96c ARM: dts: msm: Add G-Link SSR entries for MSMTRITON 1055668 I63da161f90bce2c33d1e7e9d5822c8635e307fd5 msm: camera: Export IR CUT device to mm-qcamera-daemon 1054226 I88a728ee1d54aaa887fab52e5e40d1d4e4fc69ca watchdog: Add support for cpu isolation 1070189 I791941fbede4b136c3f24d15b7fb0b60dd5477e6 ASoC: wcd934x: Check for null pointer before access 1066695 I6ee171881943d8ab77445ede1c6ee714ed171d4d ARM: dts: msm: Add SMEM entry for MSMTRITON 1054226 I370e404001344e635a663822b07557abbe0f6f52 timer: Add function to migrate timers 1046649 I941f91eeba01f4e7aa5427056bc57875e7edf197 msm: kgsl: Add memory and periphery clock control for A5 1068888 I5637e52be59ea9504ea6ae317394bef0c28c7865 net: ipc_router: fix NULL pointer de-reference issue 1066695 I276db2a07870864fca046627a615a30bc4e3936e ARM: dts: msm: Add IPC Router devices for MSMTRITON 1067981 I82b08ff548a9abb0538a0ba24f699a99547ec7b7 soc: qcom: glink_ssr: Fix uninitialized variable 1066695 I1e59ec8028e128a764d3a79d446b5e8d650937b1 ARM: dts: msm: Add G-Link SMEM Transport entries for MSM 1054226 I24d6e91b6dff468c640c2fe3a37a7f31b6f0c79a timer: Ensure timers are not running before migrating 1054226 I65943d8e4a9eac1f9f5a40ad5aaf166679215f48 trace: Move core control trace events to scheduler 1065513 I280f2201c69838ad4da8eb94e9f10768f54ed457 input: touchscreen: Fix issues in suspend path 1065513 I6c18e153ddf18667ca83d47df20c71bce6dbfa21 input: touchscreen: Remove redundant code from Goodix dr 1055668 I2e04fa47efd1454bb487eca67bd9ceaeab3e9edf msm: camera: Add a driver to control IR LED device 1054226 Ia78e701468ea3828195c2a15c9cf9fafd099804a core_ctrl: Move core control into kernel 1054226 If2d30000f068afc50db953940f4636ef6a089b24 sched/core: Add trace point for cpu isolation 1057562 I76c9a9e44755a4a77e6cffb1dc07f5b28c8b34b8 Migrate mpq demux driver from kernel 3.18 to 4.4 1036232 If6c273d9a86f9fc4bc841388b11b96c385dc64f4 net: cnss: Add support to get fw files for QCA SDIO targ 1068464 I2b0a8e5353c9bce25c965a8b6ead7494454466c9 ARM: dts: msm: modify VDD_GFX CPR voltage adjustments fo 1065513 Ic2b1b2562b63ccecdf15bdc64ad7e45996d196d3 input: touchscreen: Add debugfs support for suspend/resu 1070872 I6cbe4167ab9d980b75f4fefdd4add0d8e8adaef8 ARM: dts: msm: add slimbus7 and slimbus8 cpu dais for ms 1054226 If3b3770e547971809e789ea7c8033c48ec2aa92d hrtimer: update timer->state with 'pinned' information 1064336 I5cfadb3ee7cb339b89b0c428bae46d3802476eb9 msm: ipa3: increase SSR tag timeout 1055668 I30d1c4e6c40b8e58a70f06db9e05231b4c7f676f msm: camera: Add a driver to control IR CUT device 1054226 I0bbddb56238c2958c5987877c5bfc3e79afa67cc sched: add cpu isolation support 1051762 Ife6146d28c8bc834a79e861959eca03e58e12d5e ASoC: msm: qdsp6v2: Change device switch handling 1063261 I95c14c0fe7a33c078eb7d9aa44dd97a64f9b0fae wil6210: align to latest auto generated wmi.h 1063261 I169e0c94edf5df31336af1ede36900ec337f4314 wil6210: align to latest auto generated wmi.h 1071938 I7ab180e06ececf8136903ee04565b8b4a2bf3524 icnss: update logs for QMI rejected messages 1057065 Ie6234ae30ad47a063982e5cc50f4ecedf1f61de2 msm: pcie: verify EP is accessible before conf restore 993625 I2c99df5de44a6fd924ce7f5921db0e1cf3ba5d11 msm: pil: Adding function name and buffer size informati 1065983 I12154b0aa315fde6dd92267d4c8f4a78a6f0236f ASoC: msmcobalt: send ANC config for WCD9340 codec 1071464 Ic3ef2229fa8552301e09dfb912e79e044a81324f usb: dwc3: fix overriding core clock rate to default max 1054000 Ib88a71e2fdb2b58fd5f87a65cb7d3253884f2d97 ASoC: msmcobalt: Enable msmcobalt to use audio notifier 1063261 Ifb92501aab14843309fed8e1214a867b2ccccfce wil6210: Fix driver down flow 1065983 I8c83f6305dbc0a40b67bf2ffd53d37a0abdcf953 ASoC: wcd934x: enable rate converter clock for AANC 1054226 I07702bb5b738c1c75c49a2ca4cb08be0231ccb12 smp: Do not wake up all idle CPUs 1063261 Id17271823d167677a323dd1f52c7de4c6025b56e wil6210: prevent usage of incorrect TX hwtail 1063261 I79f8522ae84dd209cb98c3bbc52cfaeb199dd342 wil6210: fix stop p2p device handling 1057562 Ia50bd897f6bf4c0ea7adc27d53a657090a09e229 Migrate demux driver from kernel 3.18 to 4.4 1066563 I0e21c5966e0072eab826c92fc332c54e11cb0b23 thermal: tsens: Update readl call in msm_tsens_get_temp( 1054226 I6a13e8dda99130ca794e5b6f51600f4c57a3e921 drivers/base: cpu: Add node for cpu isolation Change-Id: If82c9559e44520d270bc164fcb86b382b71301ff CRs-Fixed: 1036232, 1069455, 1071204, 1001175, 1053827, 1057562, 1060610, 1068946, 1071363, 1071938, 1068888, 1069380, 1069068, 1065983, 1055668, 1066695, 1067981, 1063261, 1064336, 1067178, 1066422, 1066424, 1065513, 1066569, 1070189, 1057065, 1068464, 1071217, 1066563, 1069084, 1070187, 993625, 1053246, 1070872, 1051762, 1046649, 1056661, 1062271, 1002389, 1071752, 1056910, 1054000, 1044635, 1054226, 1071464
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpu.c17
-rw-r--r--kernel/events/core.c3
-rw-r--r--kernel/irq/cpuhotplug.c11
-rw-r--r--kernel/power/qos.c7
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/core.c295
-rw-r--r--kernel/sched/core_ctl.c1055
-rw-r--r--kernel/sched/core_ctl.h24
-rw-r--r--kernel/sched/fair.c74
-rw-r--r--kernel/sched/hmp.c33
-rw-r--r--kernel/sched/rt.c13
-rw-r--r--kernel/sched/sched.h5
-rw-r--r--kernel/smp.c4
-rw-r--r--kernel/sysctl.c23
-rw-r--r--kernel/time/hrtimer.c74
-rw-r--r--kernel/time/timer.c68
-rw-r--r--kernel/trace/power-traces.c2
-rw-r--r--kernel/watchdog.c22
18 files changed, 1622 insertions, 109 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1cfd381642da..3c97f5b88a07 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -768,6 +768,10 @@ static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly;
const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits);
EXPORT_SYMBOL(cpu_active_mask);
+static DECLARE_BITMAP(cpu_isolated_bits, CONFIG_NR_CPUS) __read_mostly;
+const struct cpumask *const cpu_isolated_mask = to_cpumask(cpu_isolated_bits);
+EXPORT_SYMBOL(cpu_isolated_mask);
+
void set_cpu_possible(unsigned int cpu, bool possible)
{
if (possible)
@@ -802,6 +806,14 @@ void set_cpu_active(unsigned int cpu, bool active)
cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits));
}
+void set_cpu_isolated(unsigned int cpu, bool isolated)
+{
+ if (isolated)
+ cpumask_set_cpu(cpu, to_cpumask(cpu_isolated_bits));
+ else
+ cpumask_clear_cpu(cpu, to_cpumask(cpu_isolated_bits));
+}
+
void init_cpu_present(const struct cpumask *src)
{
cpumask_copy(to_cpumask(cpu_present_bits), src);
@@ -817,6 +829,11 @@ void init_cpu_online(const struct cpumask *src)
cpumask_copy(to_cpumask(cpu_online_bits), src);
}
+void init_cpu_isolated(const struct cpumask *src)
+{
+ cpumask_copy(to_cpumask(cpu_isolated_bits), src);
+}
+
static ATOMIC_NOTIFIER_HEAD(idle_notifier);
void idle_notifier_register(struct notifier_block *n)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d6ec580584b6..5beb88f11671 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3386,7 +3386,8 @@ static int perf_event_read(struct perf_event *event, bool group)
* If event is enabled and currently active on a CPU, update the
* value in the event structure:
*/
- if (event->state == PERF_EVENT_STATE_ACTIVE) {
+ if (event->state == PERF_EVENT_STATE_ACTIVE &&
+ !cpu_isolated(event->oncpu)) {
struct perf_read_data data = {
.event = event,
.group = group,
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 011f8c4c63da..104432f3d311 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -11,6 +11,7 @@
#include <linux/interrupt.h>
#include <linux/ratelimit.h>
#include <linux/irq.h>
+#include <linux/cpumask.h>
#include "internals.h"
@@ -20,6 +21,7 @@ static bool migrate_one_irq(struct irq_desc *desc)
const struct cpumask *affinity = d->common->affinity;
struct irq_chip *c;
bool ret = false;
+ struct cpumask available_cpus;
/*
* If this is a per-CPU interrupt, or the affinity does not
@@ -29,8 +31,15 @@ static bool migrate_one_irq(struct irq_desc *desc)
!cpumask_test_cpu(smp_processor_id(), affinity))
return false;
+ cpumask_copy(&available_cpus, affinity);
+ cpumask_andnot(&available_cpus, &available_cpus, cpu_isolated_mask);
+ affinity = &available_cpus;
+
if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
- affinity = cpu_online_mask;
+ cpumask_andnot(&available_cpus, cpu_online_mask,
+ cpu_isolated_mask);
+ if (cpumask_empty(affinity))
+ affinity = cpu_online_mask;
ret = true;
}
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 8ecc7b3f7dd9..69c32c42080f 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -45,6 +45,7 @@
#include <linux/seq_file.h>
#include <linux/irq.h>
#include <linux/irqdesc.h>
+#include <linux/cpumask.h>
#include <linux/uaccess.h>
#include <linux/export.h>
@@ -447,6 +448,9 @@ EXPORT_SYMBOL_GPL(pm_qos_request);
int pm_qos_request_for_cpu(int pm_qos_class, int cpu)
{
+ if (cpu_isolated(cpu))
+ return INT_MAX;
+
return pm_qos_array[pm_qos_class]->constraints->target_per_cpu[cpu];
}
EXPORT_SYMBOL(pm_qos_request_for_cpu);
@@ -469,6 +473,9 @@ int pm_qos_request_for_cpumask(int pm_qos_class, struct cpumask *mask)
val = c->default_value;
for_each_cpu(cpu, mask) {
+ if (cpu_isolated(cpu))
+ continue;
+
switch (c->type) {
case PM_QOS_MIN:
if (c->target_per_cpu[cpu] < val)
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 1f159743ebfc..508b65690288 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -20,3 +20,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+obj-$(CONFIG_SCHED_CORE_CTL) += core_ctl.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7474463b9835..609aa2e588d7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -74,6 +74,7 @@
#include <linux/binfmts.h>
#include <linux/context_tracking.h>
#include <linux/compiler.h>
+#include <linux/irq.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -84,6 +85,7 @@
#endif
#include "sched.h"
+#include "core_ctl.h"
#include "../workqueue_internal.h"
#include "../smpboot.h"
@@ -1229,6 +1231,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
struct rq *rq;
unsigned int dest_cpu;
int ret = 0;
+ cpumask_t allowed_mask;
rq = task_rq_lock(p, &flags);
@@ -1244,16 +1247,22 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
if (cpumask_equal(&p->cpus_allowed, new_mask))
goto out;
- dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+ cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask);
+
+ dest_cpu = cpumask_any_and(cpu_active_mask, &allowed_mask);
if (dest_cpu >= nr_cpu_ids) {
- ret = -EINVAL;
- goto out;
+ dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+ if (dest_cpu >= nr_cpu_ids) {
+ ret = -EINVAL;
+ goto out;
+ }
+ cpumask_copy(&allowed_mask, new_mask);
}
do_set_cpus_allowed(p, new_mask);
/* Can the task run on the task's current CPU? If so, we're done */
- if (cpumask_test_cpu(task_cpu(p), new_mask))
+ if (cpumask_test_cpu(task_cpu(p), &allowed_mask))
goto out;
if (task_running(rq, p) || p->state == TASK_WAKING) {
@@ -1577,12 +1586,13 @@ EXPORT_SYMBOL_GPL(kick_process);
/*
* ->cpus_allowed is protected by both rq->lock and p->pi_lock
*/
-static int select_fallback_rq(int cpu, struct task_struct *p)
+static int select_fallback_rq(int cpu, struct task_struct *p, bool allow_iso)
{
int nid = cpu_to_node(cpu);
const struct cpumask *nodemask = NULL;
enum { cpuset, possible, fail } state = cpuset;
int dest_cpu;
+ int isolated_candidate = -1;
/*
* If the node that the cpu is on has been offlined, cpu_to_node()
@@ -1598,6 +1608,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
continue;
if (!cpu_active(dest_cpu))
continue;
+ if (cpu_isolated(dest_cpu))
+ continue;
if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
return dest_cpu;
}
@@ -1610,6 +1622,16 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
continue;
if (!cpu_active(dest_cpu))
continue;
+ if (cpu_isolated(dest_cpu)) {
+ if (allow_iso)
+ isolated_candidate = dest_cpu;
+ continue;
+ }
+ goto out;
+ }
+
+ if (isolated_candidate != -1) {
+ dest_cpu = isolated_candidate;
goto out;
}
@@ -1655,6 +1677,8 @@ out:
static inline
int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
{
+ bool allow_isolated = (p->flags & PF_KTHREAD);
+
lockdep_assert_held(&p->pi_lock);
if (p->nr_cpus_allowed > 1)
@@ -1671,8 +1695,9 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
* not worry about this generic constraint ]
*/
if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
- !cpu_online(cpu)))
- cpu = select_fallback_rq(task_cpu(p), p);
+ !cpu_online(cpu)) ||
+ (cpu_isolated(cpu) && !allow_isolated))
+ cpu = select_fallback_rq(task_cpu(p), p, allow_isolated);
return cpu;
}
@@ -2956,7 +2981,7 @@ void sched_exec(void)
if (dest_cpu == smp_processor_id())
goto unlock;
- if (likely(cpu_active(dest_cpu))) {
+ if (likely(cpu_active(dest_cpu) && likely(!cpu_isolated(dest_cpu)))) {
struct migration_arg arg = { p, dest_cpu };
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -3066,6 +3091,8 @@ void scheduler_tick(void)
if (curr->sched_class == &fair_sched_class)
check_for_migration(rq, curr);
+
+ core_ctl_check(wallclock);
}
#ifdef CONFIG_NO_HZ_FULL
@@ -5414,18 +5441,22 @@ static struct task_struct fake_task = {
};
/*
- * Migrate all tasks from the rq, sleeping tasks will be migrated by
- * try_to_wake_up()->select_task_rq().
+ * Migrate all tasks (not pinned if pinned argument say so) from the rq,
+ * sleeping tasks will be migrated by try_to_wake_up()->select_task_rq().
*
* Called with rq->lock held even though we'er in stop_machine() and
* there's no concurrency possible, we hold the required locks anyway
* because of lock validation efforts.
*/
-static void migrate_tasks(struct rq *dead_rq)
+static void migrate_tasks(struct rq *dead_rq, bool migrate_pinned_tasks)
{
struct rq *rq = dead_rq;
struct task_struct *next, *stop = rq->stop;
int dest_cpu;
+ unsigned int num_pinned_kthreads = 1; /* this thread */
+ cpumask_t avail_cpus;
+
+ cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask);
/*
* Fudge the rq selection such that the below task selection loop
@@ -5447,10 +5478,12 @@ static void migrate_tasks(struct rq *dead_rq)
for (;;) {
/*
- * There's this thread running, bail when that's the only
- * remaining thread.
+ * There's this thread running + pinned threads, bail when
+ * that's the only remaining threads.
*/
- if (rq->nr_running == 1)
+ if ((migrate_pinned_tasks && rq->nr_running == 1) ||
+ (!migrate_pinned_tasks &&
+ rq->nr_running == num_pinned_kthreads))
break;
/*
@@ -5461,6 +5494,13 @@ static void migrate_tasks(struct rq *dead_rq)
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
+ if (!migrate_pinned_tasks && next->flags & PF_KTHREAD &&
+ !cpumask_intersects(&avail_cpus, &next->cpus_allowed)) {
+ lockdep_unpin_lock(&rq->lock);
+ num_pinned_kthreads += 1;
+ continue;
+ }
+
/*
* Rules for changing task_struct::cpus_allowed are holding
* both pi_lock and rq->lock, such that holding either
@@ -5486,7 +5526,7 @@ static void migrate_tasks(struct rq *dead_rq)
}
/* Find suitable destination for @next, with force if needed. */
- dest_cpu = select_fallback_rq(dead_rq->cpu, next);
+ dest_cpu = select_fallback_rq(dead_rq->cpu, next, false);
rq = __migrate_task(rq, next, dest_cpu);
if (rq != dead_rq) {
@@ -5502,6 +5542,222 @@ static void migrate_tasks(struct rq *dead_rq)
rq->stop = stop;
}
+
+static void set_rq_online(struct rq *rq);
+static void set_rq_offline(struct rq *rq);
+
+int do_isolation_work_cpu_stop(void *data)
+{
+ unsigned long flags;
+ unsigned int cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+
+ watchdog_disable(cpu);
+
+ irq_migrate_all_off_this_cpu();
+
+ sched_ttwu_pending();
+ /* Update our root-domain */
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
+ if (rq->rd) {
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+ set_rq_offline(rq);
+ }
+
+ migrate_tasks(rq, false);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ /*
+ * We might have been in tickless state. Clear NOHZ flags to avoid
+ * us being kicked for helping out with balancing
+ */
+ nohz_balance_clear_nohz_mask(cpu);
+ return 0;
+}
+
+int do_unisolation_work_cpu_stop(void *data)
+{
+ watchdog_enable(smp_processor_id());
+ return 0;
+}
+
+static void init_sched_groups_capacity(int cpu, struct sched_domain *sd);
+
+static void sched_update_group_capacities(int cpu)
+{
+ struct sched_domain *sd;
+
+ mutex_lock(&sched_domains_mutex);
+ rcu_read_lock();
+
+ for_each_domain(cpu, sd) {
+ int balance_cpu = group_balance_cpu(sd->groups);
+
+ init_sched_groups_capacity(cpu, sd);
+ /*
+ * Need to ensure this is also called with balancing
+ * cpu.
+ */
+ if (cpu != balance_cpu)
+ init_sched_groups_capacity(balance_cpu, sd);
+ }
+
+ rcu_read_unlock();
+ mutex_unlock(&sched_domains_mutex);
+}
+
+static unsigned int cpu_isolation_vote[NR_CPUS];
+
+int sched_isolate_count(const cpumask_t *mask, bool include_offline)
+{
+ cpumask_t count_mask = CPU_MASK_NONE;
+
+ if (include_offline) {
+ cpumask_complement(&count_mask, cpu_online_mask);
+ cpumask_or(&count_mask, &count_mask, cpu_isolated_mask);
+ cpumask_and(&count_mask, &count_mask, mask);
+ } else {
+ cpumask_and(&count_mask, mask, cpu_isolated_mask);
+ }
+
+ return cpumask_weight(&count_mask);
+}
+
+/*
+ * 1) CPU is isolated and cpu is offlined:
+ * Unisolate the core.
+ * 2) CPU is not isolated and CPU is offlined:
+ * No action taken.
+ * 3) CPU is offline and request to isolate
+ * Request ignored.
+ * 4) CPU is offline and isolated:
+ * Not a possible state.
+ * 5) CPU is online and request to isolate
+ * Normal case: Isolate the CPU
+ * 6) CPU is not isolated and comes back online
+ * Nothing to do
+ *
+ * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY
+ * calling sched_unisolate_cpu() on a CPU that the client previously isolated.
+ * Client is also responsible for unisolating when a core goes offline
+ * (after CPU is marked offline).
+ */
+int sched_isolate_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ cpumask_t avail_cpus;
+ int ret_code = 0;
+ u64 start_time;
+
+ if (trace_sched_isolate_enabled())
+ start_time = sched_clock();
+
+ lock_device_hotplug();
+
+ cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask);
+
+ /* We cannot isolate ALL cpus in the system */
+ if (cpumask_weight(&avail_cpus) == 1) {
+ ret_code = -EINVAL;
+ goto out;
+ }
+
+ if (!cpu_online(cpu)) {
+ ret_code = -EINVAL;
+ goto out;
+ }
+
+ if (++cpu_isolation_vote[cpu] > 1)
+ goto out;
+
+ set_cpu_isolated(cpu, true);
+ cpumask_clear_cpu(cpu, &avail_cpus);
+
+ /* Migrate timers */
+ smp_call_function_any(&avail_cpus, hrtimer_quiesce_cpu, &cpu, 1);
+ smp_call_function_any(&avail_cpus, timer_quiesce_cpu, &cpu, 1);
+
+ migrate_sync_cpu(cpu, cpumask_first(&avail_cpus));
+ stop_cpus(cpumask_of(cpu), do_isolation_work_cpu_stop, 0);
+
+ clear_hmp_request(cpu);
+ calc_load_migrate(rq);
+ update_max_interval();
+ sched_update_group_capacities(cpu);
+
+out:
+ unlock_device_hotplug();
+ trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0],
+ start_time, 1);
+ return ret_code;
+}
+
+/*
+ * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY
+ * calling sched_unisolate_cpu() on a CPU that the client previously isolated.
+ * Client is also responsible for unisolating when a core goes offline
+ * (after CPU is marked offline).
+ */
+int sched_unisolate_cpu_unlocked(int cpu)
+{
+ int ret_code = 0;
+ struct rq *rq = cpu_rq(cpu);
+ u64 start_time;
+
+ if (trace_sched_isolate_enabled())
+ start_time = sched_clock();
+
+ lock_device_hotplug_assert();
+
+ if (!cpu_isolation_vote[cpu]) {
+ ret_code = -EINVAL;
+ goto out;
+ }
+
+ if (--cpu_isolation_vote[cpu])
+ goto out;
+
+ if (cpu_online(cpu)) {
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ rq->age_stamp = sched_clock_cpu(cpu);
+ if (rq->rd) {
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+ set_rq_online(rq);
+ }
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ }
+
+ set_cpu_isolated(cpu, false);
+ update_max_interval();
+ sched_update_group_capacities(cpu);
+
+ if (cpu_online(cpu)) {
+ stop_cpus(cpumask_of(cpu), do_unisolation_work_cpu_stop, 0);
+
+ /* Kick CPU to immediately do load balancing */
+ if (!test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
+ smp_send_reschedule(cpu);
+ }
+
+out:
+ trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0],
+ start_time, 0);
+ return ret_code;
+}
+
+int sched_unisolate_cpu(int cpu)
+{
+ int ret_code;
+
+ lock_device_hotplug();
+ ret_code = sched_unisolate_cpu_unlocked(cpu);
+ unlock_device_hotplug();
+ return ret_code;
+}
+
#endif /* CONFIG_HOTPLUG_CPU */
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -5748,13 +6004,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
sched_ttwu_pending();
/* Update our root-domain */
raw_spin_lock_irqsave(&rq->lock, flags);
- migrate_sync_cpu(cpu);
+ migrate_sync_cpu(cpu, smp_processor_id());
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
- migrate_tasks(rq);
+ migrate_tasks(rq, true);
BUG_ON(rq->nr_running != 1); /* the migration thread */
raw_spin_unlock_irqrestore(&rq->lock, flags);
break;
@@ -6509,11 +6765,14 @@ build_sched_groups(struct sched_domain *sd, int cpu)
static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
{
struct sched_group *sg = sd->groups;
+ cpumask_t avail_mask;
WARN_ON(!sg);
do {
- sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+ cpumask_andnot(&avail_mask, sched_group_cpus(sg),
+ cpu_isolated_mask);
+ sg->group_weight = cpumask_weight(&avail_mask);
sg = sg->next;
} while (sg != sd->groups);
diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c
new file mode 100644
index 000000000000..d81886da7ca2
--- /dev/null
+++ b/kernel/sched/core_ctl.c
@@ -0,0 +1,1055 @@
+/* Copyright (c) 2014-2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/cpufreq.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+
+#include <trace/events/sched.h>
+
+#define MAX_CPUS_PER_CLUSTER 4
+#define MAX_CLUSTERS 2
+
+struct cluster_data {
+ bool inited;
+ unsigned int min_cpus;
+ unsigned int max_cpus;
+ unsigned int offline_delay_ms;
+ unsigned int busy_up_thres[MAX_CPUS_PER_CLUSTER];
+ unsigned int busy_down_thres[MAX_CPUS_PER_CLUSTER];
+ unsigned int active_cpus;
+ unsigned int num_cpus;
+ cpumask_t cpu_mask;
+ unsigned int need_cpus;
+ unsigned int task_thres;
+ s64 last_isolate_ts;
+ struct list_head lru;
+ bool pending;
+ spinlock_t pending_lock;
+ bool is_big_cluster;
+ int nrrun;
+ bool nrrun_changed;
+ struct task_struct *core_ctl_thread;
+ unsigned int first_cpu;
+ bool boost;
+ struct kobject kobj;
+};
+
+struct cpu_data {
+ bool online;
+ bool is_busy;
+ unsigned int busy;
+ unsigned int cpu;
+ bool not_preferred;
+ struct cluster_data *cluster;
+ struct list_head sib;
+ bool isolated_by_us;
+};
+
+static DEFINE_PER_CPU(struct cpu_data, cpu_state);
+static struct cluster_data cluster_state[MAX_CLUSTERS];
+static unsigned int num_clusters;
+
+#define for_each_cluster(cluster, idx) \
+ for ((cluster) = &cluster_state[idx]; (idx) < num_clusters;\
+ (idx)++, (cluster) = &cluster_state[idx])
+
+static DEFINE_SPINLOCK(state_lock);
+static void apply_need(struct cluster_data *state);
+static void wake_up_core_ctl_thread(struct cluster_data *state);
+static bool initialized;
+
+static unsigned int get_active_cpu_count(const struct cluster_data *cluster);
+
+/* ========================= sysfs interface =========================== */
+
+static ssize_t store_min_cpus(struct cluster_data *state,
+ const char *buf, size_t count)
+{
+ unsigned int val;
+
+ if (sscanf(buf, "%u\n", &val) != 1)
+ return -EINVAL;
+
+ state->min_cpus = min(val, state->max_cpus);
+ wake_up_core_ctl_thread(state);
+
+ return count;
+}
+
+static ssize_t show_min_cpus(const struct cluster_data *state, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%u\n", state->min_cpus);
+}
+
+static ssize_t store_max_cpus(struct cluster_data *state,
+ const char *buf, size_t count)
+{
+ unsigned int val;
+
+ if (sscanf(buf, "%u\n", &val) != 1)
+ return -EINVAL;
+
+ val = min(val, state->num_cpus);
+ state->max_cpus = val;
+ state->min_cpus = min(state->min_cpus, state->max_cpus);
+ wake_up_core_ctl_thread(state);
+
+ return count;
+}
+
+static ssize_t show_max_cpus(const struct cluster_data *state, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%u\n", state->max_cpus);
+}
+
+static ssize_t store_offline_delay_ms(struct cluster_data *state,
+ const char *buf, size_t count)
+{
+ unsigned int val;
+
+ if (sscanf(buf, "%u\n", &val) != 1)
+ return -EINVAL;
+
+ state->offline_delay_ms = val;
+ apply_need(state);
+
+ return count;
+}
+
+static ssize_t show_task_thres(const struct cluster_data *state, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%u\n", state->task_thres);
+}
+
+static ssize_t store_task_thres(struct cluster_data *state,
+ const char *buf, size_t count)
+{
+ unsigned int val;
+
+ if (sscanf(buf, "%u\n", &val) != 1)
+ return -EINVAL;
+
+ if (val < state->num_cpus)
+ return -EINVAL;
+
+ state->task_thres = val;
+ apply_need(state);
+
+ return count;
+}
+
+static ssize_t show_offline_delay_ms(const struct cluster_data *state,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%u\n", state->offline_delay_ms);
+}
+
+static ssize_t store_busy_up_thres(struct cluster_data *state,
+ const char *buf, size_t count)
+{
+ unsigned int val[MAX_CPUS_PER_CLUSTER];
+ int ret, i;
+
+ ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]);
+ if (ret != 1 && ret != state->num_cpus)
+ return -EINVAL;
+
+ if (ret == 1) {
+ for (i = 0; i < state->num_cpus; i++)
+ state->busy_up_thres[i] = val[0];
+ } else {
+ for (i = 0; i < state->num_cpus; i++)
+ state->busy_up_thres[i] = val[i];
+ }
+ apply_need(state);
+ return count;
+}
+
+static ssize_t show_busy_up_thres(const struct cluster_data *state, char *buf)
+{
+ int i, count = 0;
+
+ for (i = 0; i < state->num_cpus; i++)
+ count += snprintf(buf + count, PAGE_SIZE - count, "%u ",
+ state->busy_up_thres[i]);
+
+ count += snprintf(buf + count, PAGE_SIZE - count, "\n");
+ return count;
+}
+
+static ssize_t store_busy_down_thres(struct cluster_data *state,
+ const char *buf, size_t count)
+{
+ unsigned int val[MAX_CPUS_PER_CLUSTER];
+ int ret, i;
+
+ ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]);
+ if (ret != 1 && ret != state->num_cpus)
+ return -EINVAL;
+
+ if (ret == 1) {
+ for (i = 0; i < state->num_cpus; i++)
+ state->busy_down_thres[i] = val[0];
+ } else {
+ for (i = 0; i < state->num_cpus; i++)
+ state->busy_down_thres[i] = val[i];
+ }
+ apply_need(state);
+ return count;
+}
+
+static ssize_t show_busy_down_thres(const struct cluster_data *state, char *buf)
+{
+ int i, count = 0;
+
+ for (i = 0; i < state->num_cpus; i++)
+ count += snprintf(buf + count, PAGE_SIZE - count, "%u ",
+ state->busy_down_thres[i]);
+
+ count += snprintf(buf + count, PAGE_SIZE - count, "\n");
+ return count;
+}
+
+static ssize_t store_is_big_cluster(struct cluster_data *state,
+ const char *buf, size_t count)
+{
+ unsigned int val;
+
+ if (sscanf(buf, "%u\n", &val) != 1)
+ return -EINVAL;
+
+ state->is_big_cluster = val ? 1 : 0;
+ return count;
+}
+
+static ssize_t show_is_big_cluster(const struct cluster_data *state, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%u\n", state->is_big_cluster);
+}
+
+static ssize_t show_cpus(const struct cluster_data *state, char *buf)
+{
+ struct cpu_data *c;
+ ssize_t count = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&state_lock, flags);
+ list_for_each_entry(c, &state->lru, sib) {
+ count += snprintf(buf + count, PAGE_SIZE - count,
+ "CPU%u (%s)\n", c->cpu,
+ c->online ? "Online" : "Offline");
+ }
+ spin_unlock_irqrestore(&state_lock, flags);
+ return count;
+}
+
+static ssize_t show_need_cpus(const struct cluster_data *state, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%u\n", state->need_cpus);
+}
+
+static ssize_t show_active_cpus(const struct cluster_data *state, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%u\n", state->active_cpus);
+}
+
+static ssize_t show_global_state(const struct cluster_data *state, char *buf)
+{
+ struct cpu_data *c;
+ struct cluster_data *cluster;
+ ssize_t count = 0;
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ c = &per_cpu(cpu_state, cpu);
+ if (!c->cluster)
+ continue;
+
+ cluster = c->cluster;
+ if (!cluster || !cluster->inited)
+ continue;
+
+ count += snprintf(buf + count, PAGE_SIZE - count,
+ "CPU%u\n", cpu);
+ count += snprintf(buf + count, PAGE_SIZE - count,
+ "\tCPU: %u\n", c->cpu);
+ count += snprintf(buf + count, PAGE_SIZE - count,
+ "\tOnline: %u\n", c->online);
+ count += snprintf(buf + count, PAGE_SIZE - count,
+ "\tActive: %u\n",
+ !cpu_isolated(c->cpu));
+ count += snprintf(buf + count, PAGE_SIZE - count,
+ "\tFirst CPU: %u\n",
+ cluster->first_cpu);
+ count += snprintf(buf + count, PAGE_SIZE - count,
+ "\tBusy%%: %u\n", c->busy);
+ count += snprintf(buf + count, PAGE_SIZE - count,
+ "\tIs busy: %u\n", c->is_busy);
+ count += snprintf(buf + count, PAGE_SIZE - count,
+ "\tNr running: %u\n", cluster->nrrun);
+ count += snprintf(buf + count, PAGE_SIZE - count,
+ "\tActive CPUs: %u\n", get_active_cpu_count(cluster));
+ count += snprintf(buf + count, PAGE_SIZE - count,
+ "\tNeed CPUs: %u\n", cluster->need_cpus);
+ count += snprintf(buf + count, PAGE_SIZE - count,
+ "\tBoost: %u\n", (unsigned int) cluster->boost);
+ }
+
+ return count;
+}
+
+static ssize_t store_not_preferred(struct cluster_data *state,
+ const char *buf, size_t count)
+{
+ struct cpu_data *c;
+ unsigned int i;
+ unsigned int val[MAX_CPUS_PER_CLUSTER];
+ unsigned long flags;
+ int ret;
+
+ ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]);
+ if (ret != 1 && ret != state->num_cpus)
+ return -EINVAL;
+
+ i = 0;
+ spin_lock_irqsave(&state_lock, flags);
+ list_for_each_entry(c, &state->lru, sib)
+ c->not_preferred = val[i++];
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ return count;
+}
+
+static ssize_t show_not_preferred(const struct cluster_data *state, char *buf)
+{
+ struct cpu_data *c;
+ ssize_t count = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&state_lock, flags);
+ list_for_each_entry(c, &state->lru, sib)
+ count += snprintf(buf + count, PAGE_SIZE - count,
+ "\tCPU:%d %u\n", c->cpu, c->not_preferred);
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ return count;
+}
+
+
+struct core_ctl_attr {
+ struct attribute attr;
+ ssize_t (*show)(const struct cluster_data *, char *);
+ ssize_t (*store)(struct cluster_data *, const char *, size_t count);
+};
+
+#define core_ctl_attr_ro(_name) \
+static struct core_ctl_attr _name = \
+__ATTR(_name, 0444, show_##_name, NULL)
+
+#define core_ctl_attr_rw(_name) \
+static struct core_ctl_attr _name = \
+__ATTR(_name, 0644, show_##_name, store_##_name)
+
+core_ctl_attr_rw(min_cpus);
+core_ctl_attr_rw(max_cpus);
+core_ctl_attr_rw(offline_delay_ms);
+core_ctl_attr_rw(busy_up_thres);
+core_ctl_attr_rw(busy_down_thres);
+core_ctl_attr_rw(task_thres);
+core_ctl_attr_rw(is_big_cluster);
+core_ctl_attr_ro(cpus);
+core_ctl_attr_ro(need_cpus);
+core_ctl_attr_ro(active_cpus);
+core_ctl_attr_ro(global_state);
+core_ctl_attr_rw(not_preferred);
+
+static struct attribute *default_attrs[] = {
+ &min_cpus.attr,
+ &max_cpus.attr,
+ &offline_delay_ms.attr,
+ &busy_up_thres.attr,
+ &busy_down_thres.attr,
+ &task_thres.attr,
+ &is_big_cluster.attr,
+ &cpus.attr,
+ &need_cpus.attr,
+ &active_cpus.attr,
+ &global_state.attr,
+ &not_preferred.attr,
+ NULL
+};
+
+#define to_cluster_data(k) container_of(k, struct cluster_data, kobj)
+#define to_attr(a) container_of(a, struct core_ctl_attr, attr)
+static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+ struct cluster_data *data = to_cluster_data(kobj);
+ struct core_ctl_attr *cattr = to_attr(attr);
+ ssize_t ret = -EIO;
+
+ if (cattr->show)
+ ret = cattr->show(data, buf);
+
+ return ret;
+}
+
+static ssize_t store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ struct cluster_data *data = to_cluster_data(kobj);
+ struct core_ctl_attr *cattr = to_attr(attr);
+ ssize_t ret = -EIO;
+
+ if (cattr->store)
+ ret = cattr->store(data, buf, count);
+
+ return ret;
+}
+
+static const struct sysfs_ops sysfs_ops = {
+ .show = show,
+ .store = store,
+};
+
+static struct kobj_type ktype_core_ctl = {
+ .sysfs_ops = &sysfs_ops,
+ .default_attrs = default_attrs,
+};
+
+/* ==================== runqueue based core count =================== */
+
+#define RQ_AVG_TOLERANCE 2
+#define RQ_AVG_DEFAULT_MS 20
+#define NR_RUNNING_TOLERANCE 5
+static unsigned int rq_avg_period_ms = RQ_AVG_DEFAULT_MS;
+
+static s64 rq_avg_timestamp_ms;
+
+static void update_running_avg(bool trigger_update)
+{
+ int avg, iowait_avg, big_avg, old_nrrun;
+ s64 now;
+ unsigned long flags;
+ struct cluster_data *cluster;
+ unsigned int index = 0;
+
+ spin_lock_irqsave(&state_lock, flags);
+
+ now = ktime_to_ms(ktime_get());
+ if (now - rq_avg_timestamp_ms < rq_avg_period_ms - RQ_AVG_TOLERANCE) {
+ spin_unlock_irqrestore(&state_lock, flags);
+ return;
+ }
+ rq_avg_timestamp_ms = now;
+ sched_get_nr_running_avg(&avg, &iowait_avg, &big_avg);
+
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ /*
+ * Round up to the next integer if the average nr running tasks
+ * is within NR_RUNNING_TOLERANCE/100 of the next integer.
+ * If normal rounding up is used, it will allow a transient task
+ * to trigger online event. By the time core is onlined, the task
+ * has finished.
+ * Rounding to closest suffers same problem because scheduler
+ * might only provide running stats per jiffy, and a transient
+ * task could skew the number for one jiffy. If core control
+ * samples every 2 jiffies, it will observe 0.5 additional running
+ * average which rounds up to 1 task.
+ */
+ avg = (avg + NR_RUNNING_TOLERANCE) / 100;
+ big_avg = (big_avg + NR_RUNNING_TOLERANCE) / 100;
+
+ for_each_cluster(cluster, index) {
+ if (!cluster->inited)
+ continue;
+ old_nrrun = cluster->nrrun;
+ /*
+ * Big cluster only need to take care of big tasks, but if
+ * there are not enough big cores, big tasks need to be run
+ * on little as well. Thus for little's runqueue stat, it
+ * has to use overall runqueue average, or derive what big
+ * tasks would have to be run on little. The latter approach
+ * is not easy to get given core control reacts much slower
+ * than scheduler, and can't predict scheduler's behavior.
+ */
+ cluster->nrrun = cluster->is_big_cluster ? big_avg : avg;
+ if (cluster->nrrun != old_nrrun) {
+ if (trigger_update)
+ apply_need(cluster);
+ else
+ cluster->nrrun_changed = true;
+ }
+ }
+ return;
+}
+
+/* adjust needed CPUs based on current runqueue information */
+static unsigned int apply_task_need(const struct cluster_data *cluster,
+ unsigned int new_need)
+{
+ /* unisolate all cores if there are enough tasks */
+ if (cluster->nrrun >= cluster->task_thres)
+ return cluster->num_cpus;
+
+ /* only unisolate more cores if there are tasks to run */
+ if (cluster->nrrun > new_need)
+ return new_need + 1;
+
+ return new_need;
+}
+
+/* ======================= load based core count ====================== */
+
+static unsigned int apply_limits(const struct cluster_data *cluster,
+ unsigned int need_cpus)
+{
+ return min(max(cluster->min_cpus, need_cpus), cluster->max_cpus);
+}
+
+static unsigned int get_active_cpu_count(const struct cluster_data *cluster)
+{
+ return cluster->num_cpus -
+ sched_isolate_count(&cluster->cpu_mask, true);
+}
+
+static bool is_active(const struct cpu_data *state)
+{
+ return state->online && !cpu_isolated(state->cpu);
+}
+
+static bool adjustment_possible(const struct cluster_data *cluster,
+ unsigned int need)
+{
+ return (need < cluster->active_cpus || (need > cluster->active_cpus &&
+ sched_isolate_count(&cluster->cpu_mask, false)));
+}
+
+static bool eval_need(struct cluster_data *cluster)
+{
+ unsigned long flags;
+ struct cpu_data *c;
+ unsigned int need_cpus = 0, last_need, thres_idx;
+ int ret = 0;
+ bool need_flag = false;
+ unsigned int active_cpus;
+ unsigned int new_need;
+
+ if (unlikely(!cluster->inited))
+ return 0;
+
+ spin_lock_irqsave(&state_lock, flags);
+
+ if (cluster->boost) {
+ need_cpus = cluster->max_cpus;
+ } else {
+ active_cpus = get_active_cpu_count(cluster);
+ thres_idx = active_cpus ? active_cpus - 1 : 0;
+ list_for_each_entry(c, &cluster->lru, sib) {
+ if (c->busy >= cluster->busy_up_thres[thres_idx])
+ c->is_busy = true;
+ else if (c->busy < cluster->busy_down_thres[thres_idx])
+ c->is_busy = false;
+ need_cpus += c->is_busy;
+ }
+ need_cpus = apply_task_need(cluster, need_cpus);
+ }
+ new_need = apply_limits(cluster, need_cpus);
+ need_flag = adjustment_possible(cluster, new_need);
+
+ last_need = cluster->need_cpus;
+ cluster->need_cpus = new_need;
+
+ if (!need_flag) {
+ spin_unlock_irqrestore(&state_lock, flags);
+ return 0;
+ }
+
+ if (need_cpus > cluster->active_cpus) {
+ ret = 1;
+ } else if (need_cpus < cluster->active_cpus) {
+ s64 now = ktime_to_ms(ktime_get());
+ s64 elapsed = now - cluster->last_isolate_ts;
+
+ ret = elapsed >= cluster->offline_delay_ms;
+ }
+
+ trace_core_ctl_eval_need(cluster->first_cpu, last_need, need_cpus,
+ ret && need_flag);
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ return ret && need_flag;
+}
+
+static void apply_need(struct cluster_data *cluster)
+{
+ if (eval_need(cluster))
+ wake_up_core_ctl_thread(cluster);
+}
+
+static int core_ctl_set_busy(unsigned int cpu, unsigned int busy)
+{
+ struct cpu_data *c = &per_cpu(cpu_state, cpu);
+ struct cluster_data *cluster = c->cluster;
+ unsigned int old_is_busy = c->is_busy;
+
+ if (!cluster || !cluster->inited)
+ return 0;
+
+ update_running_avg(false);
+ if (c->busy == busy && !cluster->nrrun_changed)
+ return 0;
+ c->busy = busy;
+ cluster->nrrun_changed = false;
+
+ apply_need(cluster);
+ trace_core_ctl_set_busy(cpu, busy, old_is_busy, c->is_busy);
+ return 0;
+}
+
+/* ========================= core count enforcement ==================== */
+
+static void wake_up_core_ctl_thread(struct cluster_data *cluster)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cluster->pending_lock, flags);
+ cluster->pending = true;
+ spin_unlock_irqrestore(&cluster->pending_lock, flags);
+
+ wake_up_process_no_notif(cluster->core_ctl_thread);
+}
+
+static u64 core_ctl_check_timestamp;
+static u64 core_ctl_check_interval;
+
+static bool do_check(u64 wallclock)
+{
+ bool do_check = false;
+ unsigned long flags;
+
+ spin_lock_irqsave(&state_lock, flags);
+ if ((wallclock - core_ctl_check_timestamp) >= core_ctl_check_interval) {
+ core_ctl_check_timestamp = wallclock;
+ do_check = true;
+ }
+ spin_unlock_irqrestore(&state_lock, flags);
+ return do_check;
+}
+
+void core_ctl_set_boost(bool boost)
+{
+ unsigned int index = 0;
+ struct cluster_data *cluster;
+
+ for_each_cluster(cluster, index) {
+ if (cluster->is_big_cluster && cluster->boost != boost) {
+ cluster->boost = boost;
+ apply_need(cluster);
+ }
+ }
+}
+
+void core_ctl_check(u64 wallclock)
+{
+ if (unlikely(!initialized))
+ return;
+
+ if (do_check(wallclock)) {
+ unsigned int index = 0;
+ struct cluster_data *cluster;
+
+ update_running_avg(true);
+
+ for_each_cluster(cluster, index) {
+ if (eval_need(cluster))
+ wake_up_core_ctl_thread(cluster);
+ }
+ }
+}
+
+static void move_cpu_lru(struct cpu_data *cpu_data)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&state_lock, flags);
+ list_del(&cpu_data->sib);
+ list_add_tail(&cpu_data->sib, &cpu_data->cluster->lru);
+ spin_unlock_irqrestore(&state_lock, flags);
+}
+
+static void try_to_isolate(struct cluster_data *cluster, unsigned int need)
+{
+ struct cpu_data *c, *tmp;
+
+ list_for_each_entry_safe(c, tmp, &cluster->lru, sib) {
+ if (!is_active(c))
+ continue;
+ if (cluster->active_cpus == need)
+ break;
+ /* Don't offline busy CPUs. */
+ if (c->is_busy)
+ continue;
+
+ pr_debug("Trying to isolate CPU%u\n", c->cpu);
+ if (!sched_isolate_cpu(c->cpu)) {
+ c->isolated_by_us = true;
+ move_cpu_lru(c);
+ cluster->last_isolate_ts = ktime_to_ms(ktime_get());
+ } else {
+ pr_debug("Unable to isolate CPU%u\n", c->cpu);
+ }
+ cluster->active_cpus = get_active_cpu_count(cluster);
+ }
+
+ /*
+ * If the number of active CPUs is within the limits, then
+ * don't force isolation of any busy CPUs.
+ */
+ if (cluster->active_cpus <= cluster->max_cpus)
+ return;
+
+ list_for_each_entry_safe(c, tmp, &cluster->lru, sib) {
+ if (!is_active(c))
+ continue;
+ if (cluster->active_cpus <= cluster->max_cpus)
+ break;
+
+ pr_debug("Trying to isolate CPU%u\n", c->cpu);
+ if (!sched_isolate_cpu(c->cpu)) {
+ c->isolated_by_us = true;
+ move_cpu_lru(c);
+ cluster->last_isolate_ts = ktime_to_ms(ktime_get());
+ } else {
+ pr_debug("Unable to isolate CPU%u\n", c->cpu);
+ }
+ cluster->active_cpus = get_active_cpu_count(cluster);
+ }
+}
+
+static void __try_to_unisolate(struct cluster_data *cluster,
+ unsigned int need, bool force)
+{
+ struct cpu_data *c, *tmp;
+
+ list_for_each_entry_safe(c, tmp, &cluster->lru, sib) {
+ if (!c->isolated_by_us)
+ continue;
+ if ((c->online && !cpu_isolated(c->cpu)) ||
+ (!force && c->not_preferred))
+ continue;
+ if (cluster->active_cpus == need)
+ break;
+
+ pr_debug("Trying to unisolate CPU%u\n", c->cpu);
+ if (!sched_unisolate_cpu(c->cpu)) {
+ c->isolated_by_us = false;
+ move_cpu_lru(c);
+ } else {
+ pr_debug("Unable to unisolate CPU%u\n", c->cpu);
+ }
+ cluster->active_cpus = get_active_cpu_count(cluster);
+ }
+}
+
+static void try_to_unisolate(struct cluster_data *cluster, unsigned int need)
+{
+ bool force_use_non_preferred = false;
+
+ __try_to_unisolate(cluster, need, force_use_non_preferred);
+
+ if (cluster->active_cpus == need)
+ return;
+
+ force_use_non_preferred = true;
+ __try_to_unisolate(cluster, need, force_use_non_preferred);
+}
+
+static void __ref do_core_ctl(struct cluster_data *cluster)
+{
+ unsigned int need;
+
+ need = apply_limits(cluster, cluster->need_cpus);
+
+ if (adjustment_possible(cluster, need)) {
+ pr_debug("Trying to adjust group %u from %u to %u\n",
+ cluster->first_cpu, cluster->active_cpus, need);
+
+ if (cluster->active_cpus > need)
+ try_to_isolate(cluster, need);
+ else if (cluster->active_cpus < need)
+ try_to_unisolate(cluster, need);
+ }
+}
+
+static int __ref try_core_ctl(void *data)
+{
+ struct cluster_data *cluster = data;
+ unsigned long flags;
+
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ spin_lock_irqsave(&cluster->pending_lock, flags);
+ if (!cluster->pending) {
+ spin_unlock_irqrestore(&cluster->pending_lock, flags);
+ schedule();
+ if (kthread_should_stop())
+ break;
+ spin_lock_irqsave(&cluster->pending_lock, flags);
+ }
+ set_current_state(TASK_RUNNING);
+ cluster->pending = false;
+ spin_unlock_irqrestore(&cluster->pending_lock, flags);
+
+ do_core_ctl(cluster);
+ }
+
+ return 0;
+}
+
+static int __ref cpu_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ uint32_t cpu = (uintptr_t)hcpu;
+ struct cpu_data *state = &per_cpu(cpu_state, cpu);
+ struct cluster_data *cluster = state->cluster;
+ unsigned int need;
+ int ret = NOTIFY_OK;
+
+ /* Don't affect suspend resume */
+ if (action & CPU_TASKS_FROZEN)
+ return NOTIFY_OK;
+
+ if (unlikely(!cluster || !cluster->inited))
+ return NOTIFY_OK;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+
+ /* If online state of CPU somehow got out of sync, fix it. */
+ if (state->online) {
+ state->online = false;
+ cluster->active_cpus = get_active_cpu_count(cluster);
+ pr_warn("CPU%d offline when state is online\n", cpu);
+ }
+ break;
+
+ case CPU_ONLINE:
+
+ state->online = true;
+ cluster->active_cpus = get_active_cpu_count(cluster);
+
+ /*
+ * Moving to the end of the list should only happen in
+ * CPU_ONLINE and not on CPU_UP_PREPARE to prevent an
+ * infinite list traversal when thermal (or other entities)
+ * reject trying to online CPUs.
+ */
+ move_cpu_lru(state);
+ break;
+
+ case CPU_DEAD:
+ /*
+ * We don't want to have a CPU both offline and isolated.
+ * So unisolate a CPU that went down if it was isolated by us.
+ */
+ if (state->isolated_by_us) {
+ sched_unisolate_cpu_unlocked(cpu);
+ state->isolated_by_us = false;
+ }
+
+ /* Move a CPU to the end of the LRU when it goes offline. */
+ move_cpu_lru(state);
+
+ /* Fall through */
+
+ case CPU_UP_CANCELED:
+
+ /* If online state of CPU somehow got out of sync, fix it. */
+ if (!state->online)
+ pr_warn("CPU%d online when state is offline\n", cpu);
+
+ state->online = false;
+ state->busy = 0;
+ cluster->active_cpus = get_active_cpu_count(cluster);
+ break;
+ }
+
+ need = apply_limits(cluster, cluster->need_cpus);
+ if (adjustment_possible(cluster, need))
+ wake_up_core_ctl_thread(cluster);
+
+ return ret;
+}
+
+static struct notifier_block __refdata cpu_notifier = {
+ .notifier_call = cpu_callback,
+};
+
+/* ============================ init code ============================== */
+
+static struct cluster_data *find_cluster_by_first_cpu(unsigned int first_cpu)
+{
+ unsigned int i;
+
+ for (i = 0; i < num_clusters; ++i) {
+ if (cluster_state[i].first_cpu == first_cpu)
+ return &cluster_state[i];
+ }
+
+ return NULL;
+}
+
+static int cluster_init(const struct cpumask *mask)
+{
+ struct device *dev;
+ unsigned int first_cpu = cpumask_first(mask);
+ struct cluster_data *cluster;
+ struct cpu_data *state;
+ unsigned int cpu;
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+
+ if (find_cluster_by_first_cpu(first_cpu))
+ return 0;
+
+ dev = get_cpu_device(first_cpu);
+ if (!dev)
+ return -ENODEV;
+
+ pr_info("Creating CPU group %d\n", first_cpu);
+
+ if (num_clusters == MAX_CLUSTERS) {
+ pr_err("Unsupported number of clusters. Only %u supported\n",
+ MAX_CLUSTERS);
+ return -EINVAL;
+ }
+ cluster = &cluster_state[num_clusters];
+ ++num_clusters;
+
+ cpumask_copy(&cluster->cpu_mask, mask);
+ cluster->num_cpus = cpumask_weight(mask);
+ if (cluster->num_cpus > MAX_CPUS_PER_CLUSTER) {
+ pr_err("HW configuration not supported\n");
+ return -EINVAL;
+ }
+ cluster->first_cpu = first_cpu;
+ cluster->min_cpus = 1;
+ cluster->max_cpus = cluster->num_cpus;
+ cluster->need_cpus = cluster->num_cpus;
+ cluster->offline_delay_ms = 100;
+ cluster->task_thres = UINT_MAX;
+ cluster->nrrun = cluster->num_cpus;
+ INIT_LIST_HEAD(&cluster->lru);
+ spin_lock_init(&cluster->pending_lock);
+
+ for_each_cpu(cpu, mask) {
+ pr_info("Init CPU%u state\n", cpu);
+
+ state = &per_cpu(cpu_state, cpu);
+ state->cluster = cluster;
+ state->cpu = cpu;
+ if (cpu_online(cpu))
+ state->online = true;
+ list_add_tail(&state->sib, &cluster->lru);
+ }
+ cluster->active_cpus = get_active_cpu_count(cluster);
+
+ cluster->core_ctl_thread = kthread_run(try_core_ctl, (void *) cluster,
+ "core_ctl/%d", first_cpu);
+ if (IS_ERR(cluster->core_ctl_thread))
+ return PTR_ERR(cluster->core_ctl_thread);
+
+ sched_setscheduler_nocheck(cluster->core_ctl_thread, SCHED_FIFO,
+ &param);
+
+ cluster->inited = true;
+
+ kobject_init(&cluster->kobj, &ktype_core_ctl);
+ return kobject_add(&cluster->kobj, &dev->kobj, "core_ctl");
+}
+
+static int cpufreq_policy_cb(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct cpufreq_policy *policy = data;
+ int ret;
+
+ switch (val) {
+ case CPUFREQ_CREATE_POLICY:
+ ret = cluster_init(policy->related_cpus);
+ if (ret)
+ pr_warn("unable to create core ctl group: %d\n", ret);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block cpufreq_pol_nb = {
+ .notifier_call = cpufreq_policy_cb,
+};
+
+static int cpufreq_gov_cb(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct cpufreq_govinfo *info = data;
+
+ switch (val) {
+ case CPUFREQ_LOAD_CHANGE:
+ core_ctl_set_busy(info->cpu, info->load);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block cpufreq_gov_nb = {
+ .notifier_call = cpufreq_gov_cb,
+};
+
+static int __init core_ctl_init(void)
+{
+ unsigned int cpu;
+
+ core_ctl_check_interval = (rq_avg_period_ms - RQ_AVG_TOLERANCE)
+ * NSEC_PER_MSEC;
+
+ register_cpu_notifier(&cpu_notifier);
+ cpufreq_register_notifier(&cpufreq_pol_nb, CPUFREQ_POLICY_NOTIFIER);
+ cpufreq_register_notifier(&cpufreq_gov_nb, CPUFREQ_GOVINFO_NOTIFIER);
+
+ lock_device_hotplug();
+ for_each_online_cpu(cpu) {
+ struct cpufreq_policy *policy;
+ int ret;
+
+ policy = cpufreq_cpu_get(cpu);
+ if (policy) {
+ ret = cluster_init(policy->related_cpus);
+ if (ret)
+ pr_warn("unable to create core ctl group: %d\n"
+ , ret);
+ cpufreq_cpu_put(policy);
+ }
+ }
+ unlock_device_hotplug();
+ initialized = true;
+ return 0;
+}
+
+late_initcall(core_ctl_init);
diff --git a/kernel/sched/core_ctl.h b/kernel/sched/core_ctl.h
new file mode 100644
index 000000000000..3b0c12acb9c0
--- /dev/null
+++ b/kernel/sched/core_ctl.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __CORE_CTL_H
+#define __CORE_CTL_H
+
+#ifdef CONFIG_SCHED_CORE_CTL
+void core_ctl_check(u64 wallclock);
+void core_ctl_set_boost(bool boost);
+#else
+static inline void core_ctl_check(u64 wallclock) {}
+static inline void core_ctl_set_boost(bool boost) {}
+#endif
+#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e893b0fcac6b..83da13b5f6b8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2941,6 +2941,8 @@ static void find_best_cpu_in_cluster(struct sched_cluster *c,
struct cpumask search_cpus;
cpumask_and(&search_cpus, tsk_cpus_allowed(env->p), &c->cpus);
+ cpumask_andnot(&search_cpus, &search_cpus, cpu_isolated_mask);
+
if (env->ignore_prev_cpu)
cpumask_clear_cpu(env->prev_cpu, &search_cpus);
@@ -3009,7 +3011,8 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats)
prev_cpu = env->prev_cpu;
if (!cpumask_test_cpu(prev_cpu, tsk_cpus_allowed(task)) ||
- unlikely(!cpu_active(prev_cpu)))
+ unlikely(!cpu_active(prev_cpu)) ||
+ cpu_isolated(prev_cpu))
return false;
if (task->ravg.mark_start - task->last_cpu_selected_ts >=
@@ -7354,6 +7357,8 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
struct sched_group_capacity *sgc;
struct rq *rq = cpu_rq(cpu);
+ if (cpumask_test_cpu(cpu, cpu_isolated_mask))
+ continue;
/*
* build_sched_domains() -> init_sched_groups_capacity()
* gets here before we've attached the domains to the
@@ -7381,7 +7386,11 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
group = child->groups;
do {
- capacity += group->sgc->capacity;
+ cpumask_t *cpus = sched_group_cpus(group);
+
+ /* Revisit this later. This won't work for MT domain */
+ if (!cpu_isolated(cpumask_first(cpus)))
+ capacity += group->sgc->capacity;
group = group->next;
} while (group != child->groups);
}
@@ -7521,6 +7530,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
power_cost(i, 0),
cpu_temp(i));
+ if (cpu_isolated(i))
+ continue;
+
/* Bias balancing toward cpus of our domain */
if (local_group)
load = target_load(i, load_idx);
@@ -7548,17 +7560,27 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->idle_cpus++;
}
- /* Adjust by relative CPU capacity of the group */
- sgs->group_capacity = group->sgc->capacity;
- sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
+ /* Isolated CPU has no weight */
+ if (!group->group_weight) {
+ sgs->group_capacity = 0;
+ sgs->avg_load = 0;
+ sgs->group_no_capacity = 1;
+ sgs->group_type = group_other;
+ sgs->group_weight = group->group_weight;
+ } else {
+ /* Adjust by relative CPU capacity of the group */
+ sgs->group_capacity = group->sgc->capacity;
+ sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) /
+ sgs->group_capacity;
- if (sgs->sum_nr_running)
- sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
+ sgs->group_weight = group->group_weight;
- sgs->group_weight = group->group_weight;
+ sgs->group_no_capacity = group_is_overloaded(env, sgs);
+ sgs->group_type = group_classify(group, sgs, env);
+ }
- sgs->group_no_capacity = group_is_overloaded(env, sgs);
- sgs->group_type = group_classify(group, sgs, env);
+ if (sgs->sum_nr_running)
+ sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
}
#ifdef CONFIG_SCHED_HMP
@@ -8601,6 +8623,9 @@ static int idle_balance(struct rq *this_rq)
int pulled_task = 0;
u64 curr_cost = 0;
+ if (cpu_isolated(this_cpu))
+ return 0;
+
idle_enter_fair(this_rq);
/*
@@ -8908,16 +8933,21 @@ static void nohz_balancer_kick(int type)
return;
}
+void nohz_balance_clear_nohz_mask(int cpu)
+{
+ if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
+ cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+ atomic_dec(&nohz.nr_cpus);
+ }
+}
+
static inline void nohz_balance_exit_idle(int cpu)
{
if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
/*
* Completely isolated CPUs don't ever set, so we must test.
*/
- if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
- cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
- atomic_dec(&nohz.nr_cpus);
- }
+ nohz_balance_clear_nohz_mask(cpu);
clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
}
}
@@ -8974,7 +9004,7 @@ void nohz_balance_enter_idle(int cpu)
/*
* If we're a completely isolated CPU, we don't play.
*/
- if (on_null_domain(cpu_rq(cpu)))
+ if (on_null_domain(cpu_rq(cpu)) || cpu_isolated(cpu))
return;
cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
@@ -9003,7 +9033,13 @@ static DEFINE_SPINLOCK(balancing);
*/
void update_max_interval(void)
{
- max_load_balance_interval = HZ*num_online_cpus()/10;
+ cpumask_t avail_mask;
+ unsigned int available_cpus;
+
+ cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask);
+ available_cpus = cpumask_weight(&avail_mask);
+
+ max_load_balance_interval = HZ*available_cpus/10;
}
/*
@@ -9342,8 +9378,10 @@ void trigger_load_balance(struct rq *rq)
{
int type = NOHZ_KICK_ANY;
- /* Don't need to rebalance while attached to NULL domain */
- if (unlikely(on_null_domain(rq)))
+ /* Don't need to rebalance while attached to NULL domain or
+ * cpu is isolated.
+ */
+ if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq)))
return;
if (time_after_eq(jiffies, rq->next_balance))
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index 5002619961ce..6e1757aa1541 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c
@@ -19,6 +19,7 @@
#include <linux/syscore_ops.h>
#include "sched.h"
+#include "core_ctl.h"
#include <trace/events/sched.h>
@@ -1090,6 +1091,8 @@ int sched_set_boost(int enable)
if (!old_refcount && boost_refcount)
boost_kick_cpus();
+ if (boost_refcount <= 1)
+ core_ctl_set_boost(boost_refcount == 1);
trace_sched_set_boost(boost_refcount);
spin_unlock_irqrestore(&boost_lock, flags);
@@ -1499,28 +1502,10 @@ int sched_hmp_proc_update_handler(struct ctl_table *table, int write,
if (write && (old_val == *data))
goto done;
- /*
- * Special handling for sched_freq_aggregate_threshold_pct
- * which can be greater than 100. Use 1000 as an upper bound
- * value which works for all practical use cases.
- */
- if (data == &sysctl_sched_freq_aggregate_threshold_pct) {
- if (*data > 1000) {
- *data = old_val;
- ret = -EINVAL;
- goto done;
- }
- } else if (data != &sysctl_sched_select_prev_cpu_us) {
- /*
- * all tunables other than sched_select_prev_cpu_us are
- * in percentage.
- */
- if (sysctl_sched_downmigrate_pct >
- sysctl_sched_upmigrate_pct || *data > 100) {
- *data = old_val;
- ret = -EINVAL;
- goto done;
- }
+ if (sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct) {
+ *data = old_val;
+ ret = -EINVAL;
+ goto done;
}
/*
@@ -2828,10 +2813,10 @@ void set_window_start(struct rq *rq)
rq->curr->ravg.mark_start = rq->window_start;
}
-void migrate_sync_cpu(int cpu)
+void migrate_sync_cpu(int cpu, int new_cpu)
{
if (cpu == sync_cpu)
- sync_cpu = smp_processor_id();
+ sync_cpu = new_cpu;
}
static void reset_all_task_stats(void)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index cfec881491ef..ba4403e910d8 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -265,8 +265,12 @@ static void pull_rt_task(struct rq *this_rq);
static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
{
- /* Try to pull RT tasks here if we lower this rq's prio */
- return rq->rt.highest_prio.curr > prev->prio;
+ /*
+ * Try to pull RT tasks here if we lower this rq's prio and cpu is not
+ * isolated
+ */
+ return rq->rt.highest_prio.curr > prev->prio &&
+ !cpu_isolated(cpu_of(rq));
}
static inline int rt_overloaded(struct rq *rq)
@@ -1694,6 +1698,8 @@ static int find_lowest_rq_hmp(struct task_struct *task)
for_each_sched_cluster(cluster) {
cpumask_and(&candidate_mask, &cluster->cpus, lowest_mask);
+ cpumask_andnot(&candidate_mask, &candidate_mask,
+ cpu_isolated_mask);
if (cpumask_empty(&candidate_mask))
continue;
@@ -2282,7 +2288,8 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
* we may need to handle the pulling of RT tasks
* now.
*/
- if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
+ if (!task_on_rq_queued(p) || rq->rt.rt_nr_running ||
+ cpu_isolated(cpu_of(rq)))
return;
queue_pull_task(rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ec7721112b05..41abb4dabeb7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1069,7 +1069,7 @@ extern void clear_boost_kick(int cpu);
extern void clear_hmp_request(int cpu);
extern void mark_task_starting(struct task_struct *p);
extern void set_window_start(struct rq *rq);
-extern void migrate_sync_cpu(int cpu);
+extern void migrate_sync_cpu(int cpu, int new_cpu);
extern void update_cluster_topology(void);
extern void set_task_last_wake(struct task_struct *p, u64 wallclock);
extern void set_task_last_switch_out(struct task_struct *p, u64 wallclock);
@@ -1424,7 +1424,7 @@ static inline void clear_boost_kick(int cpu) { }
static inline void clear_hmp_request(int cpu) { }
static inline void mark_task_starting(struct task_struct *p) { }
static inline void set_window_start(struct rq *rq) { }
-static inline void migrate_sync_cpu(int cpu) { }
+static inline void migrate_sync_cpu(int cpu, int new_cpu) {}
static inline void update_cluster_topology(void) { }
static inline void set_task_last_wake(struct task_struct *p, u64 wallclock) { }
static inline void set_task_last_switch_out(struct task_struct *p,
@@ -1953,6 +1953,7 @@ extern const struct sched_class idle_sched_class;
extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq);
+extern void nohz_balance_clear_nohz_mask(int cpu);
extern void idle_enter_fair(struct rq *this_rq);
extern void idle_exit_fair(struct rq *this_rq);
diff --git a/kernel/smp.c b/kernel/smp.c
index abdc48cd79a3..b2ec21c5c9d6 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -766,8 +766,8 @@ void wake_up_all_idle_cpus(void)
for_each_online_cpu(cpu) {
if (cpu == smp_processor_id())
continue;
-
- wake_up_if_idle(cpu);
+ if (!cpu_isolated(cpu))
+ wake_up_if_idle(cpu);
}
preempt_enable();
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 07fef40d1274..dad3324e7372 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -130,6 +130,9 @@ static int one_hundred = 100;
#ifdef CONFIG_PRINTK
static int ten_thousand = 10000;
#endif
+#ifdef CONFIG_SCHED_HMP
+static int one_thousand = 1000;
+#endif
/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@ -336,6 +339,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_hmp_proc_update_handler,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
},
{
.procname = "sched_spill_nr_run",
@@ -351,6 +356,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_hmp_proc_update_handler,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
},
{
.procname = "sched_downmigrate",
@@ -358,6 +365,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_hmp_proc_update_handler,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
},
{
.procname = "sched_init_task_load",
@@ -365,6 +374,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_hmp_proc_update_handler,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
},
{
.procname = "sched_select_prev_cpu_us",
@@ -372,6 +383,7 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_hmp_proc_update_handler,
+ .extra1 = &zero,
},
{
.procname = "sched_enable_colocation",
@@ -397,6 +409,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_hmp_proc_update_handler,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
},
{
.procname = "sched_big_waker_task_load",
@@ -404,6 +418,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_hmp_proc_update_handler,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
},
{
.procname = "sched_enable_thread_grouping",
@@ -440,6 +456,13 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_hmp_proc_update_handler,
+ .extra1 = &zero,
+ /*
+ * Special handling for sched_freq_aggregate_threshold_pct
+ * which can be greater than 100. Use 1000 as an upper bound
+ * value which works for all practical use cases.
+ */
+ .extra2 = &one_thousand,
},
{
.procname = "sched_boost",
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index fa909f9fd559..1b0117198a08 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -880,7 +880,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
base->cpu_base->active_bases |= 1 << base->index;
- timer->state = HRTIMER_STATE_ENQUEUED;
+ timer->state |= HRTIMER_STATE_ENQUEUED;
return timerqueue_add(&base->active, &timer->node);
}
@@ -900,11 +900,9 @@ static void __remove_hrtimer(struct hrtimer *timer,
u8 newstate, int reprogram)
{
struct hrtimer_cpu_base *cpu_base = base->cpu_base;
- u8 state = timer->state;
- timer->state = newstate;
- if (!(state & HRTIMER_STATE_ENQUEUED))
- return;
+ if (!(timer->state & HRTIMER_STATE_ENQUEUED))
+ goto out;
if (!timerqueue_del(&base->active, &timer->node))
cpu_base->active_bases &= ~(1 << base->index);
@@ -921,6 +919,13 @@ static void __remove_hrtimer(struct hrtimer *timer,
if (reprogram && timer == cpu_base->next_timer)
hrtimer_force_reprogram(cpu_base, 1);
#endif
+
+out:
+ /*
+ * We need to preserve PINNED state here, otherwise we may end up
+ * migrating pinned hrtimers as well.
+ */
+ timer->state = newstate | (timer->state & HRTIMER_STATE_PINNED);
}
/*
@@ -949,6 +954,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest
state = HRTIMER_STATE_INACTIVE;
__remove_hrtimer(timer, base, state, reprogram);
+ timer->state &= ~HRTIMER_STATE_PINNED;
return 1;
}
return 0;
@@ -1002,6 +1008,10 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
timer_stats_hrtimer_set_start_info(timer);
+ /* Update pinned state */
+ timer->state &= ~HRTIMER_STATE_PINNED;
+ timer->state |= (!!(mode & HRTIMER_MODE_PINNED)) << HRTIMER_PINNED_SHIFT;
+
leftmost = enqueue_hrtimer(timer, new_base);
if (!leftmost)
goto unlock;
@@ -1176,8 +1186,8 @@ bool hrtimer_active(const struct hrtimer *timer)
cpu_base = READ_ONCE(timer->base->cpu_base);
seq = raw_read_seqcount_begin(&cpu_base->seq);
- if (timer->state != HRTIMER_STATE_INACTIVE ||
- cpu_base->running == timer)
+ if (((timer->state & ~HRTIMER_STATE_PINNED) !=
+ HRTIMER_STATE_INACTIVE) || cpu_base->running == timer)
return true;
} while (read_seqcount_retry(&cpu_base->seq, seq) ||
@@ -1614,13 +1624,17 @@ static void init_hrtimers_cpu(int cpu)
hrtimer_init_hres(cpu_base);
}
-#ifdef CONFIG_HOTPLUG_CPU
-
+#if defined(CONFIG_HOTPLUG_CPU)
static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
- struct hrtimer_clock_base *new_base)
+ struct hrtimer_clock_base *new_base,
+ bool remove_pinned)
{
struct hrtimer *timer;
struct timerqueue_node *node;
+ struct timerqueue_head pinned;
+ int is_pinned;
+
+ timerqueue_init_head(&pinned);
while ((node = timerqueue_getnext(&old_base->active))) {
timer = container_of(node, struct hrtimer, node);
@@ -1633,6 +1647,13 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
* under us on another CPU
*/
__remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
+
+ is_pinned = timer->state & HRTIMER_STATE_PINNED;
+ if (!remove_pinned && is_pinned) {
+ timerqueue_add(&pinned, &timer->node);
+ continue;
+ }
+
timer->base = new_base;
/*
* Enqueue the timers on the new cpu. This does not
@@ -1644,17 +1665,23 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
*/
enqueue_hrtimer(timer, new_base);
}
+
+ /* Re-queue pinned timers for non-hotplug usecase */
+ while ((node = timerqueue_getnext(&pinned))) {
+ timer = container_of(node, struct hrtimer, node);
+
+ timerqueue_del(&pinned, &timer->node);
+ enqueue_hrtimer(timer, old_base);
+ }
}
-static void migrate_hrtimers(int scpu)
+static void __migrate_hrtimers(int scpu, bool remove_pinned)
{
struct hrtimer_cpu_base *old_base, *new_base;
+ unsigned long flags;
int i;
- BUG_ON(cpu_online(scpu));
- tick_cancel_sched_timer(scpu);
-
- local_irq_disable();
+ local_irq_save(flags);
old_base = &per_cpu(hrtimer_bases, scpu);
new_base = this_cpu_ptr(&hrtimer_bases);
/*
@@ -1666,7 +1693,7 @@ static void migrate_hrtimers(int scpu)
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
migrate_hrtimer_list(&old_base->clock_base[i],
- &new_base->clock_base[i]);
+ &new_base->clock_base[i], remove_pinned);
}
raw_spin_unlock(&old_base->lock);
@@ -1674,7 +1701,20 @@ static void migrate_hrtimers(int scpu)
/* Check, if we got expired work to do */
__hrtimer_peek_ahead_timers();
- local_irq_enable();
+ local_irq_restore(flags);
+}
+
+static void migrate_hrtimers(int scpu)
+{
+ BUG_ON(cpu_online(scpu));
+ tick_cancel_sched_timer(scpu);
+
+ __migrate_hrtimers(scpu, true);
+}
+
+void hrtimer_quiesce_cpu(void *cpup)
+{
+ __migrate_hrtimers(*(int *)cpup, false);
}
#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 51896272fcde..0efb3916f5a4 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1620,56 +1620,86 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
}
EXPORT_SYMBOL(schedule_timeout_uninterruptible);
-#ifdef CONFIG_HOTPLUG_CPU
-static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head)
+#if defined(CONFIG_HOTPLUG_CPU)
+static void migrate_timer_list(struct tvec_base *new_base,
+ struct hlist_head *head, bool remove_pinned)
{
struct timer_list *timer;
int cpu = new_base->cpu;
+ struct hlist_node *n;
+ int is_pinned;
- while (!hlist_empty(head)) {
- timer = hlist_entry(head->first, struct timer_list, entry);
- /* We ignore the accounting on the dying cpu */
- detach_timer(timer, false);
+ hlist_for_each_entry_safe(timer, n, head, entry) {
+ is_pinned = timer->flags & TIMER_PINNED_ON_CPU;
+ if (!remove_pinned && is_pinned)
+ continue;
+
+ detach_if_pending(timer, get_timer_base(timer->flags), false);
timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
internal_add_timer(new_base, timer);
}
}
-static void migrate_timers(int cpu)
+static void __migrate_timers(int cpu, bool wait, bool remove_pinned)
{
struct tvec_base *old_base;
struct tvec_base *new_base;
+ unsigned long flags;
int i;
- BUG_ON(cpu_online(cpu));
old_base = per_cpu_ptr(&tvec_bases, cpu);
new_base = get_cpu_ptr(&tvec_bases);
/*
* The caller is globally serialized and nobody else
* takes two locks at once, deadlock is not possible.
*/
- spin_lock_irq(&new_base->lock);
+ spin_lock_irqsave(&new_base->lock, flags);
spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
- BUG_ON(old_base->running_timer);
+ if (wait) {
+ /* Ensure timers are done running before continuing */
+ while (old_base->running_timer) {
+ spin_unlock(&old_base->lock);
+ spin_unlock_irqrestore(&new_base->lock, flags);
+ cpu_relax();
+ spin_lock_irqsave(&new_base->lock, flags);
+ spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+ }
+ } else {
+ BUG_ON(old_base->running_timer);
+ }
for (i = 0; i < TVR_SIZE; i++)
- migrate_timer_list(new_base, old_base->tv1.vec + i);
+ migrate_timer_list(new_base, old_base->tv1.vec + i,
+ remove_pinned);
for (i = 0; i < TVN_SIZE; i++) {
- migrate_timer_list(new_base, old_base->tv2.vec + i);
- migrate_timer_list(new_base, old_base->tv3.vec + i);
- migrate_timer_list(new_base, old_base->tv4.vec + i);
- migrate_timer_list(new_base, old_base->tv5.vec + i);
+ migrate_timer_list(new_base, old_base->tv2.vec + i,
+ remove_pinned);
+ migrate_timer_list(new_base, old_base->tv3.vec + i,
+ remove_pinned);
+ migrate_timer_list(new_base, old_base->tv4.vec + i,
+ remove_pinned);
+ migrate_timer_list(new_base, old_base->tv5.vec + i,
+ remove_pinned);
}
- old_base->active_timers = 0;
- old_base->all_timers = 0;
-
spin_unlock(&old_base->lock);
- spin_unlock_irq(&new_base->lock);
+ spin_unlock_irqrestore(&new_base->lock, flags);
put_cpu_ptr(&tvec_bases);
}
+/* Migrate timers from 'cpu' to this_cpu */
+static void migrate_timers(int cpu)
+{
+ BUG_ON(cpu_online(cpu));
+ __migrate_timers(cpu, false, true);
+}
+
+void timer_quiesce_cpu(void *cpup)
+{
+ __migrate_timers(*(int *)cpup, true, false);
+}
+
static int timer_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 9270e1ac6460..49fa2e6eea98 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -15,5 +15,3 @@
EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume);
EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
-EXPORT_TRACEPOINT_SYMBOL(core_ctl_set_busy);
-EXPORT_TRACEPOINT_SYMBOL(core_ctl_eval_need);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 029da92fb712..7f21591c8ec5 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -13,6 +13,7 @@
#include <linux/mm.h>
#include <linux/cpu.h>
+#include <linux/device.h>
#include <linux/nmi.h>
#include <linux/init.h>
#include <linux/module.h>
@@ -95,6 +96,7 @@ static u64 __read_mostly sample_period;
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
+static DEFINE_PER_CPU(unsigned int, watchdog_en);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static DEFINE_PER_CPU(bool, soft_watchdog_warn);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
@@ -586,9 +588,17 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio)
sched_setscheduler(current, policy, &param);
}
-static void watchdog_enable(unsigned int cpu)
+/* Must be called with hotplug lock (lock_device_hotplug()) held. */
+void watchdog_enable(unsigned int cpu)
{
struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+ unsigned int *enabled = raw_cpu_ptr(&watchdog_en);
+
+ lock_device_hotplug_assert();
+
+ if (*enabled)
+ return;
+ *enabled = 1;
/* kick off the timer for the hardlockup detector */
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -606,9 +616,17 @@ static void watchdog_enable(unsigned int cpu)
__touch_watchdog();
}
-static void watchdog_disable(unsigned int cpu)
+/* Must be called with hotplug lock (lock_device_hotplug()) held. */
+void watchdog_disable(unsigned int cpu)
{
struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+ unsigned int *enabled = raw_cpu_ptr(&watchdog_en);
+
+ lock_device_hotplug_assert();
+
+ if (!*enabled)
+ return;
+ *enabled = 0;
watchdog_set_prio(SCHED_NORMAL, 0);
hrtimer_cancel(hrtimer);