diff options
| author | Linux Build Service Account <lnxbuild@localhost> | 2016-09-30 11:33:23 -0600 |
|---|---|---|
| committer | Linux Build Service Account <lnxbuild@localhost> | 2016-09-30 11:33:23 -0600 |
| commit | 89ca2e2c45e6e48a2d47ebb5bf845bfec88e4319 (patch) | |
| tree | a4a13dd84e201ffd7398caffd3043265621e1aa4 /kernel | |
| parent | 8d9657e7e077f13fd3e4e8091d3f1c5ca81f6649 (diff) | |
| parent | 46692be6dd06ce405cb09cf8338b1f0edbbfb295 (diff) | |
Promotion of kernel.lnx.4.4-160930.
CRs Change ID Subject
--------------------------------------------------------------------------------------------------------------
1054226 Ifefb2f1cf6c24af7bc46fc62797955b8c8ad5815 perf: Add cpu isolation awareness
1055668 Ibb8afffdc1e4780a48d085918cb6d6cf84cc0dba msm: camera: Export IR LED device to mm-qcamera-daemon
1065513 I45690b239c73f636538b864f0c4a7e539a02eedb input: touchscreen: Change dev_pm_ops for Goodix driver
1069068 Ie19fececd9d2bc6cd3328a6c63c956bcc9eed9a6 arm64: process: Reduce the no. of bytes of data around r
1064336 I18a27267eebdca2c87bf4bffc11a120822cdc7c2 msm: ipa3: hal: change FnR prints to low
1063261 Id38c8e21a853907c884bccd2978f2fd0a547a1ca wil6210: fix wiphy registration sequence
1067981 I3f78196927501f582c36d5815096581185d797b4 soc: qcom: glink: Fix uninitialzed return value.
1065513 I3a00fb46106f859128f0fa9b8c99b5d6ba24bc7b input: touchscreen: Code clean-up for Goodix driver
1054226 I2c62af441fb9e5ba9f29719853a63e4c8f2d031b defconfig: msm: Disable core control helper
1054000 I783b7df9c7e0253e5dc88bd60e0b5300e26fba56 drivers: soc: Enable APR driver to use audio notifier
1044635 1053246 I8c5b8ffc62d34a44bfb47ec4f11477d4320d30a8 misc: qcom: qdsp6v2: SSR recovery support for NT decoder
1001175 I7d1e4d5cc421b800d2f00b6d23f6ff19ba4c4c7d msm: pcie: configurable PERST propagation delay
1067178 I9951f061ad22cc91eba1c75aba3bdfbdde904cb9 ASoC: wcd934x: Add dapm ignore suspend for backend dais
1068464 I479e23db73a64e0fc1371e6b5abfaf1c8969954b ARM: dts: msm: correct PM8005 S1 min/max voltage for msm
1053827 Icee01f6ba95e469acac9eed6bf2fdbc83947f5a8 msm: kgsl: Revisit the GPU snapshot dumping
1068464 I8539ca3a3456b9562e7ff0e48fd7824c15cea68f regulator: cpr3-mmss-regulator: update reference voltage
1066695 I11e71639291479d544849d0f8672b9384fa34d0a ARM: dts: msm: Add SMP2P entries for MSMTRITON
1064336 Ie1f0f48ffc1fd67fc8a2074d3d334fb8cd29c99f msm: ipa3: change FnR prints to low
1054000 I15325c1385eaa0f0cca2c07130f2b4a997d98e1f drivers: soc: Add Audio Notifier, PDR, and SSR drivers
1071217 I37e6f76b60ef0085d102c5d98179b467f6b77dee ARM: dts: msm: Add bus name to venus pil for MSMCOBALT
1060610 I88b6c0748e6683b2f11b751840ab26e3ed397b70 ASoC: wcd9335: add handset speaker gain mixer control
1054000 Ief6e89b003aa1e2b02f33e21e3cb17f8731425ad ASoC: msmcobalt: Add Audio Notifier, PDR, and SSR to Kco
1054226 I62acddeb707fc7d5626580166b3466e63f45fd89 core_ctl_helper: Remove code since it is not used anymor
1056661 Ida0d2d010f7bd226d7e2221f63e64a1d7e5a9075 ARM: dts: msm: Remove 710Mhz frequency for A540v2 GPU.
1063261 I51e58438672a45d210df5db3ac813e656cb525df wil6210: change HALP logging category to IRQ
1062271 If462fe3d82f139d72547f82dc7eb564f83cb35bf ASoC: msm: initialize the params array before using it
1054226 I4f1514ba5bac2e259a1105fcafb31d6a92ddd249 sched/core_ctl: Integrate core control with cpu isolatio
1063261 Ibc1e6dc2994268a60384b7d9bd459abc3791a0c4 wil6210: fix protection of wil->scan_request
1066424 Ic6dd2d1e7f829630dc6eae5ff74fae04f7fc7f9b ARM: dts: msm: enable more FG interrupts for PMICOBALT
1066695 I82e992df1bfa1113843a0772ca8b88e48050dd8d ARM: dts: msm: Add glink_pkt devices for MSMTRITON
1069455 Ibae066276b099ffb78c72a890a689f83e4df56a9 thermal: tsens: Update sensor ID index
1054226 Ice1a9503666a2b720bdb324289ca55ceb33097cd cpumask: Add cpu isolation support
1054226 I96505aeb9d07a6fa3a2c28648ffa299e0cfa2e41 sched/core_ctl: Refactor cpu data
1054226 Ib911a0d34c250c4df020bdb265b92d2b8df8db93 timer: Do not require CPUSETS to be enabled for migratio
1054226 Ie4c6cb1496ae3490d81681f1ad51c8103caa0014 soc: qcom: watchdog_v2: Add support for cpu isolation
1068464 I3fbfa4c1fc5a6b4f30f8acaf659d2abcc05a7d16 ARM: dts: msm: modify VDD_GFX CPR ceiling voltages for m
1066695 I1aaeec4eea40fedbadf8b6008233dbd6ce5b3312 ARM: dts: msm: Add ipc-spinlock entry for MSMTRITON
1054226 Icc4d1c183e993b4b3c9b96ec9779c234e73ecab7 hrtimer: make sure PINNED flag is cleared after removing
1069455 I63e2a0a29f6bbe787fa10170c28569cf692d2807 ARM: dts: msm: Update TSENS sensor ID for MSMCOBALT
1070187 Id2de8e2ac94476c1a4927f719f2987a31d692ab5 ARM: dts: msm: modify VDD_APC0/1 CPR voltages for msmcob
1065513 I0a9037eac6e30a6319919043dd2ef1c226663af9 input: touchscreen: Remove irq polling from Goodix drive
1065513 I5132854367330a9b47f678409cbe6a45f2b5abb3 msm: reap unused kernel files
1063261 I20bc8d4b2b58fc3235ec3fe778738055d7535276 wil6210: extract firmware capabilities from FW file
1069380 Ic332f87666e405edbf3511671828ba824af1f3a2 ARM: dts: msm: Fix smem rpm xprt entry for msmfalcon
1054226 I8624e0659b86b7b8fa425a3fafdb0784fe005124 timer: create timer_quiesce_cpu() to isolate CPU from ti
1056910 I09fcc019133f4d37b7be3287da8e0733e40fc0ac sched: constrain HMP scheduler tunable range with in bet
1063261 Ib5c55a37208d76505658635b12afc88552d5a220 wil6210: support rx key setting for all TIDs
1071204 Ib44789559c69e5808ed362cf9191486c93b2b66e msm: mink: Accept zero args for invoke command
1071752 I1dddc7df26caa1556e57128603afd32b2613ebde soc: qcom: pil: add PBL spare error status
1066569 I99ce1e5940506a5e65debfe822460c210a276b00 thermal: adc_tm: Initialize ADC parameters
1054000 I2ab027d4a6e2cb98df5358e24f6bebacd9aecca7 drivers: soc: Add Kconfigs for Audio Notifier, PDR, and
1002389 Id74650d5c4aaf9f84a56372f60ff5a40374e8f7b diag: Fix possible buffer corruption by proper typecasti
1054226 I5975f1e5d7a1947dc5ee1cf8a0c16ec88b0fc6fb defconfig: msmcortex: Enable core control
1065513 I3ade13181957d327ad9d0266b1999a4b0f2d8d1a input: touchscreen: Add threaded irq support to Goodix d
1069084 I916e37bd79a6645bdc186a78a65051ce1c4dd475 soc: qcom: Listen to SUBSYS_AFTER_SHUTDOWN notification
1054226 I401de0b52fa6d20573187265ee56edd543b1419e vmstat: Add cpu isolation awareness
1066422 Id9f28a0eeb2a904aca41eb46d0215d80287e0b88 qcom-charger: fg-util: add float decode function
1071363 I542fab59eadbea404c0001d25315172cc993488b ASoC: wcd934x: Avoid pop during dsd path tear down
1054226 I51259ea41e3bd5cdba50b718201a6840174a7224 hrtimer: create hrtimer_quiesce_cpu() to isolate CPU fro
1054226 I5d849dfd29aa5bb594454473768d7db1da258028 sched/tick: Ensure timers does not get queued on isolate
1054226 I7b50778615541a64f9956573757c7f28748c4f69 irq: Make irq affinity function cpu isolation aware
1068946 I6943b7f5565ad95eddb9e3d30de5efbc47106e3d qpnp-smb2: support batteryless platforms
1054226 I83e9fbb800df259616a145d311b50627dc42a5ff pmqos: Enable cpu isolation awareness
1054226 I632f37874ef79887ee1202a028ef734f392d6ed0 hrtimer.h: prevent pinned timer state from breaking inac
1066695 I53657de1a41e727b29793f02c7f2c1a43db1c96c ARM: dts: msm: Add G-Link SSR entries for MSMTRITON
1055668 I63da161f90bce2c33d1e7e9d5822c8635e307fd5 msm: camera: Export IR CUT device to mm-qcamera-daemon
1054226 I88a728ee1d54aaa887fab52e5e40d1d4e4fc69ca watchdog: Add support for cpu isolation
1070189 I791941fbede4b136c3f24d15b7fb0b60dd5477e6 ASoC: wcd934x: Check for null pointer before access
1066695 I6ee171881943d8ab77445ede1c6ee714ed171d4d ARM: dts: msm: Add SMEM entry for MSMTRITON
1054226 I370e404001344e635a663822b07557abbe0f6f52 timer: Add function to migrate timers
1046649 I941f91eeba01f4e7aa5427056bc57875e7edf197 msm: kgsl: Add memory and periphery clock control for A5
1068888 I5637e52be59ea9504ea6ae317394bef0c28c7865 net: ipc_router: fix NULL pointer de-reference issue
1066695 I276db2a07870864fca046627a615a30bc4e3936e ARM: dts: msm: Add IPC Router devices for MSMTRITON
1067981 I82b08ff548a9abb0538a0ba24f699a99547ec7b7 soc: qcom: glink_ssr: Fix uninitialized variable
1066695 I1e59ec8028e128a764d3a79d446b5e8d650937b1 ARM: dts: msm: Add G-Link SMEM Transport entries for MSM
1054226 I24d6e91b6dff468c640c2fe3a37a7f31b6f0c79a timer: Ensure timers are not running before migrating
1054226 I65943d8e4a9eac1f9f5a40ad5aaf166679215f48 trace: Move core control trace events to scheduler
1065513 I280f2201c69838ad4da8eb94e9f10768f54ed457 input: touchscreen: Fix issues in suspend path
1065513 I6c18e153ddf18667ca83d47df20c71bce6dbfa21 input: touchscreen: Remove redundant code from Goodix dr
1055668 I2e04fa47efd1454bb487eca67bd9ceaeab3e9edf msm: camera: Add a driver to control IR LED device
1054226 Ia78e701468ea3828195c2a15c9cf9fafd099804a core_ctrl: Move core control into kernel
1054226 If2d30000f068afc50db953940f4636ef6a089b24 sched/core: Add trace point for cpu isolation
1057562 I76c9a9e44755a4a77e6cffb1dc07f5b28c8b34b8 Migrate mpq demux driver from kernel 3.18 to 4.4
1036232 If6c273d9a86f9fc4bc841388b11b96c385dc64f4 net: cnss: Add support to get fw files for QCA SDIO targ
1068464 I2b0a8e5353c9bce25c965a8b6ead7494454466c9 ARM: dts: msm: modify VDD_GFX CPR voltage adjustments fo
1065513 Ic2b1b2562b63ccecdf15bdc64ad7e45996d196d3 input: touchscreen: Add debugfs support for suspend/resu
1070872 I6cbe4167ab9d980b75f4fefdd4add0d8e8adaef8 ARM: dts: msm: add slimbus7 and slimbus8 cpu dais for ms
1054226 If3b3770e547971809e789ea7c8033c48ec2aa92d hrtimer: update timer->state with 'pinned' information
1064336 I5cfadb3ee7cb339b89b0c428bae46d3802476eb9 msm: ipa3: increase SSR tag timeout
1055668 I30d1c4e6c40b8e58a70f06db9e05231b4c7f676f msm: camera: Add a driver to control IR CUT device
1054226 I0bbddb56238c2958c5987877c5bfc3e79afa67cc sched: add cpu isolation support
1051762 Ife6146d28c8bc834a79e861959eca03e58e12d5e ASoC: msm: qdsp6v2: Change device switch handling
1063261 I95c14c0fe7a33c078eb7d9aa44dd97a64f9b0fae wil6210: align to latest auto generated wmi.h
1063261 I169e0c94edf5df31336af1ede36900ec337f4314 wil6210: align to latest auto generated wmi.h
1071938 I7ab180e06ececf8136903ee04565b8b4a2bf3524 icnss: update logs for QMI rejected messages
1057065 Ie6234ae30ad47a063982e5cc50f4ecedf1f61de2 msm: pcie: verify EP is accessible before conf restore
993625 I2c99df5de44a6fd924ce7f5921db0e1cf3ba5d11 msm: pil: Adding function name and buffer size informati
1065983 I12154b0aa315fde6dd92267d4c8f4a78a6f0236f ASoC: msmcobalt: send ANC config for WCD9340 codec
1071464 Ic3ef2229fa8552301e09dfb912e79e044a81324f usb: dwc3: fix overriding core clock rate to default max
1054000 Ib88a71e2fdb2b58fd5f87a65cb7d3253884f2d97 ASoC: msmcobalt: Enable msmcobalt to use audio notifier
1063261 Ifb92501aab14843309fed8e1214a867b2ccccfce wil6210: Fix driver down flow
1065983 I8c83f6305dbc0a40b67bf2ffd53d37a0abdcf953 ASoC: wcd934x: enable rate converter clock for AANC
1054226 I07702bb5b738c1c75c49a2ca4cb08be0231ccb12 smp: Do not wake up all idle CPUs
1063261 Id17271823d167677a323dd1f52c7de4c6025b56e wil6210: prevent usage of incorrect TX hwtail
1063261 I79f8522ae84dd209cb98c3bbc52cfaeb199dd342 wil6210: fix stop p2p device handling
1057562 Ia50bd897f6bf4c0ea7adc27d53a657090a09e229 Migrate demux driver from kernel 3.18 to 4.4
1066563 I0e21c5966e0072eab826c92fc332c54e11cb0b23 thermal: tsens: Update readl call in msm_tsens_get_temp(
1054226 I6a13e8dda99130ca794e5b6f51600f4c57a3e921 drivers/base: cpu: Add node for cpu isolation
Change-Id: If82c9559e44520d270bc164fcb86b382b71301ff
CRs-Fixed: 1036232, 1069455, 1071204, 1001175, 1053827, 1057562, 1060610, 1068946, 1071363, 1071938, 1068888, 1069380, 1069068, 1065983, 1055668, 1066695, 1067981, 1063261, 1064336, 1067178, 1066422, 1066424, 1065513, 1066569, 1070189, 1057065, 1068464, 1071217, 1066563, 1069084, 1070187, 993625, 1053246, 1070872, 1051762, 1046649, 1056661, 1062271, 1002389, 1071752, 1056910, 1054000, 1044635, 1054226, 1071464
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/cpu.c | 17 | ||||
| -rw-r--r-- | kernel/events/core.c | 3 | ||||
| -rw-r--r-- | kernel/irq/cpuhotplug.c | 11 | ||||
| -rw-r--r-- | kernel/power/qos.c | 7 | ||||
| -rw-r--r-- | kernel/sched/Makefile | 1 | ||||
| -rw-r--r-- | kernel/sched/core.c | 295 | ||||
| -rw-r--r-- | kernel/sched/core_ctl.c | 1055 | ||||
| -rw-r--r-- | kernel/sched/core_ctl.h | 24 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 74 | ||||
| -rw-r--r-- | kernel/sched/hmp.c | 33 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 13 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 5 | ||||
| -rw-r--r-- | kernel/smp.c | 4 | ||||
| -rw-r--r-- | kernel/sysctl.c | 23 | ||||
| -rw-r--r-- | kernel/time/hrtimer.c | 74 | ||||
| -rw-r--r-- | kernel/time/timer.c | 68 | ||||
| -rw-r--r-- | kernel/trace/power-traces.c | 2 | ||||
| -rw-r--r-- | kernel/watchdog.c | 22 |
18 files changed, 1622 insertions, 109 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c index 1cfd381642da..3c97f5b88a07 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -768,6 +768,10 @@ static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly; const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits); EXPORT_SYMBOL(cpu_active_mask); +static DECLARE_BITMAP(cpu_isolated_bits, CONFIG_NR_CPUS) __read_mostly; +const struct cpumask *const cpu_isolated_mask = to_cpumask(cpu_isolated_bits); +EXPORT_SYMBOL(cpu_isolated_mask); + void set_cpu_possible(unsigned int cpu, bool possible) { if (possible) @@ -802,6 +806,14 @@ void set_cpu_active(unsigned int cpu, bool active) cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits)); } +void set_cpu_isolated(unsigned int cpu, bool isolated) +{ + if (isolated) + cpumask_set_cpu(cpu, to_cpumask(cpu_isolated_bits)); + else + cpumask_clear_cpu(cpu, to_cpumask(cpu_isolated_bits)); +} + void init_cpu_present(const struct cpumask *src) { cpumask_copy(to_cpumask(cpu_present_bits), src); @@ -817,6 +829,11 @@ void init_cpu_online(const struct cpumask *src) cpumask_copy(to_cpumask(cpu_online_bits), src); } +void init_cpu_isolated(const struct cpumask *src) +{ + cpumask_copy(to_cpumask(cpu_isolated_bits), src); +} + static ATOMIC_NOTIFIER_HEAD(idle_notifier); void idle_notifier_register(struct notifier_block *n) diff --git a/kernel/events/core.c b/kernel/events/core.c index d6ec580584b6..5beb88f11671 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3386,7 +3386,8 @@ static int perf_event_read(struct perf_event *event, bool group) * If event is enabled and currently active on a CPU, update the * value in the event structure: */ - if (event->state == PERF_EVENT_STATE_ACTIVE) { + if (event->state == PERF_EVENT_STATE_ACTIVE && + !cpu_isolated(event->oncpu)) { struct perf_read_data data = { .event = event, .group = group, diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 011f8c4c63da..104432f3d311 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -11,6 +11,7 @@ #include <linux/interrupt.h> #include <linux/ratelimit.h> #include <linux/irq.h> +#include <linux/cpumask.h> #include "internals.h" @@ -20,6 +21,7 @@ static bool migrate_one_irq(struct irq_desc *desc) const struct cpumask *affinity = d->common->affinity; struct irq_chip *c; bool ret = false; + struct cpumask available_cpus; /* * If this is a per-CPU interrupt, or the affinity does not @@ -29,8 +31,15 @@ static bool migrate_one_irq(struct irq_desc *desc) !cpumask_test_cpu(smp_processor_id(), affinity)) return false; + cpumask_copy(&available_cpus, affinity); + cpumask_andnot(&available_cpus, &available_cpus, cpu_isolated_mask); + affinity = &available_cpus; + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { - affinity = cpu_online_mask; + cpumask_andnot(&available_cpus, cpu_online_mask, + cpu_isolated_mask); + if (cpumask_empty(affinity)) + affinity = cpu_online_mask; ret = true; } diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 8ecc7b3f7dd9..69c32c42080f 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -45,6 +45,7 @@ #include <linux/seq_file.h> #include <linux/irq.h> #include <linux/irqdesc.h> +#include <linux/cpumask.h> #include <linux/uaccess.h> #include <linux/export.h> @@ -447,6 +448,9 @@ EXPORT_SYMBOL_GPL(pm_qos_request); int pm_qos_request_for_cpu(int pm_qos_class, int cpu) { + if (cpu_isolated(cpu)) + return INT_MAX; + return pm_qos_array[pm_qos_class]->constraints->target_per_cpu[cpu]; } EXPORT_SYMBOL(pm_qos_request_for_cpu); @@ -469,6 +473,9 @@ int pm_qos_request_for_cpumask(int pm_qos_class, struct cpumask *mask) val = c->default_value; for_each_cpu(cpu, mask) { + if (cpu_isolated(cpu)) + continue; + switch (c->type) { case PM_QOS_MIN: if (c->target_per_cpu[cpu] < val) diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 1f159743ebfc..508b65690288 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -20,3 +20,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_SCHED_CORE_CTL) += core_ctl.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7474463b9835..609aa2e588d7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -74,6 +74,7 @@ #include <linux/binfmts.h> #include <linux/context_tracking.h> #include <linux/compiler.h> +#include <linux/irq.h> #include <asm/switch_to.h> #include <asm/tlb.h> @@ -84,6 +85,7 @@ #endif #include "sched.h" +#include "core_ctl.h" #include "../workqueue_internal.h" #include "../smpboot.h" @@ -1229,6 +1231,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, struct rq *rq; unsigned int dest_cpu; int ret = 0; + cpumask_t allowed_mask; rq = task_rq_lock(p, &flags); @@ -1244,16 +1247,22 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_equal(&p->cpus_allowed, new_mask)) goto out; - dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); + cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask); + + dest_cpu = cpumask_any_and(cpu_active_mask, &allowed_mask); if (dest_cpu >= nr_cpu_ids) { - ret = -EINVAL; - goto out; + dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); + if (dest_cpu >= nr_cpu_ids) { + ret = -EINVAL; + goto out; + } + cpumask_copy(&allowed_mask, new_mask); } do_set_cpus_allowed(p, new_mask); /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask)) + if (cpumask_test_cpu(task_cpu(p), &allowed_mask)) goto out; if (task_running(rq, p) || p->state == TASK_WAKING) { @@ -1577,12 +1586,13 @@ EXPORT_SYMBOL_GPL(kick_process); /* * ->cpus_allowed is protected by both rq->lock and p->pi_lock */ -static int select_fallback_rq(int cpu, struct task_struct *p) +static int select_fallback_rq(int cpu, struct task_struct *p, bool allow_iso) { int nid = cpu_to_node(cpu); const struct cpumask *nodemask = NULL; enum { cpuset, possible, fail } state = cpuset; int dest_cpu; + int isolated_candidate = -1; /* * If the node that the cpu is on has been offlined, cpu_to_node() @@ -1598,6 +1608,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p) continue; if (!cpu_active(dest_cpu)) continue; + if (cpu_isolated(dest_cpu)) + continue; if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) return dest_cpu; } @@ -1610,6 +1622,16 @@ static int select_fallback_rq(int cpu, struct task_struct *p) continue; if (!cpu_active(dest_cpu)) continue; + if (cpu_isolated(dest_cpu)) { + if (allow_iso) + isolated_candidate = dest_cpu; + continue; + } + goto out; + } + + if (isolated_candidate != -1) { + dest_cpu = isolated_candidate; goto out; } @@ -1655,6 +1677,8 @@ out: static inline int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { + bool allow_isolated = (p->flags & PF_KTHREAD); + lockdep_assert_held(&p->pi_lock); if (p->nr_cpus_allowed > 1) @@ -1671,8 +1695,9 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) * not worry about this generic constraint ] */ if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || - !cpu_online(cpu))) - cpu = select_fallback_rq(task_cpu(p), p); + !cpu_online(cpu)) || + (cpu_isolated(cpu) && !allow_isolated)) + cpu = select_fallback_rq(task_cpu(p), p, allow_isolated); return cpu; } @@ -2956,7 +2981,7 @@ void sched_exec(void) if (dest_cpu == smp_processor_id()) goto unlock; - if (likely(cpu_active(dest_cpu))) { + if (likely(cpu_active(dest_cpu) && likely(!cpu_isolated(dest_cpu)))) { struct migration_arg arg = { p, dest_cpu }; raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -3066,6 +3091,8 @@ void scheduler_tick(void) if (curr->sched_class == &fair_sched_class) check_for_migration(rq, curr); + + core_ctl_check(wallclock); } #ifdef CONFIG_NO_HZ_FULL @@ -5414,18 +5441,22 @@ static struct task_struct fake_task = { }; /* - * Migrate all tasks from the rq, sleeping tasks will be migrated by - * try_to_wake_up()->select_task_rq(). + * Migrate all tasks (not pinned if pinned argument say so) from the rq, + * sleeping tasks will be migrated by try_to_wake_up()->select_task_rq(). * * Called with rq->lock held even though we'er in stop_machine() and * there's no concurrency possible, we hold the required locks anyway * because of lock validation efforts. */ -static void migrate_tasks(struct rq *dead_rq) +static void migrate_tasks(struct rq *dead_rq, bool migrate_pinned_tasks) { struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; int dest_cpu; + unsigned int num_pinned_kthreads = 1; /* this thread */ + cpumask_t avail_cpus; + + cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask); /* * Fudge the rq selection such that the below task selection loop @@ -5447,10 +5478,12 @@ static void migrate_tasks(struct rq *dead_rq) for (;;) { /* - * There's this thread running, bail when that's the only - * remaining thread. + * There's this thread running + pinned threads, bail when + * that's the only remaining threads. */ - if (rq->nr_running == 1) + if ((migrate_pinned_tasks && rq->nr_running == 1) || + (!migrate_pinned_tasks && + rq->nr_running == num_pinned_kthreads)) break; /* @@ -5461,6 +5494,13 @@ static void migrate_tasks(struct rq *dead_rq) BUG_ON(!next); next->sched_class->put_prev_task(rq, next); + if (!migrate_pinned_tasks && next->flags & PF_KTHREAD && + !cpumask_intersects(&avail_cpus, &next->cpus_allowed)) { + lockdep_unpin_lock(&rq->lock); + num_pinned_kthreads += 1; + continue; + } + /* * Rules for changing task_struct::cpus_allowed are holding * both pi_lock and rq->lock, such that holding either @@ -5486,7 +5526,7 @@ static void migrate_tasks(struct rq *dead_rq) } /* Find suitable destination for @next, with force if needed. */ - dest_cpu = select_fallback_rq(dead_rq->cpu, next); + dest_cpu = select_fallback_rq(dead_rq->cpu, next, false); rq = __migrate_task(rq, next, dest_cpu); if (rq != dead_rq) { @@ -5502,6 +5542,222 @@ static void migrate_tasks(struct rq *dead_rq) rq->stop = stop; } + +static void set_rq_online(struct rq *rq); +static void set_rq_offline(struct rq *rq); + +int do_isolation_work_cpu_stop(void *data) +{ + unsigned long flags; + unsigned int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + + watchdog_disable(cpu); + + irq_migrate_all_off_this_cpu(); + + sched_ttwu_pending(); + /* Update our root-domain */ + raw_spin_lock_irqsave(&rq->lock, flags); + + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + + migrate_tasks(rq, false); + raw_spin_unlock_irqrestore(&rq->lock, flags); + + /* + * We might have been in tickless state. Clear NOHZ flags to avoid + * us being kicked for helping out with balancing + */ + nohz_balance_clear_nohz_mask(cpu); + return 0; +} + +int do_unisolation_work_cpu_stop(void *data) +{ + watchdog_enable(smp_processor_id()); + return 0; +} + +static void init_sched_groups_capacity(int cpu, struct sched_domain *sd); + +static void sched_update_group_capacities(int cpu) +{ + struct sched_domain *sd; + + mutex_lock(&sched_domains_mutex); + rcu_read_lock(); + + for_each_domain(cpu, sd) { + int balance_cpu = group_balance_cpu(sd->groups); + + init_sched_groups_capacity(cpu, sd); + /* + * Need to ensure this is also called with balancing + * cpu. + */ + if (cpu != balance_cpu) + init_sched_groups_capacity(balance_cpu, sd); + } + + rcu_read_unlock(); + mutex_unlock(&sched_domains_mutex); +} + +static unsigned int cpu_isolation_vote[NR_CPUS]; + +int sched_isolate_count(const cpumask_t *mask, bool include_offline) +{ + cpumask_t count_mask = CPU_MASK_NONE; + + if (include_offline) { + cpumask_complement(&count_mask, cpu_online_mask); + cpumask_or(&count_mask, &count_mask, cpu_isolated_mask); + cpumask_and(&count_mask, &count_mask, mask); + } else { + cpumask_and(&count_mask, mask, cpu_isolated_mask); + } + + return cpumask_weight(&count_mask); +} + +/* + * 1) CPU is isolated and cpu is offlined: + * Unisolate the core. + * 2) CPU is not isolated and CPU is offlined: + * No action taken. + * 3) CPU is offline and request to isolate + * Request ignored. + * 4) CPU is offline and isolated: + * Not a possible state. + * 5) CPU is online and request to isolate + * Normal case: Isolate the CPU + * 6) CPU is not isolated and comes back online + * Nothing to do + * + * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY + * calling sched_unisolate_cpu() on a CPU that the client previously isolated. + * Client is also responsible for unisolating when a core goes offline + * (after CPU is marked offline). + */ +int sched_isolate_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + cpumask_t avail_cpus; + int ret_code = 0; + u64 start_time; + + if (trace_sched_isolate_enabled()) + start_time = sched_clock(); + + lock_device_hotplug(); + + cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask); + + /* We cannot isolate ALL cpus in the system */ + if (cpumask_weight(&avail_cpus) == 1) { + ret_code = -EINVAL; + goto out; + } + + if (!cpu_online(cpu)) { + ret_code = -EINVAL; + goto out; + } + + if (++cpu_isolation_vote[cpu] > 1) + goto out; + + set_cpu_isolated(cpu, true); + cpumask_clear_cpu(cpu, &avail_cpus); + + /* Migrate timers */ + smp_call_function_any(&avail_cpus, hrtimer_quiesce_cpu, &cpu, 1); + smp_call_function_any(&avail_cpus, timer_quiesce_cpu, &cpu, 1); + + migrate_sync_cpu(cpu, cpumask_first(&avail_cpus)); + stop_cpus(cpumask_of(cpu), do_isolation_work_cpu_stop, 0); + + clear_hmp_request(cpu); + calc_load_migrate(rq); + update_max_interval(); + sched_update_group_capacities(cpu); + +out: + unlock_device_hotplug(); + trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0], + start_time, 1); + return ret_code; +} + +/* + * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY + * calling sched_unisolate_cpu() on a CPU that the client previously isolated. + * Client is also responsible for unisolating when a core goes offline + * (after CPU is marked offline). + */ +int sched_unisolate_cpu_unlocked(int cpu) +{ + int ret_code = 0; + struct rq *rq = cpu_rq(cpu); + u64 start_time; + + if (trace_sched_isolate_enabled()) + start_time = sched_clock(); + + lock_device_hotplug_assert(); + + if (!cpu_isolation_vote[cpu]) { + ret_code = -EINVAL; + goto out; + } + + if (--cpu_isolation_vote[cpu]) + goto out; + + if (cpu_online(cpu)) { + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + rq->age_stamp = sched_clock_cpu(cpu); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_online(rq); + } + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + + set_cpu_isolated(cpu, false); + update_max_interval(); + sched_update_group_capacities(cpu); + + if (cpu_online(cpu)) { + stop_cpus(cpumask_of(cpu), do_unisolation_work_cpu_stop, 0); + + /* Kick CPU to immediately do load balancing */ + if (!test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) + smp_send_reschedule(cpu); + } + +out: + trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0], + start_time, 0); + return ret_code; +} + +int sched_unisolate_cpu(int cpu) +{ + int ret_code; + + lock_device_hotplug(); + ret_code = sched_unisolate_cpu_unlocked(cpu); + unlock_device_hotplug(); + return ret_code; +} + #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -5748,13 +6004,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) sched_ttwu_pending(); /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); - migrate_sync_cpu(cpu); + migrate_sync_cpu(cpu, smp_processor_id()); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - migrate_tasks(rq); + migrate_tasks(rq, true); BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); break; @@ -6509,11 +6765,14 @@ build_sched_groups(struct sched_domain *sd, int cpu) static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) { struct sched_group *sg = sd->groups; + cpumask_t avail_mask; WARN_ON(!sg); do { - sg->group_weight = cpumask_weight(sched_group_cpus(sg)); + cpumask_andnot(&avail_mask, sched_group_cpus(sg), + cpu_isolated_mask); + sg->group_weight = cpumask_weight(&avail_mask); sg = sg->next; } while (sg != sd->groups); diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c new file mode 100644 index 000000000000..d81886da7ca2 --- /dev/null +++ b/kernel/sched/core_ctl.c @@ -0,0 +1,1055 @@ +/* Copyright (c) 2014-2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/init.h> +#include <linux/notifier.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/cpufreq.h> +#include <linux/kthread.h> +#include <linux/sched.h> +#include <linux/sched/rt.h> + +#include <trace/events/sched.h> + +#define MAX_CPUS_PER_CLUSTER 4 +#define MAX_CLUSTERS 2 + +struct cluster_data { + bool inited; + unsigned int min_cpus; + unsigned int max_cpus; + unsigned int offline_delay_ms; + unsigned int busy_up_thres[MAX_CPUS_PER_CLUSTER]; + unsigned int busy_down_thres[MAX_CPUS_PER_CLUSTER]; + unsigned int active_cpus; + unsigned int num_cpus; + cpumask_t cpu_mask; + unsigned int need_cpus; + unsigned int task_thres; + s64 last_isolate_ts; + struct list_head lru; + bool pending; + spinlock_t pending_lock; + bool is_big_cluster; + int nrrun; + bool nrrun_changed; + struct task_struct *core_ctl_thread; + unsigned int first_cpu; + bool boost; + struct kobject kobj; +}; + +struct cpu_data { + bool online; + bool is_busy; + unsigned int busy; + unsigned int cpu; + bool not_preferred; + struct cluster_data *cluster; + struct list_head sib; + bool isolated_by_us; +}; + +static DEFINE_PER_CPU(struct cpu_data, cpu_state); +static struct cluster_data cluster_state[MAX_CLUSTERS]; +static unsigned int num_clusters; + +#define for_each_cluster(cluster, idx) \ + for ((cluster) = &cluster_state[idx]; (idx) < num_clusters;\ + (idx)++, (cluster) = &cluster_state[idx]) + +static DEFINE_SPINLOCK(state_lock); +static void apply_need(struct cluster_data *state); +static void wake_up_core_ctl_thread(struct cluster_data *state); +static bool initialized; + +static unsigned int get_active_cpu_count(const struct cluster_data *cluster); + +/* ========================= sysfs interface =========================== */ + +static ssize_t store_min_cpus(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->min_cpus = min(val, state->max_cpus); + wake_up_core_ctl_thread(state); + + return count; +} + +static ssize_t show_min_cpus(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->min_cpus); +} + +static ssize_t store_max_cpus(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + val = min(val, state->num_cpus); + state->max_cpus = val; + state->min_cpus = min(state->min_cpus, state->max_cpus); + wake_up_core_ctl_thread(state); + + return count; +} + +static ssize_t show_max_cpus(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->max_cpus); +} + +static ssize_t store_offline_delay_ms(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->offline_delay_ms = val; + apply_need(state); + + return count; +} + +static ssize_t show_task_thres(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->task_thres); +} + +static ssize_t store_task_thres(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + if (val < state->num_cpus) + return -EINVAL; + + state->task_thres = val; + apply_need(state); + + return count; +} + +static ssize_t show_offline_delay_ms(const struct cluster_data *state, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->offline_delay_ms); +} + +static ssize_t store_busy_up_thres(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val[MAX_CPUS_PER_CLUSTER]; + int ret, i; + + ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + if (ret == 1) { + for (i = 0; i < state->num_cpus; i++) + state->busy_up_thres[i] = val[0]; + } else { + for (i = 0; i < state->num_cpus; i++) + state->busy_up_thres[i] = val[i]; + } + apply_need(state); + return count; +} + +static ssize_t show_busy_up_thres(const struct cluster_data *state, char *buf) +{ + int i, count = 0; + + for (i = 0; i < state->num_cpus; i++) + count += snprintf(buf + count, PAGE_SIZE - count, "%u ", + state->busy_up_thres[i]); + + count += snprintf(buf + count, PAGE_SIZE - count, "\n"); + return count; +} + +static ssize_t store_busy_down_thres(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val[MAX_CPUS_PER_CLUSTER]; + int ret, i; + + ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + if (ret == 1) { + for (i = 0; i < state->num_cpus; i++) + state->busy_down_thres[i] = val[0]; + } else { + for (i = 0; i < state->num_cpus; i++) + state->busy_down_thres[i] = val[i]; + } + apply_need(state); + return count; +} + +static ssize_t show_busy_down_thres(const struct cluster_data *state, char *buf) +{ + int i, count = 0; + + for (i = 0; i < state->num_cpus; i++) + count += snprintf(buf + count, PAGE_SIZE - count, "%u ", + state->busy_down_thres[i]); + + count += snprintf(buf + count, PAGE_SIZE - count, "\n"); + return count; +} + +static ssize_t store_is_big_cluster(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->is_big_cluster = val ? 1 : 0; + return count; +} + +static ssize_t show_is_big_cluster(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->is_big_cluster); +} + +static ssize_t show_cpus(const struct cluster_data *state, char *buf) +{ + struct cpu_data *c; + ssize_t count = 0; + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry(c, &state->lru, sib) { + count += snprintf(buf + count, PAGE_SIZE - count, + "CPU%u (%s)\n", c->cpu, + c->online ? "Online" : "Offline"); + } + spin_unlock_irqrestore(&state_lock, flags); + return count; +} + +static ssize_t show_need_cpus(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->need_cpus); +} + +static ssize_t show_active_cpus(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->active_cpus); +} + +static ssize_t show_global_state(const struct cluster_data *state, char *buf) +{ + struct cpu_data *c; + struct cluster_data *cluster; + ssize_t count = 0; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + c = &per_cpu(cpu_state, cpu); + if (!c->cluster) + continue; + + cluster = c->cluster; + if (!cluster || !cluster->inited) + continue; + + count += snprintf(buf + count, PAGE_SIZE - count, + "CPU%u\n", cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tCPU: %u\n", c->cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tOnline: %u\n", c->online); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tActive: %u\n", + !cpu_isolated(c->cpu)); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tFirst CPU: %u\n", + cluster->first_cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tBusy%%: %u\n", c->busy); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tIs busy: %u\n", c->is_busy); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNr running: %u\n", cluster->nrrun); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tActive CPUs: %u\n", get_active_cpu_count(cluster)); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNeed CPUs: %u\n", cluster->need_cpus); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tBoost: %u\n", (unsigned int) cluster->boost); + } + + return count; +} + +static ssize_t store_not_preferred(struct cluster_data *state, + const char *buf, size_t count) +{ + struct cpu_data *c; + unsigned int i; + unsigned int val[MAX_CPUS_PER_CLUSTER]; + unsigned long flags; + int ret; + + ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + i = 0; + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry(c, &state->lru, sib) + c->not_preferred = val[i++]; + spin_unlock_irqrestore(&state_lock, flags); + + return count; +} + +static ssize_t show_not_preferred(const struct cluster_data *state, char *buf) +{ + struct cpu_data *c; + ssize_t count = 0; + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry(c, &state->lru, sib) + count += snprintf(buf + count, PAGE_SIZE - count, + "\tCPU:%d %u\n", c->cpu, c->not_preferred); + spin_unlock_irqrestore(&state_lock, flags); + + return count; +} + + +struct core_ctl_attr { + struct attribute attr; + ssize_t (*show)(const struct cluster_data *, char *); + ssize_t (*store)(struct cluster_data *, const char *, size_t count); +}; + +#define core_ctl_attr_ro(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0444, show_##_name, NULL) + +#define core_ctl_attr_rw(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0644, show_##_name, store_##_name) + +core_ctl_attr_rw(min_cpus); +core_ctl_attr_rw(max_cpus); +core_ctl_attr_rw(offline_delay_ms); +core_ctl_attr_rw(busy_up_thres); +core_ctl_attr_rw(busy_down_thres); +core_ctl_attr_rw(task_thres); +core_ctl_attr_rw(is_big_cluster); +core_ctl_attr_ro(cpus); +core_ctl_attr_ro(need_cpus); +core_ctl_attr_ro(active_cpus); +core_ctl_attr_ro(global_state); +core_ctl_attr_rw(not_preferred); + +static struct attribute *default_attrs[] = { + &min_cpus.attr, + &max_cpus.attr, + &offline_delay_ms.attr, + &busy_up_thres.attr, + &busy_down_thres.attr, + &task_thres.attr, + &is_big_cluster.attr, + &cpus.attr, + &need_cpus.attr, + &active_cpus.attr, + &global_state.attr, + ¬_preferred.attr, + NULL +}; + +#define to_cluster_data(k) container_of(k, struct cluster_data, kobj) +#define to_attr(a) container_of(a, struct core_ctl_attr, attr) +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct cluster_data *data = to_cluster_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->show) + ret = cattr->show(data, buf); + + return ret; +} + +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct cluster_data *data = to_cluster_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->store) + ret = cattr->store(data, buf, count); + + return ret; +} + +static const struct sysfs_ops sysfs_ops = { + .show = show, + .store = store, +}; + +static struct kobj_type ktype_core_ctl = { + .sysfs_ops = &sysfs_ops, + .default_attrs = default_attrs, +}; + +/* ==================== runqueue based core count =================== */ + +#define RQ_AVG_TOLERANCE 2 +#define RQ_AVG_DEFAULT_MS 20 +#define NR_RUNNING_TOLERANCE 5 +static unsigned int rq_avg_period_ms = RQ_AVG_DEFAULT_MS; + +static s64 rq_avg_timestamp_ms; + +static void update_running_avg(bool trigger_update) +{ + int avg, iowait_avg, big_avg, old_nrrun; + s64 now; + unsigned long flags; + struct cluster_data *cluster; + unsigned int index = 0; + + spin_lock_irqsave(&state_lock, flags); + + now = ktime_to_ms(ktime_get()); + if (now - rq_avg_timestamp_ms < rq_avg_period_ms - RQ_AVG_TOLERANCE) { + spin_unlock_irqrestore(&state_lock, flags); + return; + } + rq_avg_timestamp_ms = now; + sched_get_nr_running_avg(&avg, &iowait_avg, &big_avg); + + spin_unlock_irqrestore(&state_lock, flags); + + /* + * Round up to the next integer if the average nr running tasks + * is within NR_RUNNING_TOLERANCE/100 of the next integer. + * If normal rounding up is used, it will allow a transient task + * to trigger online event. By the time core is onlined, the task + * has finished. + * Rounding to closest suffers same problem because scheduler + * might only provide running stats per jiffy, and a transient + * task could skew the number for one jiffy. If core control + * samples every 2 jiffies, it will observe 0.5 additional running + * average which rounds up to 1 task. + */ + avg = (avg + NR_RUNNING_TOLERANCE) / 100; + big_avg = (big_avg + NR_RUNNING_TOLERANCE) / 100; + + for_each_cluster(cluster, index) { + if (!cluster->inited) + continue; + old_nrrun = cluster->nrrun; + /* + * Big cluster only need to take care of big tasks, but if + * there are not enough big cores, big tasks need to be run + * on little as well. Thus for little's runqueue stat, it + * has to use overall runqueue average, or derive what big + * tasks would have to be run on little. The latter approach + * is not easy to get given core control reacts much slower + * than scheduler, and can't predict scheduler's behavior. + */ + cluster->nrrun = cluster->is_big_cluster ? big_avg : avg; + if (cluster->nrrun != old_nrrun) { + if (trigger_update) + apply_need(cluster); + else + cluster->nrrun_changed = true; + } + } + return; +} + +/* adjust needed CPUs based on current runqueue information */ +static unsigned int apply_task_need(const struct cluster_data *cluster, + unsigned int new_need) +{ + /* unisolate all cores if there are enough tasks */ + if (cluster->nrrun >= cluster->task_thres) + return cluster->num_cpus; + + /* only unisolate more cores if there are tasks to run */ + if (cluster->nrrun > new_need) + return new_need + 1; + + return new_need; +} + +/* ======================= load based core count ====================== */ + +static unsigned int apply_limits(const struct cluster_data *cluster, + unsigned int need_cpus) +{ + return min(max(cluster->min_cpus, need_cpus), cluster->max_cpus); +} + +static unsigned int get_active_cpu_count(const struct cluster_data *cluster) +{ + return cluster->num_cpus - + sched_isolate_count(&cluster->cpu_mask, true); +} + +static bool is_active(const struct cpu_data *state) +{ + return state->online && !cpu_isolated(state->cpu); +} + +static bool adjustment_possible(const struct cluster_data *cluster, + unsigned int need) +{ + return (need < cluster->active_cpus || (need > cluster->active_cpus && + sched_isolate_count(&cluster->cpu_mask, false))); +} + +static bool eval_need(struct cluster_data *cluster) +{ + unsigned long flags; + struct cpu_data *c; + unsigned int need_cpus = 0, last_need, thres_idx; + int ret = 0; + bool need_flag = false; + unsigned int active_cpus; + unsigned int new_need; + + if (unlikely(!cluster->inited)) + return 0; + + spin_lock_irqsave(&state_lock, flags); + + if (cluster->boost) { + need_cpus = cluster->max_cpus; + } else { + active_cpus = get_active_cpu_count(cluster); + thres_idx = active_cpus ? active_cpus - 1 : 0; + list_for_each_entry(c, &cluster->lru, sib) { + if (c->busy >= cluster->busy_up_thres[thres_idx]) + c->is_busy = true; + else if (c->busy < cluster->busy_down_thres[thres_idx]) + c->is_busy = false; + need_cpus += c->is_busy; + } + need_cpus = apply_task_need(cluster, need_cpus); + } + new_need = apply_limits(cluster, need_cpus); + need_flag = adjustment_possible(cluster, new_need); + + last_need = cluster->need_cpus; + cluster->need_cpus = new_need; + + if (!need_flag) { + spin_unlock_irqrestore(&state_lock, flags); + return 0; + } + + if (need_cpus > cluster->active_cpus) { + ret = 1; + } else if (need_cpus < cluster->active_cpus) { + s64 now = ktime_to_ms(ktime_get()); + s64 elapsed = now - cluster->last_isolate_ts; + + ret = elapsed >= cluster->offline_delay_ms; + } + + trace_core_ctl_eval_need(cluster->first_cpu, last_need, need_cpus, + ret && need_flag); + spin_unlock_irqrestore(&state_lock, flags); + + return ret && need_flag; +} + +static void apply_need(struct cluster_data *cluster) +{ + if (eval_need(cluster)) + wake_up_core_ctl_thread(cluster); +} + +static int core_ctl_set_busy(unsigned int cpu, unsigned int busy) +{ + struct cpu_data *c = &per_cpu(cpu_state, cpu); + struct cluster_data *cluster = c->cluster; + unsigned int old_is_busy = c->is_busy; + + if (!cluster || !cluster->inited) + return 0; + + update_running_avg(false); + if (c->busy == busy && !cluster->nrrun_changed) + return 0; + c->busy = busy; + cluster->nrrun_changed = false; + + apply_need(cluster); + trace_core_ctl_set_busy(cpu, busy, old_is_busy, c->is_busy); + return 0; +} + +/* ========================= core count enforcement ==================== */ + +static void wake_up_core_ctl_thread(struct cluster_data *cluster) +{ + unsigned long flags; + + spin_lock_irqsave(&cluster->pending_lock, flags); + cluster->pending = true; + spin_unlock_irqrestore(&cluster->pending_lock, flags); + + wake_up_process_no_notif(cluster->core_ctl_thread); +} + +static u64 core_ctl_check_timestamp; +static u64 core_ctl_check_interval; + +static bool do_check(u64 wallclock) +{ + bool do_check = false; + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + if ((wallclock - core_ctl_check_timestamp) >= core_ctl_check_interval) { + core_ctl_check_timestamp = wallclock; + do_check = true; + } + spin_unlock_irqrestore(&state_lock, flags); + return do_check; +} + +void core_ctl_set_boost(bool boost) +{ + unsigned int index = 0; + struct cluster_data *cluster; + + for_each_cluster(cluster, index) { + if (cluster->is_big_cluster && cluster->boost != boost) { + cluster->boost = boost; + apply_need(cluster); + } + } +} + +void core_ctl_check(u64 wallclock) +{ + if (unlikely(!initialized)) + return; + + if (do_check(wallclock)) { + unsigned int index = 0; + struct cluster_data *cluster; + + update_running_avg(true); + + for_each_cluster(cluster, index) { + if (eval_need(cluster)) + wake_up_core_ctl_thread(cluster); + } + } +} + +static void move_cpu_lru(struct cpu_data *cpu_data) +{ + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + list_del(&cpu_data->sib); + list_add_tail(&cpu_data->sib, &cpu_data->cluster->lru); + spin_unlock_irqrestore(&state_lock, flags); +} + +static void try_to_isolate(struct cluster_data *cluster, unsigned int need) +{ + struct cpu_data *c, *tmp; + + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!is_active(c)) + continue; + if (cluster->active_cpus == need) + break; + /* Don't offline busy CPUs. */ + if (c->is_busy) + continue; + + pr_debug("Trying to isolate CPU%u\n", c->cpu); + if (!sched_isolate_cpu(c->cpu)) { + c->isolated_by_us = true; + move_cpu_lru(c); + cluster->last_isolate_ts = ktime_to_ms(ktime_get()); + } else { + pr_debug("Unable to isolate CPU%u\n", c->cpu); + } + cluster->active_cpus = get_active_cpu_count(cluster); + } + + /* + * If the number of active CPUs is within the limits, then + * don't force isolation of any busy CPUs. + */ + if (cluster->active_cpus <= cluster->max_cpus) + return; + + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!is_active(c)) + continue; + if (cluster->active_cpus <= cluster->max_cpus) + break; + + pr_debug("Trying to isolate CPU%u\n", c->cpu); + if (!sched_isolate_cpu(c->cpu)) { + c->isolated_by_us = true; + move_cpu_lru(c); + cluster->last_isolate_ts = ktime_to_ms(ktime_get()); + } else { + pr_debug("Unable to isolate CPU%u\n", c->cpu); + } + cluster->active_cpus = get_active_cpu_count(cluster); + } +} + +static void __try_to_unisolate(struct cluster_data *cluster, + unsigned int need, bool force) +{ + struct cpu_data *c, *tmp; + + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!c->isolated_by_us) + continue; + if ((c->online && !cpu_isolated(c->cpu)) || + (!force && c->not_preferred)) + continue; + if (cluster->active_cpus == need) + break; + + pr_debug("Trying to unisolate CPU%u\n", c->cpu); + if (!sched_unisolate_cpu(c->cpu)) { + c->isolated_by_us = false; + move_cpu_lru(c); + } else { + pr_debug("Unable to unisolate CPU%u\n", c->cpu); + } + cluster->active_cpus = get_active_cpu_count(cluster); + } +} + +static void try_to_unisolate(struct cluster_data *cluster, unsigned int need) +{ + bool force_use_non_preferred = false; + + __try_to_unisolate(cluster, need, force_use_non_preferred); + + if (cluster->active_cpus == need) + return; + + force_use_non_preferred = true; + __try_to_unisolate(cluster, need, force_use_non_preferred); +} + +static void __ref do_core_ctl(struct cluster_data *cluster) +{ + unsigned int need; + + need = apply_limits(cluster, cluster->need_cpus); + + if (adjustment_possible(cluster, need)) { + pr_debug("Trying to adjust group %u from %u to %u\n", + cluster->first_cpu, cluster->active_cpus, need); + + if (cluster->active_cpus > need) + try_to_isolate(cluster, need); + else if (cluster->active_cpus < need) + try_to_unisolate(cluster, need); + } +} + +static int __ref try_core_ctl(void *data) +{ + struct cluster_data *cluster = data; + unsigned long flags; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + spin_lock_irqsave(&cluster->pending_lock, flags); + if (!cluster->pending) { + spin_unlock_irqrestore(&cluster->pending_lock, flags); + schedule(); + if (kthread_should_stop()) + break; + spin_lock_irqsave(&cluster->pending_lock, flags); + } + set_current_state(TASK_RUNNING); + cluster->pending = false; + spin_unlock_irqrestore(&cluster->pending_lock, flags); + + do_core_ctl(cluster); + } + + return 0; +} + +static int __ref cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + uint32_t cpu = (uintptr_t)hcpu; + struct cpu_data *state = &per_cpu(cpu_state, cpu); + struct cluster_data *cluster = state->cluster; + unsigned int need; + int ret = NOTIFY_OK; + + /* Don't affect suspend resume */ + if (action & CPU_TASKS_FROZEN) + return NOTIFY_OK; + + if (unlikely(!cluster || !cluster->inited)) + return NOTIFY_OK; + + switch (action) { + case CPU_UP_PREPARE: + + /* If online state of CPU somehow got out of sync, fix it. */ + if (state->online) { + state->online = false; + cluster->active_cpus = get_active_cpu_count(cluster); + pr_warn("CPU%d offline when state is online\n", cpu); + } + break; + + case CPU_ONLINE: + + state->online = true; + cluster->active_cpus = get_active_cpu_count(cluster); + + /* + * Moving to the end of the list should only happen in + * CPU_ONLINE and not on CPU_UP_PREPARE to prevent an + * infinite list traversal when thermal (or other entities) + * reject trying to online CPUs. + */ + move_cpu_lru(state); + break; + + case CPU_DEAD: + /* + * We don't want to have a CPU both offline and isolated. + * So unisolate a CPU that went down if it was isolated by us. + */ + if (state->isolated_by_us) { + sched_unisolate_cpu_unlocked(cpu); + state->isolated_by_us = false; + } + + /* Move a CPU to the end of the LRU when it goes offline. */ + move_cpu_lru(state); + + /* Fall through */ + + case CPU_UP_CANCELED: + + /* If online state of CPU somehow got out of sync, fix it. */ + if (!state->online) + pr_warn("CPU%d online when state is offline\n", cpu); + + state->online = false; + state->busy = 0; + cluster->active_cpus = get_active_cpu_count(cluster); + break; + } + + need = apply_limits(cluster, cluster->need_cpus); + if (adjustment_possible(cluster, need)) + wake_up_core_ctl_thread(cluster); + + return ret; +} + +static struct notifier_block __refdata cpu_notifier = { + .notifier_call = cpu_callback, +}; + +/* ============================ init code ============================== */ + +static struct cluster_data *find_cluster_by_first_cpu(unsigned int first_cpu) +{ + unsigned int i; + + for (i = 0; i < num_clusters; ++i) { + if (cluster_state[i].first_cpu == first_cpu) + return &cluster_state[i]; + } + + return NULL; +} + +static int cluster_init(const struct cpumask *mask) +{ + struct device *dev; + unsigned int first_cpu = cpumask_first(mask); + struct cluster_data *cluster; + struct cpu_data *state; + unsigned int cpu; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + + if (find_cluster_by_first_cpu(first_cpu)) + return 0; + + dev = get_cpu_device(first_cpu); + if (!dev) + return -ENODEV; + + pr_info("Creating CPU group %d\n", first_cpu); + + if (num_clusters == MAX_CLUSTERS) { + pr_err("Unsupported number of clusters. Only %u supported\n", + MAX_CLUSTERS); + return -EINVAL; + } + cluster = &cluster_state[num_clusters]; + ++num_clusters; + + cpumask_copy(&cluster->cpu_mask, mask); + cluster->num_cpus = cpumask_weight(mask); + if (cluster->num_cpus > MAX_CPUS_PER_CLUSTER) { + pr_err("HW configuration not supported\n"); + return -EINVAL; + } + cluster->first_cpu = first_cpu; + cluster->min_cpus = 1; + cluster->max_cpus = cluster->num_cpus; + cluster->need_cpus = cluster->num_cpus; + cluster->offline_delay_ms = 100; + cluster->task_thres = UINT_MAX; + cluster->nrrun = cluster->num_cpus; + INIT_LIST_HEAD(&cluster->lru); + spin_lock_init(&cluster->pending_lock); + + for_each_cpu(cpu, mask) { + pr_info("Init CPU%u state\n", cpu); + + state = &per_cpu(cpu_state, cpu); + state->cluster = cluster; + state->cpu = cpu; + if (cpu_online(cpu)) + state->online = true; + list_add_tail(&state->sib, &cluster->lru); + } + cluster->active_cpus = get_active_cpu_count(cluster); + + cluster->core_ctl_thread = kthread_run(try_core_ctl, (void *) cluster, + "core_ctl/%d", first_cpu); + if (IS_ERR(cluster->core_ctl_thread)) + return PTR_ERR(cluster->core_ctl_thread); + + sched_setscheduler_nocheck(cluster->core_ctl_thread, SCHED_FIFO, + ¶m); + + cluster->inited = true; + + kobject_init(&cluster->kobj, &ktype_core_ctl); + return kobject_add(&cluster->kobj, &dev->kobj, "core_ctl"); +} + +static int cpufreq_policy_cb(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_policy *policy = data; + int ret; + + switch (val) { + case CPUFREQ_CREATE_POLICY: + ret = cluster_init(policy->related_cpus); + if (ret) + pr_warn("unable to create core ctl group: %d\n", ret); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block cpufreq_pol_nb = { + .notifier_call = cpufreq_policy_cb, +}; + +static int cpufreq_gov_cb(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_govinfo *info = data; + + switch (val) { + case CPUFREQ_LOAD_CHANGE: + core_ctl_set_busy(info->cpu, info->load); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block cpufreq_gov_nb = { + .notifier_call = cpufreq_gov_cb, +}; + +static int __init core_ctl_init(void) +{ + unsigned int cpu; + + core_ctl_check_interval = (rq_avg_period_ms - RQ_AVG_TOLERANCE) + * NSEC_PER_MSEC; + + register_cpu_notifier(&cpu_notifier); + cpufreq_register_notifier(&cpufreq_pol_nb, CPUFREQ_POLICY_NOTIFIER); + cpufreq_register_notifier(&cpufreq_gov_nb, CPUFREQ_GOVINFO_NOTIFIER); + + lock_device_hotplug(); + for_each_online_cpu(cpu) { + struct cpufreq_policy *policy; + int ret; + + policy = cpufreq_cpu_get(cpu); + if (policy) { + ret = cluster_init(policy->related_cpus); + if (ret) + pr_warn("unable to create core ctl group: %d\n" + , ret); + cpufreq_cpu_put(policy); + } + } + unlock_device_hotplug(); + initialized = true; + return 0; +} + +late_initcall(core_ctl_init); diff --git a/kernel/sched/core_ctl.h b/kernel/sched/core_ctl.h new file mode 100644 index 000000000000..3b0c12acb9c0 --- /dev/null +++ b/kernel/sched/core_ctl.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef __CORE_CTL_H +#define __CORE_CTL_H + +#ifdef CONFIG_SCHED_CORE_CTL +void core_ctl_check(u64 wallclock); +void core_ctl_set_boost(bool boost); +#else +static inline void core_ctl_check(u64 wallclock) {} +static inline void core_ctl_set_boost(bool boost) {} +#endif +#endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e893b0fcac6b..83da13b5f6b8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2941,6 +2941,8 @@ static void find_best_cpu_in_cluster(struct sched_cluster *c, struct cpumask search_cpus; cpumask_and(&search_cpus, tsk_cpus_allowed(env->p), &c->cpus); + cpumask_andnot(&search_cpus, &search_cpus, cpu_isolated_mask); + if (env->ignore_prev_cpu) cpumask_clear_cpu(env->prev_cpu, &search_cpus); @@ -3009,7 +3011,8 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats) prev_cpu = env->prev_cpu; if (!cpumask_test_cpu(prev_cpu, tsk_cpus_allowed(task)) || - unlikely(!cpu_active(prev_cpu))) + unlikely(!cpu_active(prev_cpu)) || + cpu_isolated(prev_cpu)) return false; if (task->ravg.mark_start - task->last_cpu_selected_ts >= @@ -7354,6 +7357,8 @@ void update_group_capacity(struct sched_domain *sd, int cpu) struct sched_group_capacity *sgc; struct rq *rq = cpu_rq(cpu); + if (cpumask_test_cpu(cpu, cpu_isolated_mask)) + continue; /* * build_sched_domains() -> init_sched_groups_capacity() * gets here before we've attached the domains to the @@ -7381,7 +7386,11 @@ void update_group_capacity(struct sched_domain *sd, int cpu) group = child->groups; do { - capacity += group->sgc->capacity; + cpumask_t *cpus = sched_group_cpus(group); + + /* Revisit this later. This won't work for MT domain */ + if (!cpu_isolated(cpumask_first(cpus))) + capacity += group->sgc->capacity; group = group->next; } while (group != child->groups); } @@ -7521,6 +7530,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, power_cost(i, 0), cpu_temp(i)); + if (cpu_isolated(i)) + continue; + /* Bias balancing toward cpus of our domain */ if (local_group) load = target_load(i, load_idx); @@ -7548,17 +7560,27 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->idle_cpus++; } - /* Adjust by relative CPU capacity of the group */ - sgs->group_capacity = group->sgc->capacity; - sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; + /* Isolated CPU has no weight */ + if (!group->group_weight) { + sgs->group_capacity = 0; + sgs->avg_load = 0; + sgs->group_no_capacity = 1; + sgs->group_type = group_other; + sgs->group_weight = group->group_weight; + } else { + /* Adjust by relative CPU capacity of the group */ + sgs->group_capacity = group->sgc->capacity; + sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / + sgs->group_capacity; - if (sgs->sum_nr_running) - sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; + sgs->group_weight = group->group_weight; - sgs->group_weight = group->group_weight; + sgs->group_no_capacity = group_is_overloaded(env, sgs); + sgs->group_type = group_classify(group, sgs, env); + } - sgs->group_no_capacity = group_is_overloaded(env, sgs); - sgs->group_type = group_classify(group, sgs, env); + if (sgs->sum_nr_running) + sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; } #ifdef CONFIG_SCHED_HMP @@ -8601,6 +8623,9 @@ static int idle_balance(struct rq *this_rq) int pulled_task = 0; u64 curr_cost = 0; + if (cpu_isolated(this_cpu)) + return 0; + idle_enter_fair(this_rq); /* @@ -8908,16 +8933,21 @@ static void nohz_balancer_kick(int type) return; } +void nohz_balance_clear_nohz_mask(int cpu) +{ + if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); + atomic_dec(&nohz.nr_cpus); + } +} + static inline void nohz_balance_exit_idle(int cpu) { if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { /* * Completely isolated CPUs don't ever set, so we must test. */ - if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); - } + nohz_balance_clear_nohz_mask(cpu); clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } } @@ -8974,7 +9004,7 @@ void nohz_balance_enter_idle(int cpu) /* * If we're a completely isolated CPU, we don't play. */ - if (on_null_domain(cpu_rq(cpu))) + if (on_null_domain(cpu_rq(cpu)) || cpu_isolated(cpu)) return; cpumask_set_cpu(cpu, nohz.idle_cpus_mask); @@ -9003,7 +9033,13 @@ static DEFINE_SPINLOCK(balancing); */ void update_max_interval(void) { - max_load_balance_interval = HZ*num_online_cpus()/10; + cpumask_t avail_mask; + unsigned int available_cpus; + + cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask); + available_cpus = cpumask_weight(&avail_mask); + + max_load_balance_interval = HZ*available_cpus/10; } /* @@ -9342,8 +9378,10 @@ void trigger_load_balance(struct rq *rq) { int type = NOHZ_KICK_ANY; - /* Don't need to rebalance while attached to NULL domain */ - if (unlikely(on_null_domain(rq))) + /* Don't need to rebalance while attached to NULL domain or + * cpu is isolated. + */ + if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq))) return; if (time_after_eq(jiffies, rq->next_balance)) diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index 5002619961ce..6e1757aa1541 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -19,6 +19,7 @@ #include <linux/syscore_ops.h> #include "sched.h" +#include "core_ctl.h" #include <trace/events/sched.h> @@ -1090,6 +1091,8 @@ int sched_set_boost(int enable) if (!old_refcount && boost_refcount) boost_kick_cpus(); + if (boost_refcount <= 1) + core_ctl_set_boost(boost_refcount == 1); trace_sched_set_boost(boost_refcount); spin_unlock_irqrestore(&boost_lock, flags); @@ -1499,28 +1502,10 @@ int sched_hmp_proc_update_handler(struct ctl_table *table, int write, if (write && (old_val == *data)) goto done; - /* - * Special handling for sched_freq_aggregate_threshold_pct - * which can be greater than 100. Use 1000 as an upper bound - * value which works for all practical use cases. - */ - if (data == &sysctl_sched_freq_aggregate_threshold_pct) { - if (*data > 1000) { - *data = old_val; - ret = -EINVAL; - goto done; - } - } else if (data != &sysctl_sched_select_prev_cpu_us) { - /* - * all tunables other than sched_select_prev_cpu_us are - * in percentage. - */ - if (sysctl_sched_downmigrate_pct > - sysctl_sched_upmigrate_pct || *data > 100) { - *data = old_val; - ret = -EINVAL; - goto done; - } + if (sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct) { + *data = old_val; + ret = -EINVAL; + goto done; } /* @@ -2828,10 +2813,10 @@ void set_window_start(struct rq *rq) rq->curr->ravg.mark_start = rq->window_start; } -void migrate_sync_cpu(int cpu) +void migrate_sync_cpu(int cpu, int new_cpu) { if (cpu == sync_cpu) - sync_cpu = smp_processor_id(); + sync_cpu = new_cpu; } static void reset_all_task_stats(void) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index cfec881491ef..ba4403e910d8 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -265,8 +265,12 @@ static void pull_rt_task(struct rq *this_rq); static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) { - /* Try to pull RT tasks here if we lower this rq's prio */ - return rq->rt.highest_prio.curr > prev->prio; + /* + * Try to pull RT tasks here if we lower this rq's prio and cpu is not + * isolated + */ + return rq->rt.highest_prio.curr > prev->prio && + !cpu_isolated(cpu_of(rq)); } static inline int rt_overloaded(struct rq *rq) @@ -1694,6 +1698,8 @@ static int find_lowest_rq_hmp(struct task_struct *task) for_each_sched_cluster(cluster) { cpumask_and(&candidate_mask, &cluster->cpus, lowest_mask); + cpumask_andnot(&candidate_mask, &candidate_mask, + cpu_isolated_mask); if (cpumask_empty(&candidate_mask)) continue; @@ -2282,7 +2288,8 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) + if (!task_on_rq_queued(p) || rq->rt.rt_nr_running || + cpu_isolated(cpu_of(rq))) return; queue_pull_task(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec7721112b05..41abb4dabeb7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1069,7 +1069,7 @@ extern void clear_boost_kick(int cpu); extern void clear_hmp_request(int cpu); extern void mark_task_starting(struct task_struct *p); extern void set_window_start(struct rq *rq); -extern void migrate_sync_cpu(int cpu); +extern void migrate_sync_cpu(int cpu, int new_cpu); extern void update_cluster_topology(void); extern void set_task_last_wake(struct task_struct *p, u64 wallclock); extern void set_task_last_switch_out(struct task_struct *p, u64 wallclock); @@ -1424,7 +1424,7 @@ static inline void clear_boost_kick(int cpu) { } static inline void clear_hmp_request(int cpu) { } static inline void mark_task_starting(struct task_struct *p) { } static inline void set_window_start(struct rq *rq) { } -static inline void migrate_sync_cpu(int cpu) { } +static inline void migrate_sync_cpu(int cpu, int new_cpu) {} static inline void update_cluster_topology(void) { } static inline void set_task_last_wake(struct task_struct *p, u64 wallclock) { } static inline void set_task_last_switch_out(struct task_struct *p, @@ -1953,6 +1953,7 @@ extern const struct sched_class idle_sched_class; extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); +extern void nohz_balance_clear_nohz_mask(int cpu); extern void idle_enter_fair(struct rq *this_rq); extern void idle_exit_fair(struct rq *this_rq); diff --git a/kernel/smp.c b/kernel/smp.c index abdc48cd79a3..b2ec21c5c9d6 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -766,8 +766,8 @@ void wake_up_all_idle_cpus(void) for_each_online_cpu(cpu) { if (cpu == smp_processor_id()) continue; - - wake_up_if_idle(cpu); + if (!cpu_isolated(cpu)) + wake_up_if_idle(cpu); } preempt_enable(); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 07fef40d1274..dad3324e7372 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -130,6 +130,9 @@ static int one_hundred = 100; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; #endif +#ifdef CONFIG_SCHED_HMP +static int one_thousand = 1000; +#endif /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; @@ -336,6 +339,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + .extra2 = &one_hundred, }, { .procname = "sched_spill_nr_run", @@ -351,6 +356,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + .extra2 = &one_hundred, }, { .procname = "sched_downmigrate", @@ -358,6 +365,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + .extra2 = &one_hundred, }, { .procname = "sched_init_task_load", @@ -365,6 +374,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + .extra2 = &one_hundred, }, { .procname = "sched_select_prev_cpu_us", @@ -372,6 +383,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, }, { .procname = "sched_enable_colocation", @@ -397,6 +409,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + .extra2 = &one_hundred, }, { .procname = "sched_big_waker_task_load", @@ -404,6 +418,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + .extra2 = &one_hundred, }, { .procname = "sched_enable_thread_grouping", @@ -440,6 +456,13 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + /* + * Special handling for sched_freq_aggregate_threshold_pct + * which can be greater than 100. Use 1000 as an upper bound + * value which works for all practical use cases. + */ + .extra2 = &one_thousand, }, { .procname = "sched_boost", diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index fa909f9fd559..1b0117198a08 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -880,7 +880,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, base->cpu_base->active_bases |= 1 << base->index; - timer->state = HRTIMER_STATE_ENQUEUED; + timer->state |= HRTIMER_STATE_ENQUEUED; return timerqueue_add(&base->active, &timer->node); } @@ -900,11 +900,9 @@ static void __remove_hrtimer(struct hrtimer *timer, u8 newstate, int reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; - u8 state = timer->state; - timer->state = newstate; - if (!(state & HRTIMER_STATE_ENQUEUED)) - return; + if (!(timer->state & HRTIMER_STATE_ENQUEUED)) + goto out; if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); @@ -921,6 +919,13 @@ static void __remove_hrtimer(struct hrtimer *timer, if (reprogram && timer == cpu_base->next_timer) hrtimer_force_reprogram(cpu_base, 1); #endif + +out: + /* + * We need to preserve PINNED state here, otherwise we may end up + * migrating pinned hrtimers as well. + */ + timer->state = newstate | (timer->state & HRTIMER_STATE_PINNED); } /* @@ -949,6 +954,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest state = HRTIMER_STATE_INACTIVE; __remove_hrtimer(timer, base, state, reprogram); + timer->state &= ~HRTIMER_STATE_PINNED; return 1; } return 0; @@ -1002,6 +1008,10 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, timer_stats_hrtimer_set_start_info(timer); + /* Update pinned state */ + timer->state &= ~HRTIMER_STATE_PINNED; + timer->state |= (!!(mode & HRTIMER_MODE_PINNED)) << HRTIMER_PINNED_SHIFT; + leftmost = enqueue_hrtimer(timer, new_base); if (!leftmost) goto unlock; @@ -1176,8 +1186,8 @@ bool hrtimer_active(const struct hrtimer *timer) cpu_base = READ_ONCE(timer->base->cpu_base); seq = raw_read_seqcount_begin(&cpu_base->seq); - if (timer->state != HRTIMER_STATE_INACTIVE || - cpu_base->running == timer) + if (((timer->state & ~HRTIMER_STATE_PINNED) != + HRTIMER_STATE_INACTIVE) || cpu_base->running == timer) return true; } while (read_seqcount_retry(&cpu_base->seq, seq) || @@ -1614,13 +1624,17 @@ static void init_hrtimers_cpu(int cpu) hrtimer_init_hres(cpu_base); } -#ifdef CONFIG_HOTPLUG_CPU - +#if defined(CONFIG_HOTPLUG_CPU) static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, - struct hrtimer_clock_base *new_base) + struct hrtimer_clock_base *new_base, + bool remove_pinned) { struct hrtimer *timer; struct timerqueue_node *node; + struct timerqueue_head pinned; + int is_pinned; + + timerqueue_init_head(&pinned); while ((node = timerqueue_getnext(&old_base->active))) { timer = container_of(node, struct hrtimer, node); @@ -1633,6 +1647,13 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * under us on another CPU */ __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); + + is_pinned = timer->state & HRTIMER_STATE_PINNED; + if (!remove_pinned && is_pinned) { + timerqueue_add(&pinned, &timer->node); + continue; + } + timer->base = new_base; /* * Enqueue the timers on the new cpu. This does not @@ -1644,17 +1665,23 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, */ enqueue_hrtimer(timer, new_base); } + + /* Re-queue pinned timers for non-hotplug usecase */ + while ((node = timerqueue_getnext(&pinned))) { + timer = container_of(node, struct hrtimer, node); + + timerqueue_del(&pinned, &timer->node); + enqueue_hrtimer(timer, old_base); + } } -static void migrate_hrtimers(int scpu) +static void __migrate_hrtimers(int scpu, bool remove_pinned) { struct hrtimer_cpu_base *old_base, *new_base; + unsigned long flags; int i; - BUG_ON(cpu_online(scpu)); - tick_cancel_sched_timer(scpu); - - local_irq_disable(); + local_irq_save(flags); old_base = &per_cpu(hrtimer_bases, scpu); new_base = this_cpu_ptr(&hrtimer_bases); /* @@ -1666,7 +1693,7 @@ static void migrate_hrtimers(int scpu) for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], - &new_base->clock_base[i]); + &new_base->clock_base[i], remove_pinned); } raw_spin_unlock(&old_base->lock); @@ -1674,7 +1701,20 @@ static void migrate_hrtimers(int scpu) /* Check, if we got expired work to do */ __hrtimer_peek_ahead_timers(); - local_irq_enable(); + local_irq_restore(flags); +} + +static void migrate_hrtimers(int scpu) +{ + BUG_ON(cpu_online(scpu)); + tick_cancel_sched_timer(scpu); + + __migrate_hrtimers(scpu, true); +} + +void hrtimer_quiesce_cpu(void *cpup) +{ + __migrate_hrtimers(*(int *)cpup, false); } #endif /* CONFIG_HOTPLUG_CPU */ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 51896272fcde..0efb3916f5a4 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1620,56 +1620,86 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) } EXPORT_SYMBOL(schedule_timeout_uninterruptible); -#ifdef CONFIG_HOTPLUG_CPU -static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) +#if defined(CONFIG_HOTPLUG_CPU) +static void migrate_timer_list(struct tvec_base *new_base, + struct hlist_head *head, bool remove_pinned) { struct timer_list *timer; int cpu = new_base->cpu; + struct hlist_node *n; + int is_pinned; - while (!hlist_empty(head)) { - timer = hlist_entry(head->first, struct timer_list, entry); - /* We ignore the accounting on the dying cpu */ - detach_timer(timer, false); + hlist_for_each_entry_safe(timer, n, head, entry) { + is_pinned = timer->flags & TIMER_PINNED_ON_CPU; + if (!remove_pinned && is_pinned) + continue; + + detach_if_pending(timer, get_timer_base(timer->flags), false); timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; internal_add_timer(new_base, timer); } } -static void migrate_timers(int cpu) +static void __migrate_timers(int cpu, bool wait, bool remove_pinned) { struct tvec_base *old_base; struct tvec_base *new_base; + unsigned long flags; int i; - BUG_ON(cpu_online(cpu)); old_base = per_cpu_ptr(&tvec_bases, cpu); new_base = get_cpu_ptr(&tvec_bases); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - spin_lock_irq(&new_base->lock); + spin_lock_irqsave(&new_base->lock, flags); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); - BUG_ON(old_base->running_timer); + if (wait) { + /* Ensure timers are done running before continuing */ + while (old_base->running_timer) { + spin_unlock(&old_base->lock); + spin_unlock_irqrestore(&new_base->lock, flags); + cpu_relax(); + spin_lock_irqsave(&new_base->lock, flags); + spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + } + } else { + BUG_ON(old_base->running_timer); + } for (i = 0; i < TVR_SIZE; i++) - migrate_timer_list(new_base, old_base->tv1.vec + i); + migrate_timer_list(new_base, old_base->tv1.vec + i, + remove_pinned); for (i = 0; i < TVN_SIZE; i++) { - migrate_timer_list(new_base, old_base->tv2.vec + i); - migrate_timer_list(new_base, old_base->tv3.vec + i); - migrate_timer_list(new_base, old_base->tv4.vec + i); - migrate_timer_list(new_base, old_base->tv5.vec + i); + migrate_timer_list(new_base, old_base->tv2.vec + i, + remove_pinned); + migrate_timer_list(new_base, old_base->tv3.vec + i, + remove_pinned); + migrate_timer_list(new_base, old_base->tv4.vec + i, + remove_pinned); + migrate_timer_list(new_base, old_base->tv5.vec + i, + remove_pinned); } - old_base->active_timers = 0; - old_base->all_timers = 0; - spin_unlock(&old_base->lock); - spin_unlock_irq(&new_base->lock); + spin_unlock_irqrestore(&new_base->lock, flags); put_cpu_ptr(&tvec_bases); } +/* Migrate timers from 'cpu' to this_cpu */ +static void migrate_timers(int cpu) +{ + BUG_ON(cpu_online(cpu)); + __migrate_timers(cpu, false, true); +} + +void timer_quiesce_cpu(void *cpup) +{ + __migrate_timers(*(int *)cpup, true, false); +} + static int timer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index 9270e1ac6460..49fa2e6eea98 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -15,5 +15,3 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); -EXPORT_TRACEPOINT_SYMBOL(core_ctl_set_busy); -EXPORT_TRACEPOINT_SYMBOL(core_ctl_eval_need); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 029da92fb712..7f21591c8ec5 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -13,6 +13,7 @@ #include <linux/mm.h> #include <linux/cpu.h> +#include <linux/device.h> #include <linux/nmi.h> #include <linux/init.h> #include <linux/module.h> @@ -95,6 +96,7 @@ static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); +static DEFINE_PER_CPU(unsigned int, watchdog_en); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(bool, soft_watchdog_warn); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); @@ -586,9 +588,17 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio) sched_setscheduler(current, policy, ¶m); } -static void watchdog_enable(unsigned int cpu) +/* Must be called with hotplug lock (lock_device_hotplug()) held. */ +void watchdog_enable(unsigned int cpu) { struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); + unsigned int *enabled = raw_cpu_ptr(&watchdog_en); + + lock_device_hotplug_assert(); + + if (*enabled) + return; + *enabled = 1; /* kick off the timer for the hardlockup detector */ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); @@ -606,9 +616,17 @@ static void watchdog_enable(unsigned int cpu) __touch_watchdog(); } -static void watchdog_disable(unsigned int cpu) +/* Must be called with hotplug lock (lock_device_hotplug()) held. */ +void watchdog_disable(unsigned int cpu) { struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); + unsigned int *enabled = raw_cpu_ptr(&watchdog_en); + + lock_device_hotplug_assert(); + + if (!*enabled) + return; + *enabled = 0; watchdog_set_prio(SCHED_NORMAL, 0); hrtimer_cancel(hrtimer); |
