diff options
| author | Linux Build Service Account <lnxbuild@localhost> | 2016-11-19 05:39:11 -0700 |
|---|---|---|
| committer | Linux Build Service Account <lnxbuild@localhost> | 2016-11-19 05:39:11 -0700 |
| commit | 3c45c2a8a2a07a76e2d129d02561d061211e70c8 (patch) | |
| tree | d577a760af70325db6038640d89264ad05b14ca2 /kernel | |
| parent | 9adece3859ce8e9723b43dfc722bcc1bafdcfb2e (diff) | |
| parent | 599e3b86154aa2d3d367e6326b0ffc6e4f76d020 (diff) | |
Promotion of kernel.lnx.4.4-161119.
CRs Change ID Subject
--------------------------------------------------------------------------------------------------------------
1088658 I2f994ae0250ffc8f740ea633324815ae429c74be msm: ipa3: linearize large skbs
1077102 I09359b528b4742f72a76690930f3d0ed90bb2caa msm: mdss: move warnings and errors out of mdss spinlock
1089895 I84185558fa6e80b13d7d0078bda9d75143680941 tcp: take care of truncations done by sk_filter()
1091511 Ia151b2dd5229f07790ac961af298305b24e098fb msm: wlan: update regulatory database
1081957 I24820bd6254002f8a8db9604d230dcbce59b1beb clk: qcom: Add support to be able to slew PLL
1081738 I10a788726358c56df9bfe11f2332e3823d7cd332 ARM: dts: msm: Enable auto GM for WLED in pmicobalt
1077726 I031ca48f0e0c39f1b2cb51081ecd55b086fb4c9b msm: mdss: fix pp timeout during transition from LP1 to
1074985 Ib2268181a617c23d62b5b6f857be5327113b2a67 soc: qcom: smem: Redesign smem memory architecture
1090708 I9cda84d1c199b72ce8b9e2997601bcc7430ddbf3 ARM: dts: msm: Update the console uart gpios for msmfalc
1080245 I3b4cf83e776750d993d53331142223109bf0862e clk: qcom: Add support for debugfs support
1087110 I3694952289c76394af8d40cd89fd2175f49ac127 msm: mdss: Add systrace for readptr_done
1089865 Ia73ab1ba51df7b501d246bb45141018409496d01 ARM: dts: msm: ensure contiguous MSI for PCIe on msmcoba
941978 Idee8691d769218d7e732c9b7f936a2c40946b239 Revert "scsi: ufs: stub UFS shutdown handler"
1091072 I7e9ada5de1f619c6a34a4b2e1764f5e908564ce5 iio: rradc: Update reading USBIN_V channel
1075082 I971e555ec8d02ccf4382e83132a696b065a8ff12 qseecom: improve error checks in qseecom_probe()
1080245 Ib67b3a3409c9e7d8adb710bb524f54f543abf712 clk: add/modify debugfs support for clocks
941978 Id499abc27303bfed72fab4d61abb872bad7d9043 scsi: ufs: error out all issued requests after shutdown
1083537 I73fc02b812f2e6694e2a6aa8bdad2381a5f19406 ASoC: msm: Fix sound card registration failure
1085331 I92e98ab46107fbcfd843898423b41716a204c2ae ARM: dts: msm: Correct interrupt assignments for msmcoba
1073250 Idc9ca896b3fe6c1c6a72a066a6e453d27a3173e8 Asoc: clean up bootup errors
1091147 I30b8488a1c19815601e6a1c5bcbdeed53715f8fa usb: phy: qusb: Make sure QUSB PHY is into proper state
1086292 I6482dc3d21fdc3e570fd53022e2fb9427668d939 msm: mdss: add null check before dereferencing src_fmt
1086292 I4812330453dedacd16dad1d920a2bacc3f67042b msm: mdss: fix race condition in dsi clk off request
1088709 I21e1c029e6b245cfa26a187b35bb1f6845302484 clk: msm: Add the CLKFLAG_NO_RATE_CACHE flag for MM cloc
1082112 I171c91e700c24ecc213ccda705bbe6188d22a43a scsi: ufs: fix sleep in atomic context
1091354 I9f928f0aad6af346de43965755beb039e422047a Revert "defconfig: msm: avoid compilation of MDSS DP dri
1090727 I78d2c27743d30b90a96e3d8df60859f67db7ddb8 ARM: dts: msm: Add ufs regulators for msmfalcon interpos
1090029 I66f6de42b106fa2027285e7393b6f9fc143d00d8 leds: qpnp-flash: Fix the mask in the flash prepare API
1089181 I4a382915a6c3a6b9d445ec1f5d57fb499a011f1a driver: thermal: msm_thermal: Enable Reliability algorit
1079438 Ib14c5b9121190dded5071ff60ecf0be8e5e5c232 ARM: dts: msm: Add physical dimensions for NT35597 panel
1060212 Iabe79bae5f9471c3c6128ed21efd04de00739daa leds: qpnp-flash-v2: Add support for thermal derate feat
1091127 I7220ad565212c325514301e4c59415b807deb99a ARM: dts: msm: Add gladiator support on msmfalcon and ms
1091440 I0eb8b9a357f172984612175d1b03dd872df91b6f diag: Call diagmem_exit only if the mempool is initializ
1090076 Ia85688854f26fe871d5c1253c2d51d75d84deb8f ARM: dts: msm: Add dummy regulator for LCDB bias
1064071 Ic0dedbad372fd9029b932dd99633a650049751ed msm: kgsl: Fix pagetable member of struct kgsl_memdesc
1083537 I3d2765535793d6ef9153cfcab4b44a9adad67e15 ASoC: msm: Add support for USB/WCN/TDM Audio
1091141 I6ce48512df5973bf8a2a3081a3a6f8759aeb499f ARM: dts: msm: Set USB core clock rate for USB2/USB3 for
1060212 Ie7a94f59e58b8f1b0816afda2496449694629205 leds: qpnp-flash-v2: add support to read pmic revid
1080701 If08ff46e72d537254e90707f28c849a86f262853 ARM: dts: msm: specify I2C configuration for msmfalcon
1079442 I822d6280b301b2db6194c845098c935e612ca61c ASoC: wcd934x: Fix adie loopback through sidetone src pa
1089895 Idc52737bc96097a9220dfe47bb76e94ff1026a05 rose: limit sk_filter trim to payload
1091147 Ibfecfe1846d02b959bd249acac3fe4c57b88aaf0 USB: phy: qusb: Turn on vdd along with 1p8/3p3 LDOs when
1090701 I0e06be169edc2eb1d35ef7fc6c41ff1809aebd03 pinctrl: qcom: msmfalcon: Update gpios as per latest gpi
1086292 I422d53d008223a9b0520f499e629f681bb6afa05 mdss: mdp: avoid panic if recovery handler is uninitiali
1060212 I42503ccd2b2dcc62c5c868132d202b9698c9d216 leds: qpnp-flash-v2: change from dev_*() to pr_*() for l
1090076 Ie828c8568ef09c89cff157d16d3cb322647b6f6e ARM: dts: msm: enable mdss power supplies for falcon tra
1074879 I8d224a70cbef162f27078b62b73acaa22670861d sched/hmp: Enhance co-location and scheduler boost featu
1087471 I15323e3ef91401142d3841db59c18fd8fee753fd sched: Remove thread group iteration from colocation
1085170 Ie23d473302d7fbda9b243a150e5c52d025007e4f usb: pd: Stop processing SVDM if handler found
1091540 I61523188f45daca026b90943c845b43a8327f51e qcom-charger: smb2: Disable try.SINK mode in the probe
1081738 Iee99e9d1b999c84ece075d2f17e9cdf6aef9a2ac leds: qpnp-wled: Add support to configure AUTO_GM settin
1081922 I9aa7a000e75b50c6b26970deaba2131c87087b8c msm: mdss: fix autorefresh disable during handoff
1075694 I9cf2f94892bdeb83fab0068902419b1603520364 msm: kgsl: preserve ISENSE registers across GPU power co
1085321 1085649 I3c9422f3a790c0c1633ab64d4213a088faaeb9e5 diag: Set the diag write buffers to busy state on channe
1090311 I96cdcb9e3642906b4afa08d9bde07e123d9b3977 USB: Allow skipping device resume during system resume
1074879 I470bcd0588e038b4a540d337fe6a412f2fa74920 sched: revise boost logic when boost_type is SCHED_BOOST
1087020 I6f9b7a630158355a7f920dcf9cfffe537b1c6a85 ASoC: msm: q6dspv2: fix potentional information leak
1089062 Icb04f6175b66fa46405e77d10fddf06b0051ee5f phy: qcom-ufs: update ufs phy 1-lane settings
1082590 I4cdcbd31b5fa5ceac0eea7c743ea9286f231b80b scsi: ufs: handle LINERESET during hibern8
1081738 I964b3452d0cdb3618b4ab446655ae75fa3a1049d leds: qpnp-wled: Add support to configure auto PFM for p
1080245 I936496e553bc958c10e743fd8a225ffc7fbc0f79 clk: Add support to allow client to print all enabled cl
1079373 Ifd7b2b88e7ab4c952b743fede6e24795069d653a qcom-charger: WA for legacy bit set on hard reboot
1090518 I7f1c0d9d84607821893a1e5d17934dae5acef5f4 clk: qcom: Add support for RCGs with dynamic and fixed s
1089865 I1e74f1b03c3e15880efdac7ff07aca2f628de99d ARM: dts: msm: enable QGIC MSI for PCIe on msmcobalt
1088059 I66cbe48b7f4910228a6af57610a8427fea7fd1f2 msm: mdss: fix incorrect mutex unlocking during NOTIFY_U
1087418 Ia3fb69dca00654dacd8d1faae34715e40e097480 scsi: ufs: enable auto hibern8 only after device initial
1088216 I326eceeddff8e77d346c3365fa46cd539324451f ARM: dts: msm: Add support for USB device for msmfalcon
1060212 Iafb7915e196a18b5f8076dda8fb06a4bd71a8e6e leds: qpnp-flash-v2: Add support for configuring OTST th
1086372 Ia03380dfa4852c80fedb38f3c79f55d8d1a9a7f6 icnss: Reset mpm_wcssaon_config bits before top level re
1080245 I0a202af6f46c7cf164036d65487db5c40aab4063 clk: Add support for list_rates ops for clocks
1091477 I7435f05f20e12a7704ae5d9597b5cdc9b5a61d00 qcom-charger: Change usb_icl votable for PD vote
1089062 Ief5df61d91fbd765c595533b3380a602a2540e5e scsi: ufs-qcom: update clock scaling sequence
1085217 I62de66e9b0bb1eeeac3c94d1ac1037285811b631 msm: ipa3: header file change for wdi-stats
1080674 I15ef73049cee76c6ea5b3916d9281bbd9fdfc563 ARM: dts: msm: specify UART configuration on msmfalcon.
1090525 I48c50bc320425c0db40cd4865e05c6b7a7fb5da3 msm: sde: remove secure camera ctrl_id definition
1061507 Iad71abbed72aa40b5c839260f5c297a885f7d128 ASoC: wcd-mbhc: correct cross connection check
1085064 Ib53902459646e590df4dc7fcb00f833d5e8f41ed usb: pd: Don't suspend charging unless changing voltages
1064071 Ic0dedbad661143977a226d50263c26b5af579ce3 msm: kgsl: Make sure USE_CPU_MAP + MAP_USER_MEM work tog
1090862 987021 I0d1797a4df9ff67f3b162a1b5d26320ca989f54a msm: mdss: hide additional kernel addresses from unprivi
Change-Id: Ic6272ada932975c2562cb87d4a617520002db3d3
CRs-Fixed: 1082112, 1075694, 1091440, 1085331, 1089062, 1081922, 1089895, 1077726, 1090029, 1061507, 1091354, 1074879, 987021, 1086292, 1085217, 1087020, 1080245, 1088709, 1089181, 1085064, 1087471, 1088059, 1080674, 1090862, 1079442, 1087418, 1090727, 1085649, 1064071, 1081738, 1086372, 941978, 1090518, 1090708, 1077102, 1090076, 1085321, 1091477, 1090701, 1090311, 1091511, 1091141, 1074985, 1079438, 1091147, 1075082, 1091127, 1087110, 1082590, 1081957, 1090525, 1085170, 1088658, 1080701, 1083537, 1091540, 1088216, 1079373, 1060212, 1073250, 1089865, 1091072
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/sched/Makefile | 2 | ||||
| -rw-r--r-- | kernel/sched/boost.c | 226 | ||||
| -rw-r--r-- | kernel/sched/core.c | 3 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 153 | ||||
| -rw-r--r-- | kernel/sched/hmp.c | 512 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 12 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 47 | ||||
| -rw-r--r-- | kernel/sched/tune.c | 184 | ||||
| -rw-r--r-- | kernel/sysctl.c | 19 |
9 files changed, 783 insertions, 375 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 7d0d34c53e08..7c0382a3eace 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -15,7 +15,7 @@ obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o obj-y += wait.o completion.o idle.o sched_avg.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o -obj-$(CONFIG_SCHED_HMP) += hmp.o +obj-$(CONFIG_SCHED_HMP) += hmp.o boost.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/boost.c b/kernel/sched/boost.c new file mode 100644 index 000000000000..fcfda385b74a --- /dev/null +++ b/kernel/sched/boost.c @@ -0,0 +1,226 @@ +/* Copyright (c) 2012-2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include "sched.h" +#include <linux/of.h> +#include <linux/sched/core_ctl.h> +#include <trace/events/sched.h> + +/* + * Scheduler boost is a mechanism to temporarily place tasks on CPUs + * with higher capacity than those where a task would have normally + * ended up with their load characteristics. Any entity enabling + * boost is responsible for disabling it as well. + */ + +unsigned int sysctl_sched_boost; +static enum sched_boost_policy boost_policy; +static enum sched_boost_policy boost_policy_dt = SCHED_BOOST_NONE; +static DEFINE_MUTEX(boost_mutex); +static unsigned int freq_aggr_threshold_backup; + +static inline void boost_kick(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (!test_and_set_bit(BOOST_KICK, &rq->hmp_flags)) + smp_send_reschedule(cpu); +} + +static void boost_kick_cpus(void) +{ + int i; + struct cpumask kick_mask; + + if (boost_policy != SCHED_BOOST_ON_BIG) + return; + + cpumask_andnot(&kick_mask, cpu_online_mask, cpu_isolated_mask); + + for_each_cpu(i, &kick_mask) { + if (cpu_capacity(i) != max_capacity) + boost_kick(i); + } +} + +int got_boost_kick(void) +{ + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + + return test_bit(BOOST_KICK, &rq->hmp_flags); +} + +void clear_boost_kick(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + clear_bit(BOOST_KICK, &rq->hmp_flags); +} + +/* + * Scheduler boost type and boost policy might at first seem unrelated, + * however, there exists a connection between them that will allow us + * to use them interchangeably during placement decisions. We'll explain + * the connection here in one possible way so that the implications are + * clear when looking at placement policies. + * + * When policy = SCHED_BOOST_NONE, type is either none or RESTRAINED + * When policy = SCHED_BOOST_ON_ALL or SCHED_BOOST_ON_BIG, type can + * neither be none nor RESTRAINED. + */ +static void set_boost_policy(int type) +{ + if (type == SCHED_BOOST_NONE || type == RESTRAINED_BOOST) { + boost_policy = SCHED_BOOST_NONE; + return; + } + + if (boost_policy_dt) { + boost_policy = boost_policy_dt; + return; + } + + if (min_possible_efficiency != max_possible_efficiency) { + boost_policy = SCHED_BOOST_ON_BIG; + return; + } + + boost_policy = SCHED_BOOST_ON_ALL; +} + +enum sched_boost_policy sched_boost_policy(void) +{ + return boost_policy; +} + +static bool verify_boost_params(int old_val, int new_val) +{ + /* + * Boost can only be turned on or off. There is no possiblity of + * switching from one boost type to another or to set the same + * kind of boost several times. + */ + return !(!!old_val == !!new_val); +} + +static void _sched_set_boost(int old_val, int type) +{ + switch (type) { + case NO_BOOST: + if (old_val == FULL_THROTTLE_BOOST) + core_ctl_set_boost(false); + else if (old_val == CONSERVATIVE_BOOST) + restore_cgroup_boost_settings(); + else + update_freq_aggregate_threshold( + freq_aggr_threshold_backup); + break; + + case FULL_THROTTLE_BOOST: + core_ctl_set_boost(true); + boost_kick_cpus(); + break; + + case CONSERVATIVE_BOOST: + update_cgroup_boost_settings(); + boost_kick_cpus(); + break; + + case RESTRAINED_BOOST: + freq_aggr_threshold_backup = + update_freq_aggregate_threshold(1); + break; + + default: + WARN_ON(1); + return; + } + + set_boost_policy(type); + sysctl_sched_boost = type; + trace_sched_set_boost(type); +} + +void sched_boost_parse_dt(void) +{ + struct device_node *sn; + const char *boost_policy; + + if (!sched_enable_hmp) + return; + + sn = of_find_node_by_path("/sched-hmp"); + if (!sn) + return; + + if (!of_property_read_string(sn, "boost-policy", &boost_policy)) { + if (!strcmp(boost_policy, "boost-on-big")) + boost_policy_dt = SCHED_BOOST_ON_BIG; + else if (!strcmp(boost_policy, "boost-on-all")) + boost_policy_dt = SCHED_BOOST_ON_ALL; + } +} + +int sched_set_boost(int type) +{ + int ret = 0; + + if (!sched_enable_hmp) + return -EINVAL; + + mutex_lock(&boost_mutex); + + if (verify_boost_params(sysctl_sched_boost, type)) + _sched_set_boost(sysctl_sched_boost, type); + else + ret = -EINVAL; + + mutex_unlock(&boost_mutex); + return ret; +} + +int sched_boost_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + unsigned int *data = (unsigned int *)table->data; + unsigned int old_val; + + if (!sched_enable_hmp) + return -EINVAL; + + mutex_lock(&boost_mutex); + + old_val = *data; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (ret || !write) + goto done; + + if (verify_boost_params(old_val, *data)) { + _sched_set_boost(old_val, *data); + } else { + *data = old_val; + ret = -EINVAL; + } + +done: + mutex_unlock(&boost_mutex); + return ret; +} + +int sched_boost(void) +{ + return sysctl_sched_boost; +} diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 84563da000cf..a5d101e8a5f2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7846,7 +7846,6 @@ void __init sched_init_smp(void) hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); update_cluster_topology(); - init_sched_hmp_boost_policy(); init_hrtick(); @@ -7895,7 +7894,7 @@ void __init sched_init(void) BUG_ON(num_possible_cpus() > BITS_PER_LONG); - sched_hmp_parse_dt(); + sched_boost_parse_dt(); init_clusters(); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1674b1054f83..3db77aff2433 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2596,6 +2596,7 @@ static u32 __compute_runnable_contrib(u64 n) #define SBC_FLAG_COLOC_CLUSTER 0x10000 #define SBC_FLAG_WAKER_CLUSTER 0x20000 #define SBC_FLAG_BACKUP_CLUSTER 0x40000 +#define SBC_FLAG_BOOST_CLUSTER 0x80000 struct cpu_select_env { struct task_struct *p; @@ -2605,7 +2606,7 @@ struct cpu_select_env { u8 need_waker_cluster:1; u8 sync:1; u8 ignore_prev_cpu:1; - enum sched_boost_type boost_type; + enum sched_boost_policy boost_policy; int prev_cpu; DECLARE_BITMAP(candidate_list, NR_CPUS); DECLARE_BITMAP(backup_list, NR_CPUS); @@ -2705,10 +2706,38 @@ select_least_power_cluster(struct cpu_select_env *env) struct sched_cluster *cluster; if (env->rtg) { - env->task_load = scale_load_to_cpu(task_load(env->p), - cluster_first_cpu(env->rtg->preferred_cluster)); - env->sbc_best_cluster_flag |= SBC_FLAG_COLOC_CLUSTER; - return env->rtg->preferred_cluster; + int cpu = cluster_first_cpu(env->rtg->preferred_cluster); + + env->task_load = scale_load_to_cpu(task_load(env->p), cpu); + + if (task_load_will_fit(env->p, env->task_load, + cpu, env->boost_policy)) { + env->sbc_best_cluster_flag |= SBC_FLAG_COLOC_CLUSTER; + + if (env->boost_policy == SCHED_BOOST_NONE) + return env->rtg->preferred_cluster; + + for_each_sched_cluster(cluster) { + if (cluster != env->rtg->preferred_cluster) { + __set_bit(cluster->id, + env->backup_list); + __clear_bit(cluster->id, + env->candidate_list); + } + } + + return env->rtg->preferred_cluster; + } + + /* + * Since the task load does not fit on the preferred + * cluster anymore, pretend that the task does not + * have any preferred cluster. This allows the waking + * task to get the appropriate CPU it needs as per the + * non co-location placement policy without having to + * wait until the preferred cluster is updated. + */ + env->rtg = NULL; } for_each_sched_cluster(cluster) { @@ -2718,7 +2747,7 @@ select_least_power_cluster(struct cpu_select_env *env) env->task_load = scale_load_to_cpu(task_load(env->p), cpu); if (task_load_will_fit(env->p, env->task_load, cpu, - env->boost_type)) + env->boost_policy)) return cluster; __set_bit(cluster->id, env->backup_list); @@ -2961,7 +2990,14 @@ static void find_best_cpu_in_cluster(struct sched_cluster *c, update_spare_capacity(stats, env, i, c->capacity, env->cpu_load); - if (env->boost_type == SCHED_BOOST_ON_ALL || + /* + * need_idle takes precedence over sched boost but when both + * are set, idlest CPU with in all the clusters is selected + * when boost_policy = BOOST_ON_ALL whereas idlest CPU in the + * big cluster is selected within boost_policy = BOOST_ON_BIG. + */ + if ((!env->need_idle && + env->boost_policy != SCHED_BOOST_NONE) || env->need_waker_cluster || sched_cpu_high_irqload(i) || spill_threshold_crossed(env, cpu_rq(i))) @@ -3005,7 +3041,7 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats) struct task_struct *task = env->p; struct sched_cluster *cluster; - if (env->boost_type != SCHED_BOOST_NONE || env->reason || + if (env->boost_policy != SCHED_BOOST_NONE || env->reason || !task->ravg.mark_start || env->need_idle || !sched_short_sleep_task_threshold) return false; @@ -3034,7 +3070,7 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats) cluster = cpu_rq(prev_cpu)->cluster; if (!task_load_will_fit(task, env->task_load, prev_cpu, - sched_boost_type())) { + sched_boost_policy())) { __set_bit(cluster->id, env->backup_list); __clear_bit(cluster->id, env->candidate_list); @@ -3056,7 +3092,7 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats) static inline bool wake_to_waker_cluster(struct cpu_select_env *env) { - return env->boost_type == SCHED_BOOST_NONE && + return env->boost_policy == SCHED_BOOST_NONE && !env->need_idle && !env->reason && env->sync && task_load(current) > sched_big_waker_task_load && task_load(env->p) < sched_small_wakee_task_load; @@ -3098,7 +3134,6 @@ static int select_best_cpu(struct task_struct *p, int target, int reason, .reason = reason, .need_idle = wake_to_idle(p), .need_waker_cluster = 0, - .boost_type = sched_boost_type(), .sync = sync, .prev_cpu = target, .ignore_prev_cpu = 0, @@ -3107,6 +3142,9 @@ static int select_best_cpu(struct task_struct *p, int target, int reason, .sbc_best_cluster_flag = 0, }; + env.boost_policy = task_sched_boost(p) ? + sched_boost_policy() : SCHED_BOOST_NONE; + bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS); bitmap_zero(env.backup_list, NR_CPUS); @@ -3178,12 +3216,23 @@ retry: sbc_flag |= env.sbc_best_flag; target = stats.best_cpu; } else { - if (env.rtg) { + if (env.rtg && env.boost_policy == SCHED_BOOST_NONE) { env.rtg = NULL; goto retry; } - find_backup_cluster(&env, &stats); + /* + * With boost_policy == SCHED_BOOST_ON_BIG, we reach here with + * backup_list = little cluster, candidate_list = none and + * stats->best_capacity_cpu points the best spare capacity + * CPU among the CPUs in the big cluster. + */ + if (env.boost_policy == SCHED_BOOST_ON_BIG && + stats.best_capacity_cpu >= 0) + sbc_flag |= SBC_FLAG_BOOST_CLUSTER; + else + find_backup_cluster(&env, &stats); + if (stats.best_capacity_cpu >= 0) { target = stats.best_capacity_cpu; sbc_flag |= SBC_FLAG_BEST_CAP_CPU; @@ -3193,8 +3242,8 @@ retry: out: sbc_flag |= env.sbc_best_cluster_flag; rcu_read_unlock(); - trace_sched_task_load(p, sched_boost(), env.reason, env.sync, - env.need_idle, sbc_flag, target); + trace_sched_task_load(p, sched_boost_policy() && task_sched_boost(p), + env.reason, env.sync, env.need_idle, sbc_flag, target); return target; } @@ -3402,11 +3451,9 @@ static inline int migration_needed(struct task_struct *p, int cpu) if (task_will_be_throttled(p)) return 0; - if (sched_boost_type() == SCHED_BOOST_ON_BIG) { - if (cpu_capacity(cpu) != max_capacity) - return UP_MIGRATION; - return 0; - } + if (sched_boost_policy() == SCHED_BOOST_ON_BIG && + cpu_capacity(cpu) != max_capacity && task_sched_boost(p)) + return UP_MIGRATION; if (sched_cpu_high_irqload(cpu)) return IRQLOAD_MIGRATION; @@ -3420,7 +3467,7 @@ static inline int migration_needed(struct task_struct *p, int cpu) return DOWN_MIGRATION; } - if (!grp && !task_will_fit(p, cpu)) { + if (!task_will_fit(p, cpu)) { rcu_read_unlock(); return UP_MIGRATION; } @@ -6648,10 +6695,7 @@ enum fbq_type { regular, remote, all }; #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 #define LBF_SOME_PINNED 0x08 -#define LBF_SCHED_BOOST_ACTIVE_BALANCE 0x40 #define LBF_BIG_TASK_ACTIVE_BALANCE 0x80 -#define LBF_HMP_ACTIVE_BALANCE (LBF_SCHED_BOOST_ACTIVE_BALANCE | \ - LBF_BIG_TASK_ACTIVE_BALANCE) #define LBF_IGNORE_BIG_TASKS 0x100 #define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200 #define LBF_MOVED_RELATED_THREAD_GROUP_TASK 0x400 @@ -6682,6 +6726,7 @@ struct lb_env { enum fbq_type fbq_type; struct list_head tasks; + enum sched_boost_policy boost_policy; }; /* @@ -6826,9 +6871,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* Record that we found atleast one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; - if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu) && - nr_big_tasks(env->src_rq) && !is_big_task(p)) - return 0; + if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu)) { + if (nr_big_tasks(env->src_rq) && !is_big_task(p)) + return 0; + + if (env->boost_policy == SCHED_BOOST_ON_BIG && + !task_sched_boost(p)) + return 0; + } twf = task_will_fit(p, env->dst_cpu); @@ -6951,8 +7001,7 @@ static int detach_tasks(struct lb_env *env) if (env->imbalance <= 0) return 0; - if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu) && - !sched_boost()) + if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu)) env->flags |= LBF_IGNORE_BIG_TASKS; else if (!same_cluster(env->dst_cpu, env->src_cpu)) env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS; @@ -7255,8 +7304,10 @@ bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds) int local_capacity, busiest_capacity; int local_pwr_cost, busiest_pwr_cost; int nr_cpus; + int boost = sched_boost(); - if (!sysctl_sched_restrict_cluster_spill || sched_boost()) + if (!sysctl_sched_restrict_cluster_spill || + boost == FULL_THROTTLE_BOOST || boost == CONSERVATIVE_BOOST) return 0; local_cpu = group_first_cpu(sds->local); @@ -7628,11 +7679,6 @@ static bool update_sd_pick_busiest_active_balance(struct lb_env *env, { if (env->idle != CPU_NOT_IDLE && cpu_capacity(env->dst_cpu) > group_rq_capacity(sg)) { - if (sched_boost() && !sds->busiest && sgs->sum_nr_running) { - env->flags |= LBF_SCHED_BOOST_ACTIVE_BALANCE; - return true; - } - if (sgs->sum_nr_big_tasks > sds->busiest_stat.sum_nr_big_tasks) { env->flags |= LBF_BIG_TASK_ACTIVE_BALANCE; @@ -8045,7 +8091,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (!sds.busiest || busiest->sum_nr_running == 0) goto out_balanced; - if (env->flags & LBF_HMP_ACTIVE_BALANCE) + if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE) goto force_balance; if (bail_inter_cluster_balance(env, &sds)) @@ -8257,7 +8303,7 @@ static int need_active_balance(struct lb_env *env) { struct sched_domain *sd = env->sd; - if (env->flags & LBF_HMP_ACTIVE_BALANCE) + if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE) return 1; if (env->idle == CPU_NEWLY_IDLE) { @@ -8348,20 +8394,21 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); struct lb_env env = { - .sd = sd, - .dst_cpu = this_cpu, - .dst_rq = this_rq, - .dst_grpmask = sched_group_cpus(sd->groups), - .idle = idle, - .loop_break = sched_nr_migrate_break, - .cpus = cpus, - .fbq_type = all, - .tasks = LIST_HEAD_INIT(env.tasks), - .imbalance = 0, - .flags = 0, - .loop = 0, + .sd = sd, + .dst_cpu = this_cpu, + .dst_rq = this_rq, + .dst_grpmask = sched_group_cpus(sd->groups), + .idle = idle, + .loop_break = sched_nr_migrate_break, + .cpus = cpus, + .fbq_type = all, + .tasks = LIST_HEAD_INIT(env.tasks), + .imbalance = 0, + .flags = 0, + .loop = 0, .busiest_nr_running = 0, .busiest_grp_capacity = 0, + .boost_policy = sched_boost_policy(), }; /* @@ -8510,7 +8557,7 @@ more_balance: no_move: if (!ld_moved) { - if (!(env.flags & LBF_HMP_ACTIVE_BALANCE)) + if (!(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE)) schedstat_inc(sd, lb_failed[idle]); /* @@ -8520,7 +8567,7 @@ no_move: * excessive cache_hot migrations and active balances. */ if (idle != CPU_NEWLY_IDLE && - !(env.flags & LBF_HMP_ACTIVE_BALANCE)) + !(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE)) sd->nr_balance_failed++; if (need_active_balance(&env)) { @@ -8797,6 +8844,7 @@ static int active_load_balance_cpu_stop(void *data) .busiest_grp_capacity = 0, .flags = 0, .loop = 0, + .boost_policy = sched_boost_policy(), }; bool moved = false; @@ -9272,7 +9320,8 @@ static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type) if (rq->nr_running < 2) return 0; - if (!sysctl_sched_restrict_cluster_spill || sched_boost()) + if (!sysctl_sched_restrict_cluster_spill || + sched_boost_policy() == SCHED_BOOST_ON_ALL) return 1; if (cpu_max_power_cost(cpu) == max_power_cost) diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index 30391aae0822..968a41e0e81e 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -17,8 +17,6 @@ #include <linux/cpufreq.h> #include <linux/list_sort.h> #include <linux/syscore_ops.h> -#include <linux/of.h> -#include <linux/sched/core_ctl.h> #include "sched.h" @@ -231,52 +229,6 @@ fail: return ret; } -/* - * It is possible that CPUs of the same micro architecture can have slight - * difference in the efficiency due to other factors like cache size. The - * BOOST_ON_BIG policy may not be optimial for such systems. The required - * boost policy can be specified via device tree to handle this. - */ -static int __read_mostly sched_boost_policy = SCHED_BOOST_NONE; - -/* - * This should be called after clusters are populated and - * the respective efficiency values are initialized. - */ -void init_sched_hmp_boost_policy(void) -{ - /* - * Initialize the boost type here if it is not passed from - * device tree. - */ - if (sched_boost_policy == SCHED_BOOST_NONE) { - if (max_possible_efficiency != min_possible_efficiency) - sched_boost_policy = SCHED_BOOST_ON_BIG; - else - sched_boost_policy = SCHED_BOOST_ON_ALL; - } -} - -void sched_hmp_parse_dt(void) -{ - struct device_node *sn; - const char *boost_policy; - - if (!sched_enable_hmp) - return; - - sn = of_find_node_by_path("/sched-hmp"); - if (!sn) - return; - - if (!of_property_read_string(sn, "boost-policy", &boost_policy)) { - if (!strcmp(boost_policy, "boost-on-big")) - sched_boost_policy = SCHED_BOOST_ON_BIG; - else if (!strcmp(boost_policy, "boost-on-all")) - sched_boost_policy = SCHED_BOOST_ON_ALL; - } -} - unsigned int max_possible_efficiency = 1; unsigned int min_possible_efficiency = UINT_MAX; @@ -680,29 +632,6 @@ int __init set_sched_enable_hmp(char *str) early_param("sched_enable_hmp", set_sched_enable_hmp); -int got_boost_kick(void) -{ - int cpu = smp_processor_id(); - struct rq *rq = cpu_rq(cpu); - - return test_bit(BOOST_KICK, &rq->hmp_flags); -} - -inline void clear_boost_kick(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - clear_bit(BOOST_KICK, &rq->hmp_flags); -} - -inline void boost_kick(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (!test_and_set_bit(BOOST_KICK, &rq->hmp_flags)) - smp_send_reschedule(cpu); -} - /* Clear any HMP scheduler related requests pending from or on cpu */ void clear_hmp_request(int cpu) { @@ -840,6 +769,9 @@ min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */ /* Window size (in ns) */ __read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW; +/* Maximum allowed threshold before freq aggregation must be enabled */ +#define MAX_FREQ_AGGR_THRESH 1000 + /* Temporarily disable window-stats activity on all cpus */ unsigned int __read_mostly sched_disable_window_stats; @@ -919,8 +851,8 @@ static const unsigned int top_tasks_bitmap_size = * C1 busy time = 5 + 5 + 6 = 16ms * */ -static __read_mostly unsigned int sched_freq_aggregate; -__read_mostly unsigned int sysctl_sched_freq_aggregate; +static __read_mostly unsigned int sched_freq_aggregate = 1; +__read_mostly unsigned int sysctl_sched_freq_aggregate = 1; unsigned int __read_mostly sysctl_sched_freq_aggregate_threshold_pct; static unsigned int __read_mostly sched_freq_aggregate_threshold; @@ -937,14 +869,6 @@ unsigned int max_task_load(void) /* Use this knob to turn on or off HMP-aware task placement logic */ unsigned int __read_mostly sched_enable_hmp; -/* - * Scheduler boost is a mechanism to temporarily place tasks on CPUs - * with higher capacity than those where a task would have normally - * ended up with their load characteristics. Any entity enabling - * boost is responsible for disabling it as well. - */ -unsigned int sysctl_sched_boost; - /* A cpu can no longer accommodate more tasks if: * * rq->nr_running > sysctl_sched_spill_nr_run || @@ -996,6 +920,21 @@ unsigned int __read_mostly sched_downmigrate; unsigned int __read_mostly sysctl_sched_downmigrate_pct = 60; /* + * Task groups whose aggregate demand on a cpu is more than + * sched_group_upmigrate need to be up-migrated if possible. + */ +unsigned int __read_mostly sched_group_upmigrate; +unsigned int __read_mostly sysctl_sched_group_upmigrate_pct = 100; + +/* + * Task groups, once up-migrated, will need to drop their aggregate + * demand to less than sched_group_downmigrate before they are "down" + * migrated. + */ +unsigned int __read_mostly sched_group_downmigrate; +unsigned int __read_mostly sysctl_sched_group_downmigrate_pct = 95; + +/* * The load scale factor of a CPU gets boosted when its max frequency * is restricted due to which the tasks are migrating to higher capacity * CPUs early. The sched_upmigrate threshold is auto-upgraded by @@ -1017,33 +956,46 @@ sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC; unsigned int __read_mostly sysctl_sched_restrict_cluster_spill; -void update_up_down_migrate(void) +static void +_update_up_down_migrate(unsigned int *up_migrate, unsigned int *down_migrate) { - unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct); - unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct); unsigned int delta; if (up_down_migrate_scale_factor == 1024) - goto done; + return; - delta = up_migrate - down_migrate; + delta = *up_migrate - *down_migrate; - up_migrate /= NSEC_PER_USEC; - up_migrate *= up_down_migrate_scale_factor; - up_migrate >>= 10; - up_migrate *= NSEC_PER_USEC; + *up_migrate /= NSEC_PER_USEC; + *up_migrate *= up_down_migrate_scale_factor; + *up_migrate >>= 10; + *up_migrate *= NSEC_PER_USEC; - up_migrate = min(up_migrate, sched_ravg_window); + *up_migrate = min(*up_migrate, sched_ravg_window); - down_migrate /= NSEC_PER_USEC; - down_migrate *= up_down_migrate_scale_factor; - down_migrate >>= 10; - down_migrate *= NSEC_PER_USEC; + *down_migrate /= NSEC_PER_USEC; + *down_migrate *= up_down_migrate_scale_factor; + *down_migrate >>= 10; + *down_migrate *= NSEC_PER_USEC; - down_migrate = min(down_migrate, up_migrate - delta); -done: + *down_migrate = min(*down_migrate, *up_migrate - delta); +} + +static void update_up_down_migrate(void) +{ + unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct); + unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct); + + _update_up_down_migrate(&up_migrate, &down_migrate); sched_upmigrate = up_migrate; sched_downmigrate = down_migrate; + + up_migrate = pct_to_real(sysctl_sched_group_upmigrate_pct); + down_migrate = pct_to_real(sysctl_sched_group_downmigrate_pct); + + _update_up_down_migrate(&up_migrate, &down_migrate); + sched_group_upmigrate = up_migrate; + sched_group_downmigrate = down_migrate; } void set_hmp_defaults(void) @@ -1134,82 +1086,6 @@ u64 cpu_load_sync(int cpu, int sync) return scale_load_to_cpu(cpu_cravg_sync(cpu, sync), cpu); } -static int boost_refcount; -static DEFINE_SPINLOCK(boost_lock); -static DEFINE_MUTEX(boost_mutex); - -static void boost_kick_cpus(void) -{ - int i; - - for_each_online_cpu(i) { - if (cpu_capacity(i) != max_capacity) - boost_kick(i); - } -} - -int sched_boost(void) -{ - return boost_refcount > 0; -} - -int sched_set_boost(int enable) -{ - unsigned long flags; - int ret = 0; - int old_refcount; - - if (!sched_enable_hmp) - return -EINVAL; - - spin_lock_irqsave(&boost_lock, flags); - - old_refcount = boost_refcount; - - if (enable == 1) { - boost_refcount++; - } else if (!enable) { - if (boost_refcount >= 1) - boost_refcount--; - else - ret = -EINVAL; - } else { - ret = -EINVAL; - } - - if (!old_refcount && boost_refcount) - boost_kick_cpus(); - - if (boost_refcount <= 1) - core_ctl_set_boost(boost_refcount == 1); - trace_sched_set_boost(boost_refcount); - spin_unlock_irqrestore(&boost_lock, flags); - - return ret; -} - -int sched_boost_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret; - - mutex_lock(&boost_mutex); - if (!write) - sysctl_sched_boost = sched_boost(); - - ret = proc_dointvec(table, write, buffer, lenp, ppos); - if (ret || !write) - goto done; - - ret = (sysctl_sched_boost <= 1) ? - sched_set_boost(sysctl_sched_boost) : -EINVAL; - -done: - mutex_unlock(&boost_mutex); - return ret; -} - /* * Task will fit on a cpu if it's bandwidth consumption on that cpu * will be less than sched_upmigrate. A big task that was previously @@ -1219,60 +1095,63 @@ done: * tasks with load close to the upmigrate threshold */ int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu, - enum sched_boost_type boost_type) + enum sched_boost_policy boost_policy) { - int upmigrate; + int upmigrate = sched_upmigrate; if (cpu_capacity(cpu) == max_capacity) return 1; - if (boost_type != SCHED_BOOST_ON_BIG) { + if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu)) + upmigrate = sched_downmigrate; + + if (boost_policy != SCHED_BOOST_ON_BIG) { if (task_nice(p) > SCHED_UPMIGRATE_MIN_NICE || upmigrate_discouraged(p)) return 1; - upmigrate = sched_upmigrate; - if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu)) - upmigrate = sched_downmigrate; - if (task_load < upmigrate) return 1; + } else { + if (task_sched_boost(p) || task_load >= upmigrate) + return 0; + + return 1; } return 0; } -enum sched_boost_type sched_boost_type(void) -{ - if (sched_boost()) - return sched_boost_policy; - - return SCHED_BOOST_NONE; -} - int task_will_fit(struct task_struct *p, int cpu) { u64 tload = scale_load_to_cpu(task_load(p), cpu); - return task_load_will_fit(p, tload, cpu, sched_boost_type()); + return task_load_will_fit(p, tload, cpu, sched_boost_policy()); } -int group_will_fit(struct sched_cluster *cluster, - struct related_thread_group *grp, u64 demand) +static int +group_will_fit(struct sched_cluster *cluster, struct related_thread_group *grp, + u64 demand, bool group_boost) { int cpu = cluster_first_cpu(cluster); int prev_capacity = 0; - unsigned int threshold = sched_upmigrate; + unsigned int threshold = sched_group_upmigrate; u64 load; if (cluster->capacity == max_capacity) return 1; + if (group_boost) + return 0; + + if (!demand) + return 1; + if (grp->preferred_cluster) prev_capacity = grp->preferred_cluster->capacity; if (cluster->capacity < prev_capacity) - threshold = sched_downmigrate; + threshold = sched_group_downmigrate; load = scale_load_to_cpu(demand, cpu); if (load < threshold) @@ -1495,6 +1374,23 @@ void post_big_task_count_change(const struct cpumask *cpus) DEFINE_MUTEX(policy_mutex); +unsigned int update_freq_aggregate_threshold(unsigned int threshold) +{ + unsigned int old_threshold; + + mutex_lock(&policy_mutex); + + old_threshold = sysctl_sched_freq_aggregate_threshold_pct; + + sysctl_sched_freq_aggregate_threshold_pct = threshold; + sched_freq_aggregate_threshold = + pct_to_real(sysctl_sched_freq_aggregate_threshold_pct); + + mutex_unlock(&policy_mutex); + + return old_threshold; +} + static inline int invalid_value_freq_input(unsigned int *data) { if (data == &sysctl_sched_freq_aggregate) @@ -1578,7 +1474,9 @@ int sched_hmp_proc_update_handler(struct ctl_table *table, int write, if (write && (old_val == *data)) goto done; - if (sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct) { + if (sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct || + sysctl_sched_group_downmigrate_pct > + sysctl_sched_group_upmigrate_pct) { *data = old_val; ret = -EINVAL; goto done; @@ -3110,37 +3008,9 @@ static void reset_all_task_stats(void) { struct task_struct *g, *p; - read_lock(&tasklist_lock); do_each_thread(g, p) { - raw_spin_lock_irq(&p->pi_lock); reset_task_stats(p); - raw_spin_unlock_irq(&p->pi_lock); } while_each_thread(g, p); - read_unlock(&tasklist_lock); -} - -static void disable_window_stats(void) -{ - unsigned long flags; - int i; - - local_irq_save(flags); - for_each_possible_cpu(i) - raw_spin_lock(&cpu_rq(i)->lock); - - sched_disable_window_stats = 1; - - for_each_possible_cpu(i) - raw_spin_unlock(&cpu_rq(i)->lock); - - local_irq_restore(flags); -} - -/* Called with all cpu's rq->lock held */ -static void enable_window_stats(void) -{ - sched_disable_window_stats = 0; - } enum reset_reason_code { @@ -3166,17 +3036,22 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size) unsigned int old = 0, new = 0; struct related_thread_group *grp; - read_lock(&related_thread_group_lock); - - disable_window_stats(); + local_irq_save(flags); - reset_all_task_stats(); + read_lock(&tasklist_lock); - local_irq_save(flags); + read_lock(&related_thread_group_lock); + /* Taking all runqueue locks prevents race with sched_exit(). */ for_each_possible_cpu(cpu) raw_spin_lock(&cpu_rq(cpu)->lock); + sched_disable_window_stats = 1; + + reset_all_task_stats(); + + read_unlock(&tasklist_lock); + list_for_each_entry(grp, &related_thread_groups, list) { int j; @@ -3196,7 +3071,7 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size) sched_load_granule = sched_ravg_window / NUM_LOAD_INDICES; } - enable_window_stats(); + sched_disable_window_stats = 0; for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); @@ -3239,10 +3114,10 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size) for_each_possible_cpu(cpu) raw_spin_unlock(&cpu_rq(cpu)->lock); - local_irq_restore(flags); - read_unlock(&related_thread_group_lock); + local_irq_restore(flags); + trace_sched_reset_all_window_stats(window_start, window_size, sched_ktime_clock() - start_ts, reason, old, new); } @@ -3824,13 +3699,13 @@ static void check_for_up_down_migrate_update(const struct cpumask *cpus) } /* Return cluster which can offer required capacity for group */ -static struct sched_cluster * -best_cluster(struct related_thread_group *grp, u64 total_demand) +static struct sched_cluster *best_cluster(struct related_thread_group *grp, + u64 total_demand, bool group_boost) { struct sched_cluster *cluster = NULL; for_each_sched_cluster(cluster) { - if (group_will_fit(cluster, grp, total_demand)) + if (group_will_fit(cluster, grp, total_demand, group_boost)) return cluster; } @@ -3841,6 +3716,9 @@ static void _set_preferred_cluster(struct related_thread_group *grp) { struct task_struct *p; u64 combined_demand = 0; + bool boost_on_big = sched_boost_policy() == SCHED_BOOST_ON_BIG; + bool group_boost = false; + u64 wallclock; if (!sysctl_sched_enable_colocation) { grp->last_update = sched_ktime_clock(); @@ -3848,31 +3726,43 @@ static void _set_preferred_cluster(struct related_thread_group *grp) return; } + if (list_empty(&grp->tasks)) + return; + + wallclock = sched_ktime_clock(); + /* * wakeup of two or more related tasks could race with each other and * could result in multiple calls to _set_preferred_cluster being issued * at same time. Avoid overhead in such cases of rechecking preferred * cluster */ - if (sched_ktime_clock() - grp->last_update < sched_ravg_window / 10) + if (wallclock - grp->last_update < sched_ravg_window / 10) return; - list_for_each_entry(p, &grp->tasks, grp_list) + list_for_each_entry(p, &grp->tasks, grp_list) { + if (boost_on_big && task_sched_boost(p)) { + group_boost = true; + break; + } + + if (p->ravg.mark_start < wallclock - + (sched_ravg_window * sched_ravg_hist_size)) + continue; + combined_demand += p->ravg.demand; - grp->preferred_cluster = best_cluster(grp, combined_demand); + } + + grp->preferred_cluster = best_cluster(grp, + combined_demand, group_boost); grp->last_update = sched_ktime_clock(); trace_sched_set_preferred_cluster(grp, combined_demand); } void set_preferred_cluster(struct related_thread_group *grp) { - /* - * Prevent possible deadlock with update_children(). Not updating - * the preferred cluster once is not a big deal. - */ - if (!raw_spin_trylock(&grp->lock)) - return; + raw_spin_lock(&grp->lock); _set_preferred_cluster(grp); raw_spin_unlock(&grp->lock); } @@ -3880,6 +3770,8 @@ void set_preferred_cluster(struct related_thread_group *grp) #define ADD_TASK 0 #define REM_TASK 1 +#define DEFAULT_CGROUP_COLOC_ID 1 + static inline void free_group_cputime(struct related_thread_group *grp) { free_percpu(grp->cpu_time); @@ -4116,64 +4008,19 @@ static void free_related_thread_group(struct rcu_head *rcu) kfree(grp); } -/* - * The thread group for a task can change while we are here. However, - * add_new_task_to_grp() will take care of any tasks that we miss here. - * When a parent exits, and a child thread is simultaneously exiting, - * sched_set_group_id() will synchronize those operations. - */ -static void update_children(struct task_struct *leader, - struct related_thread_group *grp, int event) -{ - struct task_struct *child; - struct rq *rq; - unsigned long flags; - - if (!thread_group_leader(leader)) - return; - - if (event == ADD_TASK && !sysctl_sched_enable_thread_grouping) - return; - - if (thread_group_empty(leader)) - return; - - child = next_thread(leader); - - do { - rq = task_rq_lock(child, &flags); - - if (event == REM_TASK && child->grp && grp == child->grp) { - transfer_busy_time(rq, grp, child, event); - list_del_init(&child->grp_list); - rcu_assign_pointer(child->grp, NULL); - } else if (event == ADD_TASK && !child->grp) { - transfer_busy_time(rq, grp, child, event); - list_add(&child->grp_list, &grp->tasks); - rcu_assign_pointer(child->grp, grp); - } - - task_rq_unlock(rq, child, &flags); - } while_each_thread(leader, child); - -} - static void remove_task_from_group(struct task_struct *p) { struct related_thread_group *grp = p->grp; struct rq *rq; int empty_group = 1; - unsigned long flags; raw_spin_lock(&grp->lock); - rq = task_rq_lock(p, &flags); + rq = __task_rq_lock(p); transfer_busy_time(rq, p->grp, p, REM_TASK); list_del_init(&p->grp_list); rcu_assign_pointer(p->grp, NULL); - task_rq_unlock(rq, p, &flags); - - update_children(p, grp, REM_TASK); + __task_rq_unlock(rq); if (!list_empty(&grp->tasks)) { empty_group = 0; @@ -4182,7 +4029,8 @@ static void remove_task_from_group(struct task_struct *p) raw_spin_unlock(&grp->lock); - if (empty_group) { + /* Reserved groups cannot be destroyed */ + if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID) { list_del(&grp->list); call_rcu(&grp->rcu, free_related_thread_group); } @@ -4192,7 +4040,6 @@ static int add_task_to_group(struct task_struct *p, struct related_thread_group *grp) { struct rq *rq; - unsigned long flags; raw_spin_lock(&grp->lock); @@ -4200,13 +4047,11 @@ add_task_to_group(struct task_struct *p, struct related_thread_group *grp) * Change p->grp under rq->lock. Will prevent races with read-side * reference of p->grp in various hot-paths */ - rq = task_rq_lock(p, &flags); + rq = __task_rq_lock(p); transfer_busy_time(rq, grp, p, ADD_TASK); list_add(&p->grp_list, &grp->tasks); rcu_assign_pointer(p->grp, grp); - task_rq_unlock(rq, p, &flags); - - update_children(p, grp, ADD_TASK); + __task_rq_unlock(rq); _set_preferred_cluster(grp); @@ -4219,23 +4064,33 @@ void add_new_task_to_grp(struct task_struct *new) { unsigned long flags; struct related_thread_group *grp; - struct task_struct *parent; + struct task_struct *leader = new->group_leader; + unsigned int leader_grp_id = sched_get_group_id(leader); - if (!sysctl_sched_enable_thread_grouping) + if (!sysctl_sched_enable_thread_grouping && + leader_grp_id != DEFAULT_CGROUP_COLOC_ID) return; if (thread_group_leader(new)) return; - parent = new->group_leader; + if (leader_grp_id == DEFAULT_CGROUP_COLOC_ID) { + if (!same_schedtune(new, leader)) + return; + } write_lock_irqsave(&related_thread_group_lock, flags); rcu_read_lock(); - grp = task_related_thread_group(parent); + grp = task_related_thread_group(leader); rcu_read_unlock(); - /* Its possible that update_children() already added us to the group */ + /* + * It's possible that someone already added the new task to the + * group. A leader's thread group is updated prior to calling + * this function. It's also possible that the leader has exited + * the group. In either case, there is nothing else to do. + */ if (!grp || new->grp) { write_unlock_irqrestore(&related_thread_group_lock, flags); return; @@ -4250,14 +4105,55 @@ void add_new_task_to_grp(struct task_struct *new) write_unlock_irqrestore(&related_thread_group_lock, flags); } +#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE) +/* + * We create a default colocation group at boot. There is no need to + * synchronize tasks between cgroups at creation time because the + * correct cgroup hierarchy is not available at boot. Therefore cgroup + * colocation is turned off by default even though the colocation group + * itself has been allocated. Furthermore this colocation group cannot + * be destroyted once it has been created. All of this has been as part + * of runtime optimizations. + * + * The job of synchronizing tasks to the colocation group is done when + * the colocation flag in the cgroup is turned on. + */ +static int __init create_default_coloc_group(void) +{ + struct related_thread_group *grp = NULL; + unsigned long flags; + + grp = alloc_related_thread_group(DEFAULT_CGROUP_COLOC_ID); + if (IS_ERR(grp)) { + WARN_ON(1); + return -ENOMEM; + } + + write_lock_irqsave(&related_thread_group_lock, flags); + list_add(&grp->list, &related_thread_groups); + write_unlock_irqrestore(&related_thread_group_lock, flags); + + update_freq_aggregate_threshold(MAX_FREQ_AGGR_THRESH); + return 0; +} +late_initcall(create_default_coloc_group); + +int sync_cgroup_colocation(struct task_struct *p, bool insert) +{ + unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0; + + return sched_set_group_id(p, grp_id); +} +#endif + int sched_set_group_id(struct task_struct *p, unsigned int group_id) { int rc = 0; unsigned long flags; struct related_thread_group *grp = NULL; - /* Prevents tasks from exiting while we are managing groups. */ - write_lock_irqsave(&related_thread_group_lock, flags); + raw_spin_lock_irqsave(&p->pi_lock, flags); + write_lock(&related_thread_group_lock); /* Switching from one group to another directly is not permitted */ if ((current != p && p->flags & PF_EXITING) || @@ -4272,6 +4168,12 @@ int sched_set_group_id(struct task_struct *p, unsigned int group_id) grp = lookup_related_thread_group(group_id); if (!grp) { + /* This is a reserved id */ + if (group_id == DEFAULT_CGROUP_COLOC_ID) { + rc = -EINVAL; + goto done; + } + grp = alloc_related_thread_group(group_id); if (IS_ERR(grp)) { rc = -ENOMEM; @@ -4281,10 +4183,10 @@ int sched_set_group_id(struct task_struct *p, unsigned int group_id) list_add(&grp->list, &related_thread_groups); } - BUG_ON(!grp); rc = add_task_to_group(p, grp); done: - write_unlock_irqrestore(&related_thread_group_lock, flags); + write_unlock(&related_thread_group_lock); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); return rc; } @@ -4529,7 +4431,7 @@ bool early_detection_notify(struct rq *rq, u64 wallclock) struct task_struct *p; int loop_max = 10; - if (!sched_boost() || !rq->cfs.h_nr_running) + if (sched_boost_policy() == SCHED_BOOST_NONE || !rq->cfs.h_nr_running) return 0; rq->ed_task = NULL; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ba4403e910d8..12a04f30ef77 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1677,8 +1677,13 @@ static int find_lowest_rq_hmp(struct task_struct *task) int prev_cpu = task_cpu(task); u64 cpu_load, min_load = ULLONG_MAX; int i; - int restrict_cluster = sched_boost() ? 0 : - sysctl_sched_restrict_cluster_spill; + int restrict_cluster; + int boost_on_big; + + boost_on_big = sched_boost() == FULL_THROTTLE_BOOST && + sched_boost_policy() == SCHED_BOOST_ON_BIG; + + restrict_cluster = sysctl_sched_restrict_cluster_spill; /* Make sure the mask is initialized first */ if (unlikely(!lowest_mask)) @@ -1697,6 +1702,9 @@ static int find_lowest_rq_hmp(struct task_struct *task) */ for_each_sched_cluster(cluster) { + if (boost_on_big && cluster->capacity != max_possible_capacity) + continue; + cpumask_and(&candidate_mask, &cluster->cpus, lowest_mask); cpumask_andnot(&candidate_mask, &candidate_mask, cpu_isolated_mask); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4289bf6cd642..30838bb9b442 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1061,8 +1061,6 @@ extern unsigned int max_load_scale_factor; extern unsigned int max_possible_capacity; extern unsigned int min_max_possible_capacity; extern unsigned int max_power_cost; -extern unsigned int sched_upmigrate; -extern unsigned int sched_downmigrate; extern unsigned int sched_init_task_load_windows; extern unsigned int up_down_migrate_scale_factor; extern unsigned int sysctl_sched_restrict_cluster_spill; @@ -1106,18 +1104,23 @@ extern void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock); extern unsigned int cpu_temp(int cpu); extern unsigned int nr_eligible_big_tasks(int cpu); -extern void update_up_down_migrate(void); extern int update_preferred_cluster(struct related_thread_group *grp, struct task_struct *p, u32 old_load); extern void set_preferred_cluster(struct related_thread_group *grp); extern void add_new_task_to_grp(struct task_struct *new); +extern unsigned int update_freq_aggregate_threshold(unsigned int threshold); -enum sched_boost_type { +enum sched_boost_policy { SCHED_BOOST_NONE, SCHED_BOOST_ON_BIG, SCHED_BOOST_ON_ALL, }; +#define NO_BOOST 0 +#define FULL_THROTTLE_BOOST 1 +#define CONSERVATIVE_BOOST 2 +#define RESTRAINED_BOOST 3 + static inline struct sched_cluster *cpu_cluster(int cpu) { return cpu_rq(cpu)->cluster; @@ -1387,14 +1390,11 @@ extern void set_hmp_defaults(void); extern int power_delta_exceeded(unsigned int cpu_cost, unsigned int base_cost); extern unsigned int power_cost(int cpu, u64 demand); extern void reset_all_window_stats(u64 window_start, unsigned int window_size); -extern void boost_kick(int cpu); extern int sched_boost(void); extern int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu, - enum sched_boost_type boost_type); -extern enum sched_boost_type sched_boost_type(void); + enum sched_boost_policy boost_policy); +extern enum sched_boost_policy sched_boost_policy(void); extern int task_will_fit(struct task_struct *p, int cpu); -extern int group_will_fit(struct sched_cluster *cluster, - struct related_thread_group *grp, u64 demand); extern u64 cpu_load(int cpu); extern u64 cpu_load_sync(int cpu, int sync); extern int preferred_cluster(struct sched_cluster *cluster, @@ -1422,10 +1422,32 @@ extern u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css, struct cftype *cft); extern int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, u64 upmigrate_discourage); -extern void sched_hmp_parse_dt(void); -extern void init_sched_hmp_boost_policy(void); +extern void sched_boost_parse_dt(void); extern void clear_top_tasks_bitmap(unsigned long *bitmap); +#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE) +extern bool task_sched_boost(struct task_struct *p); +extern int sync_cgroup_colocation(struct task_struct *p, bool insert); +extern bool same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2); +extern void update_cgroup_boost_settings(void); +extern void restore_cgroup_boost_settings(void); + +#else +static inline bool +same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2) +{ + return true; +} + +static inline bool task_sched_boost(struct task_struct *p) +{ + return true; +} + +static inline void update_cgroup_boost_settings(void) { } +static inline void restore_cgroup_boost_settings(void) { } +#endif + #else /* CONFIG_SCHED_HMP */ struct hmp_sched_stats; @@ -1615,8 +1637,7 @@ static inline void post_big_task_count_change(void) { } static inline void set_hmp_defaults(void) { } static inline void clear_reserved(int cpu) { } -static inline void sched_hmp_parse_dt(void) {} -static inline void init_sched_hmp_boost_policy(void) {} +static inline void sched_boost_parse_dt(void) {} #define trace_sched_cpu_load(...) #define trace_sched_cpu_load_lb(...) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 4f8182302e5e..ee2af8e0b5ce 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -25,6 +25,33 @@ struct schedtune { /* Boost value for tasks on that SchedTune CGroup */ int boost; +#ifdef CONFIG_SCHED_HMP + /* Toggle ability to override sched boost enabled */ + bool sched_boost_no_override; + + /* + * Controls whether a cgroup is eligible for sched boost or not. This + * can temporariliy be disabled by the kernel based on the no_override + * flag above. + */ + bool sched_boost_enabled; + + /* + * This tracks the default value of sched_boost_enabled and is used + * restore the value following any temporary changes to that flag. + */ + bool sched_boost_enabled_backup; + + /* + * Controls whether tasks of this cgroup should be colocated with each + * other and tasks of other cgroups that have the same flag turned on. + */ + bool colocate; + + /* Controls whether further updates are allowed to the colocate flag */ + bool colocate_update_disabled; +#endif + }; static inline struct schedtune *css_st(struct cgroup_subsys_state *css) @@ -54,6 +81,13 @@ static inline struct schedtune *parent_st(struct schedtune *st) static struct schedtune root_schedtune = { .boost = 0, +#ifdef CONFIG_SCHED_HMP + .sched_boost_no_override = false, + .sched_boost_enabled = true, + .sched_boost_enabled_backup = true, + .colocate = false, + .colocate_update_disabled = false, +#endif }; /* @@ -97,6 +131,121 @@ struct boost_groups { /* Boost groups affecting each CPU in the system */ DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups); +#ifdef CONFIG_SCHED_HMP +static inline void init_sched_boost(struct schedtune *st) +{ + st->sched_boost_no_override = false; + st->sched_boost_enabled = true; + st->sched_boost_enabled_backup = st->sched_boost_enabled; + st->colocate = false; + st->colocate_update_disabled = false; +} + +bool same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2) +{ + return task_schedtune(tsk1) == task_schedtune(tsk2); +} + +void update_cgroup_boost_settings(void) +{ + int i; + + for (i = 0; i < BOOSTGROUPS_COUNT; i++) { + if (!allocated_group[i]) + break; + + if (allocated_group[i]->sched_boost_no_override) + continue; + + allocated_group[i]->sched_boost_enabled = false; + } +} + +void restore_cgroup_boost_settings(void) +{ + int i; + + for (i = 0; i < BOOSTGROUPS_COUNT; i++) { + if (!allocated_group[i]) + break; + + allocated_group[i]->sched_boost_enabled = + allocated_group[i]->sched_boost_enabled_backup; + } +} + +bool task_sched_boost(struct task_struct *p) +{ + struct schedtune *st = task_schedtune(p); + + return st->sched_boost_enabled; +} + +static u64 +sched_boost_override_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->sched_boost_no_override; +} + +static int sched_boost_override_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 override) +{ + struct schedtune *st = css_st(css); + + st->sched_boost_no_override = !!override; + + return 0; +} + +static u64 sched_boost_enabled_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->sched_boost_enabled; +} + +static int sched_boost_enabled_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 enable) +{ + struct schedtune *st = css_st(css); + + st->sched_boost_enabled = !!enable; + st->sched_boost_enabled_backup = st->sched_boost_enabled; + + return 0; +} + +static u64 sched_colocate_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->colocate; +} + +static int sched_colocate_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 colocate) +{ + struct schedtune *st = css_st(css); + + if (st->colocate_update_disabled) + return -EPERM; + + st->colocate = !!colocate; + st->colocate_update_disabled = true; + return 0; +} + +#else /* CONFIG_SCHED_HMP */ + +static inline void init_sched_boost(struct schedtune *st) { } + +#endif /* CONFIG_SCHED_HMP */ + static u64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -121,12 +270,45 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, return 0; } +static void schedtune_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *css; + struct schedtune *st; + bool colocate; + + cgroup_taskset_first(tset, &css); + st = css_st(css); + + colocate = st->colocate; + + cgroup_taskset_for_each(task, css, tset) + sync_cgroup_colocation(task, colocate); +} + static struct cftype files[] = { { .name = "boost", .read_u64 = boost_read, .write_u64 = boost_write, }, +#ifdef CONFIG_SCHED_HMP + { + .name = "sched_boost_no_override", + .read_u64 = sched_boost_override_read, + .write_u64 = sched_boost_override_write, + }, + { + .name = "sched_boost_enabled", + .read_u64 = sched_boost_enabled_read, + .write_u64 = sched_boost_enabled_write, + }, + { + .name = "colocate", + .read_u64 = sched_colocate_read, + .write_u64 = sched_colocate_write, + }, +#endif { } /* terminate */ }; @@ -189,6 +371,7 @@ schedtune_css_alloc(struct cgroup_subsys_state *parent_css) /* Initialize per CPUs boost group support */ st->idx = idx; + init_sched_boost(st); if (schedtune_boostgroup_init(st)) goto release; @@ -222,6 +405,7 @@ struct cgroup_subsys schedtune_cgrp_subsys = { .legacy_cftypes = files, .early_init = 1, .allow_attach = subsys_cgroup_allow_attach, + .attach = schedtune_attach, }; #endif /* CONFIG_CGROUP_SCHEDTUNE */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 574316f1c344..b7cbd7940f7b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -124,6 +124,7 @@ static int __maybe_unused neg_one = -1; static int zero; static int __maybe_unused one = 1; static int __maybe_unused two = 2; +static int __maybe_unused three = 3; static int __maybe_unused four = 4; static unsigned long one_ul = 1; static int one_hundred = 100; @@ -377,6 +378,22 @@ static struct ctl_table kern_table[] = { .extra2 = &one_hundred, }, { + .procname = "sched_group_upmigrate", + .data = &sysctl_sched_group_upmigrate_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + }, + { + .procname = "sched_group_downmigrate", + .data = &sysctl_sched_group_downmigrate_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + }, + { .procname = "sched_init_task_load", .data = &sysctl_sched_init_task_load_pct, .maxlen = sizeof(unsigned int), @@ -487,6 +504,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_boost_handler, + .extra1 = &zero, + .extra2 = &three, }, #endif /* CONFIG_SCHED_HMP */ #ifdef CONFIG_SCHED_DEBUG |
