summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/scheduler/sched-zone.txt41
-rw-r--r--include/linux/sched/sysctl.h3
-rw-r--r--kernel/sched/core.c8
-rw-r--r--kernel/sched/fair.c115
-rw-r--r--kernel/sched/sched.h14
-rw-r--r--kernel/sysctl.c23
6 files changed, 129 insertions, 75 deletions
diff --git a/Documentation/scheduler/sched-zone.txt b/Documentation/scheduler/sched-zone.txt
index 4372e5c788c7..28313f144882 100644
--- a/Documentation/scheduler/sched-zone.txt
+++ b/Documentation/scheduler/sched-zone.txt
@@ -1238,35 +1238,6 @@ power mode. It ignores the actual D-state that a cluster may be in and assumes
the worst case power cost of the highest D-state. It is means of biasing task
placement away from idle clusters when necessary.
-
-*** 7.21 sched_lowspill_freq
-
-Default value: 0
-
-Appears at /proc/sys/kernel/sched_lowspill_freq
-
-This is the first of two tunables designed to govern the load balancer behavior
-at various frequency levels. This tunable defines the frequency of the little
-cluster below which the big cluster is not permitted to pull tasks from the
-little cluster as part of load balance. The idea is that below a certain
-frequency, a cluster has enough remaining capacity that may not necessitate
-migration of tasks. This helps in achieving consolidation of workload within
-the little cluster when needed.
-
-*** 7.22 sched_pack_freq
-
-Default value: INT_MAX
-
-Appears at /proc/sys/kernel/sched_pack_freq
-
-This is the second of two tunables designed to govern the load balancer behavior
-at various frequency levels. This tunable defines the frequency of the little
-cluster beyond which the little cluster is now allowed to pull tasks from the
-big cluster as part of load balance. The idea is that above a certain frequency
-threshold the little cluster may not want to pull additional work from another
-cluster. This helps in achieving consolidation of workload within the big
-cluster when needed.
-
***7.23 sched_early_detection_duration
Default value: 9500000
@@ -1278,6 +1249,18 @@ tick for it to be eligible for the scheduler's early detection feature
under scheduler boost. For more information on the feature itself please
refer to section 5.2.1.
+*** 7.24 sched_restrict_cluster_spill
+
+Default value: 0
+
+Appears at /proc/sys/kernel/sched_restrict_cluster_spill
+
+This tunable can be used to restrict the higher capacity cluster pulling tasks
+from the lower capacity cluster in the load balance path. The restriction is
+lifted if all of the CPUS in the lower capacity cluster are above spill.
+The power cost is used to break the ties if the capacity of clusters are same
+for applying this restriction.
+
=========================
8. HMP SCHEDULER TRACE POINTS
=========================
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 395b7c25250b..676502dec830 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -66,11 +66,10 @@ extern unsigned int sysctl_sched_upmigrate_pct;
extern unsigned int sysctl_sched_downmigrate_pct;
extern int sysctl_sched_upmigrate_min_nice;
extern unsigned int sysctl_early_detection_duration;
-extern unsigned int sysctl_sched_lowspill_freq;
-extern unsigned int sysctl_sched_pack_freq;
extern unsigned int sysctl_sched_boost;
extern unsigned int sysctl_sched_select_prev_cpu_us;
extern unsigned int sysctl_sched_enable_colocation;
+extern unsigned int sysctl_sched_restrict_cluster_spill;
#if defined(CONFIG_SCHED_FREQ_INPUT)
extern unsigned int sysctl_sched_new_task_windows;
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index da693099cc40..3eb27a016003 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1314,7 +1314,7 @@ static struct sched_cluster init_cluster = {
void update_all_clusters_stats(void)
{
struct sched_cluster *cluster;
- u64 highest_mpc = 0;
+ u64 highest_mpc = 0, lowest_mpc = U64_MAX;
pre_big_task_count_change(cpu_possible_mask);
@@ -1328,9 +1328,13 @@ void update_all_clusters_stats(void)
if (mpc > highest_mpc)
highest_mpc = mpc;
+
+ if (mpc < lowest_mpc)
+ lowest_mpc = mpc;
}
max_possible_capacity = highest_mpc;
+ min_max_possible_capacity = lowest_mpc;
__update_min_max_capacity();
sched_update_freq_max_load(cpu_possible_mask);
@@ -1696,6 +1700,8 @@ unsigned int min_max_freq = 1;
unsigned int max_capacity = 1024; /* max(rq->capacity) */
unsigned int min_capacity = 1024; /* min(rq->capacity) */
unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */
+unsigned int
+min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */
/* Window size (in ns) */
__read_mostly unsigned int sched_ravg_window = 10000000;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1b64a6ae333c..1cb4d18b1039 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2702,9 +2702,6 @@ unsigned int __read_mostly sysctl_sched_spill_nr_run = 10;
*/
unsigned int __read_mostly sysctl_sched_enable_power_aware = 0;
-unsigned int __read_mostly sysctl_sched_lowspill_freq;
-unsigned int __read_mostly sysctl_sched_pack_freq = UINT_MAX;
-
/*
* CPUs with load greater than the sched_spill_load_threshold are not
* eligible for task placement. When all CPUs in a cluster achieve a
@@ -2774,6 +2771,8 @@ static unsigned int __read_mostly
sched_short_sleep_task_threshold = 2000 * NSEC_PER_USEC;
unsigned int __read_mostly sysctl_sched_select_prev_cpu_us = 2000;
+unsigned int __read_mostly sysctl_sched_restrict_cluster_spill;
+
void update_up_down_migrate(void)
{
unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct);
@@ -8053,9 +8052,10 @@ bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
{
int local_cpu, busiest_cpu;
int local_capacity, busiest_capacity;
- unsigned int local_freq, busiest_freq, busiest_max_freq;
+ int local_pwr_cost, busiest_pwr_cost;
+ int nr_cpus;
- if (sched_boost())
+ if (!sysctl_sched_restrict_cluster_spill || sched_boost())
return 0;
local_cpu = group_first_cpu(sds->local);
@@ -8063,21 +8063,24 @@ bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
local_capacity = cpu_max_possible_capacity(local_cpu);
busiest_capacity = cpu_max_possible_capacity(busiest_cpu);
- local_freq = cpu_cur_freq(local_cpu);
- busiest_freq = cpu_cur_freq(busiest_cpu);
- busiest_max_freq = cpu_max_freq(busiest_cpu);
- if (local_capacity < busiest_capacity) {
- if (local_freq >= sysctl_sched_pack_freq &&
- busiest_freq < busiest_max_freq)
- return 1;
- } else if (local_capacity > busiest_capacity) {
- if (sds->busiest_stat.sum_nr_big_tasks)
- return 0;
+ local_pwr_cost = cpu_max_power_cost(local_cpu);
+ busiest_pwr_cost = cpu_max_power_cost(busiest_cpu);
- if (busiest_freq <= sysctl_sched_lowspill_freq)
- return 1;
- }
+ if (local_capacity < busiest_capacity ||
+ (local_capacity == busiest_capacity &&
+ local_pwr_cost <= busiest_pwr_cost))
+ return 0;
+
+ if (local_capacity > busiest_capacity &&
+ sds->busiest_stat.sum_nr_big_tasks)
+ return 0;
+
+ nr_cpus = cpumask_weight(sched_group_cpus(sds->busiest));
+ if ((sds->busiest_stat.group_cpu_load < nr_cpus * sched_spill_load) &&
+ (sds->busiest_stat.sum_nr_running <
+ nr_cpus * sysctl_sched_spill_nr_run))
+ return 1;
return 0;
}
@@ -9690,7 +9693,7 @@ static struct {
} nohz ____cacheline_aligned;
#ifdef CONFIG_SCHED_HMP
-static inline int find_new_hmp_ilb(void)
+static inline int find_new_hmp_ilb(int type)
{
int call_cpu = raw_smp_processor_id();
struct sched_domain *sd;
@@ -9702,7 +9705,12 @@ static inline int find_new_hmp_ilb(void)
for_each_domain(call_cpu, sd) {
for_each_cpu_and(ilb, nohz.idle_cpus_mask,
sched_domain_span(sd)) {
- if (idle_cpu(ilb)) {
+ if (idle_cpu(ilb) && (type != NOHZ_KICK_RESTRICT ||
+ (hmp_capable() &&
+ cpu_max_possible_capacity(ilb) <=
+ cpu_max_possible_capacity(call_cpu)) ||
+ cpu_max_power_cost(ilb) <=
+ cpu_max_power_cost(call_cpu))) {
rcu_read_unlock();
reset_balance_interval(ilb);
return ilb;
@@ -9720,12 +9728,12 @@ static inline int find_new_hmp_ilb(void)
}
#endif /* CONFIG_SCHED_HMP */
-static inline int find_new_ilb(void)
+static inline int find_new_ilb(int type)
{
int ilb;
if (sched_enable_hmp)
- return find_new_hmp_ilb();
+ return find_new_hmp_ilb(type);
ilb = cpumask_first(nohz.idle_cpus_mask);
@@ -9740,13 +9748,13 @@ static inline int find_new_ilb(void)
* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
* CPU (if there is one).
*/
-static void nohz_balancer_kick(void)
+static void nohz_balancer_kick(int type)
{
int ilb_cpu;
nohz.next_balance++;
- ilb_cpu = find_new_ilb();
+ ilb_cpu = find_new_ilb(type);
if (ilb_cpu >= nr_cpu_ids)
return;
@@ -10031,7 +10039,51 @@ end:
clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
}
-static inline int _nohz_kick_needed(struct rq *rq, int cpu)
+#ifdef CONFIG_SCHED_HMP
+static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
+{
+ struct sched_domain *sd;
+ int i;
+
+ if (rq->nr_running < 2)
+ return 0;
+
+ if (!sysctl_sched_restrict_cluster_spill)
+ return 1;
+
+ if (hmp_capable() && cpu_max_possible_capacity(cpu) ==
+ max_possible_capacity)
+ return 1;
+
+ rcu_read_lock();
+ sd = rcu_dereference_check_sched_domain(rq->sd);
+ if (!sd) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ for_each_cpu(i, sched_domain_span(sd)) {
+ if (cpu_load(i) < sched_spill_load &&
+ cpu_rq(i)->nr_running <
+ sysctl_sched_spill_nr_run) {
+ /* Change the kick type to limit to CPUs that
+ * are of equal or lower capacity.
+ */
+ *type = NOHZ_KICK_RESTRICT;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return 1;
+}
+#else
+static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
+{
+ return 0;
+}
+#endif
+
+static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type)
{
unsigned long now = jiffies;
@@ -10042,6 +10094,9 @@ static inline int _nohz_kick_needed(struct rq *rq, int cpu)
if (likely(!atomic_read(&nohz.nr_cpus)))
return 0;
+ if (sched_enable_hmp)
+ return _nohz_kick_needed_hmp(rq, cpu, type);
+
if (time_before(now, nohz.next_balance))
return 0;
@@ -10059,7 +10114,7 @@ static inline int _nohz_kick_needed(struct rq *rq, int cpu)
* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
* domain span are idle.
*/
-static inline bool nohz_kick_needed(struct rq *rq)
+static inline bool nohz_kick_needed(struct rq *rq, int *type)
{
#ifndef CONFIG_SCHED_HMP
struct sched_domain *sd;
@@ -10079,7 +10134,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
set_cpu_sd_state_busy();
nohz_balance_exit_idle(cpu);
- if (_nohz_kick_needed(rq, cpu))
+ if (_nohz_kick_needed(rq, cpu, type))
return true;
#ifndef CONFIG_SCHED_HMP
@@ -10148,6 +10203,8 @@ static void run_rebalance_domains(struct softirq_action *h)
*/
void trigger_load_balance(struct rq *rq)
{
+ int type = NOHZ_KICK_ANY;
+
/* Don't need to rebalance while attached to NULL domain */
if (unlikely(on_null_domain(rq)))
return;
@@ -10155,8 +10212,8 @@ void trigger_load_balance(struct rq *rq)
if (time_after_eq(jiffies, rq->next_balance))
raise_softirq(SCHED_SOFTIRQ);
#ifdef CONFIG_NO_HZ_COMMON
- if (nohz_kick_needed(rq))
- nohz_balancer_kick();
+ if (nohz_kick_needed(rq, &type))
+ nohz_balancer_kick(type);
#endif
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6cd1dc3b6267..2390f927f8c2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1036,6 +1036,7 @@ extern unsigned int max_capacity;
extern unsigned int min_capacity;
extern unsigned int max_load_scale_factor;
extern unsigned int max_possible_capacity;
+extern unsigned int min_max_possible_capacity;
extern unsigned int sched_upmigrate;
extern unsigned int sched_downmigrate;
extern unsigned int sched_init_task_load_pelt;
@@ -1101,6 +1102,16 @@ static inline int same_cluster(int src_cpu, int dst_cpu)
return cpu_rq(src_cpu)->cluster == cpu_rq(dst_cpu)->cluster;
}
+static inline int cpu_max_power_cost(int cpu)
+{
+ return cpu_rq(cpu)->cluster->max_power_cost;
+}
+
+static inline bool hmp_capable(void)
+{
+ return max_possible_capacity != min_max_possible_capacity;
+}
+
/*
* 'load' is in reference to "best cpu" at its best frequency.
* Scale that in reference to a given cpu, accounting for how bad it is
@@ -2212,6 +2223,9 @@ enum rq_nohz_flag_bits {
NOHZ_BALANCE_KICK,
};
+#define NOHZ_KICK_ANY 0
+#define NOHZ_KICK_RESTRICT 1
+
#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 878b64bfcc7a..6d637a744db6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -410,20 +410,6 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = sched_hmp_proc_update_handler,
},
- {
- .procname = "sched_lowspill_freq",
- .data = &sysctl_sched_lowspill_freq,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "sched_pack_freq",
- .data = &sysctl_sched_pack_freq,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
#if defined(CONFIG_SCHED_FREQ_INPUT) && defined(CONFIG_SCHED_HMP)
{
.procname = "sched_new_task_windows",
@@ -457,6 +443,15 @@ static struct ctl_table kern_table[] = {
.extra2 = &one,
},
{
+ .procname = "sched_restrict_cluster_spill",
+ .data = &sysctl_sched_restrict_cluster_spill,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
.procname = "sched_boost",
.data = &sysctl_sched_boost,
.maxlen = sizeof(unsigned int),