diff options
| -rw-r--r-- | Documentation/scheduler/sched-zone.txt | 41 | ||||
| -rw-r--r-- | include/linux/sched/sysctl.h | 3 | ||||
| -rw-r--r-- | kernel/sched/core.c | 8 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 115 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 14 | ||||
| -rw-r--r-- | kernel/sysctl.c | 23 |
6 files changed, 129 insertions, 75 deletions
diff --git a/Documentation/scheduler/sched-zone.txt b/Documentation/scheduler/sched-zone.txt index 4372e5c788c7..28313f144882 100644 --- a/Documentation/scheduler/sched-zone.txt +++ b/Documentation/scheduler/sched-zone.txt @@ -1238,35 +1238,6 @@ power mode. It ignores the actual D-state that a cluster may be in and assumes the worst case power cost of the highest D-state. It is means of biasing task placement away from idle clusters when necessary. - -*** 7.21 sched_lowspill_freq - -Default value: 0 - -Appears at /proc/sys/kernel/sched_lowspill_freq - -This is the first of two tunables designed to govern the load balancer behavior -at various frequency levels. This tunable defines the frequency of the little -cluster below which the big cluster is not permitted to pull tasks from the -little cluster as part of load balance. The idea is that below a certain -frequency, a cluster has enough remaining capacity that may not necessitate -migration of tasks. This helps in achieving consolidation of workload within -the little cluster when needed. - -*** 7.22 sched_pack_freq - -Default value: INT_MAX - -Appears at /proc/sys/kernel/sched_pack_freq - -This is the second of two tunables designed to govern the load balancer behavior -at various frequency levels. This tunable defines the frequency of the little -cluster beyond which the little cluster is now allowed to pull tasks from the -big cluster as part of load balance. The idea is that above a certain frequency -threshold the little cluster may not want to pull additional work from another -cluster. This helps in achieving consolidation of workload within the big -cluster when needed. - ***7.23 sched_early_detection_duration Default value: 9500000 @@ -1278,6 +1249,18 @@ tick for it to be eligible for the scheduler's early detection feature under scheduler boost. For more information on the feature itself please refer to section 5.2.1. +*** 7.24 sched_restrict_cluster_spill + +Default value: 0 + +Appears at /proc/sys/kernel/sched_restrict_cluster_spill + +This tunable can be used to restrict the higher capacity cluster pulling tasks +from the lower capacity cluster in the load balance path. The restriction is +lifted if all of the CPUS in the lower capacity cluster are above spill. +The power cost is used to break the ties if the capacity of clusters are same +for applying this restriction. + ========================= 8. HMP SCHEDULER TRACE POINTS ========================= diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 395b7c25250b..676502dec830 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -66,11 +66,10 @@ extern unsigned int sysctl_sched_upmigrate_pct; extern unsigned int sysctl_sched_downmigrate_pct; extern int sysctl_sched_upmigrate_min_nice; extern unsigned int sysctl_early_detection_duration; -extern unsigned int sysctl_sched_lowspill_freq; -extern unsigned int sysctl_sched_pack_freq; extern unsigned int sysctl_sched_boost; extern unsigned int sysctl_sched_select_prev_cpu_us; extern unsigned int sysctl_sched_enable_colocation; +extern unsigned int sysctl_sched_restrict_cluster_spill; #if defined(CONFIG_SCHED_FREQ_INPUT) extern unsigned int sysctl_sched_new_task_windows; #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index da693099cc40..3eb27a016003 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1314,7 +1314,7 @@ static struct sched_cluster init_cluster = { void update_all_clusters_stats(void) { struct sched_cluster *cluster; - u64 highest_mpc = 0; + u64 highest_mpc = 0, lowest_mpc = U64_MAX; pre_big_task_count_change(cpu_possible_mask); @@ -1328,9 +1328,13 @@ void update_all_clusters_stats(void) if (mpc > highest_mpc) highest_mpc = mpc; + + if (mpc < lowest_mpc) + lowest_mpc = mpc; } max_possible_capacity = highest_mpc; + min_max_possible_capacity = lowest_mpc; __update_min_max_capacity(); sched_update_freq_max_load(cpu_possible_mask); @@ -1696,6 +1700,8 @@ unsigned int min_max_freq = 1; unsigned int max_capacity = 1024; /* max(rq->capacity) */ unsigned int min_capacity = 1024; /* min(rq->capacity) */ unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */ +unsigned int +min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */ /* Window size (in ns) */ __read_mostly unsigned int sched_ravg_window = 10000000; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1b64a6ae333c..1cb4d18b1039 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2702,9 +2702,6 @@ unsigned int __read_mostly sysctl_sched_spill_nr_run = 10; */ unsigned int __read_mostly sysctl_sched_enable_power_aware = 0; -unsigned int __read_mostly sysctl_sched_lowspill_freq; -unsigned int __read_mostly sysctl_sched_pack_freq = UINT_MAX; - /* * CPUs with load greater than the sched_spill_load_threshold are not * eligible for task placement. When all CPUs in a cluster achieve a @@ -2774,6 +2771,8 @@ static unsigned int __read_mostly sched_short_sleep_task_threshold = 2000 * NSEC_PER_USEC; unsigned int __read_mostly sysctl_sched_select_prev_cpu_us = 2000; +unsigned int __read_mostly sysctl_sched_restrict_cluster_spill; + void update_up_down_migrate(void) { unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct); @@ -8053,9 +8052,10 @@ bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds) { int local_cpu, busiest_cpu; int local_capacity, busiest_capacity; - unsigned int local_freq, busiest_freq, busiest_max_freq; + int local_pwr_cost, busiest_pwr_cost; + int nr_cpus; - if (sched_boost()) + if (!sysctl_sched_restrict_cluster_spill || sched_boost()) return 0; local_cpu = group_first_cpu(sds->local); @@ -8063,21 +8063,24 @@ bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds) local_capacity = cpu_max_possible_capacity(local_cpu); busiest_capacity = cpu_max_possible_capacity(busiest_cpu); - local_freq = cpu_cur_freq(local_cpu); - busiest_freq = cpu_cur_freq(busiest_cpu); - busiest_max_freq = cpu_max_freq(busiest_cpu); - if (local_capacity < busiest_capacity) { - if (local_freq >= sysctl_sched_pack_freq && - busiest_freq < busiest_max_freq) - return 1; - } else if (local_capacity > busiest_capacity) { - if (sds->busiest_stat.sum_nr_big_tasks) - return 0; + local_pwr_cost = cpu_max_power_cost(local_cpu); + busiest_pwr_cost = cpu_max_power_cost(busiest_cpu); - if (busiest_freq <= sysctl_sched_lowspill_freq) - return 1; - } + if (local_capacity < busiest_capacity || + (local_capacity == busiest_capacity && + local_pwr_cost <= busiest_pwr_cost)) + return 0; + + if (local_capacity > busiest_capacity && + sds->busiest_stat.sum_nr_big_tasks) + return 0; + + nr_cpus = cpumask_weight(sched_group_cpus(sds->busiest)); + if ((sds->busiest_stat.group_cpu_load < nr_cpus * sched_spill_load) && + (sds->busiest_stat.sum_nr_running < + nr_cpus * sysctl_sched_spill_nr_run)) + return 1; return 0; } @@ -9690,7 +9693,7 @@ static struct { } nohz ____cacheline_aligned; #ifdef CONFIG_SCHED_HMP -static inline int find_new_hmp_ilb(void) +static inline int find_new_hmp_ilb(int type) { int call_cpu = raw_smp_processor_id(); struct sched_domain *sd; @@ -9702,7 +9705,12 @@ static inline int find_new_hmp_ilb(void) for_each_domain(call_cpu, sd) { for_each_cpu_and(ilb, nohz.idle_cpus_mask, sched_domain_span(sd)) { - if (idle_cpu(ilb)) { + if (idle_cpu(ilb) && (type != NOHZ_KICK_RESTRICT || + (hmp_capable() && + cpu_max_possible_capacity(ilb) <= + cpu_max_possible_capacity(call_cpu)) || + cpu_max_power_cost(ilb) <= + cpu_max_power_cost(call_cpu))) { rcu_read_unlock(); reset_balance_interval(ilb); return ilb; @@ -9720,12 +9728,12 @@ static inline int find_new_hmp_ilb(void) } #endif /* CONFIG_SCHED_HMP */ -static inline int find_new_ilb(void) +static inline int find_new_ilb(int type) { int ilb; if (sched_enable_hmp) - return find_new_hmp_ilb(); + return find_new_hmp_ilb(type); ilb = cpumask_first(nohz.idle_cpus_mask); @@ -9740,13 +9748,13 @@ static inline int find_new_ilb(void) * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle * CPU (if there is one). */ -static void nohz_balancer_kick(void) +static void nohz_balancer_kick(int type) { int ilb_cpu; nohz.next_balance++; - ilb_cpu = find_new_ilb(); + ilb_cpu = find_new_ilb(type); if (ilb_cpu >= nr_cpu_ids) return; @@ -10031,7 +10039,51 @@ end: clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); } -static inline int _nohz_kick_needed(struct rq *rq, int cpu) +#ifdef CONFIG_SCHED_HMP +static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type) +{ + struct sched_domain *sd; + int i; + + if (rq->nr_running < 2) + return 0; + + if (!sysctl_sched_restrict_cluster_spill) + return 1; + + if (hmp_capable() && cpu_max_possible_capacity(cpu) == + max_possible_capacity) + return 1; + + rcu_read_lock(); + sd = rcu_dereference_check_sched_domain(rq->sd); + if (!sd) { + rcu_read_unlock(); + return 0; + } + + for_each_cpu(i, sched_domain_span(sd)) { + if (cpu_load(i) < sched_spill_load && + cpu_rq(i)->nr_running < + sysctl_sched_spill_nr_run) { + /* Change the kick type to limit to CPUs that + * are of equal or lower capacity. + */ + *type = NOHZ_KICK_RESTRICT; + break; + } + } + rcu_read_unlock(); + return 1; +} +#else +static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type) +{ + return 0; +} +#endif + +static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type) { unsigned long now = jiffies; @@ -10042,6 +10094,9 @@ static inline int _nohz_kick_needed(struct rq *rq, int cpu) if (likely(!atomic_read(&nohz.nr_cpus))) return 0; + if (sched_enable_hmp) + return _nohz_kick_needed_hmp(rq, cpu, type); + if (time_before(now, nohz.next_balance)) return 0; @@ -10059,7 +10114,7 @@ static inline int _nohz_kick_needed(struct rq *rq, int cpu) * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ -static inline bool nohz_kick_needed(struct rq *rq) +static inline bool nohz_kick_needed(struct rq *rq, int *type) { #ifndef CONFIG_SCHED_HMP struct sched_domain *sd; @@ -10079,7 +10134,7 @@ static inline bool nohz_kick_needed(struct rq *rq) set_cpu_sd_state_busy(); nohz_balance_exit_idle(cpu); - if (_nohz_kick_needed(rq, cpu)) + if (_nohz_kick_needed(rq, cpu, type)) return true; #ifndef CONFIG_SCHED_HMP @@ -10148,6 +10203,8 @@ static void run_rebalance_domains(struct softirq_action *h) */ void trigger_load_balance(struct rq *rq) { + int type = NOHZ_KICK_ANY; + /* Don't need to rebalance while attached to NULL domain */ if (unlikely(on_null_domain(rq))) return; @@ -10155,8 +10212,8 @@ void trigger_load_balance(struct rq *rq) if (time_after_eq(jiffies, rq->next_balance)) raise_softirq(SCHED_SOFTIRQ); #ifdef CONFIG_NO_HZ_COMMON - if (nohz_kick_needed(rq)) - nohz_balancer_kick(); + if (nohz_kick_needed(rq, &type)) + nohz_balancer_kick(type); #endif } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6cd1dc3b6267..2390f927f8c2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1036,6 +1036,7 @@ extern unsigned int max_capacity; extern unsigned int min_capacity; extern unsigned int max_load_scale_factor; extern unsigned int max_possible_capacity; +extern unsigned int min_max_possible_capacity; extern unsigned int sched_upmigrate; extern unsigned int sched_downmigrate; extern unsigned int sched_init_task_load_pelt; @@ -1101,6 +1102,16 @@ static inline int same_cluster(int src_cpu, int dst_cpu) return cpu_rq(src_cpu)->cluster == cpu_rq(dst_cpu)->cluster; } +static inline int cpu_max_power_cost(int cpu) +{ + return cpu_rq(cpu)->cluster->max_power_cost; +} + +static inline bool hmp_capable(void) +{ + return max_possible_capacity != min_max_possible_capacity; +} + /* * 'load' is in reference to "best cpu" at its best frequency. * Scale that in reference to a given cpu, accounting for how bad it is @@ -2212,6 +2223,9 @@ enum rq_nohz_flag_bits { NOHZ_BALANCE_KICK, }; +#define NOHZ_KICK_ANY 0 +#define NOHZ_KICK_RESTRICT 1 + #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) #endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 878b64bfcc7a..6d637a744db6 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -410,20 +410,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_hmp_proc_update_handler, }, - { - .procname = "sched_lowspill_freq", - .data = &sysctl_sched_lowspill_freq, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "sched_pack_freq", - .data = &sysctl_sched_pack_freq, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #if defined(CONFIG_SCHED_FREQ_INPUT) && defined(CONFIG_SCHED_HMP) { .procname = "sched_new_task_windows", @@ -457,6 +443,15 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, { + .procname = "sched_restrict_cluster_spill", + .data = &sysctl_sched_restrict_cluster_spill, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { .procname = "sched_boost", .data = &sysctl_sched_boost, .maxlen = sizeof(unsigned int), |
