6 files changed, 129 insertions, 75 deletions
diff --git a/Documentation/scheduler/sched-zone.txt b/Documentation/scheduler/sched-zone.txt
index 4372e5c788c7..28313f144882 100644
--- a/Documentation/scheduler/sched-zone.txt
+++ b/Documentation/scheduler/sched-zone.txt
@@ -1238,35 +1238,6 @@ power mode. It ignores the actual D-state that a cluster may be in and assumes
 the worst case power cost of the highest D-state. It is means of biasing task
 placement away from idle clusters when necessary.
 
-
-*** 7.21 sched_lowspill_freq
-
-Default value: 0
-
-Appears at /proc/sys/kernel/sched_lowspill_freq
-
-This is the first of two tunables designed to govern the load balancer behavior
-at various frequency levels. This tunable defines the frequency of the little
-cluster below which the big cluster is not permitted to pull tasks from the
-little cluster as part of load balance. The idea is that below a certain
-frequency, a cluster has enough remaining capacity that may not necessitate
-migration of tasks. This helps in achieving consolidation of workload within
-the little cluster when needed.
-
-*** 7.22 sched_pack_freq
-
-Default value: INT_MAX
-
-Appears at /proc/sys/kernel/sched_pack_freq
-
-This is the second of two tunables designed to govern the load balancer behavior
-at various frequency levels. This tunable defines the frequency of the little
-cluster beyond which the little cluster is now allowed to pull tasks from the
-big cluster as part of load balance. The idea is that above a certain frequency
-threshold the little cluster may not want to pull additional work from another
-cluster. This helps in achieving consolidation of workload within the big
-cluster when needed.
-
 ***7.23 sched_early_detection_duration
 
 Default value: 9500000
@@ -1278,6 +1249,18 @@ tick for it to be eligible for the scheduler's early detection feature
 under scheduler boost. For more information on the feature itself please
 refer to section 5.2.1.
 
+*** 7.24 sched_restrict_cluster_spill
+
+Default value: 0
+
+Appears at /proc/sys/kernel/sched_restrict_cluster_spill
+
+This tunable can be used to restrict the higher capacity cluster pulling tasks
+from the lower capacity cluster in the load balance path. The restriction is
+lifted if all of the CPUS in the lower capacity cluster are above spill.
+The power cost is used to break the ties if the capacity of clusters are same
+for applying this restriction.
+
 =========================
 8. HMP SCHEDULER TRACE POINTS
 =========================
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 395b7c25250b..676502dec830 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -66,11 +66,10 @@ extern unsigned int sysctl_sched_upmigrate_pct;
 extern unsigned int sysctl_sched_downmigrate_pct;
 extern int sysctl_sched_upmigrate_min_nice;
 extern unsigned int sysctl_early_detection_duration;
-extern unsigned int sysctl_sched_lowspill_freq;
-extern unsigned int sysctl_sched_pack_freq;
 extern unsigned int sysctl_sched_boost;
 extern unsigned int sysctl_sched_select_prev_cpu_us;
 extern unsigned int sysctl_sched_enable_colocation;
+extern unsigned int sysctl_sched_restrict_cluster_spill;
 #if defined(CONFIG_SCHED_FREQ_INPUT)
 extern unsigned int sysctl_sched_new_task_windows;
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index da693099cc40..3eb27a016003 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1314,7 +1314,7 @@ static struct sched_cluster init_cluster = {
 void update_all_clusters_stats(void)
 {
 	struct sched_cluster *cluster;
-	u64 highest_mpc = 0;
+	u64 highest_mpc = 0, lowest_mpc = U64_MAX;
 
 	pre_big_task_count_change(cpu_possible_mask);
 
@@ -1328,9 +1328,13 @@ void update_all_clusters_stats(void)
 
 		if (mpc > highest_mpc)
 			highest_mpc = mpc;
+
+		if (mpc < lowest_mpc)
+			lowest_mpc = mpc;
 	}
 
 	max_possible_capacity = highest_mpc;
+	min_max_possible_capacity = lowest_mpc;
 
 	__update_min_max_capacity();
 	sched_update_freq_max_load(cpu_possible_mask);
@@ -1696,6 +1700,8 @@ unsigned int min_max_freq = 1;
 unsigned int max_capacity = 1024; /* max(rq->capacity) */
 unsigned int min_capacity = 1024; /* min(rq->capacity) */
 unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */
+unsigned int
+min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */
 
 /* Window size (in ns) */
 __read_mostly unsigned int sched_ravg_window = 10000000;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1b64a6ae333c..1cb4d18b1039 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2702,9 +2702,6 @@ unsigned int __read_mostly sysctl_sched_spill_nr_run = 10;
  */
 unsigned int __read_mostly sysctl_sched_enable_power_aware = 0;
 
-unsigned int __read_mostly sysctl_sched_lowspill_freq;
-unsigned int __read_mostly sysctl_sched_pack_freq = UINT_MAX;
-
 /*
  * CPUs with load greater than the sched_spill_load_threshold are not
  * eligible for task placement. When all CPUs in a cluster achieve a
@@ -2774,6 +2771,8 @@ static unsigned int __read_mostly
 sched_short_sleep_task_threshold = 2000 * NSEC_PER_USEC;
 unsigned int __read_mostly sysctl_sched_select_prev_cpu_us = 2000;
 
+unsigned int __read_mostly sysctl_sched_restrict_cluster_spill;
+
 void update_up_down_migrate(void)
 {
 	unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct);
@@ -8053,9 +8052,10 @@ bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
 {
 	int local_cpu, busiest_cpu;
 	int local_capacity, busiest_capacity;
-	unsigned int local_freq, busiest_freq, busiest_max_freq;
+	int local_pwr_cost, busiest_pwr_cost;
+	int nr_cpus;
 
-	if (sched_boost())
+	if (!sysctl_sched_restrict_cluster_spill || sched_boost())
 		return 0;
 
 	local_cpu = group_first_cpu(sds->local);
@@ -8063,21 +8063,24 @@ bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
 
 	local_capacity = cpu_max_possible_capacity(local_cpu);
 	busiest_capacity = cpu_max_possible_capacity(busiest_cpu);
-	local_freq = cpu_cur_freq(local_cpu);
-	busiest_freq = cpu_cur_freq(busiest_cpu);
-	busiest_max_freq = cpu_max_freq(busiest_cpu);
 
-	if (local_capacity < busiest_capacity) {
-		if (local_freq >= sysctl_sched_pack_freq &&
-					busiest_freq < busiest_max_freq)
-			return 1;
-	} else if (local_capacity > busiest_capacity) {
-		if (sds->busiest_stat.sum_nr_big_tasks)
-			return 0;
+	local_pwr_cost = cpu_max_power_cost(local_cpu);
+	busiest_pwr_cost = cpu_max_power_cost(busiest_cpu);
 
-		if (busiest_freq <= sysctl_sched_lowspill_freq)
-			return 1;
-	}
+	if (local_capacity < busiest_capacity ||
+			(local_capacity == busiest_capacity &&
+			local_pwr_cost <= busiest_pwr_cost))
+		return 0;
+
+	if (local_capacity > busiest_capacity &&
+			sds->busiest_stat.sum_nr_big_tasks)
+		return 0;
+
+	nr_cpus = cpumask_weight(sched_group_cpus(sds->busiest));
+	if ((sds->busiest_stat.group_cpu_load < nr_cpus * sched_spill_load) &&
+		(sds->busiest_stat.sum_nr_running <
+			nr_cpus * sysctl_sched_spill_nr_run))
+		return 1;
 
 	return 0;
 }
@@ -9690,7 +9693,7 @@ static struct {
 } nohz ____cacheline_aligned;
 
 #ifdef CONFIG_SCHED_HMP
-static inline int find_new_hmp_ilb(void)
+static inline int find_new_hmp_ilb(int type)
 {
 	int call_cpu = raw_smp_processor_id();
 	struct sched_domain *sd;
@@ -9702,7 +9705,12 @@ static inline int find_new_hmp_ilb(void)
 	for_each_domain(call_cpu, sd) {
 		for_each_cpu_and(ilb, nohz.idle_cpus_mask,
 						sched_domain_span(sd)) {
-			if (idle_cpu(ilb)) {
+			if (idle_cpu(ilb) && (type != NOHZ_KICK_RESTRICT ||
+					(hmp_capable() &&
+					 cpu_max_possible_capacity(ilb) <=
+					cpu_max_possible_capacity(call_cpu)) ||
+					cpu_max_power_cost(ilb) <=
+					cpu_max_power_cost(call_cpu))) {
 				rcu_read_unlock();
 				reset_balance_interval(ilb);
 				return ilb;
@@ -9720,12 +9728,12 @@ static inline int find_new_hmp_ilb(void)
 }
 #endif	/* CONFIG_SCHED_HMP */
 
-static inline int find_new_ilb(void)
+static inline int find_new_ilb(int type)
 {
 	int ilb;
 
 	if (sched_enable_hmp)
-		return find_new_hmp_ilb();
+		return find_new_hmp_ilb(type);
 
 	ilb = cpumask_first(nohz.idle_cpus_mask);
 
@@ -9740,13 +9748,13 @@ static inline int find_new_ilb(void)
  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
  * CPU (if there is one).
  */
-static void nohz_balancer_kick(void)
+static void nohz_balancer_kick(int type)
 {
 	int ilb_cpu;
 
 	nohz.next_balance++;
 
-	ilb_cpu = find_new_ilb();
+	ilb_cpu = find_new_ilb(type);
 
 	if (ilb_cpu >= nr_cpu_ids)
 		return;
@@ -10031,7 +10039,51 @@ end:
 	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
 }
 
-static inline int _nohz_kick_needed(struct rq *rq, int cpu)
+#ifdef CONFIG_SCHED_HMP
+static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
+{
+	struct sched_domain *sd;
+	int i;
+
+	if (rq->nr_running < 2)
+		return 0;
+
+	if (!sysctl_sched_restrict_cluster_spill)
+		return 1;
+
+	if (hmp_capable() && cpu_max_possible_capacity(cpu) ==
+			max_possible_capacity)
+		return 1;
+
+	rcu_read_lock();
+	sd = rcu_dereference_check_sched_domain(rq->sd);
+	if (!sd) {
+		rcu_read_unlock();
+		return 0;
+	}
+
+	for_each_cpu(i, sched_domain_span(sd)) {
+		if (cpu_load(i) < sched_spill_load &&
+				cpu_rq(i)->nr_running <
+				sysctl_sched_spill_nr_run) {
+			/* Change the kick type to limit to CPUs that
+			 * are of equal or lower capacity.
+			 */
+			*type = NOHZ_KICK_RESTRICT;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return 1;
+}
+#else
+static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
+{
+	return 0;
+}
+#endif
+
+static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type)
 {
 	unsigned long now = jiffies;
 
@@ -10042,6 +10094,9 @@ static inline int _nohz_kick_needed(struct rq *rq, int cpu)
 	if (likely(!atomic_read(&nohz.nr_cpus)))
 		return 0;
 
+	if (sched_enable_hmp)
+		return _nohz_kick_needed_hmp(rq, cpu, type);
+
 	if (time_before(now, nohz.next_balance))
 		return 0;
 
@@ -10059,7 +10114,7 @@ static inline int _nohz_kick_needed(struct rq *rq, int cpu)
  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
  *     domain span are idle.
  */
-static inline bool nohz_kick_needed(struct rq *rq)
+static inline bool nohz_kick_needed(struct rq *rq, int *type)
 {
 #ifndef CONFIG_SCHED_HMP
 	struct sched_domain *sd;
@@ -10079,7 +10134,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
 	set_cpu_sd_state_busy();
 	nohz_balance_exit_idle(cpu);
 
-	if (_nohz_kick_needed(rq, cpu))
+	if (_nohz_kick_needed(rq, cpu, type))
 		return true;
 
 #ifndef CONFIG_SCHED_HMP
@@ -10148,6 +10203,8 @@ static void run_rebalance_domains(struct softirq_action *h)
  */
 void trigger_load_balance(struct rq *rq)
 {
+	int type = NOHZ_KICK_ANY;
+
 	/* Don't need to rebalance while attached to NULL domain */
 	if (unlikely(on_null_domain(rq)))
 		return;
@@ -10155,8 +10212,8 @@ void trigger_load_balance(struct rq *rq)
 	if (time_after_eq(jiffies, rq->next_balance))
 		raise_softirq(SCHED_SOFTIRQ);
 #ifdef CONFIG_NO_HZ_COMMON
-	if (nohz_kick_needed(rq))
-		nohz_balancer_kick();
+	if (nohz_kick_needed(rq, &type))
+		nohz_balancer_kick(type);
 #endif
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6cd1dc3b6267..2390f927f8c2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1036,6 +1036,7 @@ extern unsigned int max_capacity;
 extern unsigned int min_capacity;
 extern unsigned int max_load_scale_factor;
 extern unsigned int max_possible_capacity;
+extern unsigned int min_max_possible_capacity;
 extern unsigned int sched_upmigrate;
 extern unsigned int sched_downmigrate;
 extern unsigned int sched_init_task_load_pelt;
@@ -1101,6 +1102,16 @@ static inline int same_cluster(int src_cpu, int dst_cpu)
 	return cpu_rq(src_cpu)->cluster == cpu_rq(dst_cpu)->cluster;
 }
 
+static inline int cpu_max_power_cost(int cpu)
+{
+	return cpu_rq(cpu)->cluster->max_power_cost;
+}
+
+static inline bool hmp_capable(void)
+{
+	return max_possible_capacity != min_max_possible_capacity;
+}
+
 /*
  * 'load' is in reference to "best cpu" at its best frequency.
  * Scale that in reference to a given cpu, accounting for how bad it is
@@ -2212,6 +2223,9 @@ enum rq_nohz_flag_bits {
 	NOHZ_BALANCE_KICK,
 };
 
+#define NOHZ_KICK_ANY 0
+#define NOHZ_KICK_RESTRICT 1
+
 #define nohz_flags(cpu)	(&cpu_rq(cpu)->nohz_flags)
 #endif
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 878b64bfcc7a..6d637a744db6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -410,20 +410,6 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= sched_hmp_proc_update_handler,
 	},
-	{
-		.procname	= "sched_lowspill_freq",
-		.data		= &sysctl_sched_lowspill_freq,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-	{
-		.procname	= "sched_pack_freq",
-		.data		= &sysctl_sched_pack_freq,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 #if defined(CONFIG_SCHED_FREQ_INPUT) && defined(CONFIG_SCHED_HMP)
 	{
 		.procname       = "sched_new_task_windows",
@@ -457,6 +443,15 @@ static struct ctl_table kern_table[] = {
 		.extra2		= &one,
 	},
 	{
+		.procname	= "sched_restrict_cluster_spill",
+		.data		= &sysctl_sched_restrict_cluster_spill,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+	{
 		.procname	= "sched_boost",
 		.data		= &sysctl_sched_boost,
 		.maxlen		= sizeof(unsigned int),