sched: Revise the inter cluster load balance restrictions

The frequency based inter cluster load balance restrictions are not reliable as frequency does not provide a good estimate of the CPU's current load. Replace them with the spill_load and spill_nr_run based checks. The higher capacity cluster is restricted from pulling the tasks from the lower capacity cluster unless all of the lower capacity CPUs are above spill. This behavior can be controlled by a sysctl tunable and it is disabled by default (i.e. no load balance restrictions). Change-Id: I45c09c8adcb61a8a7d4e08beadf2f97f1805fb42 Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org> Signed-off-by: Pavankumar Kondeti <pkondeti@codeaurora.org> [joonwoop@codeaurora.org: fixed merge conflicts due to omitted changes for CONFIG_SCHED_QHMP.]
author: Pavankumar Kondeti <pkondeti@codeaurora.org> 2015-12-04 06:34:03 +0530
committer: David Keitel <dkeitel@codeaurora.org> 2016-03-23 21:25:13 -0700
commit: 6418f213abdc8823a7aa75fa4e504d3d83effa57 (patch)
tree: 6a78e00e43d4bd8ecb8391aed2a532559ec923ac /kernel
parent: 3004236139377ad667ce13fdda58f931992fd7cc (diff)
4 files changed, 116 insertions, 44 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index da693099cc40..3eb27a016003 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1314,7 +1314,7 @@ static struct sched_cluster init_cluster = {
 void update_all_clusters_stats(void)
 {
 	struct sched_cluster *cluster;
-	u64 highest_mpc = 0;
+	u64 highest_mpc = 0, lowest_mpc = U64_MAX;
 
 	pre_big_task_count_change(cpu_possible_mask);
 
@@ -1328,9 +1328,13 @@ void update_all_clusters_stats(void)
 
 		if (mpc > highest_mpc)
 			highest_mpc = mpc;
+
+		if (mpc < lowest_mpc)
+			lowest_mpc = mpc;
 	}
 
 	max_possible_capacity = highest_mpc;
+	min_max_possible_capacity = lowest_mpc;
 
 	__update_min_max_capacity();
 	sched_update_freq_max_load(cpu_possible_mask);
@@ -1696,6 +1700,8 @@ unsigned int min_max_freq = 1;
 unsigned int max_capacity = 1024; /* max(rq->capacity) */
 unsigned int min_capacity = 1024; /* min(rq->capacity) */
 unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */
+unsigned int
+min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */
 
 /* Window size (in ns) */
 __read_mostly unsigned int sched_ravg_window = 10000000;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1b64a6ae333c..1cb4d18b1039 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2702,9 +2702,6 @@ unsigned int __read_mostly sysctl_sched_spill_nr_run = 10;
  */
 unsigned int __read_mostly sysctl_sched_enable_power_aware = 0;
 
-unsigned int __read_mostly sysctl_sched_lowspill_freq;
-unsigned int __read_mostly sysctl_sched_pack_freq = UINT_MAX;
-
 /*
  * CPUs with load greater than the sched_spill_load_threshold are not
  * eligible for task placement. When all CPUs in a cluster achieve a
@@ -2774,6 +2771,8 @@ static unsigned int __read_mostly
 sched_short_sleep_task_threshold = 2000 * NSEC_PER_USEC;
 unsigned int __read_mostly sysctl_sched_select_prev_cpu_us = 2000;
 
+unsigned int __read_mostly sysctl_sched_restrict_cluster_spill;
+
 void update_up_down_migrate(void)
 {
 	unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct);
@@ -8053,9 +8052,10 @@ bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
 {
 	int local_cpu, busiest_cpu;
 	int local_capacity, busiest_capacity;
-	unsigned int local_freq, busiest_freq, busiest_max_freq;
+	int local_pwr_cost, busiest_pwr_cost;
+	int nr_cpus;
 
-	if (sched_boost())
+	if (!sysctl_sched_restrict_cluster_spill || sched_boost())
 		return 0;
 
 	local_cpu = group_first_cpu(sds->local);
@@ -8063,21 +8063,24 @@ bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
 
 	local_capacity = cpu_max_possible_capacity(local_cpu);
 	busiest_capacity = cpu_max_possible_capacity(busiest_cpu);
-	local_freq = cpu_cur_freq(local_cpu);
-	busiest_freq = cpu_cur_freq(busiest_cpu);
-	busiest_max_freq = cpu_max_freq(busiest_cpu);
 
-	if (local_capacity < busiest_capacity) {
-		if (local_freq >= sysctl_sched_pack_freq &&
-					busiest_freq < busiest_max_freq)
-			return 1;
-	} else if (local_capacity > busiest_capacity) {
-		if (sds->busiest_stat.sum_nr_big_tasks)
-			return 0;
+	local_pwr_cost = cpu_max_power_cost(local_cpu);
+	busiest_pwr_cost = cpu_max_power_cost(busiest_cpu);
 
-		if (busiest_freq <= sysctl_sched_lowspill_freq)
-			return 1;
-	}
+	if (local_capacity < busiest_capacity ||
+			(local_capacity == busiest_capacity &&
+			local_pwr_cost <= busiest_pwr_cost))
+		return 0;
+
+	if (local_capacity > busiest_capacity &&
+			sds->busiest_stat.sum_nr_big_tasks)
+		return 0;
+
+	nr_cpus = cpumask_weight(sched_group_cpus(sds->busiest));
+	if ((sds->busiest_stat.group_cpu_load < nr_cpus * sched_spill_load) &&
+		(sds->busiest_stat.sum_nr_running <
+			nr_cpus * sysctl_sched_spill_nr_run))
+		return 1;
 
 	return 0;
 }
@@ -9690,7 +9693,7 @@ static struct {
 } nohz ____cacheline_aligned;
 
 #ifdef CONFIG_SCHED_HMP
-static inline int find_new_hmp_ilb(void)
+static inline int find_new_hmp_ilb(int type)
 {
 	int call_cpu = raw_smp_processor_id();
 	struct sched_domain *sd;
@@ -9702,7 +9705,12 @@ static inline int find_new_hmp_ilb(void)
 	for_each_domain(call_cpu, sd) {
 		for_each_cpu_and(ilb, nohz.idle_cpus_mask,
 						sched_domain_span(sd)) {
-			if (idle_cpu(ilb)) {
+			if (idle_cpu(ilb) && (type != NOHZ_KICK_RESTRICT ||
+					(hmp_capable() &&
+					 cpu_max_possible_capacity(ilb) <=
+					cpu_max_possible_capacity(call_cpu)) ||
+					cpu_max_power_cost(ilb) <=
+					cpu_max_power_cost(call_cpu))) {
 				rcu_read_unlock();
 				reset_balance_interval(ilb);
 				return ilb;
@@ -9720,12 +9728,12 @@ static inline int find_new_hmp_ilb(void)
 }
 #endif	/* CONFIG_SCHED_HMP */
 
-static inline int find_new_ilb(void)
+static inline int find_new_ilb(int type)
 {
 	int ilb;
 
 	if (sched_enable_hmp)
-		return find_new_hmp_ilb();
+		return find_new_hmp_ilb(type);
 
 	ilb = cpumask_first(nohz.idle_cpus_mask);
 
@@ -9740,13 +9748,13 @@ static inline int find_new_ilb(void)
  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
  * CPU (if there is one).
  */
-static void nohz_balancer_kick(void)
+static void nohz_balancer_kick(int type)
 {
 	int ilb_cpu;
 
 	nohz.next_balance++;
 
-	ilb_cpu = find_new_ilb();
+	ilb_cpu = find_new_ilb(type);
 
 	if (ilb_cpu >= nr_cpu_ids)
 		return;
@@ -10031,7 +10039,51 @@ end:
 	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
 }
 
-static inline int _nohz_kick_needed(struct rq *rq, int cpu)
+#ifdef CONFIG_SCHED_HMP
+static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
+{
+	struct sched_domain *sd;
+	int i;
+
+	if (rq->nr_running < 2)
+		return 0;
+
+	if (!sysctl_sched_restrict_cluster_spill)
+		return 1;
+
+	if (hmp_capable() && cpu_max_possible_capacity(cpu) ==
+			max_possible_capacity)
+		return 1;
+
+	rcu_read_lock();
+	sd = rcu_dereference_check_sched_domain(rq->sd);
+	if (!sd) {
+		rcu_read_unlock();
+		return 0;
+	}
+
+	for_each_cpu(i, sched_domain_span(sd)) {
+		if (cpu_load(i) < sched_spill_load &&
+				cpu_rq(i)->nr_running <
+				sysctl_sched_spill_nr_run) {
+			/* Change the kick type to limit to CPUs that
+			 * are of equal or lower capacity.
+			 */
+			*type = NOHZ_KICK_RESTRICT;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return 1;
+}
+#else
+static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
+{
+	return 0;
+}
+#endif
+
+static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type)
 {
 	unsigned long now = jiffies;
 
@@ -10042,6 +10094,9 @@ static inline int _nohz_kick_needed(struct rq *rq, int cpu)
 	if (likely(!atomic_read(&nohz.nr_cpus)))
 		return 0;
 
+	if (sched_enable_hmp)
+		return _nohz_kick_needed_hmp(rq, cpu, type);
+
 	if (time_before(now, nohz.next_balance))
 		return 0;
 
@@ -10059,7 +10114,7 @@ static inline int _nohz_kick_needed(struct rq *rq, int cpu)
  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
  *     domain span are idle.
  */
-static inline bool nohz_kick_needed(struct rq *rq)
+static inline bool nohz_kick_needed(struct rq *rq, int *type)
 {
 #ifndef CONFIG_SCHED_HMP
 	struct sched_domain *sd;
@@ -10079,7 +10134,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
 	set_cpu_sd_state_busy();
 	nohz_balance_exit_idle(cpu);
 
-	if (_nohz_kick_needed(rq, cpu))
+	if (_nohz_kick_needed(rq, cpu, type))
 		return true;
 
 #ifndef CONFIG_SCHED_HMP
@@ -10148,6 +10203,8 @@ static void run_rebalance_domains(struct softirq_action *h)
  */
 void trigger_load_balance(struct rq *rq)
 {
+	int type = NOHZ_KICK_ANY;
+
 	/* Don't need to rebalance while attached to NULL domain */
 	if (unlikely(on_null_domain(rq)))
 		return;
@@ -10155,8 +10212,8 @@ void trigger_load_balance(struct rq *rq)
 	if (time_after_eq(jiffies, rq->next_balance))
 		raise_softirq(SCHED_SOFTIRQ);
 #ifdef CONFIG_NO_HZ_COMMON
-	if (nohz_kick_needed(rq))
-		nohz_balancer_kick();
+	if (nohz_kick_needed(rq, &type))
+		nohz_balancer_kick(type);
 #endif
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6cd1dc3b6267..2390f927f8c2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1036,6 +1036,7 @@ extern unsigned int max_capacity;
 extern unsigned int min_capacity;
 extern unsigned int max_load_scale_factor;
 extern unsigned int max_possible_capacity;
+extern unsigned int min_max_possible_capacity;
 extern unsigned int sched_upmigrate;
 extern unsigned int sched_downmigrate;
 extern unsigned int sched_init_task_load_pelt;
@@ -1101,6 +1102,16 @@ static inline int same_cluster(int src_cpu, int dst_cpu)
 	return cpu_rq(src_cpu)->cluster == cpu_rq(dst_cpu)->cluster;
 }
 
+static inline int cpu_max_power_cost(int cpu)
+{
+	return cpu_rq(cpu)->cluster->max_power_cost;
+}
+
+static inline bool hmp_capable(void)
+{
+	return max_possible_capacity != min_max_possible_capacity;
+}
+
 /*
  * 'load' is in reference to "best cpu" at its best frequency.
  * Scale that in reference to a given cpu, accounting for how bad it is
@@ -2212,6 +2223,9 @@ enum rq_nohz_flag_bits {
 	NOHZ_BALANCE_KICK,
 };
 
+#define NOHZ_KICK_ANY 0
+#define NOHZ_KICK_RESTRICT 1
+
 #define nohz_flags(cpu)	(&cpu_rq(cpu)->nohz_flags)
 #endif
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 878b64bfcc7a..6d637a744db6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -410,20 +410,6 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= sched_hmp_proc_update_handler,
 	},
-	{
-		.procname	= "sched_lowspill_freq",
-		.data		= &sysctl_sched_lowspill_freq,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-	{
-		.procname	= "sched_pack_freq",
-		.data		= &sysctl_sched_pack_freq,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 #if defined(CONFIG_SCHED_FREQ_INPUT) && defined(CONFIG_SCHED_HMP)
 	{
 		.procname       = "sched_new_task_windows",
@@ -457,6 +443,15 @@ static struct ctl_table kern_table[] = {
 		.extra2		= &one,
 	},
 	{
+		.procname	= "sched_restrict_cluster_spill",
+		.data		= &sysctl_sched_restrict_cluster_spill,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+	{
 		.procname	= "sched_boost",
 		.data		= &sysctl_sched_boost,
 		.maxlen		= sizeof(unsigned int),
author	Pavankumar Kondeti <pkondeti@codeaurora.org>	2015-12-04 06:34:03 +0530
committer	David Keitel <dkeitel@codeaurora.org>	2016-03-23 21:25:13 -0700
commit	6418f213abdc8823a7aa75fa4e504d3d83effa57 (patch)
tree	6a78e00e43d4bd8ecb8391aed2a532559ec923ac /kernel
parent	3004236139377ad667ce13fdda58f931992fd7cc (diff)