sched: Introduce spill threshold tunables to manage overcommitment

When the number of tasks intended for a cluster exceed the number of mostly idle CPUs in that cluster, the scheduler currently freely uses CPUs in other clusters if possible. While this is optimal for performance the power trade off can be quite significant. Introduce spill threshold tunables that govern the extent to which the scheduler should attempt to contain tasks within a cluster. Change-Id: I797e6c6b2aa0c3a376dad93758abe1d587663624 Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org> [rameezmustafa@codeaurora.org]: Port to msm-3.18] Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org [joonwoop@codeaurora.org: fixed conflict in nohz_kick_needed()] Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
author: Syed Rameez Mustafa <rameezmustafa@codeaurora.org> 2014-06-12 10:47:12 -0700
committer: David Keitel <dkeitel@codeaurora.org> 2016-03-23 19:59:32 -0700
commit: a536cf8ac858ac8388272d98f9baf8e7ba25b646 (patch)
tree: 3d9e3a8e85c50d225d8f75a6237d40710b018b21 /kernel/sched
parent: 8e7389b5c2130b62ceafb7cc9d6a2ae00bfbcf5a (diff)
2 files changed, 199 insertions, 45 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6fe51274c748..f9157f604041 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2566,6 +2566,13 @@ unsigned int max_task_load(void)
 /* Use this knob to turn on or off HMP-aware task placement logic */
 unsigned int __read_mostly sysctl_sched_enable_hmp_task_placement = 1;
 
+/* A cpu can no longer accomodate more tasks if:
+ *
+ *	rq->nr_running > sysctl_sched_spill_nr_run ||
+ *	rq->cumulative_runnable_avg > sched_spill_load
+ */
+unsigned int __read_mostly sysctl_sched_spill_nr_run = 10;
+
 /*
  * A cpu is considered practically idle, if:
  *
@@ -2581,6 +2588,13 @@ unsigned int __read_mostly sysctl_sched_mostly_idle_nr_run = 3;
 unsigned int __read_mostly sysctl_sched_enable_power_aware = 1;
 
 /*
+ * This specifies the maximum percent power difference between 2
+ * CPUs for them to be considered identical in terms of their
+ * power characteristics (i.e. they are in the same power band).
+ */
+unsigned int __read_mostly sysctl_sched_powerband_limit_pct = 20;
+
+/*
  * Conversion of *_pct to absolute form is based on max_task_load().
  *
  * For example:
@@ -2591,6 +2605,15 @@ unsigned int __read_mostly sched_mostly_idle_load;
 unsigned int __read_mostly sysctl_sched_mostly_idle_load_pct = 20;
 
 /*
+ * CPUs with load greater than the sched_spill_load_threshold are not
+ * eligible for task placement. When all CPUs in a cluster achieve a
+ * load higher than this level, tasks becomes eligible for inter
+ * cluster migration.
+ */
+unsigned int __read_mostly sched_spill_load;
+unsigned int __read_mostly sysctl_sched_spill_load_pct = 100;
+
+/*
  * Tasks whose bandwidth consumption on a cpu is less than
  * sched_small_task are considered as small tasks.
  */
@@ -2632,6 +2655,9 @@ static inline int available_cpu_capacity(int cpu)
 
 void set_hmp_defaults(void)
 {
+	sched_spill_load =
+		pct_to_real(sysctl_sched_spill_load_pct);
+
 	sched_mostly_idle_load =
 		pct_to_real(sysctl_sched_mostly_idle_load_pct);
 
@@ -2688,6 +2714,33 @@ static inline int is_small_task(struct task_struct *p)
 	return load < sched_small_task;
 }
 
+static inline u64 cpu_load(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	return scale_task_load(rq->cumulative_runnable_avg, cpu);
+}
+
+static int
+spill_threshold_crossed(struct task_struct *p, struct rq *rq, int cpu)
+{
+	u32 total_load = cpu_load(cpu) + scale_task_load(task_load(p), cpu);
+
+	if (total_load > sched_spill_load ||
+	    (rq->nr_running + 1) > sysctl_sched_spill_nr_run)
+		return 1;
+
+	return 0;
+}
+
+int mostly_idle_cpu(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	return (cpu_load(cpu) <= sched_mostly_idle_load
+		&& rq->nr_running <= sysctl_sched_mostly_idle_nr_run);
+}
+
 /*
  * Task will fit on a cpu if it's bandwidth consumption on that cpu
  * will be less than sched_upmigrate. A big task that was previously
@@ -2721,11 +2774,37 @@ static int task_will_fit(struct task_struct *p, int cpu)
 	return 0;
 }
 
+static int eligible_cpu(struct task_struct *p, int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	if (mostly_idle_cpu(cpu))
+		return 1;
+
+	if (rq->capacity != max_capacity)
+		return !spill_threshold_crossed(p, rq, cpu);
+
+	return 0;
+}
+
 struct cpu_pwr_stats __weak *get_cpu_pwr_stats(void)
 {
 	return NULL;
 }
 
+int power_delta_exceeded(unsigned int cpu_cost, unsigned int base_cost)
+{
+	int delta, cost_limit;
+
+	if (!base_cost || cpu_cost == base_cost)
+		return 0;
+
+	delta = cpu_cost - base_cost;
+	cost_limit = div64_u64((u64)sysctl_sched_powerband_limit_pct *
+						(u64)base_cost, 100);
+	return abs(delta) > cost_limit;
+}
+
 unsigned int power_cost_at_freq(int cpu, unsigned int freq)
 {
 	int i = 0;
@@ -2775,24 +2854,13 @@ static unsigned int power_cost(struct task_struct *p, int cpu)
 	return power_cost_at_freq(cpu, demand);
 }
 
-
-int mostly_idle_cpu(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	u64 total_load;
-
-	total_load = scale_task_load(rq->cumulative_runnable_avg, cpu);
-
-	return (total_load <= sched_mostly_idle_load
-		&& rq->nr_running <= sysctl_sched_mostly_idle_nr_run);
-}
-
 /* return cheapest cpu that can fit this task */
 static int select_best_cpu(struct task_struct *p, int target)
 {
 	int i, best_cpu = -1, fallback_idle_cpu = -1;
 	int prev_cpu = task_cpu(p);
-	int cpu_cost, min_cost = INT_MAX, min_idle_cpu_cost = INT_MAX;
+	int cpu_cost, min_cost = INT_MAX;
+	int load, min_load = INT_MAX, min_fallback_load = INT_MAX;
 	int small_task = is_small_task(p);
 
 	trace_sched_task_load(p);
@@ -2805,28 +2873,42 @@ static int select_best_cpu(struct task_struct *p, int target)
 	    task_will_fit(p, prev_cpu)) {
 		best_cpu = prev_cpu;
 		min_cost = power_cost(p, prev_cpu);
+		min_load = cpu_load(prev_cpu);
 	}
 
 	/* Todo : Optimize this loop */
 	for_each_cpu_and(i, tsk_cpus_allowed(p), cpu_online_mask) {
-		if (!small_task && !mostly_idle_cpu(i))
-			continue;
-
-		/* Assume power_cost() returns same number for two
-		 * cpus that are nearly same in their power
-		 * rating.
-		 */
-		cpu_cost = power_cost(p, i);
-
 		if (!task_will_fit(p, i)) {
-			if (cpu_cost < min_idle_cpu_cost) {
-				min_idle_cpu_cost = cpu_cost;
-				fallback_idle_cpu = i;
+			if (mostly_idle_cpu(i)) {
+				load = cpu_load(i);
+				if (load < min_fallback_load) {
+					min_fallback_load = load;
+					fallback_idle_cpu = i;
+				}
 			}
 		} else {
-			if (cpu_cost < min_cost) {
-				min_cost = cpu_cost;
-				best_cpu = i;
+			if (eligible_cpu(p, i)) {
+				cpu_cost = power_cost(p, i);
+				load = cpu_load(i);
+
+				if (power_delta_exceeded(cpu_cost, min_cost)) {
+					if (cpu_cost < min_cost) {
+						min_cost = cpu_cost;
+						min_load = load;
+						best_cpu = i;
+					}
+				} else {
+					if (load < min_load) {
+						min_load = load;
+						best_cpu = i;
+					} else if (load == min_load &&
+						   cpu_cost < min_cost) {
+						best_cpu = i;
+					}
+
+					if (cpu_cost < min_cost)
+						min_cost = cpu_cost;
+				}
 			}
 		}
 	}
@@ -2919,8 +3001,9 @@ int sched_hmp_proc_update_handler(struct ctl_table *table, int write,
 	if (ret || !write)
 		return ret;
 
-	if ((sysctl_sched_downmigrate_pct >
-		sysctl_sched_upmigrate_pct) || *data > 100) {
+	if ((sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct) ||
+		(sysctl_sched_mostly_idle_load_pct >
+			sysctl_sched_spill_load_pct) || *data > 100) {
 			*data = old_val;
 			return -EINVAL;
 	}
@@ -2948,20 +3031,24 @@ int sched_hmp_proc_update_handler(struct ctl_table *table, int write,
 	return 0;
 }
 
-static inline int find_new_hmp_ilb(void)
+static inline int find_new_hmp_ilb(int type)
 {
 	int i;
 	int call_cpu = raw_smp_processor_id();
 	int best_cpu = nr_cpu_ids;
 	struct sched_domain *sd;
 	int min_cost = INT_MAX, cost;
+	struct rq *src_rq = cpu_rq(call_cpu);
+	struct rq *dst_rq;
 
 	rcu_read_lock();
 
 	/* Pick an idle cpu "closest" to call_cpu */
 	for_each_domain(call_cpu, sd) {
 		for_each_cpu(i, sched_domain_span(sd)) {
-			if (!idle_cpu(i))
+			dst_rq = cpu_rq(i);
+			if (!idle_cpu(i) || (type == NOHZ_KICK_RESTRICT
+				  && dst_rq->capacity > src_rq->capacity))
 				continue;
 
 			cost = power_cost_at_freq(i, min_max_freq);
@@ -3083,7 +3170,7 @@ static inline int select_best_cpu(struct task_struct *p, int target)
 	return 0;
 }
 
-static inline int find_new_hmp_ilb(void)
+static inline int find_new_hmp_ilb(int type)
 {
 	return 0;
 }
@@ -3098,6 +3185,12 @@ static unsigned int power_cost_at_freq(int cpu, unsigned int freq)
 	return 1;
 }
 
+static inline int
+spill_threshold_crossed(struct task_struct *p, struct rq *rq, int cpu)
+{
+	return 0;
+}
+
 static inline int mostly_idle_cpu(int cpu)
 {
 	return 0;
@@ -6742,6 +6835,7 @@ struct sg_lb_stats {
 	unsigned int sum_nr_running; /* Nr tasks running in the group */
 #ifdef CONFIG_SCHED_HMP
 	unsigned long sum_nr_big_tasks, sum_nr_small_tasks;
+	unsigned long group_cpu_load; /* Scaled load of all CPUs of the group */
 #endif
 	unsigned int idle_cpus;
 	unsigned int group_weight;
@@ -6794,6 +6888,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
 static int
 bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
 {
+	int nr_cpus;
+
 	if (sds->local_stat.group_capacity <=
 			sds->busiest_stat.group_capacity)
 		return 0;
@@ -6803,9 +6899,10 @@ bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
 			 sds->busiest_stat.group_capacity)
 		return 0;
 
-	if ((sds->busiest_stat.sum_nr_running -
-		 sds->busiest_stat.sum_nr_small_tasks) <=
-				 sds->busiest_stat.group_capacity)
+	nr_cpus = cpumask_weight(sched_group_cpus(sds->busiest));
+	if ((sds->busiest_stat.group_cpu_load < nr_cpus * sched_spill_load) &&
+		(sds->busiest_stat.sum_nr_running <
+			nr_cpus * sysctl_sched_spill_nr_run))
 		return 1;
 
 	return 0;
@@ -7118,6 +7215,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 #ifdef CONFIG_SCHED_HMP
 		sgs->sum_nr_big_tasks += rq->nr_big_tasks;
 		sgs->sum_nr_small_tasks += rq->nr_small_tasks;
+		sgs->group_cpu_load += cpu_load(i);
 #endif
 
 #ifdef CONFIG_NUMA_BALANCING
@@ -8354,12 +8452,12 @@ static struct {
 	unsigned long next_balance;     /* in jiffy units */
 } nohz ____cacheline_aligned;
 
-static inline int find_new_ilb(void)
+static inline int find_new_ilb(int type)
 {
 	int ilb;
 
 	if (sysctl_sched_enable_hmp_task_placement)
-		return find_new_hmp_ilb();
+		return find_new_hmp_ilb(type);
 
 	ilb = cpumask_first(nohz.idle_cpus_mask);
 
@@ -8374,13 +8472,13 @@ static inline int find_new_ilb(void)
  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
  * CPU (if there is one).
  */
-static void nohz_balancer_kick(void)
+static void nohz_balancer_kick(int type)
 {
 	int ilb_cpu;
 
 	nohz.next_balance++;
 
-	ilb_cpu = find_new_ilb();
+	ilb_cpu = find_new_ilb(type);
 
 	if (ilb_cpu >= nr_cpu_ids)
 		return;
@@ -8693,6 +8791,52 @@ end:
 	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
 }
 
+#ifdef CONFIG_SCHED_HMP
+
+static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type)
+{
+	struct sched_domain *sd;
+	int i;
+
+	if (rq->nr_running >= 2 && (rq->nr_running - rq->nr_small_tasks >= 2 ||
+	     rq->nr_running > sysctl_sched_mostly_idle_nr_run ||
+		cpu_load(cpu) > sched_mostly_idle_load)) {
+
+		if (rq->capacity == max_capacity)
+			return 1;
+
+		rcu_read_lock();
+		sd = rcu_dereference_check_sched_domain(rq->sd);
+		if (!sd) {
+			rcu_read_unlock();
+			return 0;
+		}
+
+		for_each_cpu(i, sched_domain_span(sd)) {
+			if (cpu_load(i) < sched_spill_load) {
+				/* Change the kick type to limit to CPUs that
+				 * are of equal or lower capacity.
+				 */
+				*type = NOHZ_KICK_RESTRICT;
+				break;
+			}
+		}
+		rcu_read_unlock();
+		return 1;
+	}
+
+	return 0;
+}
+
+#else /* CONFIG_SCHED_HMP */
+
+static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type)
+{
+	return (rq->nr_running >= 2);
+}
+
+#endif /* CONFIG_SCHED_HMP */
+
 /*
  * Current heuristic for kicking the idle load balancer in the presence
  * of an idle cpu in the system.
@@ -8704,12 +8848,15 @@ end:
  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
  *     domain span are idle.
  */
-static inline bool nohz_kick_needed(struct rq *rq)
+static inline bool nohz_kick_needed(struct rq *rq, int *type)
 {
 	unsigned long now = jiffies;
+#ifndef CONFIG_SCHED_HMP
 	struct sched_domain *sd;
 	struct sched_group_capacity *sgc;
-	int nr_busy, cpu = rq->cpu;
+	int nr_busy;
+#endif
+	int cpu = rq->cpu;
 	bool kick = false;
 
 	if (unlikely(rq->idle_balance))
@@ -8732,9 +8879,10 @@ static inline bool nohz_kick_needed(struct rq *rq)
 	if (time_before(now, nohz.next_balance))
 		return false;
 
-	if (rq->nr_running >= 2 && !mostly_idle_cpu(cpu))
+	if (_nohz_kick_needed(rq, cpu, type))
 		return true;
 
+#ifndef CONFIG_SCHED_HMP
 	rcu_read_lock();
 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
 	if (sd) {
@@ -8766,6 +8914,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
 
 unlock:
 	rcu_read_unlock();
+#endif
 	return kick;
 }
 #else
@@ -8799,6 +8948,8 @@ static void run_rebalance_domains(struct softirq_action *h)
  */
 void trigger_load_balance(struct rq *rq)
 {
+	int type = NOHZ_KICK_ANY;
+
 	/* Don't need to rebalance while attached to NULL domain */
 	if (unlikely(on_null_domain(rq)))
 		return;
@@ -8806,8 +8957,8 @@ void trigger_load_balance(struct rq *rq)
 	if (time_after_eq(jiffies, rq->next_balance))
 		raise_softirq(SCHED_SOFTIRQ);
 #ifdef CONFIG_NO_HZ_COMMON
-	if (nohz_kick_needed(rq))
-		nohz_balancer_kick();
+	if (nohz_kick_needed(rq, &type))
+		nohz_balancer_kick(type);
 #endif
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a0d35bbc2626..67e3de3b460d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1882,6 +1882,9 @@ enum rq_nohz_flag_bits {
 #define nohz_flags(cpu)	(&cpu_rq(cpu)->nohz_flags)
 #endif
 
+#define NOHZ_KICK_ANY 0
+#define NOHZ_KICK_RESTRICT 1
+
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 
 DECLARE_PER_CPU(u64, cpu_hardirq_time);
author	Syed Rameez Mustafa <rameezmustafa@codeaurora.org>	2014-06-12 10:47:12 -0700
committer	David Keitel <dkeitel@codeaurora.org>	2016-03-23 19:59:32 -0700
commit	a536cf8ac858ac8388272d98f9baf8e7ba25b646 (patch)
tree	3d9e3a8e85c50d225d8f75a6237d40710b018b21 /kernel/sched
parent	8e7389b5c2130b62ceafb7cc9d6a2ae00bfbcf5a (diff)