sched: take into account of governor's frequency max load

At present HMP scheduler packs tasks to busy CPU till the CPU's load is 100% to avoid waking up of idle CPU as much as possible. Such aggressive packing leads unintended CPU frequency raise as governor raises the busy CPU's frequency when its load is more than configured frequency max load which can be less than 100%. Fix to take into account of governor's frequency max load and pack tasks only when the CPU's projected load is less than max load to avoid unnecessary frequency raise. Change-Id: I4447e5e0c2fa5214ae7a9128f04fd7585ed0dcac [joonwoop@codeaurora.org: fixed minor conflict in kernel/sched/sched.h] Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
author: Joonwoo Park <joonwoop@codeaurora.org> 2015-07-27 16:52:12 -0700
committer: David Keitel <dkeitel@codeaurora.org> 2016-03-23 20:02:25 -0700
commit: b4627e0104c72dd25048fdcd8dd38fad78ad9782 (patch)
tree: fbea7cad871a4c227f6359dc119eab7926bb85fe /kernel/sched
parent: 28f67e5a50d7c1bfc41cd7eb0f940f5daaa347c2 (diff)
3 files changed, 111 insertions, 3 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ebaeda755c91..cc3ba6ee00d8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1203,7 +1203,6 @@ __read_mostly int sysctl_sched_freq_inc_notify = 10 * 1024 * 1024; /* + 10GHz */
 __read_mostly int sysctl_sched_freq_dec_notify = 10 * 1024 * 1024; /* - 10GHz */
 
 static __read_mostly unsigned int sched_io_is_busy;
-
 #endif	/* CONFIG_SCHED_FREQ_INPUT */
 
 /* 1 -> use PELT based load stats, 0 -> use window-based load stats */
@@ -1628,6 +1627,78 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 
 	BUG();
 }
+
+u32 __weak get_freq_max_load(int cpu, u32 freq)
+{
+	/* 100% by default */
+	return 100;
+}
+
+DEFINE_PER_CPU(struct freq_max_load *, freq_max_load);
+
+int sched_update_freq_max_load(const cpumask_t *cpumask)
+{
+	int i, cpu, ret;
+	unsigned int freq, max;
+	struct cpu_pstate_pwr *costs;
+	struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
+	struct freq_max_load *max_load, *old_max_load;
+
+	if (!per_cpu_info || !sysctl_sched_enable_power_aware)
+		return 0;
+
+	mutex_lock(&policy_mutex);
+	for_each_cpu(cpu, cpumask) {
+		if (!per_cpu_info[cpu].ptable) {
+			ret = -EINVAL;
+			goto fail;
+		}
+
+		old_max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
+
+		/*
+		 * allocate len + 1 and leave the last power cost as 0 for
+		 * power_cost_at_freq() can stop iterating index when
+		 * per_cpu_info[cpu].len > len of max_load due to race between
+		 * cpu power stats update and get_cpu_pwr_stats().
+		 */
+		max_load = kzalloc(sizeof(struct freq_max_load) +
+				   sizeof(u32) * (per_cpu_info[cpu].len + 1),
+				   GFP_ATOMIC);
+		if (unlikely(!max_load)) {
+			ret = -ENOMEM;
+			goto fail;
+		}
+
+		i = 0;
+		costs = per_cpu_info[cpu].ptable;
+		while (costs[i].freq) {
+			freq = costs[i].freq;
+			max = get_freq_max_load(cpu, freq);
+			max_load->freqs[i] = div64_u64((u64)freq * max, 100);
+			i++;
+		}
+
+		rcu_assign_pointer(per_cpu(freq_max_load, cpu), max_load);
+		if (old_max_load)
+			kfree_rcu(old_max_load, rcu);
+	}
+
+	mutex_unlock(&policy_mutex);
+	return 0;
+
+fail:
+	for_each_cpu(cpu, cpumask) {
+		max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
+		if (max_load) {
+			rcu_assign_pointer(per_cpu(freq_max_load, cpu), NULL);
+			kfree_rcu(max_load, rcu);
+		}
+	}
+
+	mutex_unlock(&policy_mutex);
+	return ret;
+}
 #else	/* CONFIG_SCHED_FREQ_INPUT */
 
 static inline void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
@@ -2598,6 +2669,17 @@ static int cpufreq_notifier_trans(struct notifier_block *nb,
 	return 0;
 }
 
+static int pwr_stats_ready_notifier(struct notifier_block *nb,
+				    unsigned long cpu, void *data)
+{
+	cpumask_t mask = CPU_MASK_NONE;
+
+	cpumask_set_cpu(cpu, &mask);
+	sched_update_freq_max_load(&mask);
+
+	return 0;
+}
+
 static struct notifier_block notifier_policy_block = {
 	.notifier_call = cpufreq_notifier_policy
 };
@@ -2606,6 +2688,15 @@ static struct notifier_block notifier_trans_block = {
 	.notifier_call = cpufreq_notifier_trans
 };
 
+static struct notifier_block notifier_pwr_stats_ready = {
+	.notifier_call = pwr_stats_ready_notifier
+};
+
+int __weak register_cpu_pwr_stats_ready_notifier(struct notifier_block *nb)
+{
+	return -EINVAL;
+}
+
 static int register_sched_callback(void)
 {
 	int ret;
@@ -2620,6 +2711,8 @@ static int register_sched_callback(void)
 		ret = cpufreq_register_notifier(&notifier_trans_block,
 						CPUFREQ_TRANSITION_NOTIFIER);
 
+	register_cpu_pwr_stats_ready_notifier(&notifier_pwr_stats_ready);
+
 	return 0;
 }
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a9f3199bdcf6..27e1a3d7bb05 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3063,6 +3063,7 @@ static unsigned int power_cost_at_freq(int cpu, unsigned int freq)
 	int i = 0;
 	struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
 	struct cpu_pstate_pwr *costs;
+	struct freq_max_load *max_load;
 
 	if (!per_cpu_info || !per_cpu_info[cpu].ptable ||
 	    !sysctl_sched_enable_power_aware)
@@ -3075,12 +3076,18 @@ static unsigned int power_cost_at_freq(int cpu, unsigned int freq)
 
 	costs = per_cpu_info[cpu].ptable;
 
+	rcu_read_lock();
+	max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
 	while (costs[i].freq != 0) {
-		if (costs[i].freq >= freq ||
-		    costs[i+1].freq == 0)
+		if (costs[i+1].freq == 0 ||
+		    (costs[i].freq >= freq &&
+		     (!max_load || max_load->freqs[i] >= freq))) {
+			rcu_read_unlock();
 			return costs[i].power;
+		}
 		i++;
 	}
+	rcu_read_unlock();
 	BUG();
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 23376b43777d..2545fe83e8cd 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -27,6 +27,14 @@ extern unsigned long calc_load_update;
 extern atomic_long_t calc_load_tasks;
 
 extern void calc_global_load_tick(struct rq *this_rq);
+
+struct freq_max_load {
+	struct rcu_head rcu;
+	u32 freqs[0];
+};
+
+extern DEFINE_PER_CPU(struct freq_max_load *, freq_max_load);
+
 extern long calc_load_fold_active(struct rq *this_rq);
 
 #ifdef CONFIG_SMP
author	Joonwoo Park <joonwoop@codeaurora.org>	2015-07-27 16:52:12 -0700
committer	David Keitel <dkeitel@codeaurora.org>	2016-03-23 20:02:25 -0700
commit	b4627e0104c72dd25048fdcd8dd38fad78ad9782 (patch)
tree	fbea7cad871a4c227f6359dc119eab7926bb85fe /kernel/sched
parent	28f67e5a50d7c1bfc41cd7eb0f940f5daaa347c2 (diff)