sched: add support for CPU frequency estimation with cycle counter

At present scheduler calculates task's demand with the task's execution time weighted over CPU frequency. The CPU frequency is given by governor's CPU frequency transition notification. Such notification may not be available. Provide an API for CPU clock driver to register callback functions so in order for scheduler to access CPU's cycle counter to estimate CPU's frequency without notification. At time point scheduler assumes the cycle counter increases always even when cluster is idle which might not be true. This will be fixed by subsequent change for more accurate I/O wait time accounting. CRs-fixed: 1006303 Change-Id: I93b187efd7bc225db80da0184683694f5ab99738 Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
author: Joonwoo Park <joonwoop@codeaurora.org> 2016-03-08 13:46:04 -0800
committer: Kyle Yan <kyan@codeaurora.org> 2016-04-27 19:13:05 -0700
commit: 35f1d99e0a3ad7f1b15ca2085ca92fd545dd01de (patch)
tree: a52e8cb3208ee558621098b70e3b41645a9c1ef3 /kernel
parent: 343dcf1ecc085671982de5de6212dbad827bbf1a (diff)
3 files changed, 169 insertions, 39 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0cdd0cf0718f..3afa3c5d5ebd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -797,6 +797,9 @@ sched_set_cpu_cstate(int cpu, int cstate, int wakeup_energy, int wakeup_latency)
 static ktime_t ktime_last;
 static bool sched_ktime_suspended;
 
+static bool use_cycle_counter;
+static struct cpu_cycle_counter_cb cpu_cycle_counter_cb;
+
 u64 sched_ktime_clock(void)
 {
 	if (unlikely(sched_ktime_suspended))
@@ -1398,6 +1401,7 @@ static struct sched_cluster init_cluster = {
 	.max_freq		=	1,
 	.min_freq		=	1,
 	.max_possible_freq	=	1,
+	.cpu_cycle_max_scale_factor	= 1,
 	.dstate			=	0,
 	.dstate_wakeup_energy	=	0,
 	.dstate_wakeup_latency	=	0,
@@ -1546,6 +1550,7 @@ static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus)
 	cluster->max_freq		=	1;
 	cluster->min_freq		=	1;
 	cluster->max_possible_freq	=	1;
+	cluster->cpu_cycle_max_scale_factor =	1;
 	cluster->dstate			=	0;
 	cluster->dstate_wakeup_energy	=	0;
 	cluster->dstate_wakeup_latency	=	0;
@@ -1612,6 +1617,44 @@ static void init_clusters(void)
 	INIT_LIST_HEAD(&cluster_head);
 }
 
+static inline void
+__update_cpu_cycle_max_possible_freq(struct sched_cluster *cluster)
+{
+	int cpu = cluster_first_cpu(cluster);
+
+	cluster->cpu_cycle_max_scale_factor =
+	    div64_u64(cluster->max_possible_freq * NSEC_PER_USEC,
+		      cpu_cycle_counter_cb.get_cpu_cycles_max_per_us(cpu));
+}
+
+static inline void
+update_cpu_cycle_max_possible_freq(struct sched_cluster *cluster)
+{
+	if (!use_cycle_counter)
+		return;
+
+	__update_cpu_cycle_max_possible_freq(cluster);
+}
+
+int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb)
+{
+	struct sched_cluster *cluster = NULL;
+
+	mutex_lock(&cluster_lock);
+	if (!cb->get_cpu_cycle_counter || !cb->get_cpu_cycles_max_per_us) {
+		mutex_unlock(&cluster_lock);
+		return -EINVAL;
+	}
+
+	cpu_cycle_counter_cb = *cb;
+	for_each_sched_cluster(cluster)
+		__update_cpu_cycle_max_possible_freq(cluster);
+	use_cycle_counter = true;
+	mutex_unlock(&cluster_lock);
+
+	return 0;
+}
+
 static int __init set_sched_enable_hmp(char *str)
 {
 	int enable_hmp = 0;
@@ -1718,12 +1761,24 @@ static inline void clear_boost_kick(int cpu) { }
 
 static inline void clear_hmp_request(int cpu) { }
 
+int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb)
+{
+	return 0;
+}
+
 #ifdef CONFIG_SMP
 static void update_cluster_topology(void) { }
 #endif
 
 #endif	/* CONFIG_SCHED_HMP */
 
+#define SCHED_MIN_FREQ 1
+
+struct cpu_cycle {
+	u64 cycles;
+	u64 time;
+};
+
 #if defined(CONFIG_SCHED_HMP)
 
 /*
@@ -1865,19 +1920,17 @@ update_window_start(struct rq *rq, u64 wallclock)
 	rq->window_start += (u64)nr_windows * (u64)sched_ravg_window;
 }
 
-static inline u64 scale_exec_time(u64 delta, struct rq *rq)
+#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y)
+
+static inline u64 scale_exec_time(u64 delta, struct rq *rq,
+				  const struct cpu_cycle *cc)
 {
 	int cpu = cpu_of(rq);
-	unsigned int cur_freq = cpu_cur_freq(cpu);
 	int sf;
 
-	if (unlikely(cur_freq > max_possible_freq))
-		cur_freq = max_possible_freq;
-
-	/* round up div64 */
-	delta = div64_u64(delta * cur_freq + max_possible_freq - 1,
-			  max_possible_freq);
-
+	delta = DIV64_U64_ROUNDUP(delta * cc->cycles *
+				  cpu_cycle_max_scale_factor(cpu),
+				  max_possible_freq * cc->time);
 	sf = DIV_ROUND_UP(cpu_efficiency(cpu) * 1024, max_possible_efficiency);
 
 	delta *= sf;
@@ -2251,7 +2304,8 @@ void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
  * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
  */
 static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
-	     int event, u64 wallclock, u64 irqtime)
+				 int event, u64 wallclock, u64 irqtime,
+				 const struct cpu_cycle *cc)
 {
 	int new_window, nr_full_windows = 0;
 	int p_is_curr_task = (p == rq->curr);
@@ -2341,7 +2395,7 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 			delta = wallclock - mark_start;
 		else
 			delta = irqtime;
-		delta = scale_exec_time(delta, rq);
+		delta = scale_exec_time(delta, rq, cc);
 		rq->curr_runnable_sum += delta;
 		if (new_task)
 			rq->nt_curr_runnable_sum += delta;
@@ -2366,14 +2420,15 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 		if (!nr_full_windows) {
 			/* A full window hasn't elapsed, account partial
 			 * contribution to previous completed window. */
-			delta = scale_exec_time(window_start - mark_start, rq);
+			delta = scale_exec_time(window_start - mark_start, rq,
+						cc);
 			if (!exiting_task(p))
 				p->ravg.prev_window += delta;
 		} else {
 			/* Since at least one full window has elapsed,
 			 * the contribution to the previous window is the
 			 * full window (window_size). */
-			delta = scale_exec_time(window_size, rq);
+			delta = scale_exec_time(window_size, rq, cc);
 			if (!exiting_task(p))
 				p->ravg.prev_window = delta;
 		}
@@ -2382,7 +2437,7 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 			rq->nt_prev_runnable_sum += delta;
 
 		/* Account piece of busy time in the current window. */
-		delta = scale_exec_time(wallclock - window_start, rq);
+		delta = scale_exec_time(wallclock - window_start, rq, cc);
 		rq->curr_runnable_sum += delta;
 		if (new_task)
 			rq->nt_curr_runnable_sum += delta;
@@ -2408,7 +2463,8 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 		if (!nr_full_windows) {
 			/* A full window hasn't elapsed, account partial
 			 * contribution to previous completed window. */
-			delta = scale_exec_time(window_start - mark_start, rq);
+			delta = scale_exec_time(window_start - mark_start, rq,
+						cc);
 			if (!is_idle_task(p) && !exiting_task(p))
 				p->ravg.prev_window += delta;
 
@@ -2421,7 +2477,7 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 			/* Since at least one full window has elapsed,
 			 * the contribution to the previous window is the
 			 * full window (window_size). */
-			delta = scale_exec_time(window_size, rq);
+			delta = scale_exec_time(window_size, rq, cc);
 			if (!is_idle_task(p) && !exiting_task(p))
 				p->ravg.prev_window = delta;
 
@@ -2439,7 +2495,7 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 		rq->prev_runnable_sum = delta;
 
 		/* Account piece of busy time in the current window. */
-		delta = scale_exec_time(wallclock - window_start, rq);
+		delta = scale_exec_time(wallclock - window_start, rq, cc);
 		rq->curr_runnable_sum = delta;
 		if (new_task)
 			rq->nt_curr_runnable_sum = delta;
@@ -2471,7 +2527,8 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 		rq->nt_prev_runnable_sum = rq->nt_curr_runnable_sum;
 		rq->nt_curr_runnable_sum = 0;
 		if (mark_start > window_start) {
-			rq->curr_runnable_sum = scale_exec_time(irqtime, rq);
+			rq->curr_runnable_sum = scale_exec_time(irqtime, rq,
+								cc);
 			return;
 		}
 
@@ -2480,12 +2537,12 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 		delta = window_start - mark_start;
 		if (delta > window_size)
 			delta = window_size;
-		delta = scale_exec_time(delta, rq);
+		delta = scale_exec_time(delta, rq, cc);
 		rq->prev_runnable_sum += delta;
 
 		/* Process the remaining IRQ busy time in the current window. */
 		delta = wallclock - window_start;
-		rq->curr_runnable_sum = scale_exec_time(delta, rq);
+		rq->curr_runnable_sum = scale_exec_time(delta, rq, cc);
 
 		return;
 	}
@@ -2515,7 +2572,7 @@ update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
 }
 
 static inline void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
-	     int event, u64 wallclock, u64 irqtime)
+	     int event, u64 wallclock, u64 irqtime, const struct cpu_cycle *cc)
 {
 }
 
@@ -2528,6 +2585,41 @@ static inline u32 predict_and_update_buckets(struct rq *rq,
 
 #endif	/* CONFIG_SCHED_FREQ_INPUT */
 
+static void update_task_cpu_cycles(struct task_struct *p, int cpu)
+{
+	if (use_cycle_counter)
+		p->cpu_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
+}
+
+static struct cpu_cycle
+get_task_cpu_cycles(struct task_struct *p, struct rq *rq, int event,
+		    u64 wallclock)
+{
+	u64 cur_cycles;
+	struct cpu_cycle cc;
+	int cpu = cpu_of(rq);
+
+	if (!use_cycle_counter) {
+		cc.cycles = cpu_cur_freq(cpu);
+		cc.time = 1;
+		return cc;
+	}
+
+	cur_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
+	if (unlikely(cur_cycles < p->cpu_cycles))
+		cc.cycles = cur_cycles + (U64_MAX - p->cpu_cycles);
+	else
+		cc.cycles = cur_cycles - p->cpu_cycles;
+	cc.time = wallclock - p->ravg.mark_start;
+	BUG_ON((s64)cc.time < 0);
+
+	p->cpu_cycles = cur_cycles;
+
+	trace_sched_get_task_cpu_cycles(cpu, event, cc.cycles, cc.time);
+
+	return cc;
+}
+
 static int account_busy_for_task_demand(struct task_struct *p, int event)
 {
 	/* No need to bother updating task demand for exiting tasks
@@ -2614,9 +2706,9 @@ done:
 }
 
 static void add_to_task_demand(struct rq *rq, struct task_struct *p,
-				u64 delta)
+				u64 delta, const struct cpu_cycle *cc)
 {
-	delta = scale_exec_time(delta, rq);
+	delta = scale_exec_time(delta, rq, cc);
 	p->ravg.sum += delta;
 	if (unlikely(p->ravg.sum > sched_ravg_window))
 		p->ravg.sum = sched_ravg_window;
@@ -2673,7 +2765,8 @@ static void add_to_task_demand(struct rq *rq, struct task_struct *p,
  * depends on it!
  */
 static void update_task_demand(struct task_struct *p, struct rq *rq,
-	     int event, u64 wallclock)
+			       int event, u64 wallclock,
+			       const struct cpu_cycle *cc)
 {
 	u64 mark_start = p->ravg.mark_start;
 	u64 delta, window_start = rq->window_start;
@@ -2696,7 +2789,7 @@ static void update_task_demand(struct task_struct *p, struct rq *rq,
 	if (!new_window) {
 		/* The simple case - busy time contained within the existing
 		 * window. */
-		add_to_task_demand(rq, p, wallclock - mark_start);
+		add_to_task_demand(rq, p, wallclock - mark_start, cc);
 		return;
 	}
 
@@ -2707,12 +2800,12 @@ static void update_task_demand(struct task_struct *p, struct rq *rq,
 	window_start -= (u64)nr_full_windows * (u64)window_size;
 
 	/* Process (window_start - mark_start) first */
-	add_to_task_demand(rq, p, window_start - mark_start);
+	add_to_task_demand(rq, p, window_start - mark_start, cc);
 
 	/* Push new sample(s) into task's demand history */
 	update_history(rq, p, p->ravg.sum, 1, event);
 	if (nr_full_windows)
-		update_history(rq, p, scale_exec_time(window_size, rq),
+		update_history(rq, p, scale_exec_time(window_size, rq, cc),
 			       nr_full_windows, event);
 
 	/* Roll window_start back to current to process any remainder
@@ -2721,30 +2814,39 @@ static void update_task_demand(struct task_struct *p, struct rq *rq,
 
 	/* Process (wallclock - window_start) next */
 	mark_start = window_start;
-	add_to_task_demand(rq, p, wallclock - mark_start);
+	add_to_task_demand(rq, p, wallclock - mark_start, cc);
 }
 
 /* Reflect task activity on its demand and cpu's busy time statistics */
-static void update_task_ravg(struct task_struct *p, struct rq *rq,
-	     int event, u64 wallclock, u64 irqtime)
+static struct cpu_cycle
+update_task_ravg(struct task_struct *p, struct rq *rq, int event,
+		 u64 wallclock, u64 irqtime)
 {
+	struct cpu_cycle cc = { .cycles = SCHED_MIN_FREQ, .time = 1 };
+
 	if (sched_use_pelt || !rq->window_start || sched_disable_window_stats)
-		return;
+		return cc;
 
 	lockdep_assert_held(&rq->lock);
 
 	update_window_start(rq, wallclock);
 
-	if (!p->ravg.mark_start)
+	if (!p->ravg.mark_start) {
+		update_task_cpu_cycles(p, cpu_of(rq));
 		goto done;
+	}
 
-	update_task_demand(p, rq, event, wallclock);
-	update_cpu_busy_time(p, rq, event, wallclock, irqtime);
+	cc = get_task_cpu_cycles(p, rq, event, wallclock);
+	update_task_demand(p, rq, event, wallclock, &cc);
+	update_cpu_busy_time(p, rq, event, wallclock, irqtime, &cc);
 	update_task_pred_demand(rq, p, event);
 done:
-	trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime);
+	trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime,
+				     cc.cycles, cc.time);
 
 	p->ravg.mark_start = wallclock;
+
+	return cc;
 }
 
 void sched_account_irqtime(int cpu, struct task_struct *curr,
@@ -2812,6 +2914,7 @@ static inline void mark_task_starting(struct task_struct *p)
 	wallclock = sched_ktime_clock();
 	p->ravg.mark_start = p->last_wake_ts = wallclock;
 	p->last_switch_out_ts = 0;
+	update_task_cpu_cycles(p, cpu_of(rq));
 }
 
 static inline void set_window_start(struct rq *rq)
@@ -3029,6 +3132,7 @@ void sched_get_cpus_busy(struct sched_load *busy,
 	int early_detection[cpus];
 	int cpu, i = 0;
 	unsigned int window_size;
+	struct cpu_cycle cc;
 
 	if (unlikely(cpus == 0))
 		return;
@@ -3047,8 +3151,10 @@ void sched_get_cpus_busy(struct sched_load *busy,
 	for_each_cpu(cpu, query_cpus) {
 		rq = cpu_rq(cpu);
 
-		update_task_ravg(rq->curr, rq, TASK_UPDATE,
-				 sched_ktime_clock(), 0);
+		cc = update_task_ravg(rq->curr, rq, TASK_UPDATE,
+				      sched_ktime_clock(), 0);
+		cur_freq[i] = cpu_cycles_to_freq(i, cc.cycles, cc.time);
+
 		load[i] = rq->old_busy_time = rq->prev_runnable_sum;
 		nload[i] = rq->nt_prev_runnable_sum;
 		pload[i] = rq->hmp_stats.pred_demands_sum;
@@ -3066,7 +3172,6 @@ void sched_get_cpus_busy(struct sched_load *busy,
 		notifier_sent[i] = rq->notifier_sent;
 		early_detection[i] = (rq->ed_task != NULL);
 		rq->notifier_sent = 0;
-		cur_freq[i] = cpu_cur_freq(cpu);
 		max_freq[i] = cpu_max_freq(cpu);
 		i++;
 	}
@@ -3212,6 +3317,8 @@ static void fixup_busy_time(struct task_struct *p, int new_cpu)
 	update_task_ravg(p, task_rq(p), TASK_MIGRATE,
 			 wallclock, 0);
 
+	update_task_cpu_cycles(p, new_cpu);
+
 	new_task = is_new_task(p);
 
 	if (p->ravg.curr_window) {
@@ -3531,6 +3638,7 @@ static int cpufreq_notifier_policy(struct notifier_block *nb,
 
 			sort_clusters();
 			update_all_clusters_stats();
+			update_cpu_cycle_max_possible_freq(cluster);
 			mutex_unlock(&cluster_lock);
 			continue;
 		}
@@ -3685,10 +3793,16 @@ heavy_task_wakeup(struct task_struct *p, struct rq *rq, int event)
 	return 0;
 }
 
-static inline void
+static struct cpu_cycle
 update_task_ravg(struct task_struct *p, struct rq *rq,
 			 int event, u64 wallclock, u64 irqtime)
 {
+	static const struct cpu_cycle cc = {
+		.cycles = SCHED_MIN_FREQ,
+		.time = 1
+	};
+
+	return cc;
 }
 
 static inline void mark_task_starting(struct task_struct *p) {}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 997339470655..aa30f55dc5ee 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4808,6 +4808,7 @@ void init_new_task_load(struct task_struct *p)
 	rcu_assign_pointer(p->grp, NULL);
 	INIT_LIST_HEAD(&p->grp_list);
 	memset(&p->ravg, 0, sizeof(struct ravg));
+	p->cpu_cycles = 0;
 
 	if (init_load_pct) {
 		init_load_pelt = div64_u64((u64)init_load_pct *
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a11e74c191f3..19033bfc3f8e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -387,6 +387,11 @@ struct sched_cluster {
 	 * max_possible_freq = maximum supported by hardware
 	 */
 	unsigned int cur_freq, max_freq, min_freq, max_possible_freq;
+	/*
+	 * cpu_cycle_max_scale_factor represents number of cycles per NSEC at
+	 * CPU's fmax.
+	 */
+	u32 cpu_cycle_max_scale_factor;
 	bool freq_init_done;
 	int dstate, dstate_wakeup_latency, dstate_wakeup_energy;
 	unsigned int static_cluster_pwr_cost;
@@ -1120,6 +1125,16 @@ static inline int cpu_max_power_cost(int cpu)
 	return cpu_rq(cpu)->cluster->max_power_cost;
 }
 
+static inline int cpu_cycle_max_scale_factor(int cpu)
+{
+	return cpu_rq(cpu)->cluster->cpu_cycle_max_scale_factor;
+}
+
+static inline u32 cpu_cycles_to_freq(int cpu, u64 cycles, u32 period)
+{
+	return div64_u64(cycles * cpu_cycle_max_scale_factor(cpu), period);
+}
+
 static inline bool hmp_capable(void)
 {
 	return max_possible_capacity != min_max_possible_capacity;
author	Joonwoo Park <joonwoop@codeaurora.org>	2016-03-08 13:46:04 -0800
committer	Kyle Yan <kyan@codeaurora.org>	2016-04-27 19:13:05 -0700
commit	35f1d99e0a3ad7f1b15ca2085ca92fd545dd01de (patch)
tree	a52e8cb3208ee558621098b70e3b41645a9c1ef3 /kernel
parent	343dcf1ecc085671982de5de6212dbad827bbf1a (diff)