3 files changed, 169 insertions, 39 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0cdd0cf0718f..3afa3c5d5ebd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -797,6 +797,9 @@ sched_set_cpu_cstate(int cpu, int cstate, int wakeup_energy, int wakeup_latency)
 static ktime_t ktime_last;
 static bool sched_ktime_suspended;
 
+static bool use_cycle_counter;
+static struct cpu_cycle_counter_cb cpu_cycle_counter_cb;
+
 u64 sched_ktime_clock(void)
 {
 	if (unlikely(sched_ktime_suspended))
@@ -1398,6 +1401,7 @@ static struct sched_cluster init_cluster = {
 	.max_freq		=	1,
 	.min_freq		=	1,
 	.max_possible_freq	=	1,
+	.cpu_cycle_max_scale_factor	= 1,
 	.dstate			=	0,
 	.dstate_wakeup_energy	=	0,
 	.dstate_wakeup_latency	=	0,
@@ -1546,6 +1550,7 @@ static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus)
 	cluster->max_freq		=	1;
 	cluster->min_freq		=	1;
 	cluster->max_possible_freq	=	1;
+	cluster->cpu_cycle_max_scale_factor =	1;
 	cluster->dstate			=	0;
 	cluster->dstate_wakeup_energy	=	0;
 	cluster->dstate_wakeup_latency	=	0;
@@ -1612,6 +1617,44 @@ static void init_clusters(void)
 	INIT_LIST_HEAD(&cluster_head);
 }
 
+static inline void
+__update_cpu_cycle_max_possible_freq(struct sched_cluster *cluster)
+{
+	int cpu = cluster_first_cpu(cluster);
+
+	cluster->cpu_cycle_max_scale_factor =
+	    div64_u64(cluster->max_possible_freq * NSEC_PER_USEC,
+		      cpu_cycle_counter_cb.get_cpu_cycles_max_per_us(cpu));
+}
+
+static inline void
+update_cpu_cycle_max_possible_freq(struct sched_cluster *cluster)
+{
+	if (!use_cycle_counter)
+		return;
+
+	__update_cpu_cycle_max_possible_freq(cluster);
+}
+
+int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb)
+{
+	struct sched_cluster *cluster = NULL;
+
+	mutex_lock(&cluster_lock);
+	if (!cb->get_cpu_cycle_counter || !cb->get_cpu_cycles_max_per_us) {
+		mutex_unlock(&cluster_lock);
+		return -EINVAL;
+	}
+
+	cpu_cycle_counter_cb = *cb;
+	for_each_sched_cluster(cluster)
+		__update_cpu_cycle_max_possible_freq(cluster);
+	use_cycle_counter = true;
+	mutex_unlock(&cluster_lock);
+
+	return 0;
+}
+
 static int __init set_sched_enable_hmp(char *str)
 {
 	int enable_hmp = 0;
@@ -1718,12 +1761,24 @@ static inline void clear_boost_kick(int cpu) { }
 
 static inline void clear_hmp_request(int cpu) { }
 
+int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb)
+{
+	return 0;
+}
+
 #ifdef CONFIG_SMP
 static void update_cluster_topology(void) { }
 #endif
 
 #endif	/* CONFIG_SCHED_HMP */
 
+#define SCHED_MIN_FREQ 1
+
+struct cpu_cycle {
+	u64 cycles;
+	u64 time;
+};
+
 #if defined(CONFIG_SCHED_HMP)
 
 /*
@@ -1865,19 +1920,17 @@ update_window_start(struct rq *rq, u64 wallclock)
 	rq->window_start += (u64)nr_windows * (u64)sched_ravg_window;
 }
 
-static inline u64 scale_exec_time(u64 delta, struct rq *rq)
+#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y)
+
+static inline u64 scale_exec_time(u64 delta, struct rq *rq,
+				  const struct cpu_cycle *cc)
 {
 	int cpu = cpu_of(rq);
-	unsigned int cur_freq = cpu_cur_freq(cpu);
 	int sf;
 
-	if (unlikely(cur_freq > max_possible_freq))
-		cur_freq = max_possible_freq;
-
-	/* round up div64 */
-	delta = div64_u64(delta * cur_freq + max_possible_freq - 1,
-			  max_possible_freq);
-
+	delta = DIV64_U64_ROUNDUP(delta * cc->cycles *
+				  cpu_cycle_max_scale_factor(cpu),
+				  max_possible_freq * cc->time);
 	sf = DIV_ROUND_UP(cpu_efficiency(cpu) * 1024, max_possible_efficiency);
 
 	delta *= sf;
@@ -2251,7 +2304,8 @@ void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
  * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
  */
 static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
-	     int event, u64 wallclock, u64 irqtime)
+				 int event, u64 wallclock, u64 irqtime,
+				 const struct cpu_cycle *cc)
 {
 	int new_window, nr_full_windows = 0;
 	int p_is_curr_task = (p == rq->curr);
@@ -2341,7 +2395,7 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 			delta = wallclock - mark_start;
 		else
 			delta = irqtime;
-		delta = scale_exec_time(delta, rq);
+		delta = scale_exec_time(delta, rq, cc);
 		rq->curr_runnable_sum += delta;
 		if (new_task)
 			rq->nt_curr_runnable_sum += delta;
@@ -2366,14 +2420,15 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 		if (!nr_full_windows) {
 			/* A full window hasn't elapsed, account partial
 			 * contribution to previous completed window. */
-			delta = scale_exec_time(window_start - mark_start, rq);
+			delta = scale_exec_time(window_start - mark_start, rq,
+						cc);
 			if (!exiting_task(p))
 				p->ravg.prev_window += delta;
 		} else {
 			/* Since at least one full window has elapsed,
 			 * the contribution to the previous window is the
 			 * full window (window_size). */
-			delta = scale_exec_time(window_size, rq);
+			delta = scale_exec_time(window_size, rq, cc);
 			if (!exiting_task(p))
 				p->ravg.prev_window = delta;
 		}
@@ -2382,7 +2437,7 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 			rq->nt_prev_runnable_sum += delta;
 
 		/* Account piece of busy time in the current window. */
-		delta = scale_exec_time(wallclock - window_start, rq);
+		delta = scale_exec_time(wallclock - window_start, rq, cc);
 		rq->curr_runnable_sum += delta;
 		if (new_task)
 			rq->nt_curr_runnable_sum += delta;
@@ -2408,7 +2463,8 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 		if (!nr_full_windows) {
 			/* A full window hasn't elapsed, account partial
 			 * contribution to previous completed window. */
-			delta = scale_exec_time(window_start - mark_start, rq);
+			delta = scale_exec_time(window_start - mark_start, rq,
+						cc);
 			if (!is_idle_task(p) && !exiting_task(p))
 				p->ravg.prev_window += delta;
 
@@ -2421,7 +2477,7 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 			/* Since at least one full window has elapsed,
 			 * the contribution to the previous window is the
 			 * full window (window_size). */
-			delta = scale_exec_time(window_size, rq);
+			delta = scale_exec_time(window_size, rq, cc);
 			if (!is_idle_task(p) && !exiting_task(p))
 				p->ravg.prev_window = delta;
 
@@ -2439,7 +2495,7 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 		rq->prev_runnable_sum = delta;
 
 		/* Account piece of busy time in the current window. */
-		delta = scale_exec_time(wallclock - window_start, rq);
+		delta = scale_exec_time(wallclock - window_start, rq, cc);
 		rq->curr_runnable_sum = delta;
 		if (new_task)
 			rq->nt_curr_runnable_sum = delta;
@@ -2471,7 +2527,8 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 		rq->nt_prev_runnable_sum = rq->nt_curr_runnable_sum;
 		rq->nt_curr_runnable_sum = 0;
 		if (mark_start > window_start) {
-			rq->curr_runnable_sum = scale_exec_time(irqtime, rq);
+			rq->curr_runnable_sum = scale_exec_time(irqtime, rq,
+								cc);
 			return;
 		}
 
@@ -2480,12 +2537,12 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 		delta = window_start - mark_start;
 		if (delta > window_size)
 			delta = window_size;
-		delta = scale_exec_time(delta, rq);
+		delta = scale_exec_time(delta, rq, cc);
 		rq->prev_runnable_sum += delta;
 
 		/* Process the remaining IRQ busy time in the current window. */
 		delta = wallclock - window_start;
-		rq->curr_runnable_sum = scale_exec_time(delta, rq);
+		rq->curr_runnable_sum = scale_exec_time(delta, rq, cc);
 
 		return;
 	}
@@ -2515,7 +2572,7 @@ update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
 }
 
 static inline void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
-	     int event, u64 wallclock, u64 irqtime)
+	     int event, u64 wallclock, u64 irqtime, const struct cpu_cycle *cc)
 {
 }
 
@@ -2528,6 +2585,41 @@ static inline u32 predict_and_update_buckets(struct rq *rq,
 
 #endif	/* CONFIG_SCHED_FREQ_INPUT */
 
+static void update_task_cpu_cycles(struct task_struct *p, int cpu)
+{
+	if (use_cycle_counter)
+		p->cpu_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
+}
+
+static struct cpu_cycle
+get_task_cpu_cycles(struct task_struct *p, struct rq *rq, int event,
+		    u64 wallclock)
+{
+	u64 cur_cycles;
+	struct cpu_cycle cc;
+	int cpu = cpu_of(rq);
+
+	if (!use_cycle_counter) {
+		cc.cycles = cpu_cur_freq(cpu);
+		cc.time = 1;
+		return cc;
+	}
+
+	cur_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
+	if (unlikely(cur_cycles < p->cpu_cycles))
+		cc.cycles = cur_cycles + (U64_MAX - p->cpu_cycles);
+	else
+		cc.cycles = cur_cycles - p->cpu_cycles;
+	cc.time = wallclock - p->ravg.mark_start;
+	BUG_ON((s64)cc.time < 0);
+
+	p->cpu_cycles = cur_cycles;
+
+	trace_sched_get_task_cpu_cycles(cpu, event, cc.cycles, cc.time);
+
+	return cc;
+}
+
 static int account_busy_for_task_demand(struct task_struct *p, int event)
 {
 	/* No need to bother updating task demand for exiting tasks
@@ -2614,9 +2706,9 @@ done:
 }
 
 static void add_to_task_demand(struct rq *rq, struct task_struct *p,
-				u64 delta)
+				u64 delta, const struct cpu_cycle *cc)
 {
-	delta = scale_exec_time(delta, rq);
+	delta = scale_exec_time(delta, rq, cc);
 	p->ravg.sum += delta;
 	if (unlikely(p->ravg.sum > sched_ravg_window))
 		p->ravg.sum = sched_ravg_window;
@@ -2673,7 +2765,8 @@ static void add_to_task_demand(struct rq *rq, struct task_struct *p,
  * depends on it!
  */
 static void update_task_demand(struct task_struct *p, struct rq *rq,
-	     int event, u64 wallclock)
+			       int event, u64 wallclock,
+			       const struct cpu_cycle *cc)
 {
 	u64 mark_start = p->ravg.mark_start;
 	u64 delta, window_start = rq->window_start;
@@ -2696,7 +2789,7 @@ static void update_task_demand(struct task_struct *p, struct rq *rq,
 	if (!new_window) {
 		/* The simple case - busy time contained within the existing
 		 * window. */
-		add_to_task_demand(rq, p, wallclock - mark_start);
+		add_to_task_demand(rq, p, wallclock - mark_start, cc);
 		return;
 	}
 
@@ -2707,12 +2800,12 @@ static void update_task_demand(struct task_struct *p, struct rq *rq,
 	window_start -= (u64)nr_full_windows * (u64)window_size;
 
 	/* Process (window_start - mark_start) first */
-	add_to_task_demand(rq, p, window_start - mark_start);
+	add_to_task_demand(rq, p, window_start - mark_start, cc);
 
 	/* Push new sample(s) into task's demand history */
 	update_history(rq, p, p->ravg.sum, 1, event);
 	if (nr_full_windows)
-		update_history(rq, p, scale_exec_time(window_size, rq),
+		update_history(rq, p, scale_exec_time(window_size, rq, cc),
 			       nr_full_windows, event);
 
 	/* Roll window_start back to current to process any remainder
@@ -2721,30 +2814,39 @@ static void update_task_demand(struct task_struct *p, struct rq *rq,
 
 	/* Process (wallclock - window_start) next */
 	mark_start = window_start;
-	add_to_task_demand(rq, p, wallclock - mark_start);
+	add_to_task_demand(rq, p, wallclock - mark_start, cc);
 }
 
 /* Reflect task activity on its demand and cpu's busy time statistics */
-static void update_task_ravg(struct task_struct *p, struct rq *rq,
-	     int event, u64 wallclock, u64 irqtime)
+static struct cpu_cycle
+update_task_ravg(struct task_struct *p, struct rq *rq, int event,
+		 u64 wallclock, u64 irqtime)
 {
+	struct cpu_cycle cc = { .cycles = SCHED_MIN_FREQ, .time = 1 };
+
 	if (sched_use_pelt || !rq->window_start || sched_disable_window_stats)
-		return;
+		return cc;
 
 	lockdep_assert_held(&rq->lock);
 
 	update_window_start(rq, wallclock);
 
-	if (!p->ravg.mark_start)
+	if (!p->ravg.mark_start) {
+		update_task_cpu_cycles(p, cpu_of(rq));
 		goto done;
+	}
 
-	update_task_demand(p, rq, event, wallclock);
-	update_cpu_busy_time(p, rq, event, wallclock, irqtime);
+	cc = get_task_cpu_cycles(p, rq, event, wallclock);
+	update_task_demand(p, rq, event, wallclock, &cc);
+	update_cpu_busy_time(p, rq, event, wallclock, irqtime, &cc);
 	update_task_pred_demand(rq, p, event);
 done:
-	trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime);
+	trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime,
+				     cc.cycles, cc.time);
 
 	p->ravg.mark_start = wallclock;
+
+	return cc;
 }
 
 void sched_account_irqtime(int cpu, struct task_struct *curr,
@@ -2812,6 +2914,7 @@ static inline void mark_task_starting(struct task_struct *p)
 	wallclock = sched_ktime_clock();
 	p->ravg.mark_start = p->last_wake_ts = wallclock;
 	p->last_switch_out_ts = 0;
+	update_task_cpu_cycles(p, cpu_of(rq));
 }
 
 static inline void set_window_start(struct rq *rq)
@@ -3029,6 +3132,7 @@ void sched_get_cpus_busy(struct sched_load *busy,
 	int early_detection[cpus];
 	int cpu, i = 0;
 	unsigned int window_size;
+	struct cpu_cycle cc;
 
 	if (unlikely(cpus == 0))
 		return;
@@ -3047,8 +3151,10 @@ void sched_get_cpus_busy(struct sched_load *busy,
 	for_each_cpu(cpu, query_cpus) {
 		rq = cpu_rq(cpu);
 
-		update_task_ravg(rq->curr, rq, TASK_UPDATE,
-				 sched_ktime_clock(), 0);
+		cc = update_task_ravg(rq->curr, rq, TASK_UPDATE,
+				      sched_ktime_clock(), 0);
+		cur_freq[i] = cpu_cycles_to_freq(i, cc.cycles, cc.time);
+
 		load[i] = rq->old_busy_time = rq->prev_runnable_sum;
 		nload[i] = rq->nt_prev_runnable_sum;
 		pload[i] = rq->hmp_stats.pred_demands_sum;
@@ -3066,7 +3172,6 @@ void sched_get_cpus_busy(struct sched_load *busy,
 		notifier_sent[i] = rq->notifier_sent;
 		early_detection[i] = (rq->ed_task != NULL);
 		rq->notifier_sent = 0;
-		cur_freq[i] = cpu_cur_freq(cpu);
 		max_freq[i] = cpu_max_freq(cpu);
 		i++;
 	}
@@ -3212,6 +3317,8 @@ static void fixup_busy_time(struct task_struct *p, int new_cpu)
 	update_task_ravg(p, task_rq(p), TASK_MIGRATE,
 			 wallclock, 0);
 
+	update_task_cpu_cycles(p, new_cpu);
+
 	new_task = is_new_task(p);
 
 	if (p->ravg.curr_window) {
@@ -3531,6 +3638,7 @@ static int cpufreq_notifier_policy(struct notifier_block *nb,
 
 			sort_clusters();
 			update_all_clusters_stats();
+			update_cpu_cycle_max_possible_freq(cluster);
 			mutex_unlock(&cluster_lock);
 			continue;
 		}
@@ -3685,10 +3793,16 @@ heavy_task_wakeup(struct task_struct *p, struct rq *rq, int event)
 	return 0;
 }
 
-static inline void
+static struct cpu_cycle
 update_task_ravg(struct task_struct *p, struct rq *rq,
 			 int event, u64 wallclock, u64 irqtime)
 {
+	static const struct cpu_cycle cc = {
+		.cycles = SCHED_MIN_FREQ,
+		.time = 1
+	};
+
+	return cc;
 }
 
 static inline void mark_task_starting(struct task_struct *p) {}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 997339470655..aa30f55dc5ee 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4808,6 +4808,7 @@ void init_new_task_load(struct task_struct *p)
 	rcu_assign_pointer(p->grp, NULL);
 	INIT_LIST_HEAD(&p->grp_list);
 	memset(&p->ravg, 0, sizeof(struct ravg));
+	p->cpu_cycles = 0;
 
 	if (init_load_pct) {
 		init_load_pelt = div64_u64((u64)init_load_pct *
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a11e74c191f3..19033bfc3f8e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -387,6 +387,11 @@ struct sched_cluster {
 	 * max_possible_freq = maximum supported by hardware
 	 */
 	unsigned int cur_freq, max_freq, min_freq, max_possible_freq;
+	/*
+	 * cpu_cycle_max_scale_factor represents number of cycles per NSEC at
+	 * CPU's fmax.
+	 */
+	u32 cpu_cycle_max_scale_factor;
 	bool freq_init_done;
 	int dstate, dstate_wakeup_latency, dstate_wakeup_energy;
 	unsigned int static_cluster_pwr_cost;
@@ -1120,6 +1125,16 @@ static inline int cpu_max_power_cost(int cpu)
 	return cpu_rq(cpu)->cluster->max_power_cost;
 }
 
+static inline int cpu_cycle_max_scale_factor(int cpu)
+{
+	return cpu_rq(cpu)->cluster->cpu_cycle_max_scale_factor;
+}
+
+static inline u32 cpu_cycles_to_freq(int cpu, u64 cycles, u32 period)
+{
+	return div64_u64(cycles * cpu_cycle_max_scale_factor(cpu), period);
+}
+
 static inline bool hmp_capable(void)
 {
 	return max_possible_capacity != min_max_possible_capacity;