sched: prevent task migration while governor queries CPUs' load

At present, governor retrieves each CPUs' load sequentially. In this way, there is chance of race between governor's CPU load query and task migration that would result in reporting of lesser CPUs' load than actual. For example, CPU0 load = 30%. CPU1 load = 50%. Governor Load balancer - sched_get_busy(cpu 0) = 30%. - A task 'p' migrated from CPU 1 to CPU 0. p->ravg->prev_window = 50. Now CPU 0's load = 80%, CPU 1's load = 0%. - sched_get_busy(cpu 1) = 0% 50% of load from CPU 1 to 0 never accounted. Fix such issues by introducing a new API sched_get_cpus_busy() which makes for governor to be able to get set of CPUs' load. The loads set internally constructed with blocking load balancer to ensure migration cannot occur in the meantime. Change-Id: I4fa4dd1195eff26aa603829aca2054871521495e Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
author: Joonwoo Park <joonwoop@codeaurora.org> 2015-04-17 17:16:02 -0700
committer: David Keitel <dkeitel@codeaurora.org> 2016-03-23 20:02:07 -0700
commit: a4c475e43db22d2db8bdea676536792cc6b0f724 (patch)
tree: adfbed66446eadf8cbb8ba5633ddb6f4f5cebbe1 /kernel
parent: eca78aaf849e531a94483c614ca5e93d8e575532 (diff)
1 files changed, 65 insertions, 29 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0c6508bebca5..6f47fa3dbf41 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2159,51 +2159,87 @@ scale_load_to_freq(u64 load, unsigned int src_freq, unsigned int dst_freq)
 	return div64_u64(load * (u64)src_freq, (u64)dst_freq);
 }
 
-unsigned long sched_get_busy(int cpu)
+void sched_get_cpus_busy(unsigned long *busy, const struct cpumask *query_cpus)
 {
 	unsigned long flags;
-	struct rq *rq = cpu_rq(cpu);
-	u64 load;
+	struct rq *rq;
+	const int cpus = cpumask_weight(query_cpus);
+	u64 load[cpus];
+	unsigned int cur_freq[cpus], max_freq[cpus];
+	int notifier_sent[cpus];
+	int cpu, i = 0;
+	unsigned int window_size;
+
+	if (unlikely(cpus == 0))
+		return;
 
 	/*
 	 * This function could be called in timer context, and the
 	 * current task may have been executing for a long time. Ensure
 	 * that the window stats are current by doing an update.
 	 */
-	raw_spin_lock_irqsave(&rq->lock, flags);
-	update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_clock(), 0);
-	load = rq->old_busy_time = rq->prev_runnable_sum;
+	local_irq_save(flags);
+	for_each_cpu(cpu, query_cpus)
+		raw_spin_lock(&cpu_rq(cpu)->lock);
 
-	/*
-	 * Scale load in reference to rq->max_possible_freq.
-	 *
-	 * Note that scale_load_to_cpu() scales load in reference to
-	 * rq->max_freq
-	 */
-	load = scale_load_to_cpu(load, cpu);
+	window_size = sched_ravg_window;
 
-	if (!rq->notifier_sent) {
-		u64 load_at_cur_freq;
-
-		load_at_cur_freq = scale_load_to_freq(load, rq->max_freq,
-								 rq->cur_freq);
-		if (load_at_cur_freq > sched_ravg_window)
-			load_at_cur_freq = sched_ravg_window;
-		load = scale_load_to_freq(load_at_cur_freq,
-					 rq->cur_freq, rq->max_possible_freq);
-	} else {
-		load = scale_load_to_freq(load, rq->max_freq,
-						 rq->max_possible_freq);
+	for_each_cpu(cpu, query_cpus) {
+		rq = cpu_rq(cpu);
+
+		update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_clock(), 0);
+		load[i] = rq->old_busy_time = rq->prev_runnable_sum;
+		/*
+		 * Scale load in reference to rq->max_possible_freq.
+		 *
+		 * Note that scale_load_to_cpu() scales load in reference to
+		 * rq->max_freq.
+		 */
+		load[i] = scale_load_to_cpu(load[i], cpu);
+
+		notifier_sent[i] = rq->notifier_sent;
 		rq->notifier_sent = 0;
+		cur_freq[i] = rq->cur_freq;
+		max_freq[i] = rq->max_freq;
+		i++;
 	}
 
-	load = div64_u64(load, NSEC_PER_USEC);
+	for_each_cpu(cpu, query_cpus)
+		raw_spin_unlock(&(cpu_rq(cpu))->lock);
+	local_irq_restore(flags);
 
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
+	i = 0;
+	for_each_cpu(cpu, query_cpus) {
+		rq = cpu_rq(cpu);
+
+		if (!notifier_sent[i]) {
+			load[i] = scale_load_to_freq(load[i], max_freq[i],
+						     cur_freq[i]);
+			if (load[i] > window_size)
+				load[i] = window_size;
+			load[i] = scale_load_to_freq(load[i], cur_freq[i],
+						     rq->max_possible_freq);
+		} else {
+			load[i] = scale_load_to_freq(load[i], max_freq[i],
+						     rq->max_possible_freq);
+		}
+
+		busy[i] = div64_u64(load[i], NSEC_PER_USEC);
+
+		trace_sched_get_busy(cpu, busy[i]);
+		i++;
+	}
+}
+
+unsigned long sched_get_busy(int cpu)
+{
+	struct cpumask query_cpu = CPU_MASK_NONE;
+	unsigned long busy;
 
-	trace_sched_get_busy(cpu, load);
+	cpumask_set_cpu(cpu, &query_cpu);
+	sched_get_cpus_busy(&busy, &query_cpu);
 
-	return load;
+	return busy;
 }
 
 void sched_set_io_is_busy(int val)
author	Joonwoo Park <joonwoop@codeaurora.org>	2015-04-17 17:16:02 -0700
committer	David Keitel <dkeitel@codeaurora.org>	2016-03-23 20:02:07 -0700
commit	a4c475e43db22d2db8bdea676536792cc6b0f724 (patch)
tree	adfbed66446eadf8cbb8ba5633ddb6f4f5cebbe1 /kernel
parent	eca78aaf849e531a94483c614ca5e93d8e575532 (diff)