sched: add migration load change notifier for frequency guidance

When a task moves between CPUs in two different frequency domains the cpufreq governor may wish to immediately modify the frequency of both the source and destination CPUs of the migrating task. A tunable is provided to establish what size task is considered "significant" enough to warrant notifying cpufreq. Also fix a bug that would cause load to not be accounted properly during wakeup migrations. Change-Id: Ie8f6b1cc4d43a602840dac18590b42a81327c95a Signed-off-by: Steve Muckle <smuckle@codeaurora.org> [rameezmustafa@codeaurora.org: Add double rq locking for set_task_cpu()] Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
author: Steve Muckle <smuckle@codeaurora.org> 2014-05-06 18:05:50 -0700
committer: David Keitel <dkeitel@codeaurora.org> 2016-03-23 19:59:29 -0700
commit: f469bce8e2d2062568f753ca7e9099715f504df8 (patch)
tree: 821999bd5edcf830dae9ab4094a04e8bc7c3fb5a
parent: e640249dbade56af7bc968fce2f5ede230602e6e (diff)
6 files changed, 78 insertions, 7 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 763eb0312130..2d2a94575eaa 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3171,6 +3171,8 @@ struct migration_notify_data {
 	int load;
 };
 
+extern struct atomic_notifier_head load_alert_notifier_head;
+
 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
 extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 66a978ca7a65..b65ee06f80c9 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -47,6 +47,8 @@ extern unsigned int sysctl_sched_window_stats_policy;
 extern unsigned int sysctl_sched_init_task_load_pct;
 #endif
 
+extern unsigned int sysctl_sched_task_migrate_notify_pct;
+
 #ifdef CONFIG_SCHED_HMP
 extern unsigned int sysctl_sched_enable_hmp_task_placement;
 extern unsigned int sysctl_sched_mostly_idle_nr_run;
@@ -87,6 +89,9 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
 		loff_t *ppos);
 #endif
 
+extern int sched_migrate_notify_proc_handler(struct ctl_table *table,
+		int write, void __user *buffer, size_t *lenp, loff_t *ppos);
+
 extern int sched_hmp_proc_update_handler(struct ctl_table *table,
 		int write, void __user *buffer, size_t *lenp, loff_t *ppos);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d2b7d83fbd90..a96e2225755a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -92,6 +92,7 @@
 #include <trace/events/sched.h>
 
 ATOMIC_NOTIFIER_HEAD(migration_notifier_head);
+ATOMIC_NOTIFIER_HEAD(load_alert_notifier_head);
 
 DEFINE_MUTEX(sched_domains_mutex);
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -1095,6 +1096,29 @@ unsigned int __read_mostly sched_use_pelt;
 unsigned int max_possible_efficiency = 1024;
 unsigned int min_possible_efficiency = 1024;
 
+__read_mostly unsigned int sysctl_sched_task_migrate_notify_pct = 25;
+unsigned int sched_task_migrate_notify;
+
+int sched_migrate_notify_proc_handler(struct ctl_table *table, int write,
+				      void __user *buffer, size_t *lenp,
+				      loff_t *ppos)
+{
+	int ret;
+	unsigned int *data = (unsigned int *)table->data;
+
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (ret || !write)
+		return ret;
+
+	if (*data > 100)
+		return -EINVAL;
+
+	sched_task_migrate_notify = div64_u64((u64)*data *
+					      (u64)max_task_load(), 100);
+
+	return 0;
+}
+
 /*
  * Called when new window is starting for a task, to record cpu usage over
  * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
@@ -1687,21 +1711,46 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 		perf_event_task_migrate(p);
 
 #if defined(CONFIG_SCHED_FREQ_INPUT) || defined(CONFIG_SCHED_HMP)
-		if (p->on_rq) {
+		if (p->on_rq || p->state == TASK_WAKING) {
 			struct rq *src_rq = task_rq(p);
 			struct rq *dest_rq = cpu_rq(new_cpu);
 
-			p->on_rq = 0;	/* Fixme */
-			update_task_ravg(p, task_rq(p), 0, sched_clock());
-			p->on_rq = 1;	/* Fixme */
+			/* In the wakeup case the task has already had
+			 * its statisics updated (and the RQ is not locked). */
+			if (p->state != TASK_WAKING) {
+				p->on_rq = 0;	/* todo */
+				update_task_ravg(p, task_rq(p), 0,
+						 sched_clock());
+				p->on_rq = 1;	/* todo */
+			}
+
+			if (p->state == TASK_WAKING)
+				double_rq_lock(src_rq, dest_rq);
+
 			update_task_ravg(dest_rq->curr, dest_rq,
-						 1, sched_clock());
+					 1, sched_clock());
 
 
 			src_rq->curr_runnable_sum -= p->ravg.sum;
 			src_rq->prev_runnable_sum -= p->ravg.prev_window;
 			dest_rq->curr_runnable_sum += p->ravg.sum;
 			dest_rq->prev_runnable_sum += p->ravg.prev_window;
+
+			if (p->state == TASK_WAKING)
+				double_rq_unlock(src_rq, dest_rq);
+
+			/* Is p->ravg.prev_window significant? Trigger a load
+			   alert notifier if so. */
+			if (p->ravg.prev_window > sched_task_migrate_notify &&
+			    !cpumask_test_cpu(new_cpu,
+					     &src_rq->freq_domain_cpumask)) {
+				atomic_notifier_call_chain(
+					&load_alert_notifier_head, 0,
+					(void *)(long)task_cpu(p));
+				atomic_notifier_call_chain(
+					&load_alert_notifier_head, 0,
+					(void *)(long)new_cpu);
+			}
 		}
 #endif
 
@@ -7899,6 +7948,8 @@ static int cpufreq_notifier_policy(struct notifier_block *nb,
 		return 0;
 
 	for_each_cpu(i, policy->related_cpus) {
+		cpumask_copy(&cpu_rq(i)->freq_domain_cpumask,
+			     policy->related_cpus);
 		cpu_rq(i)->min_freq = policy->min;
 		cpu_rq(i)->max_freq = policy->max;
 		cpu_rq(i)->max_possible_freq = policy->cpuinfo.max_freq;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b04af1c436cc..6fe51274c748 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2551,7 +2551,7 @@ static inline unsigned int task_load(struct task_struct *p)
 	return p->ravg.demand;
 }
 
-static inline unsigned int max_task_load(void)
+unsigned int max_task_load(void)
 {
 	if (sched_use_pelt)
 		return LOAD_AVG_MAX;
@@ -6442,7 +6442,9 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
 
 	deactivate_task(env->src_rq, p, 0);
 	p->on_rq = TASK_ON_RQ_MIGRATING;
+	double_lock_balance(env->src_rq, env->dst_rq);
 	set_task_cpu(p, env->dst_cpu);
+	double_unlock_balance(env->src_rq, env->dst_rq);
 }
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c5d593ba30f2..a0d35bbc2626 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -650,6 +650,8 @@ struct rq {
 	 * max_possible_freq = maximum supported by hardware
 	 */
 	unsigned int cur_freq, max_freq, min_freq, max_possible_freq;
+	struct cpumask freq_domain_cpumask;
+
 	u64 cumulative_runnable_avg;
 	int efficiency; /* Differentiate cpus with different IPC capability */
 	int load_scale_factor;
@@ -961,7 +963,7 @@ static inline u64 scale_task_load(u64 load, int cpu)
 	return load;
 }
 #endif
-
+unsigned int max_task_load(void);
 
 static inline void
 inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 06fe2f6591e7..4560a50a4558 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -292,6 +292,15 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_SCHED_FREQ_INPUT
+	{
+		.procname	= "sched_task_migrate_notify",
+		.data		= &sysctl_sched_task_migrate_notify_pct,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_migrate_notify_proc_handler,
+	},
+#endif
 #if defined(CONFIG_SCHED_FREQ_INPUT) || defined(CONFIG_SCHED_HMP)
 	{
 		.procname       = "sched_window_stats_policy",
author	Steve Muckle <smuckle@codeaurora.org>	2014-05-06 18:05:50 -0700
committer	David Keitel <dkeitel@codeaurora.org>	2016-03-23 19:59:29 -0700
commit	f469bce8e2d2062568f753ca7e9099715f504df8 (patch)
tree	821999bd5edcf830dae9ab4094a04e8bc7c3fb5a
parent	e640249dbade56af7bc968fce2f5ede230602e6e (diff)